1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright (c) 2013 Martin Matuska. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dbuf.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/spa.h> 37 #include <sys/zio.h> 38 #include <sys/dmu_zfetch.h> 39 #include <sys/sa.h> 40 #include <sys/sa_impl.h> 41 42 static void dbuf_destroy(dmu_buf_impl_t *db); 43 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 44 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 45 46 /* 47 * Global data structures and functions for the dbuf cache. 48 */ 49 static kmem_cache_t *dbuf_cache; 50 51 /* ARGSUSED */ 52 static int 53 dbuf_cons(void *vdb, void *unused, int kmflag) 54 { 55 dmu_buf_impl_t *db = vdb; 56 bzero(db, sizeof (dmu_buf_impl_t)); 57 58 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 59 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 60 refcount_create(&db->db_holds); 61 return (0); 62 } 63 64 /* ARGSUSED */ 65 static void 66 dbuf_dest(void *vdb, void *unused) 67 { 68 dmu_buf_impl_t *db = vdb; 69 mutex_destroy(&db->db_mtx); 70 cv_destroy(&db->db_changed); 71 refcount_destroy(&db->db_holds); 72 } 73 74 /* 75 * dbuf hash table routines 76 */ 77 static dbuf_hash_table_t dbuf_hash_table; 78 79 static uint64_t dbuf_hash_count; 80 81 static uint64_t 82 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 83 { 84 uintptr_t osv = (uintptr_t)os; 85 uint64_t crc = -1ULL; 86 87 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 93 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 94 95 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 96 97 return (crc); 98 } 99 100 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 101 102 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 103 ((dbuf)->db.db_object == (obj) && \ 104 (dbuf)->db_objset == (os) && \ 105 (dbuf)->db_level == (level) && \ 106 (dbuf)->db_blkid == (blkid)) 107 108 dmu_buf_impl_t * 109 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 110 { 111 dbuf_hash_table_t *h = &dbuf_hash_table; 112 objset_t *os = dn->dn_objset; 113 uint64_t obj = dn->dn_object; 114 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 115 uint64_t idx = hv & h->hash_table_mask; 116 dmu_buf_impl_t *db; 117 118 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 119 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 120 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 121 mutex_enter(&db->db_mtx); 122 if (db->db_state != DB_EVICTING) { 123 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 124 return (db); 125 } 126 mutex_exit(&db->db_mtx); 127 } 128 } 129 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 130 return (NULL); 131 } 132 133 /* 134 * Insert an entry into the hash table. If there is already an element 135 * equal to elem in the hash table, then the already existing element 136 * will be returned and the new element will not be inserted. 137 * Otherwise returns NULL. 138 */ 139 static dmu_buf_impl_t * 140 dbuf_hash_insert(dmu_buf_impl_t *db) 141 { 142 dbuf_hash_table_t *h = &dbuf_hash_table; 143 objset_t *os = db->db_objset; 144 uint64_t obj = db->db.db_object; 145 int level = db->db_level; 146 uint64_t blkid = db->db_blkid; 147 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 148 uint64_t idx = hv & h->hash_table_mask; 149 dmu_buf_impl_t *dbf; 150 151 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 152 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 153 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 154 mutex_enter(&dbf->db_mtx); 155 if (dbf->db_state != DB_EVICTING) { 156 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 157 return (dbf); 158 } 159 mutex_exit(&dbf->db_mtx); 160 } 161 } 162 163 mutex_enter(&db->db_mtx); 164 db->db_hash_next = h->hash_table[idx]; 165 h->hash_table[idx] = db; 166 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 167 atomic_add_64(&dbuf_hash_count, 1); 168 169 return (NULL); 170 } 171 172 /* 173 * Remove an entry from the hash table. This operation will 174 * fail if there are any existing holds on the db. 175 */ 176 static void 177 dbuf_hash_remove(dmu_buf_impl_t *db) 178 { 179 dbuf_hash_table_t *h = &dbuf_hash_table; 180 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 181 db->db_level, db->db_blkid); 182 uint64_t idx = hv & h->hash_table_mask; 183 dmu_buf_impl_t *dbf, **dbp; 184 185 /* 186 * We musn't hold db_mtx to maintin lock ordering: 187 * DBUF_HASH_MUTEX > db_mtx. 188 */ 189 ASSERT(refcount_is_zero(&db->db_holds)); 190 ASSERT(db->db_state == DB_EVICTING); 191 ASSERT(!MUTEX_HELD(&db->db_mtx)); 192 193 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 194 dbp = &h->hash_table[idx]; 195 while ((dbf = *dbp) != db) { 196 dbp = &dbf->db_hash_next; 197 ASSERT(dbf != NULL); 198 } 199 *dbp = db->db_hash_next; 200 db->db_hash_next = NULL; 201 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 202 atomic_add_64(&dbuf_hash_count, -1); 203 } 204 205 static arc_evict_func_t dbuf_do_evict; 206 207 static void 208 dbuf_evict_user(dmu_buf_impl_t *db) 209 { 210 ASSERT(MUTEX_HELD(&db->db_mtx)); 211 212 if (db->db_level != 0 || db->db_evict_func == NULL) 213 return; 214 215 if (db->db_user_data_ptr_ptr) 216 *db->db_user_data_ptr_ptr = db->db.db_data; 217 db->db_evict_func(&db->db, db->db_user_ptr); 218 db->db_user_ptr = NULL; 219 db->db_user_data_ptr_ptr = NULL; 220 db->db_evict_func = NULL; 221 } 222 223 boolean_t 224 dbuf_is_metadata(dmu_buf_impl_t *db) 225 { 226 if (db->db_level > 0) { 227 return (B_TRUE); 228 } else { 229 boolean_t is_metadata; 230 231 DB_DNODE_ENTER(db); 232 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 233 DB_DNODE_EXIT(db); 234 235 return (is_metadata); 236 } 237 } 238 239 void 240 dbuf_evict(dmu_buf_impl_t *db) 241 { 242 ASSERT(MUTEX_HELD(&db->db_mtx)); 243 ASSERT(db->db_buf == NULL); 244 ASSERT(db->db_data_pending == NULL); 245 246 dbuf_clear(db); 247 dbuf_destroy(db); 248 } 249 250 void 251 dbuf_init(void) 252 { 253 uint64_t hsize = 1ULL << 16; 254 dbuf_hash_table_t *h = &dbuf_hash_table; 255 int i; 256 257 /* 258 * The hash table is big enough to fill all of physical memory 259 * with an average 4K block size. The table will take up 260 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 261 */ 262 while (hsize * 4096 < physmem * PAGESIZE) 263 hsize <<= 1; 264 265 retry: 266 h->hash_table_mask = hsize - 1; 267 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 268 if (h->hash_table == NULL) { 269 /* XXX - we should really return an error instead of assert */ 270 ASSERT(hsize > (1ULL << 10)); 271 hsize >>= 1; 272 goto retry; 273 } 274 275 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 276 sizeof (dmu_buf_impl_t), 277 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 278 279 for (i = 0; i < DBUF_MUTEXES; i++) 280 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 281 } 282 283 void 284 dbuf_fini(void) 285 { 286 dbuf_hash_table_t *h = &dbuf_hash_table; 287 int i; 288 289 for (i = 0; i < DBUF_MUTEXES; i++) 290 mutex_destroy(&h->hash_mutexes[i]); 291 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 292 kmem_cache_destroy(dbuf_cache); 293 } 294 295 /* 296 * Other stuff. 297 */ 298 299 #ifdef ZFS_DEBUG 300 static void 301 dbuf_verify(dmu_buf_impl_t *db) 302 { 303 dnode_t *dn; 304 dbuf_dirty_record_t *dr; 305 306 ASSERT(MUTEX_HELD(&db->db_mtx)); 307 308 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 309 return; 310 311 ASSERT(db->db_objset != NULL); 312 DB_DNODE_ENTER(db); 313 dn = DB_DNODE(db); 314 if (dn == NULL) { 315 ASSERT(db->db_parent == NULL); 316 ASSERT(db->db_blkptr == NULL); 317 } else { 318 ASSERT3U(db->db.db_object, ==, dn->dn_object); 319 ASSERT3P(db->db_objset, ==, dn->dn_objset); 320 ASSERT3U(db->db_level, <, dn->dn_nlevels); 321 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 322 db->db_blkid == DMU_SPILL_BLKID || 323 !list_is_empty(&dn->dn_dbufs)); 324 } 325 if (db->db_blkid == DMU_BONUS_BLKID) { 326 ASSERT(dn != NULL); 327 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 328 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 329 } else if (db->db_blkid == DMU_SPILL_BLKID) { 330 ASSERT(dn != NULL); 331 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 332 ASSERT0(db->db.db_offset); 333 } else { 334 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 335 } 336 337 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 338 ASSERT(dr->dr_dbuf == db); 339 340 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 341 ASSERT(dr->dr_dbuf == db); 342 343 /* 344 * We can't assert that db_size matches dn_datablksz because it 345 * can be momentarily different when another thread is doing 346 * dnode_set_blksz(). 347 */ 348 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 349 dr = db->db_data_pending; 350 /* 351 * It should only be modified in syncing context, so 352 * make sure we only have one copy of the data. 353 */ 354 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 355 } 356 357 /* verify db->db_blkptr */ 358 if (db->db_blkptr) { 359 if (db->db_parent == dn->dn_dbuf) { 360 /* db is pointed to by the dnode */ 361 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 362 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 363 ASSERT(db->db_parent == NULL); 364 else 365 ASSERT(db->db_parent != NULL); 366 if (db->db_blkid != DMU_SPILL_BLKID) 367 ASSERT3P(db->db_blkptr, ==, 368 &dn->dn_phys->dn_blkptr[db->db_blkid]); 369 } else { 370 /* db is pointed to by an indirect block */ 371 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 372 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 373 ASSERT3U(db->db_parent->db.db_object, ==, 374 db->db.db_object); 375 /* 376 * dnode_grow_indblksz() can make this fail if we don't 377 * have the struct_rwlock. XXX indblksz no longer 378 * grows. safe to do this now? 379 */ 380 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 381 ASSERT3P(db->db_blkptr, ==, 382 ((blkptr_t *)db->db_parent->db.db_data + 383 db->db_blkid % epb)); 384 } 385 } 386 } 387 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 388 (db->db_buf == NULL || db->db_buf->b_data) && 389 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 390 db->db_state != DB_FILL && !dn->dn_free_txg) { 391 /* 392 * If the blkptr isn't set but they have nonzero data, 393 * it had better be dirty, otherwise we'll lose that 394 * data when we evict this buffer. 395 */ 396 if (db->db_dirtycnt == 0) { 397 uint64_t *buf = db->db.db_data; 398 int i; 399 400 for (i = 0; i < db->db.db_size >> 3; i++) { 401 ASSERT(buf[i] == 0); 402 } 403 } 404 } 405 DB_DNODE_EXIT(db); 406 } 407 #endif 408 409 static void 410 dbuf_update_data(dmu_buf_impl_t *db) 411 { 412 ASSERT(MUTEX_HELD(&db->db_mtx)); 413 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 414 ASSERT(!refcount_is_zero(&db->db_holds)); 415 *db->db_user_data_ptr_ptr = db->db.db_data; 416 } 417 } 418 419 static void 420 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 421 { 422 ASSERT(MUTEX_HELD(&db->db_mtx)); 423 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 424 db->db_buf = buf; 425 if (buf != NULL) { 426 ASSERT(buf->b_data != NULL); 427 db->db.db_data = buf->b_data; 428 if (!arc_released(buf)) 429 arc_set_callback(buf, dbuf_do_evict, db); 430 dbuf_update_data(db); 431 } else { 432 dbuf_evict_user(db); 433 db->db.db_data = NULL; 434 if (db->db_state != DB_NOFILL) 435 db->db_state = DB_UNCACHED; 436 } 437 } 438 439 /* 440 * Loan out an arc_buf for read. Return the loaned arc_buf. 441 */ 442 arc_buf_t * 443 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 444 { 445 arc_buf_t *abuf; 446 447 mutex_enter(&db->db_mtx); 448 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 449 int blksz = db->db.db_size; 450 spa_t *spa; 451 452 mutex_exit(&db->db_mtx); 453 DB_GET_SPA(&spa, db); 454 abuf = arc_loan_buf(spa, blksz); 455 bcopy(db->db.db_data, abuf->b_data, blksz); 456 } else { 457 abuf = db->db_buf; 458 arc_loan_inuse_buf(abuf, db); 459 dbuf_set_data(db, NULL); 460 mutex_exit(&db->db_mtx); 461 } 462 return (abuf); 463 } 464 465 uint64_t 466 dbuf_whichblock(dnode_t *dn, uint64_t offset) 467 { 468 if (dn->dn_datablkshift) { 469 return (offset >> dn->dn_datablkshift); 470 } else { 471 ASSERT3U(offset, <, dn->dn_datablksz); 472 return (0); 473 } 474 } 475 476 static void 477 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 478 { 479 dmu_buf_impl_t *db = vdb; 480 481 mutex_enter(&db->db_mtx); 482 ASSERT3U(db->db_state, ==, DB_READ); 483 /* 484 * All reads are synchronous, so we must have a hold on the dbuf 485 */ 486 ASSERT(refcount_count(&db->db_holds) > 0); 487 ASSERT(db->db_buf == NULL); 488 ASSERT(db->db.db_data == NULL); 489 if (db->db_level == 0 && db->db_freed_in_flight) { 490 /* we were freed in flight; disregard any error */ 491 arc_release(buf, db); 492 bzero(buf->b_data, db->db.db_size); 493 arc_buf_freeze(buf); 494 db->db_freed_in_flight = FALSE; 495 dbuf_set_data(db, buf); 496 db->db_state = DB_CACHED; 497 } else if (zio == NULL || zio->io_error == 0) { 498 dbuf_set_data(db, buf); 499 db->db_state = DB_CACHED; 500 } else { 501 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 502 ASSERT3P(db->db_buf, ==, NULL); 503 VERIFY(arc_buf_remove_ref(buf, db)); 504 db->db_state = DB_UNCACHED; 505 } 506 cv_broadcast(&db->db_changed); 507 dbuf_rele_and_unlock(db, NULL); 508 } 509 510 static void 511 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 512 { 513 dnode_t *dn; 514 spa_t *spa; 515 zbookmark_t zb; 516 uint32_t aflags = ARC_NOWAIT; 517 518 DB_DNODE_ENTER(db); 519 dn = DB_DNODE(db); 520 ASSERT(!refcount_is_zero(&db->db_holds)); 521 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 522 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 523 ASSERT(MUTEX_HELD(&db->db_mtx)); 524 ASSERT(db->db_state == DB_UNCACHED); 525 ASSERT(db->db_buf == NULL); 526 527 if (db->db_blkid == DMU_BONUS_BLKID) { 528 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 529 530 ASSERT3U(bonuslen, <=, db->db.db_size); 531 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 532 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 533 if (bonuslen < DN_MAX_BONUSLEN) 534 bzero(db->db.db_data, DN_MAX_BONUSLEN); 535 if (bonuslen) 536 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 537 DB_DNODE_EXIT(db); 538 dbuf_update_data(db); 539 db->db_state = DB_CACHED; 540 mutex_exit(&db->db_mtx); 541 return; 542 } 543 544 /* 545 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 546 * processes the delete record and clears the bp while we are waiting 547 * for the dn_mtx (resulting in a "no" from block_freed). 548 */ 549 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 550 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 551 BP_IS_HOLE(db->db_blkptr)))) { 552 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 553 554 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 555 db->db.db_size, db, type)); 556 DB_DNODE_EXIT(db); 557 bzero(db->db.db_data, db->db.db_size); 558 db->db_state = DB_CACHED; 559 *flags |= DB_RF_CACHED; 560 mutex_exit(&db->db_mtx); 561 return; 562 } 563 564 spa = dn->dn_objset->os_spa; 565 DB_DNODE_EXIT(db); 566 567 db->db_state = DB_READ; 568 mutex_exit(&db->db_mtx); 569 570 if (DBUF_IS_L2CACHEABLE(db)) 571 aflags |= ARC_L2CACHE; 572 573 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 574 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 575 db->db.db_object, db->db_level, db->db_blkid); 576 577 dbuf_add_ref(db, NULL); 578 579 (void) arc_read(zio, spa, db->db_blkptr, 580 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 581 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 582 &aflags, &zb); 583 if (aflags & ARC_CACHED) 584 *flags |= DB_RF_CACHED; 585 } 586 587 int 588 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 589 { 590 int err = 0; 591 int havepzio = (zio != NULL); 592 int prefetch; 593 dnode_t *dn; 594 595 /* 596 * We don't have to hold the mutex to check db_state because it 597 * can't be freed while we have a hold on the buffer. 598 */ 599 ASSERT(!refcount_is_zero(&db->db_holds)); 600 601 if (db->db_state == DB_NOFILL) 602 return (SET_ERROR(EIO)); 603 604 DB_DNODE_ENTER(db); 605 dn = DB_DNODE(db); 606 if ((flags & DB_RF_HAVESTRUCT) == 0) 607 rw_enter(&dn->dn_struct_rwlock, RW_READER); 608 609 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 610 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 611 DBUF_IS_CACHEABLE(db); 612 613 mutex_enter(&db->db_mtx); 614 if (db->db_state == DB_CACHED) { 615 mutex_exit(&db->db_mtx); 616 if (prefetch) 617 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 618 db->db.db_size, TRUE); 619 if ((flags & DB_RF_HAVESTRUCT) == 0) 620 rw_exit(&dn->dn_struct_rwlock); 621 DB_DNODE_EXIT(db); 622 } else if (db->db_state == DB_UNCACHED) { 623 spa_t *spa = dn->dn_objset->os_spa; 624 625 if (zio == NULL) 626 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 627 dbuf_read_impl(db, zio, &flags); 628 629 /* dbuf_read_impl has dropped db_mtx for us */ 630 631 if (prefetch) 632 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 633 db->db.db_size, flags & DB_RF_CACHED); 634 635 if ((flags & DB_RF_HAVESTRUCT) == 0) 636 rw_exit(&dn->dn_struct_rwlock); 637 DB_DNODE_EXIT(db); 638 639 if (!havepzio) 640 err = zio_wait(zio); 641 } else { 642 mutex_exit(&db->db_mtx); 643 if (prefetch) 644 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 645 db->db.db_size, TRUE); 646 if ((flags & DB_RF_HAVESTRUCT) == 0) 647 rw_exit(&dn->dn_struct_rwlock); 648 DB_DNODE_EXIT(db); 649 650 mutex_enter(&db->db_mtx); 651 if ((flags & DB_RF_NEVERWAIT) == 0) { 652 while (db->db_state == DB_READ || 653 db->db_state == DB_FILL) { 654 ASSERT(db->db_state == DB_READ || 655 (flags & DB_RF_HAVESTRUCT) == 0); 656 cv_wait(&db->db_changed, &db->db_mtx); 657 } 658 if (db->db_state == DB_UNCACHED) 659 err = SET_ERROR(EIO); 660 } 661 mutex_exit(&db->db_mtx); 662 } 663 664 ASSERT(err || havepzio || db->db_state == DB_CACHED); 665 return (err); 666 } 667 668 static void 669 dbuf_noread(dmu_buf_impl_t *db) 670 { 671 ASSERT(!refcount_is_zero(&db->db_holds)); 672 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 673 mutex_enter(&db->db_mtx); 674 while (db->db_state == DB_READ || db->db_state == DB_FILL) 675 cv_wait(&db->db_changed, &db->db_mtx); 676 if (db->db_state == DB_UNCACHED) { 677 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 678 spa_t *spa; 679 680 ASSERT(db->db_buf == NULL); 681 ASSERT(db->db.db_data == NULL); 682 DB_GET_SPA(&spa, db); 683 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 684 db->db_state = DB_FILL; 685 } else if (db->db_state == DB_NOFILL) { 686 dbuf_set_data(db, NULL); 687 } else { 688 ASSERT3U(db->db_state, ==, DB_CACHED); 689 } 690 mutex_exit(&db->db_mtx); 691 } 692 693 /* 694 * This is our just-in-time copy function. It makes a copy of 695 * buffers, that have been modified in a previous transaction 696 * group, before we modify them in the current active group. 697 * 698 * This function is used in two places: when we are dirtying a 699 * buffer for the first time in a txg, and when we are freeing 700 * a range in a dnode that includes this buffer. 701 * 702 * Note that when we are called from dbuf_free_range() we do 703 * not put a hold on the buffer, we just traverse the active 704 * dbuf list for the dnode. 705 */ 706 static void 707 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 708 { 709 dbuf_dirty_record_t *dr = db->db_last_dirty; 710 711 ASSERT(MUTEX_HELD(&db->db_mtx)); 712 ASSERT(db->db.db_data != NULL); 713 ASSERT(db->db_level == 0); 714 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 715 716 if (dr == NULL || 717 (dr->dt.dl.dr_data != 718 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 719 return; 720 721 /* 722 * If the last dirty record for this dbuf has not yet synced 723 * and its referencing the dbuf data, either: 724 * reset the reference to point to a new copy, 725 * or (if there a no active holders) 726 * just null out the current db_data pointer. 727 */ 728 ASSERT(dr->dr_txg >= txg - 2); 729 if (db->db_blkid == DMU_BONUS_BLKID) { 730 /* Note that the data bufs here are zio_bufs */ 731 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 732 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 733 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 734 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 735 int size = db->db.db_size; 736 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 737 spa_t *spa; 738 739 DB_GET_SPA(&spa, db); 740 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 741 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 742 } else { 743 dbuf_set_data(db, NULL); 744 } 745 } 746 747 void 748 dbuf_unoverride(dbuf_dirty_record_t *dr) 749 { 750 dmu_buf_impl_t *db = dr->dr_dbuf; 751 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 752 uint64_t txg = dr->dr_txg; 753 754 ASSERT(MUTEX_HELD(&db->db_mtx)); 755 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 756 ASSERT(db->db_level == 0); 757 758 if (db->db_blkid == DMU_BONUS_BLKID || 759 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 760 return; 761 762 ASSERT(db->db_data_pending != dr); 763 764 /* free this block */ 765 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) { 766 spa_t *spa; 767 768 DB_GET_SPA(&spa, db); 769 zio_free(spa, txg, bp); 770 } 771 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 772 dr->dt.dl.dr_nopwrite = B_FALSE; 773 774 /* 775 * Release the already-written buffer, so we leave it in 776 * a consistent dirty state. Note that all callers are 777 * modifying the buffer, so they will immediately do 778 * another (redundant) arc_release(). Therefore, leave 779 * the buf thawed to save the effort of freezing & 780 * immediately re-thawing it. 781 */ 782 arc_release(dr->dt.dl.dr_data, db); 783 } 784 785 /* 786 * Evict (if its unreferenced) or clear (if its referenced) any level-0 787 * data blocks in the free range, so that any future readers will find 788 * empty blocks. Also, if we happen accross any level-1 dbufs in the 789 * range that have not already been marked dirty, mark them dirty so 790 * they stay in memory. 791 */ 792 void 793 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 794 { 795 dmu_buf_impl_t *db, *db_next; 796 uint64_t txg = tx->tx_txg; 797 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 798 uint64_t first_l1 = start >> epbs; 799 uint64_t last_l1 = end >> epbs; 800 801 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { 802 end = dn->dn_maxblkid; 803 last_l1 = end >> epbs; 804 } 805 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 806 mutex_enter(&dn->dn_dbufs_mtx); 807 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 808 db_next = list_next(&dn->dn_dbufs, db); 809 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 810 811 if (db->db_level == 1 && 812 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 813 mutex_enter(&db->db_mtx); 814 if (db->db_last_dirty && 815 db->db_last_dirty->dr_txg < txg) { 816 dbuf_add_ref(db, FTAG); 817 mutex_exit(&db->db_mtx); 818 dbuf_will_dirty(db, tx); 819 dbuf_rele(db, FTAG); 820 } else { 821 mutex_exit(&db->db_mtx); 822 } 823 } 824 825 if (db->db_level != 0) 826 continue; 827 dprintf_dbuf(db, "found buf %s\n", ""); 828 if (db->db_blkid < start || db->db_blkid > end) 829 continue; 830 831 /* found a level 0 buffer in the range */ 832 mutex_enter(&db->db_mtx); 833 if (dbuf_undirty(db, tx)) { 834 /* mutex has been dropped and dbuf destroyed */ 835 continue; 836 } 837 838 if (db->db_state == DB_UNCACHED || 839 db->db_state == DB_NOFILL || 840 db->db_state == DB_EVICTING) { 841 ASSERT(db->db.db_data == NULL); 842 mutex_exit(&db->db_mtx); 843 continue; 844 } 845 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 846 /* will be handled in dbuf_read_done or dbuf_rele */ 847 db->db_freed_in_flight = TRUE; 848 mutex_exit(&db->db_mtx); 849 continue; 850 } 851 if (refcount_count(&db->db_holds) == 0) { 852 ASSERT(db->db_buf); 853 dbuf_clear(db); 854 continue; 855 } 856 /* The dbuf is referenced */ 857 858 if (db->db_last_dirty != NULL) { 859 dbuf_dirty_record_t *dr = db->db_last_dirty; 860 861 if (dr->dr_txg == txg) { 862 /* 863 * This buffer is "in-use", re-adjust the file 864 * size to reflect that this buffer may 865 * contain new data when we sync. 866 */ 867 if (db->db_blkid != DMU_SPILL_BLKID && 868 db->db_blkid > dn->dn_maxblkid) 869 dn->dn_maxblkid = db->db_blkid; 870 dbuf_unoverride(dr); 871 } else { 872 /* 873 * This dbuf is not dirty in the open context. 874 * Either uncache it (if its not referenced in 875 * the open context) or reset its contents to 876 * empty. 877 */ 878 dbuf_fix_old_data(db, txg); 879 } 880 } 881 /* clear the contents if its cached */ 882 if (db->db_state == DB_CACHED) { 883 ASSERT(db->db.db_data != NULL); 884 arc_release(db->db_buf, db); 885 bzero(db->db.db_data, db->db.db_size); 886 arc_buf_freeze(db->db_buf); 887 } 888 889 mutex_exit(&db->db_mtx); 890 } 891 mutex_exit(&dn->dn_dbufs_mtx); 892 } 893 894 static int 895 dbuf_block_freeable(dmu_buf_impl_t *db) 896 { 897 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 898 uint64_t birth_txg = 0; 899 900 /* 901 * We don't need any locking to protect db_blkptr: 902 * If it's syncing, then db_last_dirty will be set 903 * so we'll ignore db_blkptr. 904 */ 905 ASSERT(MUTEX_HELD(&db->db_mtx)); 906 if (db->db_last_dirty) 907 birth_txg = db->db_last_dirty->dr_txg; 908 else if (db->db_blkptr) 909 birth_txg = db->db_blkptr->blk_birth; 910 911 /* 912 * If we don't exist or are in a snapshot, we can't be freed. 913 * Don't pass the bp to dsl_dataset_block_freeable() since we 914 * are holding the db_mtx lock and might deadlock if we are 915 * prefetching a dedup-ed block. 916 */ 917 if (birth_txg) 918 return (ds == NULL || 919 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 920 else 921 return (FALSE); 922 } 923 924 void 925 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 926 { 927 arc_buf_t *buf, *obuf; 928 int osize = db->db.db_size; 929 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 930 dnode_t *dn; 931 932 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 933 934 DB_DNODE_ENTER(db); 935 dn = DB_DNODE(db); 936 937 /* XXX does *this* func really need the lock? */ 938 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 939 940 /* 941 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 942 * is OK, because there can be no other references to the db 943 * when we are changing its size, so no concurrent DB_FILL can 944 * be happening. 945 */ 946 /* 947 * XXX we should be doing a dbuf_read, checking the return 948 * value and returning that up to our callers 949 */ 950 dbuf_will_dirty(db, tx); 951 952 /* create the data buffer for the new block */ 953 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 954 955 /* copy old block data to the new block */ 956 obuf = db->db_buf; 957 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 958 /* zero the remainder */ 959 if (size > osize) 960 bzero((uint8_t *)buf->b_data + osize, size - osize); 961 962 mutex_enter(&db->db_mtx); 963 dbuf_set_data(db, buf); 964 VERIFY(arc_buf_remove_ref(obuf, db)); 965 db->db.db_size = size; 966 967 if (db->db_level == 0) { 968 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 969 db->db_last_dirty->dt.dl.dr_data = buf; 970 } 971 mutex_exit(&db->db_mtx); 972 973 dnode_willuse_space(dn, size-osize, tx); 974 DB_DNODE_EXIT(db); 975 } 976 977 void 978 dbuf_release_bp(dmu_buf_impl_t *db) 979 { 980 objset_t *os; 981 982 DB_GET_OBJSET(&os, db); 983 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 984 ASSERT(arc_released(os->os_phys_buf) || 985 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 986 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 987 988 (void) arc_release(db->db_buf, db); 989 } 990 991 dbuf_dirty_record_t * 992 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 993 { 994 dnode_t *dn; 995 objset_t *os; 996 dbuf_dirty_record_t **drp, *dr; 997 int drop_struct_lock = FALSE; 998 boolean_t do_free_accounting = B_FALSE; 999 int txgoff = tx->tx_txg & TXG_MASK; 1000 1001 ASSERT(tx->tx_txg != 0); 1002 ASSERT(!refcount_is_zero(&db->db_holds)); 1003 DMU_TX_DIRTY_BUF(tx, db); 1004 1005 DB_DNODE_ENTER(db); 1006 dn = DB_DNODE(db); 1007 /* 1008 * Shouldn't dirty a regular buffer in syncing context. Private 1009 * objects may be dirtied in syncing context, but only if they 1010 * were already pre-dirtied in open context. 1011 */ 1012 ASSERT(!dmu_tx_is_syncing(tx) || 1013 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1014 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1015 dn->dn_objset->os_dsl_dataset == NULL); 1016 /* 1017 * We make this assert for private objects as well, but after we 1018 * check if we're already dirty. They are allowed to re-dirty 1019 * in syncing context. 1020 */ 1021 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1022 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1023 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1024 1025 mutex_enter(&db->db_mtx); 1026 /* 1027 * XXX make this true for indirects too? The problem is that 1028 * transactions created with dmu_tx_create_assigned() from 1029 * syncing context don't bother holding ahead. 1030 */ 1031 ASSERT(db->db_level != 0 || 1032 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1033 db->db_state == DB_NOFILL); 1034 1035 mutex_enter(&dn->dn_mtx); 1036 /* 1037 * Don't set dirtyctx to SYNC if we're just modifying this as we 1038 * initialize the objset. 1039 */ 1040 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1041 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1042 dn->dn_dirtyctx = 1043 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1044 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1045 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1046 } 1047 mutex_exit(&dn->dn_mtx); 1048 1049 if (db->db_blkid == DMU_SPILL_BLKID) 1050 dn->dn_have_spill = B_TRUE; 1051 1052 /* 1053 * If this buffer is already dirty, we're done. 1054 */ 1055 drp = &db->db_last_dirty; 1056 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1057 db->db.db_object == DMU_META_DNODE_OBJECT); 1058 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1059 drp = &dr->dr_next; 1060 if (dr && dr->dr_txg == tx->tx_txg) { 1061 DB_DNODE_EXIT(db); 1062 1063 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1064 /* 1065 * If this buffer has already been written out, 1066 * we now need to reset its state. 1067 */ 1068 dbuf_unoverride(dr); 1069 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1070 db->db_state != DB_NOFILL) 1071 arc_buf_thaw(db->db_buf); 1072 } 1073 mutex_exit(&db->db_mtx); 1074 return (dr); 1075 } 1076 1077 /* 1078 * Only valid if not already dirty. 1079 */ 1080 ASSERT(dn->dn_object == 0 || 1081 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1082 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1083 1084 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1085 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1086 dn->dn_phys->dn_nlevels > db->db_level || 1087 dn->dn_next_nlevels[txgoff] > db->db_level || 1088 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1089 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1090 1091 /* 1092 * We should only be dirtying in syncing context if it's the 1093 * mos or we're initializing the os or it's a special object. 1094 * However, we are allowed to dirty in syncing context provided 1095 * we already dirtied it in open context. Hence we must make 1096 * this assertion only if we're not already dirty. 1097 */ 1098 os = dn->dn_objset; 1099 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1100 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1101 ASSERT(db->db.db_size != 0); 1102 1103 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1104 1105 if (db->db_blkid != DMU_BONUS_BLKID) { 1106 /* 1107 * Update the accounting. 1108 * Note: we delay "free accounting" until after we drop 1109 * the db_mtx. This keeps us from grabbing other locks 1110 * (and possibly deadlocking) in bp_get_dsize() while 1111 * also holding the db_mtx. 1112 */ 1113 dnode_willuse_space(dn, db->db.db_size, tx); 1114 do_free_accounting = dbuf_block_freeable(db); 1115 } 1116 1117 /* 1118 * If this buffer is dirty in an old transaction group we need 1119 * to make a copy of it so that the changes we make in this 1120 * transaction group won't leak out when we sync the older txg. 1121 */ 1122 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1123 if (db->db_level == 0) { 1124 void *data_old = db->db_buf; 1125 1126 if (db->db_state != DB_NOFILL) { 1127 if (db->db_blkid == DMU_BONUS_BLKID) { 1128 dbuf_fix_old_data(db, tx->tx_txg); 1129 data_old = db->db.db_data; 1130 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1131 /* 1132 * Release the data buffer from the cache so 1133 * that we can modify it without impacting 1134 * possible other users of this cached data 1135 * block. Note that indirect blocks and 1136 * private objects are not released until the 1137 * syncing state (since they are only modified 1138 * then). 1139 */ 1140 arc_release(db->db_buf, db); 1141 dbuf_fix_old_data(db, tx->tx_txg); 1142 data_old = db->db_buf; 1143 } 1144 ASSERT(data_old != NULL); 1145 } 1146 dr->dt.dl.dr_data = data_old; 1147 } else { 1148 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1149 list_create(&dr->dt.di.dr_children, 1150 sizeof (dbuf_dirty_record_t), 1151 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1152 } 1153 dr->dr_dbuf = db; 1154 dr->dr_txg = tx->tx_txg; 1155 dr->dr_next = *drp; 1156 *drp = dr; 1157 1158 /* 1159 * We could have been freed_in_flight between the dbuf_noread 1160 * and dbuf_dirty. We win, as though the dbuf_noread() had 1161 * happened after the free. 1162 */ 1163 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1164 db->db_blkid != DMU_SPILL_BLKID) { 1165 mutex_enter(&dn->dn_mtx); 1166 dnode_clear_range(dn, db->db_blkid, 1, tx); 1167 mutex_exit(&dn->dn_mtx); 1168 db->db_freed_in_flight = FALSE; 1169 } 1170 1171 /* 1172 * This buffer is now part of this txg 1173 */ 1174 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1175 db->db_dirtycnt += 1; 1176 ASSERT3U(db->db_dirtycnt, <=, 3); 1177 1178 mutex_exit(&db->db_mtx); 1179 1180 if (db->db_blkid == DMU_BONUS_BLKID || 1181 db->db_blkid == DMU_SPILL_BLKID) { 1182 mutex_enter(&dn->dn_mtx); 1183 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1184 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1185 mutex_exit(&dn->dn_mtx); 1186 dnode_setdirty(dn, tx); 1187 DB_DNODE_EXIT(db); 1188 return (dr); 1189 } else if (do_free_accounting) { 1190 blkptr_t *bp = db->db_blkptr; 1191 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1192 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1193 /* 1194 * This is only a guess -- if the dbuf is dirty 1195 * in a previous txg, we don't know how much 1196 * space it will use on disk yet. We should 1197 * really have the struct_rwlock to access 1198 * db_blkptr, but since this is just a guess, 1199 * it's OK if we get an odd answer. 1200 */ 1201 ddt_prefetch(os->os_spa, bp); 1202 dnode_willuse_space(dn, -willfree, tx); 1203 } 1204 1205 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1206 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1207 drop_struct_lock = TRUE; 1208 } 1209 1210 if (db->db_level == 0) { 1211 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1212 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1213 } 1214 1215 if (db->db_level+1 < dn->dn_nlevels) { 1216 dmu_buf_impl_t *parent = db->db_parent; 1217 dbuf_dirty_record_t *di; 1218 int parent_held = FALSE; 1219 1220 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1221 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1222 1223 parent = dbuf_hold_level(dn, db->db_level+1, 1224 db->db_blkid >> epbs, FTAG); 1225 ASSERT(parent != NULL); 1226 parent_held = TRUE; 1227 } 1228 if (drop_struct_lock) 1229 rw_exit(&dn->dn_struct_rwlock); 1230 ASSERT3U(db->db_level+1, ==, parent->db_level); 1231 di = dbuf_dirty(parent, tx); 1232 if (parent_held) 1233 dbuf_rele(parent, FTAG); 1234 1235 mutex_enter(&db->db_mtx); 1236 /* possible race with dbuf_undirty() */ 1237 if (db->db_last_dirty == dr || 1238 dn->dn_object == DMU_META_DNODE_OBJECT) { 1239 mutex_enter(&di->dt.di.dr_mtx); 1240 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1241 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1242 list_insert_tail(&di->dt.di.dr_children, dr); 1243 mutex_exit(&di->dt.di.dr_mtx); 1244 dr->dr_parent = di; 1245 } 1246 mutex_exit(&db->db_mtx); 1247 } else { 1248 ASSERT(db->db_level+1 == dn->dn_nlevels); 1249 ASSERT(db->db_blkid < dn->dn_nblkptr); 1250 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1251 mutex_enter(&dn->dn_mtx); 1252 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1253 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1254 mutex_exit(&dn->dn_mtx); 1255 if (drop_struct_lock) 1256 rw_exit(&dn->dn_struct_rwlock); 1257 } 1258 1259 dnode_setdirty(dn, tx); 1260 DB_DNODE_EXIT(db); 1261 return (dr); 1262 } 1263 1264 /* 1265 * Return TRUE if this evicted the dbuf. 1266 */ 1267 static boolean_t 1268 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1269 { 1270 dnode_t *dn; 1271 uint64_t txg = tx->tx_txg; 1272 dbuf_dirty_record_t *dr, **drp; 1273 1274 ASSERT(txg != 0); 1275 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1276 ASSERT0(db->db_level); 1277 ASSERT(MUTEX_HELD(&db->db_mtx)); 1278 1279 /* 1280 * If this buffer is not dirty, we're done. 1281 */ 1282 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1283 if (dr->dr_txg <= txg) 1284 break; 1285 if (dr == NULL || dr->dr_txg < txg) 1286 return (B_FALSE); 1287 ASSERT(dr->dr_txg == txg); 1288 ASSERT(dr->dr_dbuf == db); 1289 1290 DB_DNODE_ENTER(db); 1291 dn = DB_DNODE(db); 1292 1293 /* 1294 * Note: This code will probably work even if there are concurrent 1295 * holders, but it is untested in that scenerio, as the ZPL and 1296 * ztest have additional locking (the range locks) that prevents 1297 * that type of concurrent access. 1298 */ 1299 ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt); 1300 1301 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1302 1303 ASSERT(db->db.db_size != 0); 1304 1305 /* XXX would be nice to fix up dn_towrite_space[] */ 1306 1307 *drp = dr->dr_next; 1308 1309 /* 1310 * Note that there are three places in dbuf_dirty() 1311 * where this dirty record may be put on a list. 1312 * Make sure to do a list_remove corresponding to 1313 * every one of those list_insert calls. 1314 */ 1315 if (dr->dr_parent) { 1316 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1317 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1318 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1319 } else if (db->db_blkid == DMU_SPILL_BLKID || 1320 db->db_level+1 == dn->dn_nlevels) { 1321 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1322 mutex_enter(&dn->dn_mtx); 1323 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1324 mutex_exit(&dn->dn_mtx); 1325 } 1326 DB_DNODE_EXIT(db); 1327 1328 if (db->db_state != DB_NOFILL) { 1329 dbuf_unoverride(dr); 1330 1331 ASSERT(db->db_buf != NULL); 1332 ASSERT(dr->dt.dl.dr_data != NULL); 1333 if (dr->dt.dl.dr_data != db->db_buf) 1334 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1335 } 1336 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1337 1338 ASSERT(db->db_dirtycnt > 0); 1339 db->db_dirtycnt -= 1; 1340 1341 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1342 arc_buf_t *buf = db->db_buf; 1343 1344 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1345 dbuf_set_data(db, NULL); 1346 VERIFY(arc_buf_remove_ref(buf, db)); 1347 dbuf_evict(db); 1348 return (B_TRUE); 1349 } 1350 1351 return (B_FALSE); 1352 } 1353 1354 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1355 void 1356 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1357 { 1358 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1359 1360 ASSERT(tx->tx_txg != 0); 1361 ASSERT(!refcount_is_zero(&db->db_holds)); 1362 1363 DB_DNODE_ENTER(db); 1364 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1365 rf |= DB_RF_HAVESTRUCT; 1366 DB_DNODE_EXIT(db); 1367 (void) dbuf_read(db, NULL, rf); 1368 (void) dbuf_dirty(db, tx); 1369 } 1370 1371 void 1372 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1373 { 1374 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1375 1376 db->db_state = DB_NOFILL; 1377 1378 dmu_buf_will_fill(db_fake, tx); 1379 } 1380 1381 void 1382 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1383 { 1384 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1385 1386 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1387 ASSERT(tx->tx_txg != 0); 1388 ASSERT(db->db_level == 0); 1389 ASSERT(!refcount_is_zero(&db->db_holds)); 1390 1391 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1392 dmu_tx_private_ok(tx)); 1393 1394 dbuf_noread(db); 1395 (void) dbuf_dirty(db, tx); 1396 } 1397 1398 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1399 /* ARGSUSED */ 1400 void 1401 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1402 { 1403 mutex_enter(&db->db_mtx); 1404 DBUF_VERIFY(db); 1405 1406 if (db->db_state == DB_FILL) { 1407 if (db->db_level == 0 && db->db_freed_in_flight) { 1408 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1409 /* we were freed while filling */ 1410 /* XXX dbuf_undirty? */ 1411 bzero(db->db.db_data, db->db.db_size); 1412 db->db_freed_in_flight = FALSE; 1413 } 1414 db->db_state = DB_CACHED; 1415 cv_broadcast(&db->db_changed); 1416 } 1417 mutex_exit(&db->db_mtx); 1418 } 1419 1420 /* 1421 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1422 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1423 */ 1424 void 1425 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1426 { 1427 ASSERT(!refcount_is_zero(&db->db_holds)); 1428 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1429 ASSERT(db->db_level == 0); 1430 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1431 ASSERT(buf != NULL); 1432 ASSERT(arc_buf_size(buf) == db->db.db_size); 1433 ASSERT(tx->tx_txg != 0); 1434 1435 arc_return_buf(buf, db); 1436 ASSERT(arc_released(buf)); 1437 1438 mutex_enter(&db->db_mtx); 1439 1440 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1441 cv_wait(&db->db_changed, &db->db_mtx); 1442 1443 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1444 1445 if (db->db_state == DB_CACHED && 1446 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1447 mutex_exit(&db->db_mtx); 1448 (void) dbuf_dirty(db, tx); 1449 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1450 VERIFY(arc_buf_remove_ref(buf, db)); 1451 xuio_stat_wbuf_copied(); 1452 return; 1453 } 1454 1455 xuio_stat_wbuf_nocopy(); 1456 if (db->db_state == DB_CACHED) { 1457 dbuf_dirty_record_t *dr = db->db_last_dirty; 1458 1459 ASSERT(db->db_buf != NULL); 1460 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1461 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1462 if (!arc_released(db->db_buf)) { 1463 ASSERT(dr->dt.dl.dr_override_state == 1464 DR_OVERRIDDEN); 1465 arc_release(db->db_buf, db); 1466 } 1467 dr->dt.dl.dr_data = buf; 1468 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1469 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1470 arc_release(db->db_buf, db); 1471 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1472 } 1473 db->db_buf = NULL; 1474 } 1475 ASSERT(db->db_buf == NULL); 1476 dbuf_set_data(db, buf); 1477 db->db_state = DB_FILL; 1478 mutex_exit(&db->db_mtx); 1479 (void) dbuf_dirty(db, tx); 1480 dbuf_fill_done(db, tx); 1481 } 1482 1483 /* 1484 * "Clear" the contents of this dbuf. This will mark the dbuf 1485 * EVICTING and clear *most* of its references. Unfortunetely, 1486 * when we are not holding the dn_dbufs_mtx, we can't clear the 1487 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1488 * in this case. For callers from the DMU we will usually see: 1489 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1490 * For the arc callback, we will usually see: 1491 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1492 * Sometimes, though, we will get a mix of these two: 1493 * DMU: dbuf_clear()->arc_buf_evict() 1494 * ARC: dbuf_do_evict()->dbuf_destroy() 1495 */ 1496 void 1497 dbuf_clear(dmu_buf_impl_t *db) 1498 { 1499 dnode_t *dn; 1500 dmu_buf_impl_t *parent = db->db_parent; 1501 dmu_buf_impl_t *dndb; 1502 int dbuf_gone = FALSE; 1503 1504 ASSERT(MUTEX_HELD(&db->db_mtx)); 1505 ASSERT(refcount_is_zero(&db->db_holds)); 1506 1507 dbuf_evict_user(db); 1508 1509 if (db->db_state == DB_CACHED) { 1510 ASSERT(db->db.db_data != NULL); 1511 if (db->db_blkid == DMU_BONUS_BLKID) { 1512 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1513 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1514 } 1515 db->db.db_data = NULL; 1516 db->db_state = DB_UNCACHED; 1517 } 1518 1519 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1520 ASSERT(db->db_data_pending == NULL); 1521 1522 db->db_state = DB_EVICTING; 1523 db->db_blkptr = NULL; 1524 1525 DB_DNODE_ENTER(db); 1526 dn = DB_DNODE(db); 1527 dndb = dn->dn_dbuf; 1528 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1529 list_remove(&dn->dn_dbufs, db); 1530 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1531 membar_producer(); 1532 DB_DNODE_EXIT(db); 1533 /* 1534 * Decrementing the dbuf count means that the hold corresponding 1535 * to the removed dbuf is no longer discounted in dnode_move(), 1536 * so the dnode cannot be moved until after we release the hold. 1537 * The membar_producer() ensures visibility of the decremented 1538 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1539 * release any lock. 1540 */ 1541 dnode_rele(dn, db); 1542 db->db_dnode_handle = NULL; 1543 } else { 1544 DB_DNODE_EXIT(db); 1545 } 1546 1547 if (db->db_buf) 1548 dbuf_gone = arc_buf_evict(db->db_buf); 1549 1550 if (!dbuf_gone) 1551 mutex_exit(&db->db_mtx); 1552 1553 /* 1554 * If this dbuf is referenced from an indirect dbuf, 1555 * decrement the ref count on the indirect dbuf. 1556 */ 1557 if (parent && parent != dndb) 1558 dbuf_rele(parent, db); 1559 } 1560 1561 static int 1562 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1563 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1564 { 1565 int nlevels, epbs; 1566 1567 *parentp = NULL; 1568 *bpp = NULL; 1569 1570 ASSERT(blkid != DMU_BONUS_BLKID); 1571 1572 if (blkid == DMU_SPILL_BLKID) { 1573 mutex_enter(&dn->dn_mtx); 1574 if (dn->dn_have_spill && 1575 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1576 *bpp = &dn->dn_phys->dn_spill; 1577 else 1578 *bpp = NULL; 1579 dbuf_add_ref(dn->dn_dbuf, NULL); 1580 *parentp = dn->dn_dbuf; 1581 mutex_exit(&dn->dn_mtx); 1582 return (0); 1583 } 1584 1585 if (dn->dn_phys->dn_nlevels == 0) 1586 nlevels = 1; 1587 else 1588 nlevels = dn->dn_phys->dn_nlevels; 1589 1590 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1591 1592 ASSERT3U(level * epbs, <, 64); 1593 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1594 if (level >= nlevels || 1595 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1596 /* the buffer has no parent yet */ 1597 return (SET_ERROR(ENOENT)); 1598 } else if (level < nlevels-1) { 1599 /* this block is referenced from an indirect block */ 1600 int err = dbuf_hold_impl(dn, level+1, 1601 blkid >> epbs, fail_sparse, NULL, parentp); 1602 if (err) 1603 return (err); 1604 err = dbuf_read(*parentp, NULL, 1605 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1606 if (err) { 1607 dbuf_rele(*parentp, NULL); 1608 *parentp = NULL; 1609 return (err); 1610 } 1611 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1612 (blkid & ((1ULL << epbs) - 1)); 1613 return (0); 1614 } else { 1615 /* the block is referenced from the dnode */ 1616 ASSERT3U(level, ==, nlevels-1); 1617 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1618 blkid < dn->dn_phys->dn_nblkptr); 1619 if (dn->dn_dbuf) { 1620 dbuf_add_ref(dn->dn_dbuf, NULL); 1621 *parentp = dn->dn_dbuf; 1622 } 1623 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1624 return (0); 1625 } 1626 } 1627 1628 static dmu_buf_impl_t * 1629 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1630 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1631 { 1632 objset_t *os = dn->dn_objset; 1633 dmu_buf_impl_t *db, *odb; 1634 1635 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1636 ASSERT(dn->dn_type != DMU_OT_NONE); 1637 1638 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1639 1640 db->db_objset = os; 1641 db->db.db_object = dn->dn_object; 1642 db->db_level = level; 1643 db->db_blkid = blkid; 1644 db->db_last_dirty = NULL; 1645 db->db_dirtycnt = 0; 1646 db->db_dnode_handle = dn->dn_handle; 1647 db->db_parent = parent; 1648 db->db_blkptr = blkptr; 1649 1650 db->db_user_ptr = NULL; 1651 db->db_user_data_ptr_ptr = NULL; 1652 db->db_evict_func = NULL; 1653 db->db_immediate_evict = 0; 1654 db->db_freed_in_flight = 0; 1655 1656 if (blkid == DMU_BONUS_BLKID) { 1657 ASSERT3P(parent, ==, dn->dn_dbuf); 1658 db->db.db_size = DN_MAX_BONUSLEN - 1659 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1660 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1661 db->db.db_offset = DMU_BONUS_BLKID; 1662 db->db_state = DB_UNCACHED; 1663 /* the bonus dbuf is not placed in the hash table */ 1664 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1665 return (db); 1666 } else if (blkid == DMU_SPILL_BLKID) { 1667 db->db.db_size = (blkptr != NULL) ? 1668 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1669 db->db.db_offset = 0; 1670 } else { 1671 int blocksize = 1672 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1673 db->db.db_size = blocksize; 1674 db->db.db_offset = db->db_blkid * blocksize; 1675 } 1676 1677 /* 1678 * Hold the dn_dbufs_mtx while we get the new dbuf 1679 * in the hash table *and* added to the dbufs list. 1680 * This prevents a possible deadlock with someone 1681 * trying to look up this dbuf before its added to the 1682 * dn_dbufs list. 1683 */ 1684 mutex_enter(&dn->dn_dbufs_mtx); 1685 db->db_state = DB_EVICTING; 1686 if ((odb = dbuf_hash_insert(db)) != NULL) { 1687 /* someone else inserted it first */ 1688 kmem_cache_free(dbuf_cache, db); 1689 mutex_exit(&dn->dn_dbufs_mtx); 1690 return (odb); 1691 } 1692 list_insert_head(&dn->dn_dbufs, db); 1693 db->db_state = DB_UNCACHED; 1694 mutex_exit(&dn->dn_dbufs_mtx); 1695 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1696 1697 if (parent && parent != dn->dn_dbuf) 1698 dbuf_add_ref(parent, db); 1699 1700 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1701 refcount_count(&dn->dn_holds) > 0); 1702 (void) refcount_add(&dn->dn_holds, db); 1703 (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 1704 1705 dprintf_dbuf(db, "db=%p\n", db); 1706 1707 return (db); 1708 } 1709 1710 static int 1711 dbuf_do_evict(void *private) 1712 { 1713 arc_buf_t *buf = private; 1714 dmu_buf_impl_t *db = buf->b_private; 1715 1716 if (!MUTEX_HELD(&db->db_mtx)) 1717 mutex_enter(&db->db_mtx); 1718 1719 ASSERT(refcount_is_zero(&db->db_holds)); 1720 1721 if (db->db_state != DB_EVICTING) { 1722 ASSERT(db->db_state == DB_CACHED); 1723 DBUF_VERIFY(db); 1724 db->db_buf = NULL; 1725 dbuf_evict(db); 1726 } else { 1727 mutex_exit(&db->db_mtx); 1728 dbuf_destroy(db); 1729 } 1730 return (0); 1731 } 1732 1733 static void 1734 dbuf_destroy(dmu_buf_impl_t *db) 1735 { 1736 ASSERT(refcount_is_zero(&db->db_holds)); 1737 1738 if (db->db_blkid != DMU_BONUS_BLKID) { 1739 /* 1740 * If this dbuf is still on the dn_dbufs list, 1741 * remove it from that list. 1742 */ 1743 if (db->db_dnode_handle != NULL) { 1744 dnode_t *dn; 1745 1746 DB_DNODE_ENTER(db); 1747 dn = DB_DNODE(db); 1748 mutex_enter(&dn->dn_dbufs_mtx); 1749 list_remove(&dn->dn_dbufs, db); 1750 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1751 mutex_exit(&dn->dn_dbufs_mtx); 1752 DB_DNODE_EXIT(db); 1753 /* 1754 * Decrementing the dbuf count means that the hold 1755 * corresponding to the removed dbuf is no longer 1756 * discounted in dnode_move(), so the dnode cannot be 1757 * moved until after we release the hold. 1758 */ 1759 dnode_rele(dn, db); 1760 db->db_dnode_handle = NULL; 1761 } 1762 dbuf_hash_remove(db); 1763 } 1764 db->db_parent = NULL; 1765 db->db_buf = NULL; 1766 1767 ASSERT(!list_link_active(&db->db_link)); 1768 ASSERT(db->db.db_data == NULL); 1769 ASSERT(db->db_hash_next == NULL); 1770 ASSERT(db->db_blkptr == NULL); 1771 ASSERT(db->db_data_pending == NULL); 1772 1773 kmem_cache_free(dbuf_cache, db); 1774 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1775 } 1776 1777 void 1778 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1779 { 1780 dmu_buf_impl_t *db = NULL; 1781 blkptr_t *bp = NULL; 1782 1783 ASSERT(blkid != DMU_BONUS_BLKID); 1784 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1785 1786 if (dnode_block_freed(dn, blkid)) 1787 return; 1788 1789 /* dbuf_find() returns with db_mtx held */ 1790 if (db = dbuf_find(dn, 0, blkid)) { 1791 /* 1792 * This dbuf is already in the cache. We assume that 1793 * it is already CACHED, or else about to be either 1794 * read or filled. 1795 */ 1796 mutex_exit(&db->db_mtx); 1797 return; 1798 } 1799 1800 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1801 if (bp && !BP_IS_HOLE(bp)) { 1802 int priority = dn->dn_type == DMU_OT_DDT_ZAP ? 1803 ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; 1804 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1805 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1806 zbookmark_t zb; 1807 1808 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1809 dn->dn_object, 0, blkid); 1810 1811 (void) arc_read(NULL, dn->dn_objset->os_spa, 1812 bp, NULL, NULL, priority, 1813 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1814 &aflags, &zb); 1815 } 1816 if (db) 1817 dbuf_rele(db, NULL); 1818 } 1819 } 1820 1821 /* 1822 * Returns with db_holds incremented, and db_mtx not held. 1823 * Note: dn_struct_rwlock must be held. 1824 */ 1825 int 1826 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1827 void *tag, dmu_buf_impl_t **dbp) 1828 { 1829 dmu_buf_impl_t *db, *parent = NULL; 1830 1831 ASSERT(blkid != DMU_BONUS_BLKID); 1832 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1833 ASSERT3U(dn->dn_nlevels, >, level); 1834 1835 *dbp = NULL; 1836 top: 1837 /* dbuf_find() returns with db_mtx held */ 1838 db = dbuf_find(dn, level, blkid); 1839 1840 if (db == NULL) { 1841 blkptr_t *bp = NULL; 1842 int err; 1843 1844 ASSERT3P(parent, ==, NULL); 1845 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1846 if (fail_sparse) { 1847 if (err == 0 && bp && BP_IS_HOLE(bp)) 1848 err = SET_ERROR(ENOENT); 1849 if (err) { 1850 if (parent) 1851 dbuf_rele(parent, NULL); 1852 return (err); 1853 } 1854 } 1855 if (err && err != ENOENT) 1856 return (err); 1857 db = dbuf_create(dn, level, blkid, parent, bp); 1858 } 1859 1860 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1861 arc_buf_add_ref(db->db_buf, db); 1862 if (db->db_buf->b_data == NULL) { 1863 dbuf_clear(db); 1864 if (parent) { 1865 dbuf_rele(parent, NULL); 1866 parent = NULL; 1867 } 1868 goto top; 1869 } 1870 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1871 } 1872 1873 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1874 1875 /* 1876 * If this buffer is currently syncing out, and we are are 1877 * still referencing it from db_data, we need to make a copy 1878 * of it in case we decide we want to dirty it again in this txg. 1879 */ 1880 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1881 dn->dn_object != DMU_META_DNODE_OBJECT && 1882 db->db_state == DB_CACHED && db->db_data_pending) { 1883 dbuf_dirty_record_t *dr = db->db_data_pending; 1884 1885 if (dr->dt.dl.dr_data == db->db_buf) { 1886 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1887 1888 dbuf_set_data(db, 1889 arc_buf_alloc(dn->dn_objset->os_spa, 1890 db->db.db_size, db, type)); 1891 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1892 db->db.db_size); 1893 } 1894 } 1895 1896 (void) refcount_add(&db->db_holds, tag); 1897 dbuf_update_data(db); 1898 DBUF_VERIFY(db); 1899 mutex_exit(&db->db_mtx); 1900 1901 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1902 if (parent) 1903 dbuf_rele(parent, NULL); 1904 1905 ASSERT3P(DB_DNODE(db), ==, dn); 1906 ASSERT3U(db->db_blkid, ==, blkid); 1907 ASSERT3U(db->db_level, ==, level); 1908 *dbp = db; 1909 1910 return (0); 1911 } 1912 1913 dmu_buf_impl_t * 1914 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1915 { 1916 dmu_buf_impl_t *db; 1917 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1918 return (err ? NULL : db); 1919 } 1920 1921 dmu_buf_impl_t * 1922 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1923 { 1924 dmu_buf_impl_t *db; 1925 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1926 return (err ? NULL : db); 1927 } 1928 1929 void 1930 dbuf_create_bonus(dnode_t *dn) 1931 { 1932 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1933 1934 ASSERT(dn->dn_bonus == NULL); 1935 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1936 } 1937 1938 int 1939 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 1940 { 1941 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1942 dnode_t *dn; 1943 1944 if (db->db_blkid != DMU_SPILL_BLKID) 1945 return (SET_ERROR(ENOTSUP)); 1946 if (blksz == 0) 1947 blksz = SPA_MINBLOCKSIZE; 1948 if (blksz > SPA_MAXBLOCKSIZE) 1949 blksz = SPA_MAXBLOCKSIZE; 1950 else 1951 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 1952 1953 DB_DNODE_ENTER(db); 1954 dn = DB_DNODE(db); 1955 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 1956 dbuf_new_size(db, blksz, tx); 1957 rw_exit(&dn->dn_struct_rwlock); 1958 DB_DNODE_EXIT(db); 1959 1960 return (0); 1961 } 1962 1963 void 1964 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 1965 { 1966 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 1967 } 1968 1969 #pragma weak dmu_buf_add_ref = dbuf_add_ref 1970 void 1971 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1972 { 1973 int64_t holds = refcount_add(&db->db_holds, tag); 1974 ASSERT(holds > 1); 1975 } 1976 1977 /* 1978 * If you call dbuf_rele() you had better not be referencing the dnode handle 1979 * unless you have some other direct or indirect hold on the dnode. (An indirect 1980 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 1981 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 1982 * dnode's parent dbuf evicting its dnode handles. 1983 */ 1984 #pragma weak dmu_buf_rele = dbuf_rele 1985 void 1986 dbuf_rele(dmu_buf_impl_t *db, void *tag) 1987 { 1988 mutex_enter(&db->db_mtx); 1989 dbuf_rele_and_unlock(db, tag); 1990 } 1991 1992 /* 1993 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 1994 * db_dirtycnt and db_holds to be updated atomically. 1995 */ 1996 void 1997 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 1998 { 1999 int64_t holds; 2000 2001 ASSERT(MUTEX_HELD(&db->db_mtx)); 2002 DBUF_VERIFY(db); 2003 2004 /* 2005 * Remove the reference to the dbuf before removing its hold on the 2006 * dnode so we can guarantee in dnode_move() that a referenced bonus 2007 * buffer has a corresponding dnode hold. 2008 */ 2009 holds = refcount_remove(&db->db_holds, tag); 2010 ASSERT(holds >= 0); 2011 2012 /* 2013 * We can't freeze indirects if there is a possibility that they 2014 * may be modified in the current syncing context. 2015 */ 2016 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2017 arc_buf_freeze(db->db_buf); 2018 2019 if (holds == db->db_dirtycnt && 2020 db->db_level == 0 && db->db_immediate_evict) 2021 dbuf_evict_user(db); 2022 2023 if (holds == 0) { 2024 if (db->db_blkid == DMU_BONUS_BLKID) { 2025 mutex_exit(&db->db_mtx); 2026 2027 /* 2028 * If the dnode moves here, we cannot cross this barrier 2029 * until the move completes. 2030 */ 2031 DB_DNODE_ENTER(db); 2032 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 2033 DB_DNODE_EXIT(db); 2034 /* 2035 * The bonus buffer's dnode hold is no longer discounted 2036 * in dnode_move(). The dnode cannot move until after 2037 * the dnode_rele(). 2038 */ 2039 dnode_rele(DB_DNODE(db), db); 2040 } else if (db->db_buf == NULL) { 2041 /* 2042 * This is a special case: we never associated this 2043 * dbuf with any data allocated from the ARC. 2044 */ 2045 ASSERT(db->db_state == DB_UNCACHED || 2046 db->db_state == DB_NOFILL); 2047 dbuf_evict(db); 2048 } else if (arc_released(db->db_buf)) { 2049 arc_buf_t *buf = db->db_buf; 2050 /* 2051 * This dbuf has anonymous data associated with it. 2052 */ 2053 dbuf_set_data(db, NULL); 2054 VERIFY(arc_buf_remove_ref(buf, db)); 2055 dbuf_evict(db); 2056 } else { 2057 VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2058 2059 /* 2060 * A dbuf will be eligible for eviction if either the 2061 * 'primarycache' property is set or a duplicate 2062 * copy of this buffer is already cached in the arc. 2063 * 2064 * In the case of the 'primarycache' a buffer 2065 * is considered for eviction if it matches the 2066 * criteria set in the property. 2067 * 2068 * To decide if our buffer is considered a 2069 * duplicate, we must call into the arc to determine 2070 * if multiple buffers are referencing the same 2071 * block on-disk. If so, then we simply evict 2072 * ourselves. 2073 */ 2074 if (!DBUF_IS_CACHEABLE(db) || 2075 arc_buf_eviction_needed(db->db_buf)) 2076 dbuf_clear(db); 2077 else 2078 mutex_exit(&db->db_mtx); 2079 } 2080 } else { 2081 mutex_exit(&db->db_mtx); 2082 } 2083 } 2084 2085 #pragma weak dmu_buf_refcount = dbuf_refcount 2086 uint64_t 2087 dbuf_refcount(dmu_buf_impl_t *db) 2088 { 2089 return (refcount_count(&db->db_holds)); 2090 } 2091 2092 void * 2093 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2094 dmu_buf_evict_func_t *evict_func) 2095 { 2096 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2097 user_data_ptr_ptr, evict_func)); 2098 } 2099 2100 void * 2101 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2102 dmu_buf_evict_func_t *evict_func) 2103 { 2104 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2105 2106 db->db_immediate_evict = TRUE; 2107 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2108 user_data_ptr_ptr, evict_func)); 2109 } 2110 2111 void * 2112 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2113 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2114 { 2115 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2116 ASSERT(db->db_level == 0); 2117 2118 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2119 2120 mutex_enter(&db->db_mtx); 2121 2122 if (db->db_user_ptr == old_user_ptr) { 2123 db->db_user_ptr = user_ptr; 2124 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2125 db->db_evict_func = evict_func; 2126 2127 dbuf_update_data(db); 2128 } else { 2129 old_user_ptr = db->db_user_ptr; 2130 } 2131 2132 mutex_exit(&db->db_mtx); 2133 return (old_user_ptr); 2134 } 2135 2136 void * 2137 dmu_buf_get_user(dmu_buf_t *db_fake) 2138 { 2139 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2140 ASSERT(!refcount_is_zero(&db->db_holds)); 2141 2142 return (db->db_user_ptr); 2143 } 2144 2145 boolean_t 2146 dmu_buf_freeable(dmu_buf_t *dbuf) 2147 { 2148 boolean_t res = B_FALSE; 2149 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2150 2151 if (db->db_blkptr) 2152 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2153 db->db_blkptr, db->db_blkptr->blk_birth); 2154 2155 return (res); 2156 } 2157 2158 blkptr_t * 2159 dmu_buf_get_blkptr(dmu_buf_t *db) 2160 { 2161 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2162 return (dbi->db_blkptr); 2163 } 2164 2165 static void 2166 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2167 { 2168 /* ASSERT(dmu_tx_is_syncing(tx) */ 2169 ASSERT(MUTEX_HELD(&db->db_mtx)); 2170 2171 if (db->db_blkptr != NULL) 2172 return; 2173 2174 if (db->db_blkid == DMU_SPILL_BLKID) { 2175 db->db_blkptr = &dn->dn_phys->dn_spill; 2176 BP_ZERO(db->db_blkptr); 2177 return; 2178 } 2179 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2180 /* 2181 * This buffer was allocated at a time when there was 2182 * no available blkptrs from the dnode, or it was 2183 * inappropriate to hook it in (i.e., nlevels mis-match). 2184 */ 2185 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2186 ASSERT(db->db_parent == NULL); 2187 db->db_parent = dn->dn_dbuf; 2188 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2189 DBUF_VERIFY(db); 2190 } else { 2191 dmu_buf_impl_t *parent = db->db_parent; 2192 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2193 2194 ASSERT(dn->dn_phys->dn_nlevels > 1); 2195 if (parent == NULL) { 2196 mutex_exit(&db->db_mtx); 2197 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2198 (void) dbuf_hold_impl(dn, db->db_level+1, 2199 db->db_blkid >> epbs, FALSE, db, &parent); 2200 rw_exit(&dn->dn_struct_rwlock); 2201 mutex_enter(&db->db_mtx); 2202 db->db_parent = parent; 2203 } 2204 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2205 (db->db_blkid & ((1ULL << epbs) - 1)); 2206 DBUF_VERIFY(db); 2207 } 2208 } 2209 2210 static void 2211 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2212 { 2213 dmu_buf_impl_t *db = dr->dr_dbuf; 2214 dnode_t *dn; 2215 zio_t *zio; 2216 2217 ASSERT(dmu_tx_is_syncing(tx)); 2218 2219 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2220 2221 mutex_enter(&db->db_mtx); 2222 2223 ASSERT(db->db_level > 0); 2224 DBUF_VERIFY(db); 2225 2226 if (db->db_buf == NULL) { 2227 mutex_exit(&db->db_mtx); 2228 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2229 mutex_enter(&db->db_mtx); 2230 } 2231 ASSERT3U(db->db_state, ==, DB_CACHED); 2232 ASSERT(db->db_buf != NULL); 2233 2234 DB_DNODE_ENTER(db); 2235 dn = DB_DNODE(db); 2236 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2237 dbuf_check_blkptr(dn, db); 2238 DB_DNODE_EXIT(db); 2239 2240 db->db_data_pending = dr; 2241 2242 mutex_exit(&db->db_mtx); 2243 dbuf_write(dr, db->db_buf, tx); 2244 2245 zio = dr->dr_zio; 2246 mutex_enter(&dr->dt.di.dr_mtx); 2247 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2248 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2249 mutex_exit(&dr->dt.di.dr_mtx); 2250 zio_nowait(zio); 2251 } 2252 2253 static void 2254 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2255 { 2256 arc_buf_t **datap = &dr->dt.dl.dr_data; 2257 dmu_buf_impl_t *db = dr->dr_dbuf; 2258 dnode_t *dn; 2259 objset_t *os; 2260 uint64_t txg = tx->tx_txg; 2261 2262 ASSERT(dmu_tx_is_syncing(tx)); 2263 2264 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2265 2266 mutex_enter(&db->db_mtx); 2267 /* 2268 * To be synced, we must be dirtied. But we 2269 * might have been freed after the dirty. 2270 */ 2271 if (db->db_state == DB_UNCACHED) { 2272 /* This buffer has been freed since it was dirtied */ 2273 ASSERT(db->db.db_data == NULL); 2274 } else if (db->db_state == DB_FILL) { 2275 /* This buffer was freed and is now being re-filled */ 2276 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2277 } else { 2278 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2279 } 2280 DBUF_VERIFY(db); 2281 2282 DB_DNODE_ENTER(db); 2283 dn = DB_DNODE(db); 2284 2285 if (db->db_blkid == DMU_SPILL_BLKID) { 2286 mutex_enter(&dn->dn_mtx); 2287 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2288 mutex_exit(&dn->dn_mtx); 2289 } 2290 2291 /* 2292 * If this is a bonus buffer, simply copy the bonus data into the 2293 * dnode. It will be written out when the dnode is synced (and it 2294 * will be synced, since it must have been dirty for dbuf_sync to 2295 * be called). 2296 */ 2297 if (db->db_blkid == DMU_BONUS_BLKID) { 2298 dbuf_dirty_record_t **drp; 2299 2300 ASSERT(*datap != NULL); 2301 ASSERT0(db->db_level); 2302 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2303 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2304 DB_DNODE_EXIT(db); 2305 2306 if (*datap != db->db.db_data) { 2307 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2308 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2309 } 2310 db->db_data_pending = NULL; 2311 drp = &db->db_last_dirty; 2312 while (*drp != dr) 2313 drp = &(*drp)->dr_next; 2314 ASSERT(dr->dr_next == NULL); 2315 ASSERT(dr->dr_dbuf == db); 2316 *drp = dr->dr_next; 2317 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2318 ASSERT(db->db_dirtycnt > 0); 2319 db->db_dirtycnt -= 1; 2320 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2321 return; 2322 } 2323 2324 os = dn->dn_objset; 2325 2326 /* 2327 * This function may have dropped the db_mtx lock allowing a dmu_sync 2328 * operation to sneak in. As a result, we need to ensure that we 2329 * don't check the dr_override_state until we have returned from 2330 * dbuf_check_blkptr. 2331 */ 2332 dbuf_check_blkptr(dn, db); 2333 2334 /* 2335 * If this buffer is in the middle of an immediate write, 2336 * wait for the synchronous IO to complete. 2337 */ 2338 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2339 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2340 cv_wait(&db->db_changed, &db->db_mtx); 2341 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2342 } 2343 2344 if (db->db_state != DB_NOFILL && 2345 dn->dn_object != DMU_META_DNODE_OBJECT && 2346 refcount_count(&db->db_holds) > 1 && 2347 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2348 *datap == db->db_buf) { 2349 /* 2350 * If this buffer is currently "in use" (i.e., there 2351 * are active holds and db_data still references it), 2352 * then make a copy before we start the write so that 2353 * any modifications from the open txg will not leak 2354 * into this write. 2355 * 2356 * NOTE: this copy does not need to be made for 2357 * objects only modified in the syncing context (e.g. 2358 * DNONE_DNODE blocks). 2359 */ 2360 int blksz = arc_buf_size(*datap); 2361 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2362 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2363 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2364 } 2365 db->db_data_pending = dr; 2366 2367 mutex_exit(&db->db_mtx); 2368 2369 dbuf_write(dr, *datap, tx); 2370 2371 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2372 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2373 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2374 DB_DNODE_EXIT(db); 2375 } else { 2376 /* 2377 * Although zio_nowait() does not "wait for an IO", it does 2378 * initiate the IO. If this is an empty write it seems plausible 2379 * that the IO could actually be completed before the nowait 2380 * returns. We need to DB_DNODE_EXIT() first in case 2381 * zio_nowait() invalidates the dbuf. 2382 */ 2383 DB_DNODE_EXIT(db); 2384 zio_nowait(dr->dr_zio); 2385 } 2386 } 2387 2388 void 2389 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2390 { 2391 dbuf_dirty_record_t *dr; 2392 2393 while (dr = list_head(list)) { 2394 if (dr->dr_zio != NULL) { 2395 /* 2396 * If we find an already initialized zio then we 2397 * are processing the meta-dnode, and we have finished. 2398 * The dbufs for all dnodes are put back on the list 2399 * during processing, so that we can zio_wait() 2400 * these IOs after initiating all child IOs. 2401 */ 2402 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2403 DMU_META_DNODE_OBJECT); 2404 break; 2405 } 2406 list_remove(list, dr); 2407 if (dr->dr_dbuf->db_level > 0) 2408 dbuf_sync_indirect(dr, tx); 2409 else 2410 dbuf_sync_leaf(dr, tx); 2411 } 2412 } 2413 2414 /* ARGSUSED */ 2415 static void 2416 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2417 { 2418 dmu_buf_impl_t *db = vdb; 2419 dnode_t *dn; 2420 blkptr_t *bp = zio->io_bp; 2421 blkptr_t *bp_orig = &zio->io_bp_orig; 2422 spa_t *spa = zio->io_spa; 2423 int64_t delta; 2424 uint64_t fill = 0; 2425 int i; 2426 2427 ASSERT(db->db_blkptr == bp); 2428 2429 DB_DNODE_ENTER(db); 2430 dn = DB_DNODE(db); 2431 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2432 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2433 zio->io_prev_space_delta = delta; 2434 2435 if (BP_IS_HOLE(bp)) { 2436 ASSERT(bp->blk_fill == 0); 2437 DB_DNODE_EXIT(db); 2438 return; 2439 } 2440 2441 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2442 BP_GET_TYPE(bp) == dn->dn_type) || 2443 (db->db_blkid == DMU_SPILL_BLKID && 2444 BP_GET_TYPE(bp) == dn->dn_bonustype)); 2445 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2446 2447 mutex_enter(&db->db_mtx); 2448 2449 #ifdef ZFS_DEBUG 2450 if (db->db_blkid == DMU_SPILL_BLKID) { 2451 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2452 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2453 db->db_blkptr == &dn->dn_phys->dn_spill); 2454 } 2455 #endif 2456 2457 if (db->db_level == 0) { 2458 mutex_enter(&dn->dn_mtx); 2459 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2460 db->db_blkid != DMU_SPILL_BLKID) 2461 dn->dn_phys->dn_maxblkid = db->db_blkid; 2462 mutex_exit(&dn->dn_mtx); 2463 2464 if (dn->dn_type == DMU_OT_DNODE) { 2465 dnode_phys_t *dnp = db->db.db_data; 2466 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2467 i--, dnp++) { 2468 if (dnp->dn_type != DMU_OT_NONE) 2469 fill++; 2470 } 2471 } else { 2472 fill = 1; 2473 } 2474 } else { 2475 blkptr_t *ibp = db->db.db_data; 2476 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2477 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2478 if (BP_IS_HOLE(ibp)) 2479 continue; 2480 fill += ibp->blk_fill; 2481 } 2482 } 2483 DB_DNODE_EXIT(db); 2484 2485 bp->blk_fill = fill; 2486 2487 mutex_exit(&db->db_mtx); 2488 } 2489 2490 /* ARGSUSED */ 2491 static void 2492 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2493 { 2494 dmu_buf_impl_t *db = vdb; 2495 blkptr_t *bp = zio->io_bp; 2496 blkptr_t *bp_orig = &zio->io_bp_orig; 2497 uint64_t txg = zio->io_txg; 2498 dbuf_dirty_record_t **drp, *dr; 2499 2500 ASSERT0(zio->io_error); 2501 ASSERT(db->db_blkptr == bp); 2502 2503 /* 2504 * For nopwrites and rewrites we ensure that the bp matches our 2505 * original and bypass all the accounting. 2506 */ 2507 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2508 ASSERT(BP_EQUAL(bp, bp_orig)); 2509 } else { 2510 objset_t *os; 2511 dsl_dataset_t *ds; 2512 dmu_tx_t *tx; 2513 2514 DB_GET_OBJSET(&os, db); 2515 ds = os->os_dsl_dataset; 2516 tx = os->os_synctx; 2517 2518 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2519 dsl_dataset_block_born(ds, bp, tx); 2520 } 2521 2522 mutex_enter(&db->db_mtx); 2523 2524 DBUF_VERIFY(db); 2525 2526 drp = &db->db_last_dirty; 2527 while ((dr = *drp) != db->db_data_pending) 2528 drp = &dr->dr_next; 2529 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2530 ASSERT(dr->dr_txg == txg); 2531 ASSERT(dr->dr_dbuf == db); 2532 ASSERT(dr->dr_next == NULL); 2533 *drp = dr->dr_next; 2534 2535 #ifdef ZFS_DEBUG 2536 if (db->db_blkid == DMU_SPILL_BLKID) { 2537 dnode_t *dn; 2538 2539 DB_DNODE_ENTER(db); 2540 dn = DB_DNODE(db); 2541 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2542 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2543 db->db_blkptr == &dn->dn_phys->dn_spill); 2544 DB_DNODE_EXIT(db); 2545 } 2546 #endif 2547 2548 if (db->db_level == 0) { 2549 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2550 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2551 if (db->db_state != DB_NOFILL) { 2552 if (dr->dt.dl.dr_data != db->db_buf) 2553 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2554 db)); 2555 else if (!arc_released(db->db_buf)) 2556 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2557 } 2558 } else { 2559 dnode_t *dn; 2560 2561 DB_DNODE_ENTER(db); 2562 dn = DB_DNODE(db); 2563 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2564 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2565 if (!BP_IS_HOLE(db->db_blkptr)) { 2566 int epbs = 2567 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2568 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2569 db->db.db_size); 2570 ASSERT3U(dn->dn_phys->dn_maxblkid 2571 >> (db->db_level * epbs), >=, db->db_blkid); 2572 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2573 } 2574 DB_DNODE_EXIT(db); 2575 mutex_destroy(&dr->dt.di.dr_mtx); 2576 list_destroy(&dr->dt.di.dr_children); 2577 } 2578 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2579 2580 cv_broadcast(&db->db_changed); 2581 ASSERT(db->db_dirtycnt > 0); 2582 db->db_dirtycnt -= 1; 2583 db->db_data_pending = NULL; 2584 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2585 } 2586 2587 static void 2588 dbuf_write_nofill_ready(zio_t *zio) 2589 { 2590 dbuf_write_ready(zio, NULL, zio->io_private); 2591 } 2592 2593 static void 2594 dbuf_write_nofill_done(zio_t *zio) 2595 { 2596 dbuf_write_done(zio, NULL, zio->io_private); 2597 } 2598 2599 static void 2600 dbuf_write_override_ready(zio_t *zio) 2601 { 2602 dbuf_dirty_record_t *dr = zio->io_private; 2603 dmu_buf_impl_t *db = dr->dr_dbuf; 2604 2605 dbuf_write_ready(zio, NULL, db); 2606 } 2607 2608 static void 2609 dbuf_write_override_done(zio_t *zio) 2610 { 2611 dbuf_dirty_record_t *dr = zio->io_private; 2612 dmu_buf_impl_t *db = dr->dr_dbuf; 2613 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2614 2615 mutex_enter(&db->db_mtx); 2616 if (!BP_EQUAL(zio->io_bp, obp)) { 2617 if (!BP_IS_HOLE(obp)) 2618 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2619 arc_release(dr->dt.dl.dr_data, db); 2620 } 2621 mutex_exit(&db->db_mtx); 2622 2623 dbuf_write_done(zio, NULL, db); 2624 } 2625 2626 static void 2627 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2628 { 2629 dmu_buf_impl_t *db = dr->dr_dbuf; 2630 dnode_t *dn; 2631 objset_t *os; 2632 dmu_buf_impl_t *parent = db->db_parent; 2633 uint64_t txg = tx->tx_txg; 2634 zbookmark_t zb; 2635 zio_prop_t zp; 2636 zio_t *zio; 2637 int wp_flag = 0; 2638 2639 DB_DNODE_ENTER(db); 2640 dn = DB_DNODE(db); 2641 os = dn->dn_objset; 2642 2643 if (db->db_state != DB_NOFILL) { 2644 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2645 /* 2646 * Private object buffers are released here rather 2647 * than in dbuf_dirty() since they are only modified 2648 * in the syncing context and we don't want the 2649 * overhead of making multiple copies of the data. 2650 */ 2651 if (BP_IS_HOLE(db->db_blkptr)) { 2652 arc_buf_thaw(data); 2653 } else { 2654 dbuf_release_bp(db); 2655 } 2656 } 2657 } 2658 2659 if (parent != dn->dn_dbuf) { 2660 ASSERT(parent && parent->db_data_pending); 2661 ASSERT(db->db_level == parent->db_level-1); 2662 ASSERT(arc_released(parent->db_buf)); 2663 zio = parent->db_data_pending->dr_zio; 2664 } else { 2665 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2666 db->db_blkid != DMU_SPILL_BLKID) || 2667 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2668 if (db->db_blkid != DMU_SPILL_BLKID) 2669 ASSERT3P(db->db_blkptr, ==, 2670 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2671 zio = dn->dn_zio; 2672 } 2673 2674 ASSERT(db->db_level == 0 || data == db->db_buf); 2675 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2676 ASSERT(zio); 2677 2678 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2679 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2680 db->db.db_object, db->db_level, db->db_blkid); 2681 2682 if (db->db_blkid == DMU_SPILL_BLKID) 2683 wp_flag = WP_SPILL; 2684 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2685 2686 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp, txg); 2687 DB_DNODE_EXIT(db); 2688 2689 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2690 ASSERT(db->db_state != DB_NOFILL); 2691 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2692 db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2693 dbuf_write_override_ready, dbuf_write_override_done, dr, 2694 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2695 mutex_enter(&db->db_mtx); 2696 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2697 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2698 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2699 mutex_exit(&db->db_mtx); 2700 } else if (db->db_state == DB_NOFILL) { 2701 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); 2702 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2703 db->db_blkptr, NULL, db->db.db_size, &zp, 2704 dbuf_write_nofill_ready, dbuf_write_nofill_done, db, 2705 ZIO_PRIORITY_ASYNC_WRITE, 2706 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2707 } else { 2708 ASSERT(arc_released(data)); 2709 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2710 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, 2711 dbuf_write_ready, dbuf_write_done, db, 2712 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2713 } 2714 }