824 last_l1 = end >> epbs;
825 }
826 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
827
828 mutex_enter(&dn->dn_dbufs_mtx);
829 if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
830 /* There can't be any dbufs in this range; no need to search. */
831 mutex_exit(&dn->dn_dbufs_mtx);
832 return;
833 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
834 /*
835 * If we are receiving, we expect there to be no dbufs in
836 * the range to be freed, because receive modifies each
837 * block at most once, and in offset order. If this is
838 * not the case, it can lead to performance problems,
839 * so note that we unexpectedly took the slow path.
840 */
841 atomic_inc_64(&zfs_free_range_recv_miss);
842 }
843
844 for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
845 db_next = list_next(&dn->dn_dbufs, db);
846 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
847
848 if (db->db_level == 1 &&
849 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
850 mutex_enter(&db->db_mtx);
851 if (db->db_last_dirty &&
852 db->db_last_dirty->dr_txg < txg) {
853 dbuf_add_ref(db, FTAG);
854 mutex_exit(&db->db_mtx);
855 dbuf_will_dirty(db, tx);
856 dbuf_rele(db, FTAG);
857 } else {
858 mutex_exit(&db->db_mtx);
859 }
860 }
861
862 if (db->db_level != 0)
863 continue;
864 dprintf_dbuf(db, "found buf %s\n", "");
1170 * that we can modify it without impacting
1171 * possible other users of this cached data
1172 * block. Note that indirect blocks and
1173 * private objects are not released until the
1174 * syncing state (since they are only modified
1175 * then).
1176 */
1177 arc_release(db->db_buf, db);
1178 dbuf_fix_old_data(db, tx->tx_txg);
1179 data_old = db->db_buf;
1180 }
1181 ASSERT(data_old != NULL);
1182 }
1183 dr->dt.dl.dr_data = data_old;
1184 } else {
1185 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1186 list_create(&dr->dt.di.dr_children,
1187 sizeof (dbuf_dirty_record_t),
1188 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1189 }
1190 dr->dr_dbuf = db;
1191 dr->dr_txg = tx->tx_txg;
1192 dr->dr_next = *drp;
1193 *drp = dr;
1194
1195 /*
1196 * We could have been freed_in_flight between the dbuf_noread
1197 * and dbuf_dirty. We win, as though the dbuf_noread() had
1198 * happened after the free.
1199 */
1200 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1201 db->db_blkid != DMU_SPILL_BLKID) {
1202 mutex_enter(&dn->dn_mtx);
1203 dnode_clear_range(dn, db->db_blkid, 1, tx);
1204 mutex_exit(&dn->dn_mtx);
1205 db->db_freed_in_flight = FALSE;
1206 }
1207
1208 /*
1209 * This buffer is now part of this txg
1253 dmu_buf_impl_t *parent = db->db_parent;
1254 dbuf_dirty_record_t *di;
1255 int parent_held = FALSE;
1256
1257 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1258 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1259
1260 parent = dbuf_hold_level(dn, db->db_level+1,
1261 db->db_blkid >> epbs, FTAG);
1262 ASSERT(parent != NULL);
1263 parent_held = TRUE;
1264 }
1265 if (drop_struct_lock)
1266 rw_exit(&dn->dn_struct_rwlock);
1267 ASSERT3U(db->db_level+1, ==, parent->db_level);
1268 di = dbuf_dirty(parent, tx);
1269 if (parent_held)
1270 dbuf_rele(parent, FTAG);
1271
1272 mutex_enter(&db->db_mtx);
1273 /* possible race with dbuf_undirty() */
1274 if (db->db_last_dirty == dr ||
1275 dn->dn_object == DMU_META_DNODE_OBJECT) {
1276 mutex_enter(&di->dt.di.dr_mtx);
1277 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1278 ASSERT(!list_link_active(&dr->dr_dirty_node));
1279 list_insert_tail(&di->dt.di.dr_children, dr);
1280 mutex_exit(&di->dt.di.dr_mtx);
1281 dr->dr_parent = di;
1282 }
1283 mutex_exit(&db->db_mtx);
1284 } else {
1285 ASSERT(db->db_level+1 == dn->dn_nlevels);
1286 ASSERT(db->db_blkid < dn->dn_nblkptr);
1287 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1288 mutex_enter(&dn->dn_mtx);
1289 ASSERT(!list_link_active(&dr->dr_dirty_node));
1290 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1291 mutex_exit(&dn->dn_mtx);
1292 if (drop_struct_lock)
1293 rw_exit(&dn->dn_struct_rwlock);
1323 if (dr == NULL || dr->dr_txg < txg)
1324 return (B_FALSE);
1325 ASSERT(dr->dr_txg == txg);
1326 ASSERT(dr->dr_dbuf == db);
1327
1328 DB_DNODE_ENTER(db);
1329 dn = DB_DNODE(db);
1330
1331 /*
1332 * Note: This code will probably work even if there are concurrent
1333 * holders, but it is untested in that scenerio, as the ZPL and
1334 * ztest have additional locking (the range locks) that prevents
1335 * that type of concurrent access.
1336 */
1337 ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1338
1339 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1340
1341 ASSERT(db->db.db_size != 0);
1342
1343 /* XXX would be nice to fix up dn_towrite_space[] */
1344
1345 *drp = dr->dr_next;
1346
1347 /*
1348 * Note that there are three places in dbuf_dirty()
1349 * where this dirty record may be put on a list.
1350 * Make sure to do a list_remove corresponding to
1351 * every one of those list_insert calls.
1352 */
1353 if (dr->dr_parent) {
1354 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1355 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1356 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1357 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1358 db->db_level+1 == dn->dn_nlevels) {
1359 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1360 mutex_enter(&dn->dn_mtx);
1361 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1362 mutex_exit(&dn->dn_mtx);
1363 }
1503 arc_release(db->db_buf, db);
1504 }
1505 dr->dt.dl.dr_data = buf;
1506 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1507 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1508 arc_release(db->db_buf, db);
1509 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1510 }
1511 db->db_buf = NULL;
1512 }
1513 ASSERT(db->db_buf == NULL);
1514 dbuf_set_data(db, buf);
1515 db->db_state = DB_FILL;
1516 mutex_exit(&db->db_mtx);
1517 (void) dbuf_dirty(db, tx);
1518 dbuf_fill_done(db, tx);
1519 }
1520
1521 /*
1522 * "Clear" the contents of this dbuf. This will mark the dbuf
1523 * EVICTING and clear *most* of its references. Unfortunetely,
1524 * when we are not holding the dn_dbufs_mtx, we can't clear the
1525 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1526 * in this case. For callers from the DMU we will usually see:
1527 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1528 * For the arc callback, we will usually see:
1529 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1530 * Sometimes, though, we will get a mix of these two:
1531 * DMU: dbuf_clear()->arc_buf_evict()
1532 * ARC: dbuf_do_evict()->dbuf_destroy()
1533 */
1534 void
1535 dbuf_clear(dmu_buf_impl_t *db)
1536 {
1537 dnode_t *dn;
1538 dmu_buf_impl_t *parent = db->db_parent;
1539 dmu_buf_impl_t *dndb;
1540 int dbuf_gone = FALSE;
1541
1542 ASSERT(MUTEX_HELD(&db->db_mtx));
1543 ASSERT(refcount_is_zero(&db->db_holds));
1690 db->db_evict_func = NULL;
1691 db->db_immediate_evict = 0;
1692 db->db_freed_in_flight = 0;
1693
1694 if (blkid == DMU_BONUS_BLKID) {
1695 ASSERT3P(parent, ==, dn->dn_dbuf);
1696 db->db.db_size = DN_MAX_BONUSLEN -
1697 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1698 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1699 db->db.db_offset = DMU_BONUS_BLKID;
1700 db->db_state = DB_UNCACHED;
1701 /* the bonus dbuf is not placed in the hash table */
1702 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1703 return (db);
1704 } else if (blkid == DMU_SPILL_BLKID) {
1705 db->db.db_size = (blkptr != NULL) ?
1706 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1707 db->db.db_offset = 0;
1708 } else {
1709 int blocksize =
1710 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
1711 db->db.db_size = blocksize;
1712 db->db.db_offset = db->db_blkid * blocksize;
1713 }
1714
1715 /*
1716 * Hold the dn_dbufs_mtx while we get the new dbuf
1717 * in the hash table *and* added to the dbufs list.
1718 * This prevents a possible deadlock with someone
1719 * trying to look up this dbuf before its added to the
1720 * dn_dbufs list.
1721 */
1722 mutex_enter(&dn->dn_dbufs_mtx);
1723 db->db_state = DB_EVICTING;
1724 if ((odb = dbuf_hash_insert(db)) != NULL) {
1725 /* someone else inserted it first */
1726 kmem_cache_free(dbuf_cache, db);
1727 mutex_exit(&dn->dn_dbufs_mtx);
1728 return (odb);
1729 }
1730 list_insert_head(&dn->dn_dbufs, db);
1799 */
1800 dnode_rele(dn, db);
1801 db->db_dnode_handle = NULL;
1802 }
1803 dbuf_hash_remove(db);
1804 }
1805 db->db_parent = NULL;
1806 db->db_buf = NULL;
1807
1808 ASSERT(!list_link_active(&db->db_link));
1809 ASSERT(db->db.db_data == NULL);
1810 ASSERT(db->db_hash_next == NULL);
1811 ASSERT(db->db_blkptr == NULL);
1812 ASSERT(db->db_data_pending == NULL);
1813
1814 kmem_cache_free(dbuf_cache, db);
1815 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1816 }
1817
1818 void
1819 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1820 {
1821 dmu_buf_impl_t *db = NULL;
1822 blkptr_t *bp = NULL;
1823
1824 ASSERT(blkid != DMU_BONUS_BLKID);
1825 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1826
1827 if (dnode_block_freed(dn, blkid))
1828 return;
1829
1830 /* dbuf_find() returns with db_mtx held */
1831 if (db = dbuf_find(dn, 0, blkid)) {
1832 /*
1833 * This dbuf is already in the cache. We assume that
1834 * it is already CACHED, or else about to be either
1835 * read or filled.
1836 */
1837 mutex_exit(&db->db_mtx);
1838 return;
1839 }
1840
1841 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1842 if (bp && !BP_IS_HOLE(bp)) {
1843 int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
1844 ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
1845 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1846 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1847 zbookmark_t zb;
1848
1849 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1850 dn->dn_object, 0, blkid);
1851
1852 (void) arc_read(NULL, dn->dn_objset->os_spa,
1853 bp, NULL, NULL, priority,
1854 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1855 &aflags, &zb);
1856 }
1857 if (db)
1858 dbuf_rele(db, NULL);
1859 }
1860 }
1861
1862 /*
1863 * Returns with db_holds incremented, and db_mtx not held.
1864 * Note: dn_struct_rwlock must be held.
1865 */
1866 int
1867 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1868 void *tag, dmu_buf_impl_t **dbp)
1869 {
1870 dmu_buf_impl_t *db, *parent = NULL;
1871
1872 ASSERT(blkid != DMU_BONUS_BLKID);
1873 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2514 }
2515 } else {
2516 fill = 1;
2517 }
2518 } else {
2519 blkptr_t *ibp = db->db.db_data;
2520 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2521 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2522 if (BP_IS_HOLE(ibp))
2523 continue;
2524 fill += ibp->blk_fill;
2525 }
2526 }
2527 DB_DNODE_EXIT(db);
2528
2529 bp->blk_fill = fill;
2530
2531 mutex_exit(&db->db_mtx);
2532 }
2533
2534 /* ARGSUSED */
2535 static void
2536 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2537 {
2538 dmu_buf_impl_t *db = vdb;
2539 blkptr_t *bp = zio->io_bp;
2540 blkptr_t *bp_orig = &zio->io_bp_orig;
2541 uint64_t txg = zio->io_txg;
2542 dbuf_dirty_record_t **drp, *dr;
2543
2544 ASSERT0(zio->io_error);
2545 ASSERT(db->db_blkptr == bp);
2546
2547 /*
2548 * For nopwrites and rewrites we ensure that the bp matches our
2549 * original and bypass all the accounting.
2550 */
2551 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2552 ASSERT(BP_EQUAL(bp, bp_orig));
2553 } else {
2554 objset_t *os;
2555 dsl_dataset_t *ds;
2608 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2609 if (!BP_IS_HOLE(db->db_blkptr)) {
2610 int epbs =
2611 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2612 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2613 db->db.db_size);
2614 ASSERT3U(dn->dn_phys->dn_maxblkid
2615 >> (db->db_level * epbs), >=, db->db_blkid);
2616 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2617 }
2618 DB_DNODE_EXIT(db);
2619 mutex_destroy(&dr->dt.di.dr_mtx);
2620 list_destroy(&dr->dt.di.dr_children);
2621 }
2622 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2623
2624 cv_broadcast(&db->db_changed);
2625 ASSERT(db->db_dirtycnt > 0);
2626 db->db_dirtycnt -= 1;
2627 db->db_data_pending = NULL;
2628 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2629 }
2630
2631 static void
2632 dbuf_write_nofill_ready(zio_t *zio)
2633 {
2634 dbuf_write_ready(zio, NULL, zio->io_private);
2635 }
2636
2637 static void
2638 dbuf_write_nofill_done(zio_t *zio)
2639 {
2640 dbuf_write_done(zio, NULL, zio->io_private);
2641 }
2642
2643 static void
2644 dbuf_write_override_ready(zio_t *zio)
2645 {
2646 dbuf_dirty_record_t *dr = zio->io_private;
2647 dmu_buf_impl_t *db = dr->dr_dbuf;
2726
2727 ASSERT(db->db_level == 0 || data == db->db_buf);
2728 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2729 ASSERT(zio);
2730
2731 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2732 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2733 db->db.db_object, db->db_level, db->db_blkid);
2734
2735 if (db->db_blkid == DMU_SPILL_BLKID)
2736 wp_flag = WP_SPILL;
2737 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2738
2739 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2740 DB_DNODE_EXIT(db);
2741
2742 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2743 ASSERT(db->db_state != DB_NOFILL);
2744 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2745 db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2746 dbuf_write_override_ready, dbuf_write_override_done, dr,
2747 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2748 mutex_enter(&db->db_mtx);
2749 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2750 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2751 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2752 mutex_exit(&db->db_mtx);
2753 } else if (db->db_state == DB_NOFILL) {
2754 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2755 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2756 db->db_blkptr, NULL, db->db.db_size, &zp,
2757 dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
2758 ZIO_PRIORITY_ASYNC_WRITE,
2759 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2760 } else {
2761 ASSERT(arc_released(data));
2762 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2763 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2764 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2765 dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
2766 ZIO_FLAG_MUSTSUCCEED, &zb);
2767 }
2768 }
|
824 last_l1 = end >> epbs;
825 }
826 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
827
828 mutex_enter(&dn->dn_dbufs_mtx);
829 if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
830 /* There can't be any dbufs in this range; no need to search. */
831 mutex_exit(&dn->dn_dbufs_mtx);
832 return;
833 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
834 /*
835 * If we are receiving, we expect there to be no dbufs in
836 * the range to be freed, because receive modifies each
837 * block at most once, and in offset order. If this is
838 * not the case, it can lead to performance problems,
839 * so note that we unexpectedly took the slow path.
840 */
841 atomic_inc_64(&zfs_free_range_recv_miss);
842 }
843
844 for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
845 db_next = list_next(&dn->dn_dbufs, db);
846 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
847
848 if (db->db_level == 1 &&
849 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
850 mutex_enter(&db->db_mtx);
851 if (db->db_last_dirty &&
852 db->db_last_dirty->dr_txg < txg) {
853 dbuf_add_ref(db, FTAG);
854 mutex_exit(&db->db_mtx);
855 dbuf_will_dirty(db, tx);
856 dbuf_rele(db, FTAG);
857 } else {
858 mutex_exit(&db->db_mtx);
859 }
860 }
861
862 if (db->db_level != 0)
863 continue;
864 dprintf_dbuf(db, "found buf %s\n", "");
1170 * that we can modify it without impacting
1171 * possible other users of this cached data
1172 * block. Note that indirect blocks and
1173 * private objects are not released until the
1174 * syncing state (since they are only modified
1175 * then).
1176 */
1177 arc_release(db->db_buf, db);
1178 dbuf_fix_old_data(db, tx->tx_txg);
1179 data_old = db->db_buf;
1180 }
1181 ASSERT(data_old != NULL);
1182 }
1183 dr->dt.dl.dr_data = data_old;
1184 } else {
1185 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1186 list_create(&dr->dt.di.dr_children,
1187 sizeof (dbuf_dirty_record_t),
1188 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1189 }
1190 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1191 dr->dr_accounted = db->db.db_size;
1192 dr->dr_dbuf = db;
1193 dr->dr_txg = tx->tx_txg;
1194 dr->dr_next = *drp;
1195 *drp = dr;
1196
1197 /*
1198 * We could have been freed_in_flight between the dbuf_noread
1199 * and dbuf_dirty. We win, as though the dbuf_noread() had
1200 * happened after the free.
1201 */
1202 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1203 db->db_blkid != DMU_SPILL_BLKID) {
1204 mutex_enter(&dn->dn_mtx);
1205 dnode_clear_range(dn, db->db_blkid, 1, tx);
1206 mutex_exit(&dn->dn_mtx);
1207 db->db_freed_in_flight = FALSE;
1208 }
1209
1210 /*
1211 * This buffer is now part of this txg
1255 dmu_buf_impl_t *parent = db->db_parent;
1256 dbuf_dirty_record_t *di;
1257 int parent_held = FALSE;
1258
1259 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1260 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1261
1262 parent = dbuf_hold_level(dn, db->db_level+1,
1263 db->db_blkid >> epbs, FTAG);
1264 ASSERT(parent != NULL);
1265 parent_held = TRUE;
1266 }
1267 if (drop_struct_lock)
1268 rw_exit(&dn->dn_struct_rwlock);
1269 ASSERT3U(db->db_level+1, ==, parent->db_level);
1270 di = dbuf_dirty(parent, tx);
1271 if (parent_held)
1272 dbuf_rele(parent, FTAG);
1273
1274 mutex_enter(&db->db_mtx);
1275 /*
1276 * Since we've dropped the mutex, it's possible that
1277 * dbuf_undirty() might have changed this out from under us.
1278 */
1279 if (db->db_last_dirty == dr ||
1280 dn->dn_object == DMU_META_DNODE_OBJECT) {
1281 mutex_enter(&di->dt.di.dr_mtx);
1282 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1283 ASSERT(!list_link_active(&dr->dr_dirty_node));
1284 list_insert_tail(&di->dt.di.dr_children, dr);
1285 mutex_exit(&di->dt.di.dr_mtx);
1286 dr->dr_parent = di;
1287 }
1288 mutex_exit(&db->db_mtx);
1289 } else {
1290 ASSERT(db->db_level+1 == dn->dn_nlevels);
1291 ASSERT(db->db_blkid < dn->dn_nblkptr);
1292 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1293 mutex_enter(&dn->dn_mtx);
1294 ASSERT(!list_link_active(&dr->dr_dirty_node));
1295 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1296 mutex_exit(&dn->dn_mtx);
1297 if (drop_struct_lock)
1298 rw_exit(&dn->dn_struct_rwlock);
1328 if (dr == NULL || dr->dr_txg < txg)
1329 return (B_FALSE);
1330 ASSERT(dr->dr_txg == txg);
1331 ASSERT(dr->dr_dbuf == db);
1332
1333 DB_DNODE_ENTER(db);
1334 dn = DB_DNODE(db);
1335
1336 /*
1337 * Note: This code will probably work even if there are concurrent
1338 * holders, but it is untested in that scenerio, as the ZPL and
1339 * ztest have additional locking (the range locks) that prevents
1340 * that type of concurrent access.
1341 */
1342 ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1343
1344 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1345
1346 ASSERT(db->db.db_size != 0);
1347
1348 /*
1349 * Any space we accounted for in dp_dirty_* will be cleaned up by
1350 * dsl_pool_sync(). This is relatively rare so the discrepancy
1351 * is not a big deal.
1352 */
1353
1354 *drp = dr->dr_next;
1355
1356 /*
1357 * Note that there are three places in dbuf_dirty()
1358 * where this dirty record may be put on a list.
1359 * Make sure to do a list_remove corresponding to
1360 * every one of those list_insert calls.
1361 */
1362 if (dr->dr_parent) {
1363 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1364 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1365 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1366 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1367 db->db_level+1 == dn->dn_nlevels) {
1368 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1369 mutex_enter(&dn->dn_mtx);
1370 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1371 mutex_exit(&dn->dn_mtx);
1372 }
1512 arc_release(db->db_buf, db);
1513 }
1514 dr->dt.dl.dr_data = buf;
1515 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1516 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1517 arc_release(db->db_buf, db);
1518 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1519 }
1520 db->db_buf = NULL;
1521 }
1522 ASSERT(db->db_buf == NULL);
1523 dbuf_set_data(db, buf);
1524 db->db_state = DB_FILL;
1525 mutex_exit(&db->db_mtx);
1526 (void) dbuf_dirty(db, tx);
1527 dbuf_fill_done(db, tx);
1528 }
1529
1530 /*
1531 * "Clear" the contents of this dbuf. This will mark the dbuf
1532 * EVICTING and clear *most* of its references. Unfortunately,
1533 * when we are not holding the dn_dbufs_mtx, we can't clear the
1534 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1535 * in this case. For callers from the DMU we will usually see:
1536 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1537 * For the arc callback, we will usually see:
1538 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1539 * Sometimes, though, we will get a mix of these two:
1540 * DMU: dbuf_clear()->arc_buf_evict()
1541 * ARC: dbuf_do_evict()->dbuf_destroy()
1542 */
1543 void
1544 dbuf_clear(dmu_buf_impl_t *db)
1545 {
1546 dnode_t *dn;
1547 dmu_buf_impl_t *parent = db->db_parent;
1548 dmu_buf_impl_t *dndb;
1549 int dbuf_gone = FALSE;
1550
1551 ASSERT(MUTEX_HELD(&db->db_mtx));
1552 ASSERT(refcount_is_zero(&db->db_holds));
1699 db->db_evict_func = NULL;
1700 db->db_immediate_evict = 0;
1701 db->db_freed_in_flight = 0;
1702
1703 if (blkid == DMU_BONUS_BLKID) {
1704 ASSERT3P(parent, ==, dn->dn_dbuf);
1705 db->db.db_size = DN_MAX_BONUSLEN -
1706 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1707 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1708 db->db.db_offset = DMU_BONUS_BLKID;
1709 db->db_state = DB_UNCACHED;
1710 /* the bonus dbuf is not placed in the hash table */
1711 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1712 return (db);
1713 } else if (blkid == DMU_SPILL_BLKID) {
1714 db->db.db_size = (blkptr != NULL) ?
1715 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1716 db->db.db_offset = 0;
1717 } else {
1718 int blocksize =
1719 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1720 db->db.db_size = blocksize;
1721 db->db.db_offset = db->db_blkid * blocksize;
1722 }
1723
1724 /*
1725 * Hold the dn_dbufs_mtx while we get the new dbuf
1726 * in the hash table *and* added to the dbufs list.
1727 * This prevents a possible deadlock with someone
1728 * trying to look up this dbuf before its added to the
1729 * dn_dbufs list.
1730 */
1731 mutex_enter(&dn->dn_dbufs_mtx);
1732 db->db_state = DB_EVICTING;
1733 if ((odb = dbuf_hash_insert(db)) != NULL) {
1734 /* someone else inserted it first */
1735 kmem_cache_free(dbuf_cache, db);
1736 mutex_exit(&dn->dn_dbufs_mtx);
1737 return (odb);
1738 }
1739 list_insert_head(&dn->dn_dbufs, db);
1808 */
1809 dnode_rele(dn, db);
1810 db->db_dnode_handle = NULL;
1811 }
1812 dbuf_hash_remove(db);
1813 }
1814 db->db_parent = NULL;
1815 db->db_buf = NULL;
1816
1817 ASSERT(!list_link_active(&db->db_link));
1818 ASSERT(db->db.db_data == NULL);
1819 ASSERT(db->db_hash_next == NULL);
1820 ASSERT(db->db_blkptr == NULL);
1821 ASSERT(db->db_data_pending == NULL);
1822
1823 kmem_cache_free(dbuf_cache, db);
1824 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1825 }
1826
1827 void
1828 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1829 {
1830 dmu_buf_impl_t *db = NULL;
1831 blkptr_t *bp = NULL;
1832
1833 ASSERT(blkid != DMU_BONUS_BLKID);
1834 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1835
1836 if (dnode_block_freed(dn, blkid))
1837 return;
1838
1839 /* dbuf_find() returns with db_mtx held */
1840 if (db = dbuf_find(dn, 0, blkid)) {
1841 /*
1842 * This dbuf is already in the cache. We assume that
1843 * it is already CACHED, or else about to be either
1844 * read or filled.
1845 */
1846 mutex_exit(&db->db_mtx);
1847 return;
1848 }
1849
1850 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1851 if (bp && !BP_IS_HOLE(bp)) {
1852 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1853 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1854 zbookmark_t zb;
1855
1856 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1857 dn->dn_object, 0, blkid);
1858
1859 (void) arc_read(NULL, dn->dn_objset->os_spa,
1860 bp, NULL, NULL, prio,
1861 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1862 &aflags, &zb);
1863 }
1864 if (db)
1865 dbuf_rele(db, NULL);
1866 }
1867 }
1868
1869 /*
1870 * Returns with db_holds incremented, and db_mtx not held.
1871 * Note: dn_struct_rwlock must be held.
1872 */
1873 int
1874 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1875 void *tag, dmu_buf_impl_t **dbp)
1876 {
1877 dmu_buf_impl_t *db, *parent = NULL;
1878
1879 ASSERT(blkid != DMU_BONUS_BLKID);
1880 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2521 }
2522 } else {
2523 fill = 1;
2524 }
2525 } else {
2526 blkptr_t *ibp = db->db.db_data;
2527 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2528 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2529 if (BP_IS_HOLE(ibp))
2530 continue;
2531 fill += ibp->blk_fill;
2532 }
2533 }
2534 DB_DNODE_EXIT(db);
2535
2536 bp->blk_fill = fill;
2537
2538 mutex_exit(&db->db_mtx);
2539 }
2540
2541 /*
2542 * The SPA will call this callback several times for each zio - once
2543 * for every physical child i/o (zio->io_phys_children times). This
2544 * allows the DMU to monitor the progress of each logical i/o. For example,
2545 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2546 * block. There may be a long delay before all copies/fragments are completed,
2547 * so this callback allows us to retire dirty space gradually, as the physical
2548 * i/os complete.
2549 */
2550 /* ARGSUSED */
2551 static void
2552 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2553 {
2554 dmu_buf_impl_t *db = arg;
2555 objset_t *os = db->db_objset;
2556 dsl_pool_t *dp = dmu_objset_pool(os);
2557 dbuf_dirty_record_t *dr;
2558 int delta = 0;
2559
2560 dr = db->db_data_pending;
2561 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2562
2563 /*
2564 * The callback will be called io_phys_children times. Retire one
2565 * portion of our dirty space each time we are called. Any rounding
2566 * error will be cleaned up by dsl_pool_sync()'s call to
2567 * dsl_pool_undirty_space().
2568 */
2569 delta = dr->dr_accounted / zio->io_phys_children;
2570 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2571 }
2572
2573 /* ARGSUSED */
2574 static void
2575 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2576 {
2577 dmu_buf_impl_t *db = vdb;
2578 blkptr_t *bp = zio->io_bp;
2579 blkptr_t *bp_orig = &zio->io_bp_orig;
2580 uint64_t txg = zio->io_txg;
2581 dbuf_dirty_record_t **drp, *dr;
2582
2583 ASSERT0(zio->io_error);
2584 ASSERT(db->db_blkptr == bp);
2585
2586 /*
2587 * For nopwrites and rewrites we ensure that the bp matches our
2588 * original and bypass all the accounting.
2589 */
2590 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2591 ASSERT(BP_EQUAL(bp, bp_orig));
2592 } else {
2593 objset_t *os;
2594 dsl_dataset_t *ds;
2647 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2648 if (!BP_IS_HOLE(db->db_blkptr)) {
2649 int epbs =
2650 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2651 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2652 db->db.db_size);
2653 ASSERT3U(dn->dn_phys->dn_maxblkid
2654 >> (db->db_level * epbs), >=, db->db_blkid);
2655 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2656 }
2657 DB_DNODE_EXIT(db);
2658 mutex_destroy(&dr->dt.di.dr_mtx);
2659 list_destroy(&dr->dt.di.dr_children);
2660 }
2661 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2662
2663 cv_broadcast(&db->db_changed);
2664 ASSERT(db->db_dirtycnt > 0);
2665 db->db_dirtycnt -= 1;
2666 db->db_data_pending = NULL;
2667
2668 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2669 }
2670
2671 static void
2672 dbuf_write_nofill_ready(zio_t *zio)
2673 {
2674 dbuf_write_ready(zio, NULL, zio->io_private);
2675 }
2676
2677 static void
2678 dbuf_write_nofill_done(zio_t *zio)
2679 {
2680 dbuf_write_done(zio, NULL, zio->io_private);
2681 }
2682
2683 static void
2684 dbuf_write_override_ready(zio_t *zio)
2685 {
2686 dbuf_dirty_record_t *dr = zio->io_private;
2687 dmu_buf_impl_t *db = dr->dr_dbuf;
2766
2767 ASSERT(db->db_level == 0 || data == db->db_buf);
2768 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2769 ASSERT(zio);
2770
2771 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2772 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2773 db->db.db_object, db->db_level, db->db_blkid);
2774
2775 if (db->db_blkid == DMU_SPILL_BLKID)
2776 wp_flag = WP_SPILL;
2777 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2778
2779 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2780 DB_DNODE_EXIT(db);
2781
2782 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2783 ASSERT(db->db_state != DB_NOFILL);
2784 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2785 db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2786 dbuf_write_override_ready, NULL, dbuf_write_override_done,
2787 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2788 mutex_enter(&db->db_mtx);
2789 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2790 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2791 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2792 mutex_exit(&db->db_mtx);
2793 } else if (db->db_state == DB_NOFILL) {
2794 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2795 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2796 db->db_blkptr, NULL, db->db.db_size, &zp,
2797 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2798 ZIO_PRIORITY_ASYNC_WRITE,
2799 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2800 } else {
2801 ASSERT(arc_released(data));
2802 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2803 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2804 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2805 dbuf_write_physdone, dbuf_write_done, db,
2806 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2807 }
2808 }
|