Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


 824                 last_l1 = end >> epbs;
 825         }
 826         dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 827 
 828         mutex_enter(&dn->dn_dbufs_mtx);
 829         if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
 830                 /* There can't be any dbufs in this range; no need to search. */
 831                 mutex_exit(&dn->dn_dbufs_mtx);
 832                 return;
 833         } else if (dmu_objset_is_receiving(dn->dn_objset)) {
 834                 /*
 835                  * If we are receiving, we expect there to be no dbufs in
 836                  * the range to be freed, because receive modifies each
 837                  * block at most once, and in offset order.  If this is
 838                  * not the case, it can lead to performance problems,
 839                  * so note that we unexpectedly took the slow path.
 840                  */
 841                 atomic_inc_64(&zfs_free_range_recv_miss);
 842         }
 843 
 844         for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 845                 db_next = list_next(&dn->dn_dbufs, db);
 846                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 847 
 848                 if (db->db_level == 1 &&
 849                     db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
 850                         mutex_enter(&db->db_mtx);
 851                         if (db->db_last_dirty &&
 852                             db->db_last_dirty->dr_txg < txg) {
 853                                 dbuf_add_ref(db, FTAG);
 854                                 mutex_exit(&db->db_mtx);
 855                                 dbuf_will_dirty(db, tx);
 856                                 dbuf_rele(db, FTAG);
 857                         } else {
 858                                 mutex_exit(&db->db_mtx);
 859                         }
 860                 }
 861 
 862                 if (db->db_level != 0)
 863                         continue;
 864                 dprintf_dbuf(db, "found buf %s\n", "");


1170                                  * that we can modify it without impacting
1171                                  * possible other users of this cached data
1172                                  * block.  Note that indirect blocks and
1173                                  * private objects are not released until the
1174                                  * syncing state (since they are only modified
1175                                  * then).
1176                                  */
1177                                 arc_release(db->db_buf, db);
1178                                 dbuf_fix_old_data(db, tx->tx_txg);
1179                                 data_old = db->db_buf;
1180                         }
1181                         ASSERT(data_old != NULL);
1182                 }
1183                 dr->dt.dl.dr_data = data_old;
1184         } else {
1185                 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1186                 list_create(&dr->dt.di.dr_children,
1187                     sizeof (dbuf_dirty_record_t),
1188                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
1189         }


1190         dr->dr_dbuf = db;
1191         dr->dr_txg = tx->tx_txg;
1192         dr->dr_next = *drp;
1193         *drp = dr;
1194 
1195         /*
1196          * We could have been freed_in_flight between the dbuf_noread
1197          * and dbuf_dirty.  We win, as though the dbuf_noread() had
1198          * happened after the free.
1199          */
1200         if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1201             db->db_blkid != DMU_SPILL_BLKID) {
1202                 mutex_enter(&dn->dn_mtx);
1203                 dnode_clear_range(dn, db->db_blkid, 1, tx);
1204                 mutex_exit(&dn->dn_mtx);
1205                 db->db_freed_in_flight = FALSE;
1206         }
1207 
1208         /*
1209          * This buffer is now part of this txg


1253                 dmu_buf_impl_t *parent = db->db_parent;
1254                 dbuf_dirty_record_t *di;
1255                 int parent_held = FALSE;
1256 
1257                 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1258                         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1259 
1260                         parent = dbuf_hold_level(dn, db->db_level+1,
1261                             db->db_blkid >> epbs, FTAG);
1262                         ASSERT(parent != NULL);
1263                         parent_held = TRUE;
1264                 }
1265                 if (drop_struct_lock)
1266                         rw_exit(&dn->dn_struct_rwlock);
1267                 ASSERT3U(db->db_level+1, ==, parent->db_level);
1268                 di = dbuf_dirty(parent, tx);
1269                 if (parent_held)
1270                         dbuf_rele(parent, FTAG);
1271 
1272                 mutex_enter(&db->db_mtx);
1273                 /*  possible race with dbuf_undirty() */



1274                 if (db->db_last_dirty == dr ||
1275                     dn->dn_object == DMU_META_DNODE_OBJECT) {
1276                         mutex_enter(&di->dt.di.dr_mtx);
1277                         ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1278                         ASSERT(!list_link_active(&dr->dr_dirty_node));
1279                         list_insert_tail(&di->dt.di.dr_children, dr);
1280                         mutex_exit(&di->dt.di.dr_mtx);
1281                         dr->dr_parent = di;
1282                 }
1283                 mutex_exit(&db->db_mtx);
1284         } else {
1285                 ASSERT(db->db_level+1 == dn->dn_nlevels);
1286                 ASSERT(db->db_blkid < dn->dn_nblkptr);
1287                 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1288                 mutex_enter(&dn->dn_mtx);
1289                 ASSERT(!list_link_active(&dr->dr_dirty_node));
1290                 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1291                 mutex_exit(&dn->dn_mtx);
1292                 if (drop_struct_lock)
1293                         rw_exit(&dn->dn_struct_rwlock);


1323         if (dr == NULL || dr->dr_txg < txg)
1324                 return (B_FALSE);
1325         ASSERT(dr->dr_txg == txg);
1326         ASSERT(dr->dr_dbuf == db);
1327 
1328         DB_DNODE_ENTER(db);
1329         dn = DB_DNODE(db);
1330 
1331         /*
1332          * Note:  This code will probably work even if there are concurrent
1333          * holders, but it is untested in that scenerio, as the ZPL and
1334          * ztest have additional locking (the range locks) that prevents
1335          * that type of concurrent access.
1336          */
1337         ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1338 
1339         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1340 
1341         ASSERT(db->db.db_size != 0);
1342 
1343         /* XXX would be nice to fix up dn_towrite_space[] */




1344 
1345         *drp = dr->dr_next;
1346 
1347         /*
1348          * Note that there are three places in dbuf_dirty()
1349          * where this dirty record may be put on a list.
1350          * Make sure to do a list_remove corresponding to
1351          * every one of those list_insert calls.
1352          */
1353         if (dr->dr_parent) {
1354                 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1355                 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1356                 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1357         } else if (db->db_blkid == DMU_SPILL_BLKID ||
1358             db->db_level+1 == dn->dn_nlevels) {
1359                 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1360                 mutex_enter(&dn->dn_mtx);
1361                 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1362                 mutex_exit(&dn->dn_mtx);
1363         }


1503                                 arc_release(db->db_buf, db);
1504                         }
1505                         dr->dt.dl.dr_data = buf;
1506                         VERIFY(arc_buf_remove_ref(db->db_buf, db));
1507                 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1508                         arc_release(db->db_buf, db);
1509                         VERIFY(arc_buf_remove_ref(db->db_buf, db));
1510                 }
1511                 db->db_buf = NULL;
1512         }
1513         ASSERT(db->db_buf == NULL);
1514         dbuf_set_data(db, buf);
1515         db->db_state = DB_FILL;
1516         mutex_exit(&db->db_mtx);
1517         (void) dbuf_dirty(db, tx);
1518         dbuf_fill_done(db, tx);
1519 }
1520 
1521 /*
1522  * "Clear" the contents of this dbuf.  This will mark the dbuf
1523  * EVICTING and clear *most* of its references.  Unfortunetely,
1524  * when we are not holding the dn_dbufs_mtx, we can't clear the
1525  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1526  * in this case.  For callers from the DMU we will usually see:
1527  *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1528  * For the arc callback, we will usually see:
1529  *      dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1530  * Sometimes, though, we will get a mix of these two:
1531  *      DMU: dbuf_clear()->arc_buf_evict()
1532  *      ARC: dbuf_do_evict()->dbuf_destroy()
1533  */
1534 void
1535 dbuf_clear(dmu_buf_impl_t *db)
1536 {
1537         dnode_t *dn;
1538         dmu_buf_impl_t *parent = db->db_parent;
1539         dmu_buf_impl_t *dndb;
1540         int dbuf_gone = FALSE;
1541 
1542         ASSERT(MUTEX_HELD(&db->db_mtx));
1543         ASSERT(refcount_is_zero(&db->db_holds));


1690         db->db_evict_func = NULL;
1691         db->db_immediate_evict = 0;
1692         db->db_freed_in_flight = 0;
1693 
1694         if (blkid == DMU_BONUS_BLKID) {
1695                 ASSERT3P(parent, ==, dn->dn_dbuf);
1696                 db->db.db_size = DN_MAX_BONUSLEN -
1697                     (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1698                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1699                 db->db.db_offset = DMU_BONUS_BLKID;
1700                 db->db_state = DB_UNCACHED;
1701                 /* the bonus dbuf is not placed in the hash table */
1702                 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1703                 return (db);
1704         } else if (blkid == DMU_SPILL_BLKID) {
1705                 db->db.db_size = (blkptr != NULL) ?
1706                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1707                 db->db.db_offset = 0;
1708         } else {
1709                 int blocksize =
1710                     db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
1711                 db->db.db_size = blocksize;
1712                 db->db.db_offset = db->db_blkid * blocksize;
1713         }
1714 
1715         /*
1716          * Hold the dn_dbufs_mtx while we get the new dbuf
1717          * in the hash table *and* added to the dbufs list.
1718          * This prevents a possible deadlock with someone
1719          * trying to look up this dbuf before its added to the
1720          * dn_dbufs list.
1721          */
1722         mutex_enter(&dn->dn_dbufs_mtx);
1723         db->db_state = DB_EVICTING;
1724         if ((odb = dbuf_hash_insert(db)) != NULL) {
1725                 /* someone else inserted it first */
1726                 kmem_cache_free(dbuf_cache, db);
1727                 mutex_exit(&dn->dn_dbufs_mtx);
1728                 return (odb);
1729         }
1730         list_insert_head(&dn->dn_dbufs, db);


1799                          */
1800                         dnode_rele(dn, db);
1801                         db->db_dnode_handle = NULL;
1802                 }
1803                 dbuf_hash_remove(db);
1804         }
1805         db->db_parent = NULL;
1806         db->db_buf = NULL;
1807 
1808         ASSERT(!list_link_active(&db->db_link));
1809         ASSERT(db->db.db_data == NULL);
1810         ASSERT(db->db_hash_next == NULL);
1811         ASSERT(db->db_blkptr == NULL);
1812         ASSERT(db->db_data_pending == NULL);
1813 
1814         kmem_cache_free(dbuf_cache, db);
1815         arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1816 }
1817 
1818 void
1819 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1820 {
1821         dmu_buf_impl_t *db = NULL;
1822         blkptr_t *bp = NULL;
1823 
1824         ASSERT(blkid != DMU_BONUS_BLKID);
1825         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1826 
1827         if (dnode_block_freed(dn, blkid))
1828                 return;
1829 
1830         /* dbuf_find() returns with db_mtx held */
1831         if (db = dbuf_find(dn, 0, blkid)) {
1832                 /*
1833                  * This dbuf is already in the cache.  We assume that
1834                  * it is already CACHED, or else about to be either
1835                  * read or filled.
1836                  */
1837                 mutex_exit(&db->db_mtx);
1838                 return;
1839         }
1840 
1841         if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1842                 if (bp && !BP_IS_HOLE(bp)) {
1843                         int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
1844                             ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
1845                         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1846                         uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1847                         zbookmark_t zb;
1848 
1849                         SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1850                             dn->dn_object, 0, blkid);
1851 
1852                         (void) arc_read(NULL, dn->dn_objset->os_spa,
1853                             bp, NULL, NULL, priority,
1854                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1855                             &aflags, &zb);
1856                 }
1857                 if (db)
1858                         dbuf_rele(db, NULL);
1859         }
1860 }
1861 
1862 /*
1863  * Returns with db_holds incremented, and db_mtx not held.
1864  * Note: dn_struct_rwlock must be held.
1865  */
1866 int
1867 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1868     void *tag, dmu_buf_impl_t **dbp)
1869 {
1870         dmu_buf_impl_t *db, *parent = NULL;
1871 
1872         ASSERT(blkid != DMU_BONUS_BLKID);
1873         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));


2514                         }
2515                 } else {
2516                         fill = 1;
2517                 }
2518         } else {
2519                 blkptr_t *ibp = db->db.db_data;
2520                 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2521                 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2522                         if (BP_IS_HOLE(ibp))
2523                                 continue;
2524                         fill += ibp->blk_fill;
2525                 }
2526         }
2527         DB_DNODE_EXIT(db);
2528 
2529         bp->blk_fill = fill;
2530 
2531         mutex_exit(&db->db_mtx);
2532 }
2533 









2534 /* ARGSUSED */
2535 static void























2536 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2537 {
2538         dmu_buf_impl_t *db = vdb;
2539         blkptr_t *bp = zio->io_bp;
2540         blkptr_t *bp_orig = &zio->io_bp_orig;
2541         uint64_t txg = zio->io_txg;
2542         dbuf_dirty_record_t **drp, *dr;
2543 
2544         ASSERT0(zio->io_error);
2545         ASSERT(db->db_blkptr == bp);
2546 
2547         /*
2548          * For nopwrites and rewrites we ensure that the bp matches our
2549          * original and bypass all the accounting.
2550          */
2551         if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2552                 ASSERT(BP_EQUAL(bp, bp_orig));
2553         } else {
2554                 objset_t *os;
2555                 dsl_dataset_t *ds;


2608                 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2609                 if (!BP_IS_HOLE(db->db_blkptr)) {
2610                         int epbs =
2611                             dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2612                         ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2613                             db->db.db_size);
2614                         ASSERT3U(dn->dn_phys->dn_maxblkid
2615                             >> (db->db_level * epbs), >=, db->db_blkid);
2616                         arc_set_callback(db->db_buf, dbuf_do_evict, db);
2617                 }
2618                 DB_DNODE_EXIT(db);
2619                 mutex_destroy(&dr->dt.di.dr_mtx);
2620                 list_destroy(&dr->dt.di.dr_children);
2621         }
2622         kmem_free(dr, sizeof (dbuf_dirty_record_t));
2623 
2624         cv_broadcast(&db->db_changed);
2625         ASSERT(db->db_dirtycnt > 0);
2626         db->db_dirtycnt -= 1;
2627         db->db_data_pending = NULL;

2628         dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2629 }
2630 
2631 static void
2632 dbuf_write_nofill_ready(zio_t *zio)
2633 {
2634         dbuf_write_ready(zio, NULL, zio->io_private);
2635 }
2636 
2637 static void
2638 dbuf_write_nofill_done(zio_t *zio)
2639 {
2640         dbuf_write_done(zio, NULL, zio->io_private);
2641 }
2642 
2643 static void
2644 dbuf_write_override_ready(zio_t *zio)
2645 {
2646         dbuf_dirty_record_t *dr = zio->io_private;
2647         dmu_buf_impl_t *db = dr->dr_dbuf;


2726 
2727         ASSERT(db->db_level == 0 || data == db->db_buf);
2728         ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2729         ASSERT(zio);
2730 
2731         SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2732             os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2733             db->db.db_object, db->db_level, db->db_blkid);
2734 
2735         if (db->db_blkid == DMU_SPILL_BLKID)
2736                 wp_flag = WP_SPILL;
2737         wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2738 
2739         dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2740         DB_DNODE_EXIT(db);
2741 
2742         if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2743                 ASSERT(db->db_state != DB_NOFILL);
2744                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2745                     db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2746                     dbuf_write_override_ready, dbuf_write_override_done, dr,
2747                     ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2748                 mutex_enter(&db->db_mtx);
2749                 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2750                 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2751                     dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2752                 mutex_exit(&db->db_mtx);
2753         } else if (db->db_state == DB_NOFILL) {
2754                 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2755                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2756                     db->db_blkptr, NULL, db->db.db_size, &zp,
2757                     dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
2758                     ZIO_PRIORITY_ASYNC_WRITE,
2759                     ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2760         } else {
2761                 ASSERT(arc_released(data));
2762                 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2763                     db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2764                     DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2765                     dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
2766                     ZIO_FLAG_MUSTSUCCEED, &zb);
2767         }
2768 }


 824                 last_l1 = end >> epbs;
 825         }
 826         dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 827 
 828         mutex_enter(&dn->dn_dbufs_mtx);
 829         if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
 830                 /* There can't be any dbufs in this range; no need to search. */
 831                 mutex_exit(&dn->dn_dbufs_mtx);
 832                 return;
 833         } else if (dmu_objset_is_receiving(dn->dn_objset)) {
 834                 /*
 835                  * If we are receiving, we expect there to be no dbufs in
 836                  * the range to be freed, because receive modifies each
 837                  * block at most once, and in offset order.  If this is
 838                  * not the case, it can lead to performance problems,
 839                  * so note that we unexpectedly took the slow path.
 840                  */
 841                 atomic_inc_64(&zfs_free_range_recv_miss);
 842         }
 843 
 844         for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
 845                 db_next = list_next(&dn->dn_dbufs, db);
 846                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 847 
 848                 if (db->db_level == 1 &&
 849                     db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
 850                         mutex_enter(&db->db_mtx);
 851                         if (db->db_last_dirty &&
 852                             db->db_last_dirty->dr_txg < txg) {
 853                                 dbuf_add_ref(db, FTAG);
 854                                 mutex_exit(&db->db_mtx);
 855                                 dbuf_will_dirty(db, tx);
 856                                 dbuf_rele(db, FTAG);
 857                         } else {
 858                                 mutex_exit(&db->db_mtx);
 859                         }
 860                 }
 861 
 862                 if (db->db_level != 0)
 863                         continue;
 864                 dprintf_dbuf(db, "found buf %s\n", "");


1170                                  * that we can modify it without impacting
1171                                  * possible other users of this cached data
1172                                  * block.  Note that indirect blocks and
1173                                  * private objects are not released until the
1174                                  * syncing state (since they are only modified
1175                                  * then).
1176                                  */
1177                                 arc_release(db->db_buf, db);
1178                                 dbuf_fix_old_data(db, tx->tx_txg);
1179                                 data_old = db->db_buf;
1180                         }
1181                         ASSERT(data_old != NULL);
1182                 }
1183                 dr->dt.dl.dr_data = data_old;
1184         } else {
1185                 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1186                 list_create(&dr->dt.di.dr_children,
1187                     sizeof (dbuf_dirty_record_t),
1188                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
1189         }
1190         if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1191                 dr->dr_accounted = db->db.db_size;
1192         dr->dr_dbuf = db;
1193         dr->dr_txg = tx->tx_txg;
1194         dr->dr_next = *drp;
1195         *drp = dr;
1196 
1197         /*
1198          * We could have been freed_in_flight between the dbuf_noread
1199          * and dbuf_dirty.  We win, as though the dbuf_noread() had
1200          * happened after the free.
1201          */
1202         if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1203             db->db_blkid != DMU_SPILL_BLKID) {
1204                 mutex_enter(&dn->dn_mtx);
1205                 dnode_clear_range(dn, db->db_blkid, 1, tx);
1206                 mutex_exit(&dn->dn_mtx);
1207                 db->db_freed_in_flight = FALSE;
1208         }
1209 
1210         /*
1211          * This buffer is now part of this txg


1255                 dmu_buf_impl_t *parent = db->db_parent;
1256                 dbuf_dirty_record_t *di;
1257                 int parent_held = FALSE;
1258 
1259                 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1260                         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1261 
1262                         parent = dbuf_hold_level(dn, db->db_level+1,
1263                             db->db_blkid >> epbs, FTAG);
1264                         ASSERT(parent != NULL);
1265                         parent_held = TRUE;
1266                 }
1267                 if (drop_struct_lock)
1268                         rw_exit(&dn->dn_struct_rwlock);
1269                 ASSERT3U(db->db_level+1, ==, parent->db_level);
1270                 di = dbuf_dirty(parent, tx);
1271                 if (parent_held)
1272                         dbuf_rele(parent, FTAG);
1273 
1274                 mutex_enter(&db->db_mtx);
1275                 /*
1276                  * Since we've dropped the mutex, it's possible that
1277                  * dbuf_undirty() might have changed this out from under us.
1278                  */
1279                 if (db->db_last_dirty == dr ||
1280                     dn->dn_object == DMU_META_DNODE_OBJECT) {
1281                         mutex_enter(&di->dt.di.dr_mtx);
1282                         ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1283                         ASSERT(!list_link_active(&dr->dr_dirty_node));
1284                         list_insert_tail(&di->dt.di.dr_children, dr);
1285                         mutex_exit(&di->dt.di.dr_mtx);
1286                         dr->dr_parent = di;
1287                 }
1288                 mutex_exit(&db->db_mtx);
1289         } else {
1290                 ASSERT(db->db_level+1 == dn->dn_nlevels);
1291                 ASSERT(db->db_blkid < dn->dn_nblkptr);
1292                 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1293                 mutex_enter(&dn->dn_mtx);
1294                 ASSERT(!list_link_active(&dr->dr_dirty_node));
1295                 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1296                 mutex_exit(&dn->dn_mtx);
1297                 if (drop_struct_lock)
1298                         rw_exit(&dn->dn_struct_rwlock);


1328         if (dr == NULL || dr->dr_txg < txg)
1329                 return (B_FALSE);
1330         ASSERT(dr->dr_txg == txg);
1331         ASSERT(dr->dr_dbuf == db);
1332 
1333         DB_DNODE_ENTER(db);
1334         dn = DB_DNODE(db);
1335 
1336         /*
1337          * Note:  This code will probably work even if there are concurrent
1338          * holders, but it is untested in that scenerio, as the ZPL and
1339          * ztest have additional locking (the range locks) that prevents
1340          * that type of concurrent access.
1341          */
1342         ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1343 
1344         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1345 
1346         ASSERT(db->db.db_size != 0);
1347 
1348         /*
1349          * Any space we accounted for in dp_dirty_* will be cleaned up by
1350          * dsl_pool_sync().  This is relatively rare so the discrepancy
1351          * is not a big deal.
1352          */
1353 
1354         *drp = dr->dr_next;
1355 
1356         /*
1357          * Note that there are three places in dbuf_dirty()
1358          * where this dirty record may be put on a list.
1359          * Make sure to do a list_remove corresponding to
1360          * every one of those list_insert calls.
1361          */
1362         if (dr->dr_parent) {
1363                 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1364                 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1365                 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1366         } else if (db->db_blkid == DMU_SPILL_BLKID ||
1367             db->db_level+1 == dn->dn_nlevels) {
1368                 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1369                 mutex_enter(&dn->dn_mtx);
1370                 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1371                 mutex_exit(&dn->dn_mtx);
1372         }


1512                                 arc_release(db->db_buf, db);
1513                         }
1514                         dr->dt.dl.dr_data = buf;
1515                         VERIFY(arc_buf_remove_ref(db->db_buf, db));
1516                 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1517                         arc_release(db->db_buf, db);
1518                         VERIFY(arc_buf_remove_ref(db->db_buf, db));
1519                 }
1520                 db->db_buf = NULL;
1521         }
1522         ASSERT(db->db_buf == NULL);
1523         dbuf_set_data(db, buf);
1524         db->db_state = DB_FILL;
1525         mutex_exit(&db->db_mtx);
1526         (void) dbuf_dirty(db, tx);
1527         dbuf_fill_done(db, tx);
1528 }
1529 
1530 /*
1531  * "Clear" the contents of this dbuf.  This will mark the dbuf
1532  * EVICTING and clear *most* of its references.  Unfortunately,
1533  * when we are not holding the dn_dbufs_mtx, we can't clear the
1534  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1535  * in this case.  For callers from the DMU we will usually see:
1536  *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1537  * For the arc callback, we will usually see:
1538  *      dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1539  * Sometimes, though, we will get a mix of these two:
1540  *      DMU: dbuf_clear()->arc_buf_evict()
1541  *      ARC: dbuf_do_evict()->dbuf_destroy()
1542  */
1543 void
1544 dbuf_clear(dmu_buf_impl_t *db)
1545 {
1546         dnode_t *dn;
1547         dmu_buf_impl_t *parent = db->db_parent;
1548         dmu_buf_impl_t *dndb;
1549         int dbuf_gone = FALSE;
1550 
1551         ASSERT(MUTEX_HELD(&db->db_mtx));
1552         ASSERT(refcount_is_zero(&db->db_holds));


1699         db->db_evict_func = NULL;
1700         db->db_immediate_evict = 0;
1701         db->db_freed_in_flight = 0;
1702 
1703         if (blkid == DMU_BONUS_BLKID) {
1704                 ASSERT3P(parent, ==, dn->dn_dbuf);
1705                 db->db.db_size = DN_MAX_BONUSLEN -
1706                     (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1707                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1708                 db->db.db_offset = DMU_BONUS_BLKID;
1709                 db->db_state = DB_UNCACHED;
1710                 /* the bonus dbuf is not placed in the hash table */
1711                 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1712                 return (db);
1713         } else if (blkid == DMU_SPILL_BLKID) {
1714                 db->db.db_size = (blkptr != NULL) ?
1715                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1716                 db->db.db_offset = 0;
1717         } else {
1718                 int blocksize =
1719                     db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1720                 db->db.db_size = blocksize;
1721                 db->db.db_offset = db->db_blkid * blocksize;
1722         }
1723 
1724         /*
1725          * Hold the dn_dbufs_mtx while we get the new dbuf
1726          * in the hash table *and* added to the dbufs list.
1727          * This prevents a possible deadlock with someone
1728          * trying to look up this dbuf before its added to the
1729          * dn_dbufs list.
1730          */
1731         mutex_enter(&dn->dn_dbufs_mtx);
1732         db->db_state = DB_EVICTING;
1733         if ((odb = dbuf_hash_insert(db)) != NULL) {
1734                 /* someone else inserted it first */
1735                 kmem_cache_free(dbuf_cache, db);
1736                 mutex_exit(&dn->dn_dbufs_mtx);
1737                 return (odb);
1738         }
1739         list_insert_head(&dn->dn_dbufs, db);


1808                          */
1809                         dnode_rele(dn, db);
1810                         db->db_dnode_handle = NULL;
1811                 }
1812                 dbuf_hash_remove(db);
1813         }
1814         db->db_parent = NULL;
1815         db->db_buf = NULL;
1816 
1817         ASSERT(!list_link_active(&db->db_link));
1818         ASSERT(db->db.db_data == NULL);
1819         ASSERT(db->db_hash_next == NULL);
1820         ASSERT(db->db_blkptr == NULL);
1821         ASSERT(db->db_data_pending == NULL);
1822 
1823         kmem_cache_free(dbuf_cache, db);
1824         arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1825 }
1826 
1827 void
1828 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1829 {
1830         dmu_buf_impl_t *db = NULL;
1831         blkptr_t *bp = NULL;
1832 
1833         ASSERT(blkid != DMU_BONUS_BLKID);
1834         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1835 
1836         if (dnode_block_freed(dn, blkid))
1837                 return;
1838 
1839         /* dbuf_find() returns with db_mtx held */
1840         if (db = dbuf_find(dn, 0, blkid)) {
1841                 /*
1842                  * This dbuf is already in the cache.  We assume that
1843                  * it is already CACHED, or else about to be either
1844                  * read or filled.
1845                  */
1846                 mutex_exit(&db->db_mtx);
1847                 return;
1848         }
1849 
1850         if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1851                 if (bp && !BP_IS_HOLE(bp)) {


1852                         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1853                         uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1854                         zbookmark_t zb;
1855 
1856                         SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1857                             dn->dn_object, 0, blkid);
1858 
1859                         (void) arc_read(NULL, dn->dn_objset->os_spa,
1860                             bp, NULL, NULL, prio,
1861                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1862                             &aflags, &zb);
1863                 }
1864                 if (db)
1865                         dbuf_rele(db, NULL);
1866         }
1867 }
1868 
1869 /*
1870  * Returns with db_holds incremented, and db_mtx not held.
1871  * Note: dn_struct_rwlock must be held.
1872  */
1873 int
1874 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1875     void *tag, dmu_buf_impl_t **dbp)
1876 {
1877         dmu_buf_impl_t *db, *parent = NULL;
1878 
1879         ASSERT(blkid != DMU_BONUS_BLKID);
1880         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));


2521                         }
2522                 } else {
2523                         fill = 1;
2524                 }
2525         } else {
2526                 blkptr_t *ibp = db->db.db_data;
2527                 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2528                 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2529                         if (BP_IS_HOLE(ibp))
2530                                 continue;
2531                         fill += ibp->blk_fill;
2532                 }
2533         }
2534         DB_DNODE_EXIT(db);
2535 
2536         bp->blk_fill = fill;
2537 
2538         mutex_exit(&db->db_mtx);
2539 }
2540 
2541 /*
2542  * The SPA will call this callback several times for each zio - once
2543  * for every physical child i/o (zio->io_phys_children times).  This
2544  * allows the DMU to monitor the progress of each logical i/o.  For example,
2545  * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2546  * block.  There may be a long delay before all copies/fragments are completed,
2547  * so this callback allows us to retire dirty space gradually, as the physical
2548  * i/os complete.
2549  */
2550 /* ARGSUSED */
2551 static void
2552 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2553 {
2554         dmu_buf_impl_t *db = arg;
2555         objset_t *os = db->db_objset;
2556         dsl_pool_t *dp = dmu_objset_pool(os);
2557         dbuf_dirty_record_t *dr;
2558         int delta = 0;
2559 
2560         dr = db->db_data_pending;
2561         ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2562 
2563         /*
2564          * The callback will be called io_phys_children times.  Retire one
2565          * portion of our dirty space each time we are called.  Any rounding
2566          * error will be cleaned up by dsl_pool_sync()'s call to
2567          * dsl_pool_undirty_space().
2568          */
2569         delta = dr->dr_accounted / zio->io_phys_children;
2570         dsl_pool_undirty_space(dp, delta, zio->io_txg);
2571 }
2572 
2573 /* ARGSUSED */
2574 static void
2575 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2576 {
2577         dmu_buf_impl_t *db = vdb;
2578         blkptr_t *bp = zio->io_bp;
2579         blkptr_t *bp_orig = &zio->io_bp_orig;
2580         uint64_t txg = zio->io_txg;
2581         dbuf_dirty_record_t **drp, *dr;
2582 
2583         ASSERT0(zio->io_error);
2584         ASSERT(db->db_blkptr == bp);
2585 
2586         /*
2587          * For nopwrites and rewrites we ensure that the bp matches our
2588          * original and bypass all the accounting.
2589          */
2590         if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2591                 ASSERT(BP_EQUAL(bp, bp_orig));
2592         } else {
2593                 objset_t *os;
2594                 dsl_dataset_t *ds;


2647                 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2648                 if (!BP_IS_HOLE(db->db_blkptr)) {
2649                         int epbs =
2650                             dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2651                         ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2652                             db->db.db_size);
2653                         ASSERT3U(dn->dn_phys->dn_maxblkid
2654                             >> (db->db_level * epbs), >=, db->db_blkid);
2655                         arc_set_callback(db->db_buf, dbuf_do_evict, db);
2656                 }
2657                 DB_DNODE_EXIT(db);
2658                 mutex_destroy(&dr->dt.di.dr_mtx);
2659                 list_destroy(&dr->dt.di.dr_children);
2660         }
2661         kmem_free(dr, sizeof (dbuf_dirty_record_t));
2662 
2663         cv_broadcast(&db->db_changed);
2664         ASSERT(db->db_dirtycnt > 0);
2665         db->db_dirtycnt -= 1;
2666         db->db_data_pending = NULL;
2667 
2668         dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2669 }
2670 
2671 static void
2672 dbuf_write_nofill_ready(zio_t *zio)
2673 {
2674         dbuf_write_ready(zio, NULL, zio->io_private);
2675 }
2676 
2677 static void
2678 dbuf_write_nofill_done(zio_t *zio)
2679 {
2680         dbuf_write_done(zio, NULL, zio->io_private);
2681 }
2682 
2683 static void
2684 dbuf_write_override_ready(zio_t *zio)
2685 {
2686         dbuf_dirty_record_t *dr = zio->io_private;
2687         dmu_buf_impl_t *db = dr->dr_dbuf;


2766 
2767         ASSERT(db->db_level == 0 || data == db->db_buf);
2768         ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2769         ASSERT(zio);
2770 
2771         SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2772             os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2773             db->db.db_object, db->db_level, db->db_blkid);
2774 
2775         if (db->db_blkid == DMU_SPILL_BLKID)
2776                 wp_flag = WP_SPILL;
2777         wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2778 
2779         dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2780         DB_DNODE_EXIT(db);
2781 
2782         if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2783                 ASSERT(db->db_state != DB_NOFILL);
2784                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2785                     db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2786                     dbuf_write_override_ready, NULL, dbuf_write_override_done,
2787                     dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2788                 mutex_enter(&db->db_mtx);
2789                 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2790                 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2791                     dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2792                 mutex_exit(&db->db_mtx);
2793         } else if (db->db_state == DB_NOFILL) {
2794                 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2795                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2796                     db->db_blkptr, NULL, db->db.db_size, &zp,
2797                     dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2798                     ZIO_PRIORITY_ASYNC_WRITE,
2799                     ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2800         } else {
2801                 ASSERT(arc_released(data));
2802                 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2803                     db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2804                     DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2805                     dbuf_write_physdone, dbuf_write_done, db,
2806                     ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2807         }
2808 }