Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

*** 839,849 **** * so note that we unexpectedly took the slow path. */ atomic_inc_64(&zfs_free_range_recv_miss); } ! for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level == 1 && db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { --- 839,849 ---- * so note that we unexpectedly took the slow path. */ atomic_inc_64(&zfs_free_range_recv_miss); } ! for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level == 1 && db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
*** 1185,1194 **** --- 1185,1196 ---- mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } + if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) + dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; *drp = dr;
*** 1268,1278 **** di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); ! /* possible race with dbuf_undirty() */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); --- 1270,1283 ---- di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); ! /* ! * Since we've dropped the mutex, it's possible that ! * dbuf_undirty() might have changed this out from under us. ! */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node));
*** 1338,1348 **** dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); ! /* XXX would be nice to fix up dn_towrite_space[] */ *drp = dr->dr_next; /* * Note that there are three places in dbuf_dirty() --- 1343,1357 ---- dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); ! /* ! * Any space we accounted for in dp_dirty_* will be cleaned up by ! * dsl_pool_sync(). This is relatively rare so the discrepancy ! * is not a big deal. ! */ *drp = dr->dr_next; /* * Note that there are three places in dbuf_dirty()
*** 1518,1528 **** dbuf_fill_done(db, tx); } /* * "Clear" the contents of this dbuf. This will mark the dbuf ! * EVICTING and clear *most* of its references. Unfortunetely, * when we are not holding the dn_dbufs_mtx, we can't clear the * entry in the dn_dbufs list. We have to wait until dbuf_destroy() * in this case. For callers from the DMU we will usually see: * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see: --- 1527,1537 ---- dbuf_fill_done(db, tx); } /* * "Clear" the contents of this dbuf. This will mark the dbuf ! * EVICTING and clear *most* of its references. Unfortunately, * when we are not holding the dn_dbufs_mtx, we can't clear the * entry in the dn_dbufs list. We have to wait until dbuf_destroy() * in this case. For callers from the DMU we will usually see: * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see:
*** 1705,1715 **** db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = ! db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* --- 1714,1724 ---- db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = ! db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /*
*** 1814,1824 **** kmem_cache_free(dbuf_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } void ! dbuf_prefetch(dnode_t *dn, uint64_t blkid) { dmu_buf_impl_t *db = NULL; blkptr_t *bp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); --- 1823,1833 ---- kmem_cache_free(dbuf_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } void ! dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) { dmu_buf_impl_t *db = NULL; blkptr_t *bp = NULL; ASSERT(blkid != DMU_BONUS_BLKID);
*** 1838,1858 **** return; } if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { - int priority = dn->dn_type == DMU_OT_DDT_ZAP ? - ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, 0, blkid); (void) arc_read(NULL, dn->dn_objset->os_spa, ! bp, NULL, NULL, priority, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } if (db) dbuf_rele(db, NULL); --- 1847,1865 ---- return; } if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, 0, blkid); (void) arc_read(NULL, dn->dn_objset->os_spa, ! bp, NULL, NULL, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } if (db) dbuf_rele(db, NULL);
*** 2529,2540 **** --- 2536,2579 ---- bp->blk_fill = fill; mutex_exit(&db->db_mtx); } + /* + * The SPA will call this callback several times for each zio - once + * for every physical child i/o (zio->io_phys_children times). This + * allows the DMU to monitor the progress of each logical i/o. For example, + * there may be 2 copies of an indirect block, or many fragments of a RAID-Z + * block. There may be a long delay before all copies/fragments are completed, + * so this callback allows us to retire dirty space gradually, as the physical + * i/os complete. + */ /* ARGSUSED */ static void + dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) + { + dmu_buf_impl_t *db = arg; + objset_t *os = db->db_objset; + dsl_pool_t *dp = dmu_objset_pool(os); + dbuf_dirty_record_t *dr; + int delta = 0; + + dr = db->db_data_pending; + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dsl_pool_sync()'s call to + * dsl_pool_undirty_space(). + */ + delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); + } + + /* ARGSUSED */ + static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig;
*** 2623,2632 **** --- 2662,2672 ---- cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); } static void dbuf_write_nofill_ready(zio_t *zio)
*** 2741,2768 **** if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { ASSERT(db->db_state != DB_NOFILL); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, data->b_data, arc_buf_size(data), &zp, ! dbuf_write_override_ready, dbuf_write_override_done, dr, ! ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, ! dbuf_write_nofill_ready, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); dr->dr_zio = arc_write(zio, os->os_spa, txg, db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, ! dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ! ZIO_FLAG_MUSTSUCCEED, &zb); } } --- 2781,2808 ---- if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { ASSERT(db->db_state != DB_NOFILL); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, data->b_data, arc_buf_size(data), &zp, ! dbuf_write_override_ready, NULL, dbuf_write_override_done, ! dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, ! dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); dr->dr_zio = arc_write(zio, os->os_spa, txg, db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, ! dbuf_write_physdone, dbuf_write_done, db, ! ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } }