Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

@@ -839,11 +839,11 @@
                  * so note that we unexpectedly took the slow path.
                  */
                 atomic_inc_64(&zfs_free_range_recv_miss);
         }
 
-        for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+        for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
                 db_next = list_next(&dn->dn_dbufs, db);
                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
                 if (db->db_level == 1 &&
                     db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {

@@ -1185,10 +1185,12 @@
                 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
                 list_create(&dr->dt.di.dr_children,
                     sizeof (dbuf_dirty_record_t),
                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
         }
+        if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+                dr->dr_accounted = db->db.db_size;
         dr->dr_dbuf = db;
         dr->dr_txg = tx->tx_txg;
         dr->dr_next = *drp;
         *drp = dr;
 

@@ -1268,11 +1270,14 @@
                 di = dbuf_dirty(parent, tx);
                 if (parent_held)
                         dbuf_rele(parent, FTAG);
 
                 mutex_enter(&db->db_mtx);
-                /*  possible race with dbuf_undirty() */
+                /*
+                 * Since we've dropped the mutex, it's possible that
+                 * dbuf_undirty() might have changed this out from under us.
+                 */
                 if (db->db_last_dirty == dr ||
                     dn->dn_object == DMU_META_DNODE_OBJECT) {
                         mutex_enter(&di->dt.di.dr_mtx);
                         ASSERT3U(di->dr_txg, ==, tx->tx_txg);
                         ASSERT(!list_link_active(&dr->dr_dirty_node));

@@ -1338,11 +1343,15 @@
 
         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
         ASSERT(db->db.db_size != 0);
 
-        /* XXX would be nice to fix up dn_towrite_space[] */
+        /*
+         * Any space we accounted for in dp_dirty_* will be cleaned up by
+         * dsl_pool_sync().  This is relatively rare so the discrepancy
+         * is not a big deal.
+         */
 
         *drp = dr->dr_next;
 
         /*
          * Note that there are three places in dbuf_dirty()

@@ -1518,11 +1527,11 @@
         dbuf_fill_done(db, tx);
 }
 
 /*
  * "Clear" the contents of this dbuf.  This will mark the dbuf
- * EVICTING and clear *most* of its references.  Unfortunetely,
+ * EVICTING and clear *most* of its references.  Unfortunately,
  * when we are not holding the dn_dbufs_mtx, we can't clear the
  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
  * in this case.  For callers from the DMU we will usually see:
  *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
  * For the arc callback, we will usually see:

@@ -1705,11 +1714,11 @@
                 db->db.db_size = (blkptr != NULL) ?
                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
                 db->db.db_offset = 0;
         } else {
                 int blocksize =
-                    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
+                    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
                 db->db.db_size = blocksize;
                 db->db.db_offset = db->db_blkid * blocksize;
         }
 
         /*

@@ -1814,11 +1823,11 @@
         kmem_cache_free(dbuf_cache, db);
         arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 }
 
 void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
 {
         dmu_buf_impl_t *db = NULL;
         blkptr_t *bp = NULL;
 
         ASSERT(blkid != DMU_BONUS_BLKID);

@@ -1838,21 +1847,19 @@
                 return;
         }
 
         if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
                 if (bp && !BP_IS_HOLE(bp)) {
-                        int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
-                            ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
                         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                         uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
                         zbookmark_t zb;
 
                         SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
                             dn->dn_object, 0, blkid);
 
                         (void) arc_read(NULL, dn->dn_objset->os_spa,
-                            bp, NULL, NULL, priority,
+                            bp, NULL, NULL, prio,
                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
                             &aflags, &zb);
                 }
                 if (db)
                         dbuf_rele(db, NULL);

@@ -2529,12 +2536,44 @@
         bp->blk_fill = fill;
 
         mutex_exit(&db->db_mtx);
 }
 
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times).  This
+ * allows the DMU to monitor the progress of each logical i/o.  For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block.  There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
 /* ARGSUSED */
 static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+        dmu_buf_impl_t *db = arg;
+        objset_t *os = db->db_objset;
+        dsl_pool_t *dp = dmu_objset_pool(os);
+        dbuf_dirty_record_t *dr;
+        int delta = 0;
+
+        dr = db->db_data_pending;
+        ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+        /*
+         * The callback will be called io_phys_children times.  Retire one
+         * portion of our dirty space each time we are called.  Any rounding
+         * error will be cleaned up by dsl_pool_sync()'s call to
+         * dsl_pool_undirty_space().
+         */
+        delta = dr->dr_accounted / zio->io_phys_children;
+        dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+/* ARGSUSED */
+static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
         dmu_buf_impl_t *db = vdb;
         blkptr_t *bp = zio->io_bp;
         blkptr_t *bp_orig = &zio->io_bp_orig;

@@ -2623,10 +2662,11 @@
 
         cv_broadcast(&db->db_changed);
         ASSERT(db->db_dirtycnt > 0);
         db->db_dirtycnt -= 1;
         db->db_data_pending = NULL;
+
         dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 }
 
 static void
 dbuf_write_nofill_ready(zio_t *zio)

@@ -2741,28 +2781,28 @@
 
         if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
                 ASSERT(db->db_state != DB_NOFILL);
                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
                     db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
-                    dbuf_write_override_ready, dbuf_write_override_done, dr,
-                    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+                    dbuf_write_override_ready, NULL, dbuf_write_override_done,
+                    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                 mutex_enter(&db->db_mtx);
                 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
                 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
                     dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
                 mutex_exit(&db->db_mtx);
         } else if (db->db_state == DB_NOFILL) {
                 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
                     db->db_blkptr, NULL, db->db.db_size, &zp,
-                    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+                    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
                     ZIO_PRIORITY_ASYNC_WRITE,
                     ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
         } else {
                 ASSERT(arc_released(data));
                 dr->dr_zio = arc_write(zio, os->os_spa, txg,
                     db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
                     DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
-                    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-                    ZIO_FLAG_MUSTSUCCEED, &zb);
+                    dbuf_write_physdone, dbuf_write_done, db,
+                    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
         }
 }