Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

@@ -369,17 +369,15 @@
  */
 static int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
-        dsl_pool_t *dp = NULL;
         dmu_buf_t **dbp;
         uint64_t blkid, nblks, i;
         uint32_t dbuf_flags;
         int err;
         zio_t *zio;
-        hrtime_t start;
 
         ASSERT(length <= DMU_MAX_ACCESS);
 
         dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
         if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)

@@ -403,13 +401,10 @@
                 }
                 nblks = 1;
         }
         dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
-        if (dn->dn_objset->os_dsl_dataset)
-                dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
-        start = gethrtime();
         zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
         blkid = dbuf_whichblock(dn, offset);
         for (i = 0; i < nblks; i++) {
                 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
                 if (db == NULL) {

@@ -426,13 +421,10 @@
         }
         rw_exit(&dn->dn_struct_rwlock);
 
         /* wait for async i/o */
         err = zio_wait(zio);
-        /* track read overhead when we are in sync context */
-        if (dp && dsl_pool_sync_context(dp))
-                dp->dp_read_overhead += gethrtime() - start;
         if (err) {
                 dmu_buf_rele_array(dbp, nblks, tag);
                 return (err);
         }
 

@@ -510,16 +502,26 @@
         }
 
         kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
+/*
+ * Issue prefetch i/os for the given blocks.
+ *
+ * Note: The assumption is that we *know* these blocks will be needed
+ * almost immediately.  Therefore, the prefetch i/os will be issued at
+ * ZIO_PRIORITY_SYNC_READ
+ *
+ * Note: indirect blocks and other metadata will be read synchronously,
+ * causing this function to block if they are not already cached.
+ */
 void
 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 {
         dnode_t *dn;
         uint64_t blkid;
-        int nblks, i, err;
+        int nblks, err;
 
         if (zfs_prefetch_disable)
                 return;
 
         if (len == 0) {  /* they're interested in the bonus buffer */

@@ -528,11 +530,11 @@
                 if (object == 0 || object >= DN_MAX_OBJECT)
                         return;
 
                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
                 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
-                dbuf_prefetch(dn, blkid);
+                dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
                 rw_exit(&dn->dn_struct_rwlock);
                 return;
         }
 
         /*

@@ -545,20 +547,20 @@
                 return;
 
         rw_enter(&dn->dn_struct_rwlock, RW_READER);
         if (dn->dn_datablkshift) {
                 int blkshift = dn->dn_datablkshift;
-                nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
-                    P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+                nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
+                    P2ALIGN(offset, 1 << blkshift)) >> blkshift;
         } else {
                 nblks = (offset < dn->dn_datablksz);
         }
 
         if (nblks != 0) {
                 blkid = dbuf_whichblock(dn, offset);
-                for (i = 0; i < nblks; i++)
-                        dbuf_prefetch(dn, blkid+i);
+                for (int i = 0; i < nblks; i++)
+                        dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
         }
 
         rw_exit(&dn->dn_struct_rwlock);
 
         dnode_rele(dn, FTAG);

@@ -1354,11 +1356,11 @@
         dsa->dsa_zgd = zgd;
         dsa->dsa_tx = tx;
 
         zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
             zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
-            dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+            dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
             ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
         return (0);
 }
 

@@ -1494,12 +1496,13 @@
         dsa->dsa_zgd = zgd;
         dsa->dsa_tx = NULL;
 
         zio_nowait(arc_write(pio, os->os_spa, txg,
             bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
-            DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
-            dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+            DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
+            NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
+            ZIO_FLAG_CANFAIL, &zb));
 
         return (0);
 }
 
 int