Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

@@ -37,34 +37,14 @@
 #include <sys/arc.h>
 #include <sys/ddt.h>
 
 /*
  * ==========================================================================
- * I/O priority table
- * ==========================================================================
- */
-uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
-        0,      /* ZIO_PRIORITY_NOW             */
-        0,      /* ZIO_PRIORITY_SYNC_READ       */
-        0,      /* ZIO_PRIORITY_SYNC_WRITE      */
-        0,      /* ZIO_PRIORITY_LOG_WRITE       */
-        1,      /* ZIO_PRIORITY_CACHE_FILL      */
-        1,      /* ZIO_PRIORITY_AGG             */
-        4,      /* ZIO_PRIORITY_FREE            */
-        4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
-        6,      /* ZIO_PRIORITY_ASYNC_READ      */
-        10,     /* ZIO_PRIORITY_RESILVER        */
-        20,     /* ZIO_PRIORITY_SCRUB           */
-        2,      /* ZIO_PRIORITY_DDT_PREFETCH    */
-};
-
-/*
- * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
-char *zio_type_name[ZIO_TYPES] = {
+const char *zio_type_name[ZIO_TYPES] = {
         "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
         "zio_ioctl"
 };
 
 /*

@@ -484,11 +464,14 @@
         mutex_enter(&pio->io_lock);
         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
                 *errorp = zio_worst_error(*errorp, zio->io_error);
         pio->io_reexecute |= zio->io_reexecute;
         ASSERT3U(*countp, >, 0);
-        if (--*countp == 0 && pio->io_stall == countp) {
+
+        (*countp)--;
+
+        if (*countp == 0 && pio->io_stall == countp) {
                 pio->io_stall = NULL;
                 mutex_exit(&pio->io_lock);
                 zio_execute(pio);
         } else {
                 mutex_exit(&pio->io_lock);

@@ -508,11 +491,11 @@
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_type_t type, int priority, enum zio_flag flags,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
     vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
     enum zio_stage stage, enum zio_stage pipeline)
 {
         zio_t *zio;
 

@@ -618,11 +601,11 @@
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb)
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
 {
         zio_t *zio;
 
         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
             data, size, done, private,

@@ -634,12 +617,13 @@
 }
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, const zio_prop_t *zp,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb)
+    zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
+    void *private,
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
 {
         zio_t *zio;
 
         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&

@@ -654,19 +638,20 @@
             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
         zio->io_ready = ready;
+        zio->io_physdone = physdone;
         zio->io_prop = *zp;
 
         return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
-    uint64_t size, zio_done_func_t *done, void *private, int priority,
-    enum zio_flag flags, zbookmark_t *zb)
+    uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb)
 {
         zio_t *zio;
 
         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,

@@ -738,11 +723,11 @@
          */
         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
                 stage |= ZIO_STAGE_ISSUE_ASYNC;
 
         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
-            NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
+            NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 
 
         return (zio);
 }

@@ -776,36 +761,36 @@
         return (zio);
 }
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
+    zio_done_func_t *done, void *private, enum zio_flag flags)
 {
         zio_t *zio;
         int c;
 
         if (vd->vdev_children == 0) {
                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-                    ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
+                    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 
                 zio->io_cmd = cmd;
         } else {
                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 
                 for (c = 0; c < vd->vdev_children; c++)
                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-                            done, private, priority, flags));
+                            done, private, flags));
         }
 
         return (zio);
 }
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
         zio_t *zio;
 
         ASSERT(vd->vdev_children == 0);
         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||

@@ -822,11 +807,11 @@
 }
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
         zio_t *zio;
 
         ASSERT(vd->vdev_children == 0);
         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||

@@ -857,12 +842,12 @@
 /*
  * Create a child I/O to do some work for us.
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-        void *data, uint64_t size, int type, int priority, enum zio_flag flags,
-        zio_done_func_t *done, void *private)
+        void *data, uint64_t size, int type, zio_priority_t priority,
+        enum zio_flag flags, zio_done_func_t *done, void *private)
 {
         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
         zio_t *zio;
 
         ASSERT(vd->vdev_parent ==

@@ -893,25 +878,29 @@
 
         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 
+        zio->io_physdone = pio->io_physdone;
+        if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+                zio->io_logical->io_phys_children++;
+
         return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
-        int type, int priority, enum zio_flag flags,
+        int type, zio_priority_t priority, enum zio_flag flags,
         zio_done_func_t *done, void *private)
 {
         zio_t *zio;
 
         ASSERT(vd->vdev_ops->vdev_op_leaf);
 
         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
             data, size, done, private, type, priority,
-            flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
+            flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
             vd, offset, NULL,
             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
         return (zio);
 }

@@ -918,11 +907,11 @@
 
 void
 zio_flush(zio_t *zio, vdev_t *vd)
 {
         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
-            NULL, NULL, ZIO_PRIORITY_NOW,
+            NULL, NULL,
             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
 void
 zio_shrink(zio_t *zio, uint64_t size)

@@ -1819,11 +1808,11 @@
                 zp.zp_dedup_verify = B_FALSE;
                 zp.zp_nopwrite = B_FALSE;
 
                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
-                    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
+                    zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
                     &pio->io_bookmark));
         }
 
         /*

@@ -2196,11 +2185,11 @@
                         ddt_exit(ddt);
                         return (ZIO_PIPELINE_CONTINUE);
                 }
 
                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
-                    zio->io_orig_size, &czp, NULL,
+                    zio->io_orig_size, &czp, NULL, NULL,
                     zio_ddt_ditto_write_done, dde, zio->io_priority,
                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;

@@ -2218,11 +2207,11 @@
                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
                 ddt_phys_fill(ddp, bp);
                 ddt_phys_addref(ddp);
         } else {
                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
-                    zio->io_orig_size, zp, zio_ddt_child_write_ready,
+                    zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
                     zio_ddt_child_write_done, dde, zio->io_priority,
                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
                 dde->dde_lead_zio[p] = cio;

@@ -2635,10 +2624,17 @@
         }
 
         if (zio->io_error)
                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+        if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+            zio->io_physdone != NULL) {
+                ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+                ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+                zio->io_physdone(zio->io_logical);
+        }
+
         return (ZIO_PIPELINE_CONTINUE);
 }
 
 void
 zio_vdev_io_reissue(zio_t *zio)