Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

@@ -85,18 +85,16 @@
  */
 static int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
         ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
-        ZTI_MODE_ONLINE_PERCENT,        /* value is % of online CPUs */
         ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
         ZTI_MODE_NULL,                  /* don't create a taskq */
         ZTI_NMODES
 } zti_modes_t;
 
 #define ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
-#define ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
 #define ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
 #define ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
 
 #define ZTI_N(n)        ZTI_P(n, 1)
 #define ZTI_ONE         ZTI_N(1)

@@ -144,11 +142,11 @@
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
-uint_t          zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
+uint_t          zio_taskq_batch_pct = 75;       /* 1 thread per cpu in pset */
 id_t            zio_taskq_psrset_bind = PS_NONE;
 boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 
 boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */

@@ -840,13 +838,10 @@
         ASSERT3U(count, >, 0);
 
         tqs->stqs_count = count;
         tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 
-        for (uint_t i = 0; i < count; i++) {
-                taskq_t *tq;
-
                 switch (mode) {
                 case ZTI_MODE_FIXED:
                         ASSERT3U(value, >=, 1);
                         value = MAX(value, 1);
                         break;

@@ -855,21 +850,20 @@
                         batch = B_TRUE;
                         flags |= TASKQ_THREADS_CPU_PCT;
                         value = zio_taskq_batch_pct;
                         break;
 
-                case ZTI_MODE_ONLINE_PERCENT:
-                        flags |= TASKQ_THREADS_CPU_PCT;
-                        break;
-
                 default:
                         panic("unrecognized mode for %s_%s taskq (%u:%u) in "
                             "spa_activate()",
                             zio_type_name[t], zio_taskq_types[q], mode, value);
                         break;
                 }
 
+        for (uint_t i = 0; i < count; i++) {
+                taskq_t *tq;
+
                 if (count > 1) {
                         (void) snprintf(name, sizeof (name), "%s_%s_%u",
                             zio_type_name[t], zio_taskq_types[q], i);
                 } else {
                         (void) snprintf(name, sizeof (name), "%s_%s",

@@ -881,11 +875,20 @@
                                 flags |= TASKQ_DC_BATCH;
 
                         tq = taskq_create_sysdc(name, value, 50, INT_MAX,
                             spa->spa_proc, zio_taskq_basedc, flags);
                 } else {
-                        tq = taskq_create_proc(name, value, maxclsyspri, 50,
+                        pri_t pri = maxclsyspri;
+                        /*
+                         * The write issue taskq can be extremely CPU
+                         * intensive.  Run it at slightly lower priority
+                         * than the other taskqs.
+                         */
+                        if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
+                                pri--;
+
+                        tq = taskq_create_proc(name, value, pri, 50,
                             INT_MAX, spa->spa_proc, flags);
                 }
 
                 tqs->stqs_taskq[i] = tq;
         }

@@ -5735,11 +5738,37 @@
         zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
             zio->io_flags));
         return (0);
 }
 
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing frees.
+ */
 static void
+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
+{
+        zio_t *zio = zio_root(spa, NULL, NULL, 0);
+        bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
+        VERIFY(zio_wait(zio) == 0);
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing deferred frees.
+ */
+static void
+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
+{
+        zio_t *zio = zio_root(spa, NULL, NULL, 0);
+        VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
+            spa_free_sync_cb, zio, tx), ==, 0);
+        VERIFY0(zio_wait(zio));
+}
+
+
+static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
         char *packed = NULL;
         size_t bufsize;
         size_t nvsize = 0;

@@ -6061,11 +6090,10 @@
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
         dsl_pool_t *dp = spa->spa_dsl_pool;
         objset_t *mos = spa->spa_meta_objset;
-        bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
         vdev_t *rvd = spa->spa_root_vdev;
         vdev_t *vd;
         dmu_tx_t *tx;
         int error;

@@ -6141,14 +6169,11 @@
         if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
             !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
             !txg_list_empty(&dp->dp_sync_tasks, txg) ||
             ((dsl_scan_active(dp->dp_scan) ||
             txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
-                zio_t *zio = zio_root(spa, NULL, NULL, 0);
-                VERIFY3U(bpobj_iterate(defer_bpo,
-                    spa_free_sync_cb, zio, tx), ==, 0);
-                VERIFY0(zio_wait(zio));
+                spa_sync_deferred_frees(spa, tx);
         }
 
         /*
          * Iterate to convergence.
          */

@@ -6162,17 +6187,14 @@
                     ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
                 spa_errlog_sync(spa, txg);
                 dsl_pool_sync(dp, txg);
 
                 if (pass < zfs_sync_pass_deferred_free) {
-                        zio_t *zio = zio_root(spa, NULL, NULL, 0);
-                        bplist_iterate(free_bpl, spa_free_sync_cb,
-                            zio, tx);
-                        VERIFY(zio_wait(zio) == 0);
+                        spa_sync_frees(spa, free_bpl, tx);
                 } else {
                         bplist_iterate(free_bpl, bpobj_enqueue_cb,
-                            defer_bpo, tx);
+                            &spa->spa_deferred_bpobj, tx);
                 }
 
                 ddt_sync(spa, txg);
                 dsl_scan_sync(dp, tx);