Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

@@ -125,10 +125,11 @@
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <vm/anon.h>
 #include <sys/fs/swapnode.h>
 #include <sys/dnlc.h>

@@ -145,22 +146,24 @@
 
 static kmutex_t         arc_reclaim_thr_lock;
 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 static uint8_t          arc_thread_exit;
 
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
 #define ARC_REDUCE_DNLC_PERCENT 3
 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 
 typedef enum arc_reclaim_strategy {
         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 } arc_reclaim_strategy_t;
 
+/*
+ * The number of iterations through arc_evict_*() before we
+ * drop & reacquire the lock.
+ */
+int arc_evict_iterations = 100;
+
 /* number of seconds before growing cache again */
 static int              arc_grow_retry = 60;
 
 /* shift of arc_c for calculating both min and max arc_p */
 static int              arc_p_min_shift = 4;

@@ -172,10 +175,15 @@
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 static int              arc_min_prefetch_lifespan;
 
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
 static int arc_dead;
 
 /*
  * The arc has filled available memory and has now warmed up.
  */

@@ -467,10 +475,11 @@
 typedef struct arc_write_callback arc_write_callback_t;
 
 struct arc_write_callback {
         void            *awcb_private;
         arc_done_func_t *awcb_ready;
+        arc_done_func_t *awcb_physdone;
         arc_done_func_t *awcb_done;
         arc_buf_t       *awcb_buf;
 };
 
 struct arc_buf_hdr {

@@ -1161,11 +1170,11 @@
         arc_state_t *old_state = ab->b_state;
         int64_t refcnt = refcount_count(&ab->b_refcnt);
         uint64_t from_delta, to_delta;
 
         ASSERT(MUTEX_HELD(hash_lock));
-        ASSERT(new_state != old_state);
+        ASSERT3P(new_state, !=, old_state);
         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
 
         from_delta = to_delta = ab->b_datacnt * ab->b_size;

@@ -1776,10 +1785,12 @@
         arc_buf_hdr_t *ab, *ab_prev = NULL;
         list_t *list = &state->arcs_list[type];
         kmutex_t *hash_lock;
         boolean_t have_lock;
         void *stolen = NULL;
+        arc_buf_hdr_t marker = { 0 };
+        int count = 0;
 
         ASSERT(state == arc_mru || state == arc_mfu);
 
         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 

@@ -1799,10 +1810,37 @@
                 }
                 /* "lookahead" for better eviction candidate */
                 if (recycle && ab->b_size != bytes &&
                     ab_prev && ab_prev->b_size == bytes)
                         continue;
+
+                /* ignore markers */
+                if (ab->b_spa == 0)
+                        continue;
+
+                /*
+                 * It may take a long time to evict all the bufs requested.
+                 * To avoid blocking all arc activity, periodically drop
+                 * the arcs_mtx and give other threads a chance to run
+                 * before reacquiring the lock.
+                 *
+                 * If we are looking for a buffer to recycle, we are in
+                 * the hot code path, so don't sleep.
+                 */
+                if (!recycle && count++ > arc_evict_iterations) {
+                        list_insert_after(list, ab, &marker);
+                        mutex_exit(&evicted_state->arcs_mtx);
+                        mutex_exit(&state->arcs_mtx);
+                        kpreempt(KPREEMPT_SYNC);
+                        mutex_enter(&state->arcs_mtx);
+                        mutex_enter(&evicted_state->arcs_mtx);
+                        ab_prev = list_prev(list, &marker);
+                        list_remove(list, &marker);
+                        count = 0;
+                        continue;
+                }
+
                 hash_lock = HDR_LOCK(ab);
                 have_lock = MUTEX_HELD(hash_lock);
                 if (have_lock || mutex_tryenter(hash_lock)) {
                         ASSERT0(refcount_count(&ab->b_refcnt));
                         ASSERT(ab->b_datacnt > 0);

@@ -1880,30 +1918,16 @@
 
         if (missed)
                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
 
         /*
-         * We have just evicted some data into the ghost state, make
-         * sure we also adjust the ghost state size if necessary.
+         * Note: we have just evicted some data into the ghost state,
+         * potentially putting the ghost size over the desired size.  Rather
+         * that evicting from the ghost list in this hot code path, leave
+         * this chore to the arc_reclaim_thread().
          */
-        if (arc_no_grow &&
-            arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
-                int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
-                    arc_mru_ghost->arcs_size - arc_c;
 
-                if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
-                        int64_t todelete =
-                            MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-                        arc_evict_ghost(arc_mru_ghost, NULL, todelete);
-                } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
-                        int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
-                            arc_mru_ghost->arcs_size +
-                            arc_mfu_ghost->arcs_size - arc_c);
-                        arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
-                }
-        }
-
         return (stolen);
 }
 
 /*
  * Remove buffers from list until we've removed the specified number of

@@ -1916,16 +1940,19 @@
         arc_buf_hdr_t marker = { 0 };
         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
         kmutex_t *hash_lock;
         uint64_t bytes_deleted = 0;
         uint64_t bufs_skipped = 0;
+        int count = 0;
 
         ASSERT(GHOST_STATE(state));
 top:
         mutex_enter(&state->arcs_mtx);
         for (ab = list_tail(list); ab; ab = ab_prev) {
                 ab_prev = list_prev(list, ab);
+                if (ab->b_type > ARC_BUFC_NUMTYPES)
+                        panic("invalid ab=%p", (void *)ab);
                 if (spa && ab->b_spa != spa)
                         continue;
 
                 /* ignore markers */
                 if (ab->b_spa == 0)

@@ -1933,10 +1960,27 @@
 
                 hash_lock = HDR_LOCK(ab);
                 /* caller may be trying to modify this buffer, skip it */
                 if (MUTEX_HELD(hash_lock))
                         continue;
+
+                /*
+                 * It may take a long time to evict all the bufs requested.
+                 * To avoid blocking all arc activity, periodically drop
+                 * the arcs_mtx and give other threads a chance to run
+                 * before reacquiring the lock.
+                 */
+                if (count++ > arc_evict_iterations) {
+                        list_insert_after(list, ab, &marker);
+                        mutex_exit(&state->arcs_mtx);
+                        kpreempt(KPREEMPT_SYNC);
+                        mutex_enter(&state->arcs_mtx);
+                        ab_prev = list_prev(list, &marker);
+                        list_remove(list, &marker);
+                        count = 0;
+                        continue;
+                }
                 if (mutex_tryenter(hash_lock)) {
                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
                         ASSERT(ab->b_buf == NULL);
                         ARCSTAT_BUMP(arcstat_deleted);
                         bytes_deleted += ab->b_size;

@@ -1968,13 +2012,15 @@
                         mutex_enter(hash_lock);
                         mutex_exit(hash_lock);
                         mutex_enter(&state->arcs_mtx);
                         ab_prev = list_prev(list, &marker);
                         list_remove(list, &marker);
-                } else
+                } else {
                         bufs_skipped += 1;
         }
+
+        }
         mutex_exit(&state->arcs_mtx);
 
         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
             (bytes < 0 || bytes_deleted < bytes)) {
                 list = &state->arcs_list[ARC_BUFC_METADATA];

@@ -2823,11 +2869,11 @@
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
-    void *private, int priority, int zio_flags, uint32_t *arc_flags,
+    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
     const zbookmark_t *zb)
 {
         arc_buf_hdr_t *hdr;
         arc_buf_t *buf = NULL;
         kmutex_t *hash_lock;

@@ -3426,11 +3472,23 @@
         }
         arc_cksum_compute(buf, B_FALSE);
         hdr->b_flags |= ARC_IO_IN_PROGRESS;
 }
 
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write.  See the comment in dbuf_write_physdone() for details.
+ */
 static void
+arc_write_physdone(zio_t *zio)
+{
+        arc_write_callback_t *cb = zio->io_private;
+        if (cb->awcb_physdone != NULL)
+                cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
+static void
 arc_write_done(zio_t *zio)
 {
         arc_write_callback_t *callback = zio->io_private;
         arc_buf_t *buf = callback->awcb_buf;
         arc_buf_hdr_t *hdr = buf->b_hdr;

@@ -3506,12 +3564,13 @@
 }
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
-    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
-    void *private, int priority, int zio_flags, const zbookmark_t *zb)
+    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
+    arc_done_func_t *done, void *private, zio_priority_t priority,
+    int zio_flags, const zbookmark_t *zb)
 {
         arc_buf_hdr_t *hdr = buf->b_hdr;
         arc_write_callback_t *callback;
         zio_t *zio;
 

@@ -3524,22 +3583,24 @@
                 hdr->b_flags |= ARC_L2CACHE;
         if (l2arc_compress)
                 hdr->b_flags |= ARC_L2COMPRESS;
         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
         callback->awcb_ready = ready;
+        callback->awcb_physdone = physdone;
         callback->awcb_done = done;
         callback->awcb_private = private;
         callback->awcb_buf = buf;
 
         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
-            arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+            arc_write_ready, arc_write_physdone, arc_write_done, callback,
+            priority, zio_flags, zb);
 
         return (zio);
 }
 
 static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
 {
 #ifdef _KERNEL
         uint64_t available_memory = ptob(freemem);
         static uint64_t page_load = 0;
         static uint64_t last_txg = 0;

@@ -3546,11 +3607,12 @@
 
 #if defined(__i386)
         available_memory =
             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
 #endif
-        if (available_memory >= zfs_write_limit_max)
+
+        if (freemem > physmem * arc_lotsfree_percent / 100)
                 return (0);
 
         if (txg > last_txg) {
                 last_txg = txg;
                 page_load = 0;

@@ -3570,24 +3632,10 @@
                 /* memory is low, delay before restarting */
                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
                 return (SET_ERROR(EAGAIN));
         }
         page_load = 0;
-
-        if (arc_size > arc_c_min) {
-                uint64_t evictable_memory =
-                    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
-                    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-                    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
-                    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
-                available_memory += MIN(evictable_memory, arc_size - arc_c_min);
-        }
-
-        if (inflight_data > available_memory / 4) {
-                ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-                return (SET_ERROR(ERESTART));
-        }
 #endif
         return (0);
 }
 
 void

@@ -3601,19 +3649,10 @@
 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 {
         int error;
         uint64_t anon_size;
 
-#ifdef ZFS_DEBUG
-        /*
-         * Once in a while, fail for no reason.  Everything should cope.
-         */
-        if (spa_get_random(10000) == 0) {
-                dprintf("forcing random failure\n");
-                return (SET_ERROR(ERESTART));
-        }
-#endif
         if (reserve > arc_c/4 && !arc_no_grow)
                 arc_c = MIN(arc_c_max, reserve * 4);
         if (reserve > arc_c)
                 return (SET_ERROR(ENOMEM));
 

@@ -3627,11 +3666,12 @@
         /*
          * Writes will, almost always, require additional memory allocations
          * in order to compress/encrypt/etc the data.  We therefore need to
          * make sure that there is sufficient available memory for this.
          */
-        if (error = arc_memory_throttle(reserve, anon_size, txg))
+        error = arc_memory_throttle(reserve, txg);
+        if (error != 0)
                 return (error);
 
         /*
          * Throttle writes when the amount of dirty data in the cache
          * gets too large.  We try to keep the cache less than half full

@@ -3776,15 +3816,24 @@
             TS_RUN, minclsyspri);
 
         arc_dead = FALSE;
         arc_warm = B_FALSE;
 
-        if (zfs_write_limit_max == 0)
-                zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
-        else
-                zfs_write_limit_shift = 0;
-        mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+        /*
+         * Calculate maximum amount of dirty data per pool.
+         *
+         * If it has been set by /etc/system, take that.
+         * Otherwise, use a percentage of physical memory defined by
+         * zfs_dirty_data_max_percent (default 10%) with a cap at
+         * zfs_dirty_data_max_max (default 4GB).
+         */
+        if (zfs_dirty_data_max == 0) {
+                zfs_dirty_data_max = physmem * PAGESIZE *
+                    zfs_dirty_data_max_percent / 100;
+                zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+                    zfs_dirty_data_max_max);
+        }
 }
 
 void
 arc_fini(void)
 {

@@ -3821,12 +3870,10 @@
         mutex_destroy(&arc_mru_ghost->arcs_mtx);
         mutex_destroy(&arc_mfu->arcs_mtx);
         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
         mutex_destroy(&arc_l2c_only->arcs_mtx);
 
-        mutex_destroy(&zfs_write_limit_lock);
-
         buf_fini();
 
         ASSERT(arc_loaned_bytes == 0);
 }