dlpx-os-diff Cdiff usr/src/uts/common/fs/zfs/arc.c

Print this page

4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


*** 125,134 ****
--- 125,135 ----
  #include <sys/zfs_context.h>
  #include <sys/arc.h>
  #include <sys/refcount.h>
  #include <sys/vdev.h>
  #include <sys/vdev_impl.h>
+ #include <sys/dsl_pool.h>
  #ifdef _KERNEL
  #include <sys/vmsystm.h>
  #include <vm/anon.h>
  #include <sys/fs/swapnode.h>
  #include <sys/dnlc.h>
*** 145,166 ****
  
  static kmutex_t         arc_reclaim_thr_lock;
  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
  static uint8_t          arc_thread_exit;
  
- extern int zfs_write_limit_shift;
- extern uint64_t zfs_write_limit_max;
- extern kmutex_t zfs_write_limit_lock;
- 
  #define ARC_REDUCE_DNLC_PERCENT 3
  uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
  
  typedef enum arc_reclaim_strategy {
          ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
          ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
  } arc_reclaim_strategy_t;
  
  /* number of seconds before growing cache again */
  static int              arc_grow_retry = 60;
  
  /* shift of arc_c for calculating both min and max arc_p */
  static int              arc_p_min_shift = 4;
--- 146,169 ----
  
  static kmutex_t         arc_reclaim_thr_lock;
  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
  static uint8_t          arc_thread_exit;
  
  #define ARC_REDUCE_DNLC_PERCENT 3
  uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
  
  typedef enum arc_reclaim_strategy {
          ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
          ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
  } arc_reclaim_strategy_t;
  
+ /*
+  * The number of iterations through arc_evict_*() before we
+  * drop & reacquire the lock.
+  */
+ int arc_evict_iterations = 100;
+ 
  /* number of seconds before growing cache again */
  static int              arc_grow_retry = 60;
  
  /* shift of arc_c for calculating both min and max arc_p */
  static int              arc_p_min_shift = 4;
*** 172,181 ****
--- 175,189 ----
   * minimum lifespan of a prefetch block in clock ticks
   * (initialized in arc_init())
   */
  static int              arc_min_prefetch_lifespan;
  
+ /*
+  * If this percent of memory is free, don't throttle.
+  */
+ int arc_lotsfree_percent = 10;
+ 
  static int arc_dead;
  
  /*
   * The arc has filled available memory and has now warmed up.
   */
*** 467,476 ****
--- 475,485 ----
  typedef struct arc_write_callback arc_write_callback_t;
  
  struct arc_write_callback {
          void            *awcb_private;
          arc_done_func_t *awcb_ready;
+         arc_done_func_t *awcb_physdone;
          arc_done_func_t *awcb_done;
          arc_buf_t       *awcb_buf;
  };
  
  struct arc_buf_hdr {
*** 1161,1171 ****
          arc_state_t *old_state = ab->b_state;
          int64_t refcnt = refcount_count(&ab->b_refcnt);
          uint64_t from_delta, to_delta;
  
          ASSERT(MUTEX_HELD(hash_lock));
!         ASSERT(new_state != old_state);
          ASSERT(refcnt == 0 || ab->b_datacnt > 0);
          ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
          ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
  
          from_delta = to_delta = ab->b_datacnt * ab->b_size;
--- 1170,1180 ----
          arc_state_t *old_state = ab->b_state;
          int64_t refcnt = refcount_count(&ab->b_refcnt);
          uint64_t from_delta, to_delta;
  
          ASSERT(MUTEX_HELD(hash_lock));
!         ASSERT3P(new_state, !=, old_state);
          ASSERT(refcnt == 0 || ab->b_datacnt > 0);
          ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
          ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
  
          from_delta = to_delta = ab->b_datacnt * ab->b_size;
*** 1776,1785 ****
--- 1785,1796 ----
          arc_buf_hdr_t *ab, *ab_prev = NULL;
          list_t *list = &state->arcs_list[type];
          kmutex_t *hash_lock;
          boolean_t have_lock;
          void *stolen = NULL;
+         arc_buf_hdr_t marker = { 0 };
+         int count = 0;
  
          ASSERT(state == arc_mru || state == arc_mfu);
  
          evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
  
*** 1799,1808 ****
--- 1810,1846 ----
                  }
                  /* "lookahead" for better eviction candidate */
                  if (recycle && ab->b_size != bytes &&
                      ab_prev && ab_prev->b_size == bytes)
                          continue;
+ 
+                 /* ignore markers */
+                 if (ab->b_spa == 0)
+                         continue;
+ 
+                 /*
+                  * It may take a long time to evict all the bufs requested.
+                  * To avoid blocking all arc activity, periodically drop
+                  * the arcs_mtx and give other threads a chance to run
+                  * before reacquiring the lock.
+                  *
+                  * If we are looking for a buffer to recycle, we are in
+                  * the hot code path, so don't sleep.
+                  */
+                 if (!recycle && count++ > arc_evict_iterations) {
+                         list_insert_after(list, ab, &marker);
+                         mutex_exit(&evicted_state->arcs_mtx);
+                         mutex_exit(&state->arcs_mtx);
+                         kpreempt(KPREEMPT_SYNC);
+                         mutex_enter(&state->arcs_mtx);
+                         mutex_enter(&evicted_state->arcs_mtx);
+                         ab_prev = list_prev(list, &marker);
+                         list_remove(list, &marker);
+                         count = 0;
+                         continue;
+                 }
+ 
                  hash_lock = HDR_LOCK(ab);
                  have_lock = MUTEX_HELD(hash_lock);
                  if (have_lock || mutex_tryenter(hash_lock)) {
                          ASSERT0(refcount_count(&ab->b_refcnt));
                          ASSERT(ab->b_datacnt > 0);
*** 1880,1909 ****
  
          if (missed)
                  ARCSTAT_INCR(arcstat_mutex_miss, missed);
  
          /*
!          * We have just evicted some data into the ghost state, make
!          * sure we also adjust the ghost state size if necessary.
           */
-         if (arc_no_grow &&
-             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
-                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
-                     arc_mru_ghost->arcs_size - arc_c;
  
-                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
-                         int64_t todelete =
-                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-                         arc_evict_ghost(arc_mru_ghost, NULL, todelete);
-                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
-                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
-                             arc_mru_ghost->arcs_size +
-                             arc_mfu_ghost->arcs_size - arc_c);
-                         arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
-                 }
-         }
- 
          return (stolen);
  }
  
  /*
   * Remove buffers from list until we've removed the specified number of
--- 1918,1933 ----
  
          if (missed)
                  ARCSTAT_INCR(arcstat_mutex_miss, missed);
  
          /*
!          * Note: we have just evicted some data into the ghost state,
!          * potentially putting the ghost size over the desired size.  Rather
!          * that evicting from the ghost list in this hot code path, leave
!          * this chore to the arc_reclaim_thread().
           */
  
          return (stolen);
  }
  
  /*
   * Remove buffers from list until we've removed the specified number of
*** 1916,1931 ****
--- 1940,1958 ----
          arc_buf_hdr_t marker = { 0 };
          list_t *list = &state->arcs_list[ARC_BUFC_DATA];
          kmutex_t *hash_lock;
          uint64_t bytes_deleted = 0;
          uint64_t bufs_skipped = 0;
+         int count = 0;
  
          ASSERT(GHOST_STATE(state));
  top:
          mutex_enter(&state->arcs_mtx);
          for (ab = list_tail(list); ab; ab = ab_prev) {
                  ab_prev = list_prev(list, ab);
+                 if (ab->b_type > ARC_BUFC_NUMTYPES)
+                         panic("invalid ab=%p", (void *)ab);
                  if (spa && ab->b_spa != spa)
                          continue;
  
                  /* ignore markers */
                  if (ab->b_spa == 0)
*** 1933,1942 ****
--- 1960,1986 ----
  
                  hash_lock = HDR_LOCK(ab);
                  /* caller may be trying to modify this buffer, skip it */
                  if (MUTEX_HELD(hash_lock))
                          continue;
+ 
+                 /*
+                  * It may take a long time to evict all the bufs requested.
+                  * To avoid blocking all arc activity, periodically drop
+                  * the arcs_mtx and give other threads a chance to run
+                  * before reacquiring the lock.
+                  */
+                 if (count++ > arc_evict_iterations) {
+                         list_insert_after(list, ab, &marker);
+                         mutex_exit(&state->arcs_mtx);
+                         kpreempt(KPREEMPT_SYNC);
+                         mutex_enter(&state->arcs_mtx);
+                         ab_prev = list_prev(list, &marker);
+                         list_remove(list, &marker);
+                         count = 0;
+                         continue;
+                 }
                  if (mutex_tryenter(hash_lock)) {
                          ASSERT(!HDR_IO_IN_PROGRESS(ab));
                          ASSERT(ab->b_buf == NULL);
                          ARCSTAT_BUMP(arcstat_deleted);
                          bytes_deleted += ab->b_size;
*** 1968,1980 ****
                          mutex_enter(hash_lock);
                          mutex_exit(hash_lock);
                          mutex_enter(&state->arcs_mtx);
                          ab_prev = list_prev(list, &marker);
                          list_remove(list, &marker);
!                 } else
                          bufs_skipped += 1;
          }
          mutex_exit(&state->arcs_mtx);
  
          if (list == &state->arcs_list[ARC_BUFC_DATA] &&
              (bytes < 0 || bytes_deleted < bytes)) {
                  list = &state->arcs_list[ARC_BUFC_METADATA];
--- 2012,2026 ----
                          mutex_enter(hash_lock);
                          mutex_exit(hash_lock);
                          mutex_enter(&state->arcs_mtx);
                          ab_prev = list_prev(list, &marker);
                          list_remove(list, &marker);
!                 } else {
                          bufs_skipped += 1;
                  }
+ 
+         }
          mutex_exit(&state->arcs_mtx);
  
          if (list == &state->arcs_list[ARC_BUFC_DATA] &&
              (bytes < 0 || bytes_deleted < bytes)) {
                  list = &state->arcs_list[ARC_BUFC_METADATA];
*** 2823,2833 ****
   * arc_read_done() will invoke all the requested "done" functions
   * for readers of this block.
   */
  int
  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
!     void *private, int priority, int zio_flags, uint32_t *arc_flags,
      const zbookmark_t *zb)
  {
          arc_buf_hdr_t *hdr;
          arc_buf_t *buf = NULL;
          kmutex_t *hash_lock;
--- 2869,2879 ----
   * arc_read_done() will invoke all the requested "done" functions
   * for readers of this block.
   */
  int
  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
!     void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
      const zbookmark_t *zb)
  {
          arc_buf_hdr_t *hdr;
          arc_buf_t *buf = NULL;
          kmutex_t *hash_lock;
*** 3426,3436 ****
--- 3472,3494 ----
          }
          arc_cksum_compute(buf, B_FALSE);
          hdr->b_flags |= ARC_IO_IN_PROGRESS;
  }
  
+ /*
+  * The SPA calls this callback for each physical write that happens on behalf
+  * of a logical write.  See the comment in dbuf_write_physdone() for details.
+  */
  static void
+ arc_write_physdone(zio_t *zio)
+ {
+         arc_write_callback_t *cb = zio->io_private;
+         if (cb->awcb_physdone != NULL)
+                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+ }
+ 
+ static void
  arc_write_done(zio_t *zio)
  {
          arc_write_callback_t *callback = zio->io_private;
          arc_buf_t *buf = callback->awcb_buf;
          arc_buf_hdr_t *hdr = buf->b_hdr;
*** 3506,3517 ****
  }
  
  zio_t *
  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
!     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
!     void *private, int priority, int zio_flags, const zbookmark_t *zb)
  {
          arc_buf_hdr_t *hdr = buf->b_hdr;
          arc_write_callback_t *callback;
          zio_t *zio;
  
--- 3564,3576 ----
  }
  
  zio_t *
  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
!     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
!     arc_done_func_t *done, void *private, zio_priority_t priority,
!     int zio_flags, const zbookmark_t *zb)
  {
          arc_buf_hdr_t *hdr = buf->b_hdr;
          arc_write_callback_t *callback;
          zio_t *zio;
  
*** 3524,3545 ****
                  hdr->b_flags |= ARC_L2CACHE;
          if (l2arc_compress)
                  hdr->b_flags |= ARC_L2COMPRESS;
          callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
          callback->awcb_ready = ready;
          callback->awcb_done = done;
          callback->awcb_private = private;
          callback->awcb_buf = buf;
  
          zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
!             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
  
          return (zio);
  }
  
  static int
! arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
  {
  #ifdef _KERNEL
          uint64_t available_memory = ptob(freemem);
          static uint64_t page_load = 0;
          static uint64_t last_txg = 0;
--- 3583,3606 ----
                  hdr->b_flags |= ARC_L2CACHE;
          if (l2arc_compress)
                  hdr->b_flags |= ARC_L2COMPRESS;
          callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
          callback->awcb_ready = ready;
+         callback->awcb_physdone = physdone;
          callback->awcb_done = done;
          callback->awcb_private = private;
          callback->awcb_buf = buf;
  
          zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
!             arc_write_ready, arc_write_physdone, arc_write_done, callback,
!             priority, zio_flags, zb);
  
          return (zio);
  }
  
  static int
! arc_memory_throttle(uint64_t reserve, uint64_t txg)
  {
  #ifdef _KERNEL
          uint64_t available_memory = ptob(freemem);
          static uint64_t page_load = 0;
          static uint64_t last_txg = 0;
*** 3546,3556 ****
  
  #if defined(__i386)
          available_memory =
              MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
  #endif
!         if (available_memory >= zfs_write_limit_max)
                  return (0);
  
          if (txg > last_txg) {
                  last_txg = txg;
                  page_load = 0;
--- 3607,3618 ----
  
  #if defined(__i386)
          available_memory =
              MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
  #endif
! 
!         if (freemem > physmem * arc_lotsfree_percent / 100)
                  return (0);
  
          if (txg > last_txg) {
                  last_txg = txg;
                  page_load = 0;
*** 3570,3593 ****
                  /* memory is low, delay before restarting */
                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
                  return (SET_ERROR(EAGAIN));
          }
          page_load = 0;
- 
-         if (arc_size > arc_c_min) {
-                 uint64_t evictable_memory =
-                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
-                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
-                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
-                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
-         }
- 
-         if (inflight_data > available_memory / 4) {
-                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-                 return (SET_ERROR(ERESTART));
-         }
  #endif
          return (0);
  }
  
  void
--- 3632,3641 ----
*** 3601,3619 ****
  arc_tempreserve_space(uint64_t reserve, uint64_t txg)
  {
          int error;
          uint64_t anon_size;
  
- #ifdef ZFS_DEBUG
-         /*
-          * Once in a while, fail for no reason.  Everything should cope.
-          */
-         if (spa_get_random(10000) == 0) {
-                 dprintf("forcing random failure\n");
-                 return (SET_ERROR(ERESTART));
-         }
- #endif
          if (reserve > arc_c/4 && !arc_no_grow)
                  arc_c = MIN(arc_c_max, reserve * 4);
          if (reserve > arc_c)
                  return (SET_ERROR(ENOMEM));
  
--- 3649,3658 ----
*** 3627,3637 ****
          /*
           * Writes will, almost always, require additional memory allocations
           * in order to compress/encrypt/etc the data.  We therefore need to
           * make sure that there is sufficient available memory for this.
           */
!         if (error = arc_memory_throttle(reserve, anon_size, txg))
                  return (error);
  
          /*
           * Throttle writes when the amount of dirty data in the cache
           * gets too large.  We try to keep the cache less than half full
--- 3666,3677 ----
          /*
           * Writes will, almost always, require additional memory allocations
           * in order to compress/encrypt/etc the data.  We therefore need to
           * make sure that there is sufficient available memory for this.
           */
!         error = arc_memory_throttle(reserve, txg);
!         if (error != 0)
                  return (error);
  
          /*
           * Throttle writes when the amount of dirty data in the cache
           * gets too large.  We try to keep the cache less than half full
*** 3776,3790 ****
              TS_RUN, minclsyspri);
  
          arc_dead = FALSE;
          arc_warm = B_FALSE;
  
!         if (zfs_write_limit_max == 0)
!                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
!         else
!                 zfs_write_limit_shift = 0;
!         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
  }
  
  void
  arc_fini(void)
  {
--- 3816,3839 ----
              TS_RUN, minclsyspri);
  
          arc_dead = FALSE;
          arc_warm = B_FALSE;
  
!         /*
!          * Calculate maximum amount of dirty data per pool.
!          *
!          * If it has been set by /etc/system, take that.
!          * Otherwise, use a percentage of physical memory defined by
!          * zfs_dirty_data_max_percent (default 10%) with a cap at
!          * zfs_dirty_data_max_max (default 4GB).
!          */
!         if (zfs_dirty_data_max == 0) {
!                 zfs_dirty_data_max = physmem * PAGESIZE *
!                     zfs_dirty_data_max_percent / 100;
!                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
!                     zfs_dirty_data_max_max);
!         }
  }
  
  void
  arc_fini(void)
  {
*** 3821,3832 ****
          mutex_destroy(&arc_mru_ghost->arcs_mtx);
          mutex_destroy(&arc_mfu->arcs_mtx);
          mutex_destroy(&arc_mfu_ghost->arcs_mtx);
          mutex_destroy(&arc_l2c_only->arcs_mtx);
  
-         mutex_destroy(&zfs_write_limit_lock);
- 
          buf_fini();
  
          ASSERT(arc_loaned_bytes == 0);
  }
  
--- 3870,3879 ----