nexenta-gate Cdiff usr/src/uts/common/fs/zfs/arc.c

Print this page

ARC pressure valve implementation


*** 147,156 ****
--- 147,161 ----
  
  static kmutex_t         arc_reclaim_thr_lock;
  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
  static uint8_t          arc_thread_exit;
  
+ static kmutex_t         arc_pressure_thr_lock;
+ static kcondvar_t       arc_pressure_thr_cv;
+ static uint8_t          arc_pressure_thread_exit;
+ static uint64_t         arc_pressure_threshold;
+ 
  #define ARC_REDUCE_DNLC_PERCENT 3
  uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
  
  typedef enum arc_reclaim_strategy {
          ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
*** 293,302 ****
--- 298,308 ----
          kstat_named_t arcstat_c_max;
          kstat_named_t arcstat_size;
          kstat_named_t arcstat_hdr_size;
          kstat_named_t arcstat_data_size;
          kstat_named_t arcstat_other_size;
+         kstat_named_t arcstat_growth_rate;
          kstat_named_t arcstat_l2_hits;
          kstat_named_t arcstat_l2_misses;
          kstat_named_t arcstat_l2_feeds;
          kstat_named_t arcstat_l2_rw_clash;
          kstat_named_t arcstat_l2_read_bytes;
*** 359,368 ****
--- 365,375 ----
          { "c_max",                      KSTAT_DATA_UINT64 },
          { "size",                       KSTAT_DATA_UINT64 },
          { "hdr_size",                   KSTAT_DATA_UINT64 },
          { "data_size",                  KSTAT_DATA_UINT64 },
          { "other_size",                 KSTAT_DATA_UINT64 },
+         { "growth_rate",                KSTAT_DATA_UINT64 },
          { "l2_hits",                    KSTAT_DATA_UINT64 },
          { "l2_misses",                  KSTAT_DATA_UINT64 },
          { "l2_feeds",                   KSTAT_DATA_UINT64 },
          { "l2_rw_clash",                KSTAT_DATA_UINT64 },
          { "l2_read_bytes",              KSTAT_DATA_UINT64 },
*** 428,437 ****
--- 435,461 ----
                  } else {                                                \
                          ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
                  }                                                       \
          }
  
+ /*
+  * This macro allows us to use kstats as floating averages. Each time we
+  * update this kstat, we first factor it and the update value by `factor'
+  * to shrink the new value's contribution to the overall average. This
+  * macro assumes that integer loads and stores are atomic, but is not
+  * safe for multiple writers updating the kstat in parallel (only the
+  * last writer's update will remain).
+  */
+ #define ARCSTAT_F_AVG(stat, value, factor) \
+         do { \
+                 uint64_t x = ARCSTAT(stat); \
+                 x = x - x / factor + (value) / factor; \
+                 ARCSTAT(stat) = x; \
+                 _NOTE(NOTREACHED) \
+                 _NOTE(CONSTCOND) \
+         } while (0)
+ 
  kstat_t                 *arc_ksp;
  static arc_state_t      *arc_anon;
  static arc_state_t      *arc_mru;
  static arc_state_t      *arc_mru_ghost;
  static arc_state_t      *arc_mfu;
*** 459,468 ****
--- 483,493 ----
          ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
  
  static int              arc_no_grow;    /* Don't try to grow cache size */
  static uint64_t         arc_tempreserve;
  static uint64_t         arc_loaned_bytes;
+ static uint64_t         arc_bytes_allocd = 0;
  
  typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
  
  typedef struct arc_callback arc_callback_t;
  
*** 1272,1281 ****
--- 1297,1307 ----
                  break;
          }
  
          ARCSTAT_INCR(arcstat_meta_used, space);
          atomic_add_64(&arc_size, space);
+         atomic_add_64(&arc_bytes_allocd, space);
  }
  
  void
  arc_space_return(uint64_t space, arc_space_type_t type)
  {
*** 1308,1317 ****
--- 1334,1344 ----
  arc_data_buf_alloc(uint64_t size)
  {
          if (arc_evict_needed(ARC_BUFC_DATA))
                  cv_signal(&arc_reclaim_thr_cv);
          atomic_add_64(&arc_size, size);
+         atomic_add_64(&arc_bytes_allocd, size);
          return (zio_data_buf_alloc(size));
  }
  
  void
  arc_data_buf_free(void *buf, uint64_t size)
*** 2109,2119 ****
--- 2136,2181 ----
                  delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
                  arc_evict_ghost(arc_mfu_ghost, NULL, delta);
          }
  }
  
+ #define ACCURACY        1000
+ 
  static void
+ arc_reclaim_bytes(uint64_t to_evict)
+ {
+         uint64_t to_evict_data_mru, to_evict_data_mfu;
+         uint64_t to_evict_meta_mru, to_evict_meta_mfu;
+ 
+         to_evict_meta_mru = (((arc_mru->arcs_lsize[ARC_BUFC_METADATA] *
+             ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+             to_evict) / ACCURACY;
+         to_evict_data_mru = (((arc_mru->arcs_lsize[ARC_BUFC_DATA] *
+             ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+             to_evict) / ACCURACY;
+         to_evict_meta_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_METADATA] *
+             ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+             to_evict) / ACCURACY;
+         to_evict_data_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_DATA] *
+             ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+             to_evict) / ACCURACY;
+ 
+         if (to_evict_meta_mru > 0)
+                 (void) arc_evict(arc_mru, NULL, to_evict_meta_mru, FALSE,
+                     ARC_BUFC_METADATA);
+         if (to_evict_data_mru > 0)
+                 (void) arc_evict(arc_mru, NULL, to_evict_data_mru, FALSE,
+                     ARC_BUFC_DATA);
+         if (to_evict_meta_mfu > 0)
+                 (void) arc_evict(arc_mfu, NULL, to_evict_meta_mfu, FALSE,
+                     ARC_BUFC_METADATA);
+         if (to_evict_data_mfu > 0)
+                 (void) arc_evict(arc_mfu, NULL, to_evict_data_mfu, FALSE,
+                     ARC_BUFC_DATA);
+ }
+ 
+ static void
  arc_do_user_evicts(void)
  {
          mutex_enter(&arc_eviction_mtx);
          while (arc_eviction_list != NULL) {
                  arc_buf_t *buf = arc_eviction_list;
*** 2247,2256 ****
--- 2309,2339 ----
  
          if (arc_size > arc_c)
                  arc_adjust();
  }
  
+ #define PHYSMEM_PRESSURE_FRACTION       100
+ 
+ static boolean_t
+ arc_mem_pressure(void)
+ {
+ #ifdef _KERNEL
+         uint64_t extra = desfree + physmem / PHYSMEM_PRESSURE_FRACTION;
+ 
+         if ((freemem < lotsfree + needfree + extra) ||
+             (needfree || availrmem < swapfs_minfree + swapfs_reserve + extra) ||
+             (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) <
+             (vmem_size(zio_arena, VMEM_ALLOC) >> 4) +
+             physmem / PHYSMEM_PRESSURE_FRACTION))
+                 return (B_TRUE);
+ 
+         return (freemem < physmem / PHYSMEM_PRESSURE_FRACTION);
+ #else
+         return (0);
+ #endif
+ }
+ 
  /*
   * Determine if the system is under memory pressure and is asking
   * to reclaim memory. A return value of 1 indicates that the system
   * is under memory pressure and that the arc should adjust accordingly.
   */
*** 2389,2399 ****
--- 2472,2559 ----
           */
          if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
                  vmem_qcache_reap(zio_arena);
  }
  
+ #define RECLAIMS_PER_SEC        20
+ #define STAT_UPDATES_PER_SEC    5
+ 
+ /*
+  * During heavy use, the ARC naturally wants to oscilate its arc_c around
+  * a maximum memory pressure point which corresponds to the arc_reclaim_needed
+  * function evaluating to 1. This results in the arc_size slowly growing
+  * towards this reclaim_needed threshold and exceeding it periodically. Once
+  * this happens, both arc_c and arc_size are down-adjusted by the
+  * arc_reclaim_thread and kmem_reap is initiated. This is problematic on
+  * bigmem systems with a small recordsize (4k or 8k), because reaping a kmem
+  * cache which contains very large numbers of objects is extremely expensive
+  * from an xcall perspective (several seconds of heavy CPU use):
+  *
+  * (mem)
+  * ^         arc_reclaim_thread reacts
+  * |           |                   |
+  * |           V                   V
+  * |
+  * |           +                   +
+  * |          /|                  /|
+  * | ......./..|................/..|.............. arc_reclaim_needed threshold
+  * |      /     \_____________/     \___________/(etc)
+  * |    /          kmem reap          kmem reap
+  * |  /
+  * |/
+  * +----------------------------------------------------------------->
+  *                                                            (time)
+  *
+  * To help address this stairstep pattern, the arc_pressure_thread periodically
+  * gauges the distance of the current arc_size to the arc_reclaim_needed
+  * threshold by way of an estimation algorithm (in arc_mem_pressure).
+  */
  static void
+ arc_pressure_thread(void)
+ {
+         clock_t                 last_update = ddi_get_lbolt();
+         callb_cpr_t             cpr;
+ 
+         CALLB_CPR_INIT(&cpr, &arc_pressure_thr_lock, callb_generic_cpr, FTAG);
+ 
+         mutex_enter(&arc_pressure_thr_lock);
+         while (arc_pressure_thread_exit == 0) {
+                 clock_t now;
+ 
+                 now = ddi_get_lbolt();
+                 if (now - last_update >= hz / STAT_UPDATES_PER_SEC) {
+                         uint64_t new_rate;
+ 
+                         new_rate = (atomic_swap_64(&arc_bytes_allocd, 0) *
+                             hz) / (now - last_update);
+ 
+                         if (ARCSTAT(arcstat_growth_rate) < new_rate)
+                                 ARCSTAT(arcstat_growth_rate) = new_rate;
+                         else
+                                 ARCSTAT_F_AVG(arcstat_growth_rate, new_rate, 4);
+                         last_update = now;
+                 }
+ 
+                 arc_pressure_threshold = arc_c - ARCSTAT(arcstat_growth_rate);
+                 if (arc_size > arc_pressure_threshold) {
+                         arc_reclaim_bytes(arc_size - arc_pressure_threshold);
+                 }
+ 
+                 CALLB_CPR_SAFE_BEGIN(&cpr);
+                 (void) cv_timedwait(&arc_pressure_thr_cv,
+                     &arc_pressure_thr_lock,
+                     ddi_get_lbolt() + hz / RECLAIMS_PER_SEC);
+                 CALLB_CPR_SAFE_END(&cpr, &arc_pressure_thr_lock);
+         }
+ 
+         arc_pressure_thread_exit = 0;
+         cv_broadcast(&arc_pressure_thr_cv);
+         CALLB_CPR_EXIT(&cpr);           /* drops arc_pressure_thr_lock */
+         thread_exit();
+ }
+ 
+ static void
  arc_reclaim_thread(void)
  {
          clock_t                 growtime = 0;
          arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
          callb_cpr_t             cpr;
*** 2498,2508 ****
  
          /*
           * If we're within (2 * maxblocksize) bytes of the target
           * cache size, increment the target cache size
           */
!         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
                  atomic_add_64(&arc_c, (int64_t)bytes);
                  if (arc_c > arc_c_max)
                          arc_c = arc_c_max;
                  else if (state == arc_anon)
                          atomic_add_64(&arc_p, (int64_t)bytes);
--- 2658,2669 ----
  
          /*
           * If we're within (2 * maxblocksize) bytes of the target
           * cache size, increment the target cache size
           */
!         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT) ||
!             (arc_size >= arc_pressure_threshold && arc_mem_pressure() == 0)) {
                  atomic_add_64(&arc_c, (int64_t)bytes);
                  if (arc_c > arc_c_max)
                          arc_c = arc_c_max;
                  else if (state == arc_anon)
                          atomic_add_64(&arc_p, (int64_t)bytes);
*** 2571,2580 ****
--- 2732,2742 ----
                  } else {
                          ASSERT(type == ARC_BUFC_DATA);
                          buf->b_data = zio_data_buf_alloc(size);
                          ARCSTAT_INCR(arcstat_data_size, size);
                          atomic_add_64(&arc_size, size);
+                         atomic_add_64(&arc_bytes_allocd, size);
                  }
                  goto out;
          }
  
          /*
*** 2603,2612 ****
--- 2765,2775 ----
                  } else {
                          ASSERT(type == ARC_BUFC_DATA);
                          buf->b_data = zio_data_buf_alloc(size);
                          ARCSTAT_INCR(arcstat_data_size, size);
                          atomic_add_64(&arc_size, size);
+                         atomic_add_64(&arc_bytes_allocd, size);
                  }
                  ARCSTAT_BUMP(arcstat_recycle_miss);
          }
          ASSERT(buf->b_data != NULL);
  out:
*** 3769,3778 ****
--- 3932,3944 ----
  arc_init(void)
  {
          mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
          cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
  
+         mutex_init(&arc_pressure_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+         cv_init(&arc_pressure_thr_cv, NULL, CV_DEFAULT, NULL);
+ 
          /* Convert seconds to clock ticks */
          arc_min_prefetch_lifespan = 1 * hz;
  
          /* Start out with 1/8 of all memory */
          arc_c = physmem * PAGESIZE / 8;
*** 3784,3793 ****
--- 3950,3961 ----
           * need to limit the cache to 1/8 of VM size.
           */
          arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
  #endif
  
+         /* initial sensible value */
+         arc_pressure_threshold = arc_c;
          /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
          arc_c_min = MAX(arc_c / 4, 64<<20);
          /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
          if (arc_c * 8 >= 1<<30)
                  arc_c_max = (arc_c * 8) - (1<<30);
*** 3885,3894 ****
--- 4053,4064 ----
                  kstat_install(arc_ksp);
          }
  
          (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
              TS_RUN, minclsyspri);
+         (void) thread_create(NULL, 0, arc_pressure_thread, NULL, 0, &p0,
+             TS_RUN, minclsyspri);
  
          arc_dead = FALSE;
          arc_warm = B_FALSE;
  
          /*
*** 3914,3923 ****
--- 4084,4099 ----
          arc_thread_exit = 1;
          while (arc_thread_exit != 0)
                  cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
          mutex_exit(&arc_reclaim_thr_lock);
  
+         mutex_enter(&arc_pressure_thr_lock);
+         arc_pressure_thread_exit = 1;
+         while (arc_pressure_thread_exit != 0)
+                 cv_wait(&arc_pressure_thr_cv, &arc_pressure_thr_lock);
+         mutex_exit(&arc_pressure_thr_lock);
+ 
          arc_flush(NULL);
  
          arc_dead = TRUE;
  
          if (arc_ksp != NULL) {
*** 3926,3935 ****
--- 4102,4113 ----
          }
  
          mutex_destroy(&arc_eviction_mtx);
          mutex_destroy(&arc_reclaim_thr_lock);
          cv_destroy(&arc_reclaim_thr_cv);
+         mutex_destroy(&arc_pressure_thr_lock);
+         cv_destroy(&arc_pressure_thr_cv);
  
          list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);