nexenta-gate Udiff usr/src/uts/common/fs/zfs/arc.c

Print this page

ARC pressure valve implementation

@@ -147,10 +147,15 @@
 
 static kmutex_t         arc_reclaim_thr_lock;
 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 static uint8_t          arc_thread_exit;
 
+static kmutex_t         arc_pressure_thr_lock;
+static kcondvar_t       arc_pressure_thr_cv;
+static uint8_t          arc_pressure_thread_exit;
+static uint64_t         arc_pressure_threshold;
+
 #define ARC_REDUCE_DNLC_PERCENT 3
 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 
 typedef enum arc_reclaim_strategy {
         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */

@@ -293,10 +298,11 @@
         kstat_named_t arcstat_c_max;
         kstat_named_t arcstat_size;
         kstat_named_t arcstat_hdr_size;
         kstat_named_t arcstat_data_size;
         kstat_named_t arcstat_other_size;
+        kstat_named_t arcstat_growth_rate;
         kstat_named_t arcstat_l2_hits;
         kstat_named_t arcstat_l2_misses;
         kstat_named_t arcstat_l2_feeds;
         kstat_named_t arcstat_l2_rw_clash;
         kstat_named_t arcstat_l2_read_bytes;

@@ -359,10 +365,11 @@
         { "c_max",                      KSTAT_DATA_UINT64 },
         { "size",                       KSTAT_DATA_UINT64 },
         { "hdr_size",                   KSTAT_DATA_UINT64 },
         { "data_size",                  KSTAT_DATA_UINT64 },
         { "other_size",                 KSTAT_DATA_UINT64 },
+        { "growth_rate",                KSTAT_DATA_UINT64 },
         { "l2_hits",                    KSTAT_DATA_UINT64 },
         { "l2_misses",                  KSTAT_DATA_UINT64 },
         { "l2_feeds",                   KSTAT_DATA_UINT64 },
         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
         { "l2_read_bytes",              KSTAT_DATA_UINT64 },

@@ -428,10 +435,27 @@
                 } else {                                                \
                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
                 }                                                       \
         }
 
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by `factor'
+ * to shrink the new value's contribution to the overall average. This
+ * macro assumes that integer loads and stores are atomic, but is not
+ * safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG(stat, value, factor) \
+        do { \
+                uint64_t x = ARCSTAT(stat); \
+                x = x - x / factor + (value) / factor; \
+                ARCSTAT(stat) = x; \
+                _NOTE(NOTREACHED) \
+                _NOTE(CONSTCOND) \
+        } while (0)
+
 kstat_t                 *arc_ksp;
 static arc_state_t      *arc_anon;
 static arc_state_t      *arc_mru;
 static arc_state_t      *arc_mru_ghost;
 static arc_state_t      *arc_mfu;

@@ -459,10 +483,11 @@
         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 
 static int              arc_no_grow;    /* Don't try to grow cache size */
 static uint64_t         arc_tempreserve;
 static uint64_t         arc_loaned_bytes;
+static uint64_t         arc_bytes_allocd = 0;
 
 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 
 typedef struct arc_callback arc_callback_t;

@@ -1272,10 +1297,11 @@
                 break;
         }
 
         ARCSTAT_INCR(arcstat_meta_used, space);
         atomic_add_64(&arc_size, space);
+        atomic_add_64(&arc_bytes_allocd, space);
 }
 
 void
 arc_space_return(uint64_t space, arc_space_type_t type)
 {

@@ -1308,10 +1334,11 @@
 arc_data_buf_alloc(uint64_t size)
 {
         if (arc_evict_needed(ARC_BUFC_DATA))
                 cv_signal(&arc_reclaim_thr_cv);
         atomic_add_64(&arc_size, size);
+        atomic_add_64(&arc_bytes_allocd, size);
         return (zio_data_buf_alloc(size));
 }
 
 void
 arc_data_buf_free(void *buf, uint64_t size)

@@ -2109,11 +2136,46 @@
                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
         }
 }
 
+#define ACCURACY        1000
+
 static void
+arc_reclaim_bytes(uint64_t to_evict)
+{
+        uint64_t to_evict_data_mru, to_evict_data_mfu;
+        uint64_t to_evict_meta_mru, to_evict_meta_mfu;
+
+        to_evict_meta_mru = (((arc_mru->arcs_lsize[ARC_BUFC_METADATA] *
+            ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+            to_evict) / ACCURACY;
+        to_evict_data_mru = (((arc_mru->arcs_lsize[ARC_BUFC_DATA] *
+            ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+            to_evict) / ACCURACY;
+        to_evict_meta_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_METADATA] *
+            ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+            to_evict) / ACCURACY;
+        to_evict_data_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_DATA] *
+            ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+            to_evict) / ACCURACY;
+
+        if (to_evict_meta_mru > 0)
+                (void) arc_evict(arc_mru, NULL, to_evict_meta_mru, FALSE,
+                    ARC_BUFC_METADATA);
+        if (to_evict_data_mru > 0)
+                (void) arc_evict(arc_mru, NULL, to_evict_data_mru, FALSE,
+                    ARC_BUFC_DATA);
+        if (to_evict_meta_mfu > 0)
+                (void) arc_evict(arc_mfu, NULL, to_evict_meta_mfu, FALSE,
+                    ARC_BUFC_METADATA);
+        if (to_evict_data_mfu > 0)
+                (void) arc_evict(arc_mfu, NULL, to_evict_data_mfu, FALSE,
+                    ARC_BUFC_DATA);
+}
+
+static void
 arc_do_user_evicts(void)
 {
         mutex_enter(&arc_eviction_mtx);
         while (arc_eviction_list != NULL) {
                 arc_buf_t *buf = arc_eviction_list;

@@ -2247,10 +2309,31 @@
 
         if (arc_size > arc_c)
                 arc_adjust();
 }
 
+#define PHYSMEM_PRESSURE_FRACTION       100
+
+static boolean_t
+arc_mem_pressure(void)
+{
+#ifdef _KERNEL
+        uint64_t extra = desfree + physmem / PHYSMEM_PRESSURE_FRACTION;
+
+        if ((freemem < lotsfree + needfree + extra) ||
+            (needfree || availrmem < swapfs_minfree + swapfs_reserve + extra) ||
+            (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) <
+            (vmem_size(zio_arena, VMEM_ALLOC) >> 4) +
+            physmem / PHYSMEM_PRESSURE_FRACTION))
+                return (B_TRUE);
+
+        return (freemem < physmem / PHYSMEM_PRESSURE_FRACTION);
+#else
+        return (0);
+#endif
+}
+
 /*
  * Determine if the system is under memory pressure and is asking
  * to reclaim memory. A return value of 1 indicates that the system
  * is under memory pressure and that the arc should adjust accordingly.
  */

@@ -2389,11 +2472,88 @@
          */
         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
                 vmem_qcache_reap(zio_arena);
 }
 
+#define RECLAIMS_PER_SEC        20
+#define STAT_UPDATES_PER_SEC    5
+
+/*
+ * During heavy use, the ARC naturally wants to oscilate its arc_c around
+ * a maximum memory pressure point which corresponds to the arc_reclaim_needed
+ * function evaluating to 1. This results in the arc_size slowly growing
+ * towards this reclaim_needed threshold and exceeding it periodically. Once
+ * this happens, both arc_c and arc_size are down-adjusted by the
+ * arc_reclaim_thread and kmem_reap is initiated. This is problematic on
+ * bigmem systems with a small recordsize (4k or 8k), because reaping a kmem
+ * cache which contains very large numbers of objects is extremely expensive
+ * from an xcall perspective (several seconds of heavy CPU use):
+ *
+ * (mem)
+ * ^         arc_reclaim_thread reacts
+ * |           |                   |
+ * |           V                   V
+ * |
+ * |           +                   +
+ * |          /|                  /|
+ * | ......./..|................/..|.............. arc_reclaim_needed threshold
+ * |      /     \_____________/     \___________/(etc)
+ * |    /          kmem reap          kmem reap
+ * |  /
+ * |/
+ * +----------------------------------------------------------------->
+ *                                                            (time)
+ *
+ * To help address this stairstep pattern, the arc_pressure_thread periodically
+ * gauges the distance of the current arc_size to the arc_reclaim_needed
+ * threshold by way of an estimation algorithm (in arc_mem_pressure).
+ */
 static void
+arc_pressure_thread(void)
+{
+        clock_t                 last_update = ddi_get_lbolt();
+        callb_cpr_t             cpr;
+
+        CALLB_CPR_INIT(&cpr, &arc_pressure_thr_lock, callb_generic_cpr, FTAG);
+
+        mutex_enter(&arc_pressure_thr_lock);
+        while (arc_pressure_thread_exit == 0) {
+                clock_t now;
+
+                now = ddi_get_lbolt();
+                if (now - last_update >= hz / STAT_UPDATES_PER_SEC) {
+                        uint64_t new_rate;
+
+                        new_rate = (atomic_swap_64(&arc_bytes_allocd, 0) *
+                            hz) / (now - last_update);
+
+                        if (ARCSTAT(arcstat_growth_rate) < new_rate)
+                                ARCSTAT(arcstat_growth_rate) = new_rate;
+                        else
+                                ARCSTAT_F_AVG(arcstat_growth_rate, new_rate, 4);
+                        last_update = now;
+                }
+
+                arc_pressure_threshold = arc_c - ARCSTAT(arcstat_growth_rate);
+                if (arc_size > arc_pressure_threshold) {
+                        arc_reclaim_bytes(arc_size - arc_pressure_threshold);
+                }
+
+                CALLB_CPR_SAFE_BEGIN(&cpr);
+                (void) cv_timedwait(&arc_pressure_thr_cv,
+                    &arc_pressure_thr_lock,
+                    ddi_get_lbolt() + hz / RECLAIMS_PER_SEC);
+                CALLB_CPR_SAFE_END(&cpr, &arc_pressure_thr_lock);
+        }
+
+        arc_pressure_thread_exit = 0;
+        cv_broadcast(&arc_pressure_thr_cv);
+        CALLB_CPR_EXIT(&cpr);           /* drops arc_pressure_thr_lock */
+        thread_exit();
+}
+
+static void
 arc_reclaim_thread(void)
 {
         clock_t                 growtime = 0;
         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
         callb_cpr_t             cpr;

@@ -2498,11 +2658,12 @@
 
         /*
          * If we're within (2 * maxblocksize) bytes of the target
          * cache size, increment the target cache size
          */
-        if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+        if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT) ||
+            (arc_size >= arc_pressure_threshold && arc_mem_pressure() == 0)) {
                 atomic_add_64(&arc_c, (int64_t)bytes);
                 if (arc_c > arc_c_max)
                         arc_c = arc_c_max;
                 else if (state == arc_anon)
                         atomic_add_64(&arc_p, (int64_t)bytes);

@@ -2571,10 +2732,11 @@
                 } else {
                         ASSERT(type == ARC_BUFC_DATA);
                         buf->b_data = zio_data_buf_alloc(size);
                         ARCSTAT_INCR(arcstat_data_size, size);
                         atomic_add_64(&arc_size, size);
+                        atomic_add_64(&arc_bytes_allocd, size);
                 }
                 goto out;
         }
 
         /*

@@ -2603,10 +2765,11 @@
                 } else {
                         ASSERT(type == ARC_BUFC_DATA);
                         buf->b_data = zio_data_buf_alloc(size);
                         ARCSTAT_INCR(arcstat_data_size, size);
                         atomic_add_64(&arc_size, size);
+                        atomic_add_64(&arc_bytes_allocd, size);
                 }
                 ARCSTAT_BUMP(arcstat_recycle_miss);
         }
         ASSERT(buf->b_data != NULL);
 out:

@@ -3769,10 +3932,13 @@
 arc_init(void)
 {
         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
 
+        mutex_init(&arc_pressure_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+        cv_init(&arc_pressure_thr_cv, NULL, CV_DEFAULT, NULL);
+
         /* Convert seconds to clock ticks */
         arc_min_prefetch_lifespan = 1 * hz;
 
         /* Start out with 1/8 of all memory */
         arc_c = physmem * PAGESIZE / 8;

@@ -3784,10 +3950,12 @@
          * need to limit the cache to 1/8 of VM size.
          */
         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
 #endif
 
+        /* initial sensible value */
+        arc_pressure_threshold = arc_c;
         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
         arc_c_min = MAX(arc_c / 4, 64<<20);
         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
         if (arc_c * 8 >= 1<<30)
                 arc_c_max = (arc_c * 8) - (1<<30);

@@ -3885,10 +4053,12 @@
                 kstat_install(arc_ksp);
         }
 
         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
             TS_RUN, minclsyspri);
+        (void) thread_create(NULL, 0, arc_pressure_thread, NULL, 0, &p0,
+            TS_RUN, minclsyspri);
 
         arc_dead = FALSE;
         arc_warm = B_FALSE;
 
         /*

@@ -3914,10 +4084,16 @@
         arc_thread_exit = 1;
         while (arc_thread_exit != 0)
                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
         mutex_exit(&arc_reclaim_thr_lock);
 
+        mutex_enter(&arc_pressure_thr_lock);
+        arc_pressure_thread_exit = 1;
+        while (arc_pressure_thread_exit != 0)
+                cv_wait(&arc_pressure_thr_cv, &arc_pressure_thr_lock);
+        mutex_exit(&arc_pressure_thr_lock);
+
         arc_flush(NULL);
 
         arc_dead = TRUE;
 
         if (arc_ksp != NULL) {

@@ -3926,10 +4102,12 @@
         }
 
         mutex_destroy(&arc_eviction_mtx);
         mutex_destroy(&arc_reclaim_thr_lock);
         cv_destroy(&arc_reclaim_thr_cv);
+        mutex_destroy(&arc_pressure_thr_lock);
+        cv_destroy(&arc_pressure_thr_cv);
 
         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);