Print this page
ARC pressure valve implementation
@@ -147,10 +147,15 @@
static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
+static kmutex_t arc_pressure_thr_lock;
+static kcondvar_t arc_pressure_thr_cv;
+static uint8_t arc_pressure_thread_exit;
+static uint64_t arc_pressure_threshold;
+
#define ARC_REDUCE_DNLC_PERCENT 3
uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
typedef enum arc_reclaim_strategy {
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
@@ -293,10 +298,11 @@
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
kstat_named_t arcstat_hdr_size;
kstat_named_t arcstat_data_size;
kstat_named_t arcstat_other_size;
+ kstat_named_t arcstat_growth_rate;
kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_misses;
kstat_named_t arcstat_l2_feeds;
kstat_named_t arcstat_l2_rw_clash;
kstat_named_t arcstat_l2_read_bytes;
@@ -359,10 +365,11 @@
{ "c_max", KSTAT_DATA_UINT64 },
{ "size", KSTAT_DATA_UINT64 },
{ "hdr_size", KSTAT_DATA_UINT64 },
{ "data_size", KSTAT_DATA_UINT64 },
{ "other_size", KSTAT_DATA_UINT64 },
+ { "growth_rate", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
{ "l2_feeds", KSTAT_DATA_UINT64 },
{ "l2_rw_clash", KSTAT_DATA_UINT64 },
{ "l2_read_bytes", KSTAT_DATA_UINT64 },
@@ -428,10 +435,27 @@
} else { \
ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
} \
}
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by `factor'
+ * to shrink the new value's contribution to the overall average. This
+ * macro assumes that integer loads and stores are atomic, but is not
+ * safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG(stat, value, factor) \
+ do { \
+ uint64_t x = ARCSTAT(stat); \
+ x = x - x / factor + (value) / factor; \
+ ARCSTAT(stat) = x; \
+ _NOTE(NOTREACHED) \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
kstat_t *arc_ksp;
static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
@@ -459,10 +483,11 @@
((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
static uint64_t arc_loaned_bytes;
+static uint64_t arc_bytes_allocd = 0;
typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
typedef struct arc_callback arc_callback_t;
@@ -1272,10 +1297,11 @@
break;
}
ARCSTAT_INCR(arcstat_meta_used, space);
atomic_add_64(&arc_size, space);
+ atomic_add_64(&arc_bytes_allocd, space);
}
void
arc_space_return(uint64_t space, arc_space_type_t type)
{
@@ -1308,10 +1334,11 @@
arc_data_buf_alloc(uint64_t size)
{
if (arc_evict_needed(ARC_BUFC_DATA))
cv_signal(&arc_reclaim_thr_cv);
atomic_add_64(&arc_size, size);
+ atomic_add_64(&arc_bytes_allocd, size);
return (zio_data_buf_alloc(size));
}
void
arc_data_buf_free(void *buf, uint64_t size)
@@ -2109,11 +2136,46 @@
delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
arc_evict_ghost(arc_mfu_ghost, NULL, delta);
}
}
+#define ACCURACY 1000
+
static void
+arc_reclaim_bytes(uint64_t to_evict)
+{
+ uint64_t to_evict_data_mru, to_evict_data_mfu;
+ uint64_t to_evict_meta_mru, to_evict_meta_mfu;
+
+ to_evict_meta_mru = (((arc_mru->arcs_lsize[ARC_BUFC_METADATA] *
+ ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+ to_evict) / ACCURACY;
+ to_evict_data_mru = (((arc_mru->arcs_lsize[ARC_BUFC_DATA] *
+ ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+ to_evict) / ACCURACY;
+ to_evict_meta_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_METADATA] *
+ ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+ to_evict) / ACCURACY;
+ to_evict_data_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_DATA] *
+ ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
+ to_evict) / ACCURACY;
+
+ if (to_evict_meta_mru > 0)
+ (void) arc_evict(arc_mru, NULL, to_evict_meta_mru, FALSE,
+ ARC_BUFC_METADATA);
+ if (to_evict_data_mru > 0)
+ (void) arc_evict(arc_mru, NULL, to_evict_data_mru, FALSE,
+ ARC_BUFC_DATA);
+ if (to_evict_meta_mfu > 0)
+ (void) arc_evict(arc_mfu, NULL, to_evict_meta_mfu, FALSE,
+ ARC_BUFC_METADATA);
+ if (to_evict_data_mfu > 0)
+ (void) arc_evict(arc_mfu, NULL, to_evict_data_mfu, FALSE,
+ ARC_BUFC_DATA);
+}
+
+static void
arc_do_user_evicts(void)
{
mutex_enter(&arc_eviction_mtx);
while (arc_eviction_list != NULL) {
arc_buf_t *buf = arc_eviction_list;
@@ -2247,10 +2309,31 @@
if (arc_size > arc_c)
arc_adjust();
}
+#define PHYSMEM_PRESSURE_FRACTION 100
+
+static boolean_t
+arc_mem_pressure(void)
+{
+#ifdef _KERNEL
+ uint64_t extra = desfree + physmem / PHYSMEM_PRESSURE_FRACTION;
+
+ if ((freemem < lotsfree + needfree + extra) ||
+ (needfree || availrmem < swapfs_minfree + swapfs_reserve + extra) ||
+ (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) <
+ (vmem_size(zio_arena, VMEM_ALLOC) >> 4) +
+ physmem / PHYSMEM_PRESSURE_FRACTION))
+ return (B_TRUE);
+
+ return (freemem < physmem / PHYSMEM_PRESSURE_FRACTION);
+#else
+ return (0);
+#endif
+}
+
/*
* Determine if the system is under memory pressure and is asking
* to reclaim memory. A return value of 1 indicates that the system
* is under memory pressure and that the arc should adjust accordingly.
*/
@@ -2389,11 +2472,88 @@
*/
if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
vmem_qcache_reap(zio_arena);
}
+#define RECLAIMS_PER_SEC 20
+#define STAT_UPDATES_PER_SEC 5
+
+/*
+ * During heavy use, the ARC naturally wants to oscilate its arc_c around
+ * a maximum memory pressure point which corresponds to the arc_reclaim_needed
+ * function evaluating to 1. This results in the arc_size slowly growing
+ * towards this reclaim_needed threshold and exceeding it periodically. Once
+ * this happens, both arc_c and arc_size are down-adjusted by the
+ * arc_reclaim_thread and kmem_reap is initiated. This is problematic on
+ * bigmem systems with a small recordsize (4k or 8k), because reaping a kmem
+ * cache which contains very large numbers of objects is extremely expensive
+ * from an xcall perspective (several seconds of heavy CPU use):
+ *
+ * (mem)
+ * ^ arc_reclaim_thread reacts
+ * | | |
+ * | V V
+ * |
+ * | + +
+ * | /| /|
+ * | ......./..|................/..|.............. arc_reclaim_needed threshold
+ * | / \_____________/ \___________/(etc)
+ * | / kmem reap kmem reap
+ * | /
+ * |/
+ * +----------------------------------------------------------------->
+ * (time)
+ *
+ * To help address this stairstep pattern, the arc_pressure_thread periodically
+ * gauges the distance of the current arc_size to the arc_reclaim_needed
+ * threshold by way of an estimation algorithm (in arc_mem_pressure).
+ */
static void
+arc_pressure_thread(void)
+{
+ clock_t last_update = ddi_get_lbolt();
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &arc_pressure_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&arc_pressure_thr_lock);
+ while (arc_pressure_thread_exit == 0) {
+ clock_t now;
+
+ now = ddi_get_lbolt();
+ if (now - last_update >= hz / STAT_UPDATES_PER_SEC) {
+ uint64_t new_rate;
+
+ new_rate = (atomic_swap_64(&arc_bytes_allocd, 0) *
+ hz) / (now - last_update);
+
+ if (ARCSTAT(arcstat_growth_rate) < new_rate)
+ ARCSTAT(arcstat_growth_rate) = new_rate;
+ else
+ ARCSTAT_F_AVG(arcstat_growth_rate, new_rate, 4);
+ last_update = now;
+ }
+
+ arc_pressure_threshold = arc_c - ARCSTAT(arcstat_growth_rate);
+ if (arc_size > arc_pressure_threshold) {
+ arc_reclaim_bytes(arc_size - arc_pressure_threshold);
+ }
+
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&arc_pressure_thr_cv,
+ &arc_pressure_thr_lock,
+ ddi_get_lbolt() + hz / RECLAIMS_PER_SEC);
+ CALLB_CPR_SAFE_END(&cpr, &arc_pressure_thr_lock);
+ }
+
+ arc_pressure_thread_exit = 0;
+ cv_broadcast(&arc_pressure_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops arc_pressure_thr_lock */
+ thread_exit();
+}
+
+static void
arc_reclaim_thread(void)
{
clock_t growtime = 0;
arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
callb_cpr_t cpr;
@@ -2498,11 +2658,12 @@
/*
* If we're within (2 * maxblocksize) bytes of the target
* cache size, increment the target cache size
*/
- if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT) ||
+ (arc_size >= arc_pressure_threshold && arc_mem_pressure() == 0)) {
atomic_add_64(&arc_c, (int64_t)bytes);
if (arc_c > arc_c_max)
arc_c = arc_c_max;
else if (state == arc_anon)
atomic_add_64(&arc_p, (int64_t)bytes);
@@ -2571,10 +2732,11 @@
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size);
+ atomic_add_64(&arc_bytes_allocd, size);
}
goto out;
}
/*
@@ -2603,10 +2765,11 @@
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size);
+ atomic_add_64(&arc_bytes_allocd, size);
}
ARCSTAT_BUMP(arcstat_recycle_miss);
}
ASSERT(buf->b_data != NULL);
out:
@@ -3769,10 +3932,13 @@
arc_init(void)
{
mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&arc_pressure_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_pressure_thr_cv, NULL, CV_DEFAULT, NULL);
+
/* Convert seconds to clock ticks */
arc_min_prefetch_lifespan = 1 * hz;
/* Start out with 1/8 of all memory */
arc_c = physmem * PAGESIZE / 8;
@@ -3784,10 +3950,12 @@
* need to limit the cache to 1/8 of VM size.
*/
arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
#endif
+ /* initial sensible value */
+ arc_pressure_threshold = arc_c;
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
arc_c_min = MAX(arc_c / 4, 64<<20);
/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
if (arc_c * 8 >= 1<<30)
arc_c_max = (arc_c * 8) - (1<<30);
@@ -3885,10 +4053,12 @@
kstat_install(arc_ksp);
}
(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
TS_RUN, minclsyspri);
+ (void) thread_create(NULL, 0, arc_pressure_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
arc_dead = FALSE;
arc_warm = B_FALSE;
/*
@@ -3914,10 +4084,16 @@
arc_thread_exit = 1;
while (arc_thread_exit != 0)
cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
mutex_exit(&arc_reclaim_thr_lock);
+ mutex_enter(&arc_pressure_thr_lock);
+ arc_pressure_thread_exit = 1;
+ while (arc_pressure_thread_exit != 0)
+ cv_wait(&arc_pressure_thr_cv, &arc_pressure_thr_lock);
+ mutex_exit(&arc_pressure_thr_lock);
+
arc_flush(NULL);
arc_dead = TRUE;
if (arc_ksp != NULL) {
@@ -3926,10 +4102,12 @@
}
mutex_destroy(&arc_eviction_mtx);
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
+ mutex_destroy(&arc_pressure_thr_lock);
+ cv_destroy(&arc_pressure_thr_cv);
list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);