Print this page
ARC pressure valve implementation

*** 147,156 **** --- 147,161 ---- static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; + static kmutex_t arc_pressure_thr_lock; + static kcondvar_t arc_pressure_thr_cv; + static uint8_t arc_pressure_thread_exit; + static uint64_t arc_pressure_threshold; + #define ARC_REDUCE_DNLC_PERCENT 3 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
*** 293,302 **** --- 298,308 ---- kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; kstat_named_t arcstat_hdr_size; kstat_named_t arcstat_data_size; kstat_named_t arcstat_other_size; + kstat_named_t arcstat_growth_rate; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes;
*** 359,368 **** --- 365,375 ---- { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, + { "growth_rate", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 },
*** 428,437 **** --- 435,461 ---- } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } + /* + * This macro allows us to use kstats as floating averages. Each time we + * update this kstat, we first factor it and the update value by `factor' + * to shrink the new value's contribution to the overall average. This + * macro assumes that integer loads and stores are atomic, but is not + * safe for multiple writers updating the kstat in parallel (only the + * last writer's update will remain). + */ + #define ARCSTAT_F_AVG(stat, value, factor) \ + do { \ + uint64_t x = ARCSTAT(stat); \ + x = x - x / factor + (value) / factor; \ + ARCSTAT(stat) = x; \ + _NOTE(NOTREACHED) \ + _NOTE(CONSTCOND) \ + } while (0) + kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu;
*** 459,468 **** --- 483,493 ---- ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; + static uint64_t arc_bytes_allocd = 0; typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; typedef struct arc_callback arc_callback_t;
*** 1272,1281 **** --- 1297,1307 ---- break; } ARCSTAT_INCR(arcstat_meta_used, space); atomic_add_64(&arc_size, space); + atomic_add_64(&arc_bytes_allocd, space); } void arc_space_return(uint64_t space, arc_space_type_t type) {
*** 1308,1317 **** --- 1334,1344 ---- arc_data_buf_alloc(uint64_t size) { if (arc_evict_needed(ARC_BUFC_DATA)) cv_signal(&arc_reclaim_thr_cv); atomic_add_64(&arc_size, size); + atomic_add_64(&arc_bytes_allocd, size); return (zio_data_buf_alloc(size)); } void arc_data_buf_free(void *buf, uint64_t size)
*** 2109,2119 **** --- 2136,2181 ---- delta = MIN(arc_mfu_ghost->arcs_size, adjustment); arc_evict_ghost(arc_mfu_ghost, NULL, delta); } } + #define ACCURACY 1000 + static void + arc_reclaim_bytes(uint64_t to_evict) + { + uint64_t to_evict_data_mru, to_evict_data_mfu; + uint64_t to_evict_meta_mru, to_evict_meta_mfu; + + to_evict_meta_mru = (((arc_mru->arcs_lsize[ARC_BUFC_METADATA] * + ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) * + to_evict) / ACCURACY; + to_evict_data_mru = (((arc_mru->arcs_lsize[ARC_BUFC_DATA] * + ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) * + to_evict) / ACCURACY; + to_evict_meta_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_METADATA] * + ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) * + to_evict) / ACCURACY; + to_evict_data_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_DATA] * + ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) * + to_evict) / ACCURACY; + + if (to_evict_meta_mru > 0) + (void) arc_evict(arc_mru, NULL, to_evict_meta_mru, FALSE, + ARC_BUFC_METADATA); + if (to_evict_data_mru > 0) + (void) arc_evict(arc_mru, NULL, to_evict_data_mru, FALSE, + ARC_BUFC_DATA); + if (to_evict_meta_mfu > 0) + (void) arc_evict(arc_mfu, NULL, to_evict_meta_mfu, FALSE, + ARC_BUFC_METADATA); + if (to_evict_data_mfu > 0) + (void) arc_evict(arc_mfu, NULL, to_evict_data_mfu, FALSE, + ARC_BUFC_DATA); + } + + static void arc_do_user_evicts(void) { mutex_enter(&arc_eviction_mtx); while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list;
*** 2247,2256 **** --- 2309,2339 ---- if (arc_size > arc_c) arc_adjust(); } + #define PHYSMEM_PRESSURE_FRACTION 100 + + static boolean_t + arc_mem_pressure(void) + { + #ifdef _KERNEL + uint64_t extra = desfree + physmem / PHYSMEM_PRESSURE_FRACTION; + + if ((freemem < lotsfree + needfree + extra) || + (needfree || availrmem < swapfs_minfree + swapfs_reserve + extra) || + (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 4) + + physmem / PHYSMEM_PRESSURE_FRACTION)) + return (B_TRUE); + + return (freemem < physmem / PHYSMEM_PRESSURE_FRACTION); + #else + return (0); + #endif + } + /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of 1 indicates that the system * is under memory pressure and that the arc should adjust accordingly. */
*** 2389,2399 **** --- 2472,2559 ---- */ if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) vmem_qcache_reap(zio_arena); } + #define RECLAIMS_PER_SEC 20 + #define STAT_UPDATES_PER_SEC 5 + + /* + * During heavy use, the ARC naturally wants to oscilate its arc_c around + * a maximum memory pressure point which corresponds to the arc_reclaim_needed + * function evaluating to 1. This results in the arc_size slowly growing + * towards this reclaim_needed threshold and exceeding it periodically. Once + * this happens, both arc_c and arc_size are down-adjusted by the + * arc_reclaim_thread and kmem_reap is initiated. This is problematic on + * bigmem systems with a small recordsize (4k or 8k), because reaping a kmem + * cache which contains very large numbers of objects is extremely expensive + * from an xcall perspective (several seconds of heavy CPU use): + * + * (mem) + * ^ arc_reclaim_thread reacts + * | | | + * | V V + * | + * | + + + * | /| /| + * | ......./..|................/..|.............. arc_reclaim_needed threshold + * | / \_____________/ \___________/(etc) + * | / kmem reap kmem reap + * | / + * |/ + * +-----------------------------------------------------------------> + * (time) + * + * To help address this stairstep pattern, the arc_pressure_thread periodically + * gauges the distance of the current arc_size to the arc_reclaim_needed + * threshold by way of an estimation algorithm (in arc_mem_pressure). + */ static void + arc_pressure_thread(void) + { + clock_t last_update = ddi_get_lbolt(); + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_pressure_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_pressure_thr_lock); + while (arc_pressure_thread_exit == 0) { + clock_t now; + + now = ddi_get_lbolt(); + if (now - last_update >= hz / STAT_UPDATES_PER_SEC) { + uint64_t new_rate; + + new_rate = (atomic_swap_64(&arc_bytes_allocd, 0) * + hz) / (now - last_update); + + if (ARCSTAT(arcstat_growth_rate) < new_rate) + ARCSTAT(arcstat_growth_rate) = new_rate; + else + ARCSTAT_F_AVG(arcstat_growth_rate, new_rate, 4); + last_update = now; + } + + arc_pressure_threshold = arc_c - ARCSTAT(arcstat_growth_rate); + if (arc_size > arc_pressure_threshold) { + arc_reclaim_bytes(arc_size - arc_pressure_threshold); + } + + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&arc_pressure_thr_cv, + &arc_pressure_thr_lock, + ddi_get_lbolt() + hz / RECLAIMS_PER_SEC); + CALLB_CPR_SAFE_END(&cpr, &arc_pressure_thr_lock); + } + + arc_pressure_thread_exit = 0; + cv_broadcast(&arc_pressure_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_pressure_thr_lock */ + thread_exit(); + } + + static void arc_reclaim_thread(void) { clock_t growtime = 0; arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr;
*** 2498,2508 **** /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ ! if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); --- 2658,2669 ---- /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ ! if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT) || ! (arc_size >= arc_pressure_threshold && arc_mem_pressure() == 0)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes);
*** 2571,2580 **** --- 2732,2742 ---- } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); + atomic_add_64(&arc_bytes_allocd, size); } goto out; } /*
*** 2603,2612 **** --- 2765,2775 ---- } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); + atomic_add_64(&arc_bytes_allocd, size); } ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); out:
*** 3769,3778 **** --- 3932,3944 ---- arc_init(void) { mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_pressure_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_pressure_thr_cv, NULL, CV_DEFAULT, NULL); + /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ arc_c = physmem * PAGESIZE / 8;
*** 3784,3793 **** --- 3950,3961 ---- * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif + /* initial sensible value */ + arc_pressure_threshold = arc_c; /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ arc_c_min = MAX(arc_c / 4, 64<<20); /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ if (arc_c * 8 >= 1<<30) arc_c_max = (arc_c * 8) - (1<<30);
*** 3885,3894 **** --- 4053,4064 ---- kstat_install(arc_ksp); } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); + (void) thread_create(NULL, 0, arc_pressure_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); arc_dead = FALSE; arc_warm = B_FALSE; /*
*** 3914,3923 **** --- 4084,4099 ---- arc_thread_exit = 1; while (arc_thread_exit != 0) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); mutex_exit(&arc_reclaim_thr_lock); + mutex_enter(&arc_pressure_thr_lock); + arc_pressure_thread_exit = 1; + while (arc_pressure_thread_exit != 0) + cv_wait(&arc_pressure_thr_cv, &arc_pressure_thr_lock); + mutex_exit(&arc_pressure_thr_lock); + arc_flush(NULL); arc_dead = TRUE; if (arc_ksp != NULL) {
*** 3926,3935 **** --- 4102,4113 ---- } mutex_destroy(&arc_eviction_mtx); mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); + mutex_destroy(&arc_pressure_thr_lock); + cv_destroy(&arc_pressure_thr_cv); list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);