132 #ifdef _KERNEL
133 #include <sys/vmsystm.h>
134 #include <vm/anon.h>
135 #include <sys/fs/swapnode.h>
136 #include <sys/dnlc.h>
137 #endif
138 #include <sys/callb.h>
139 #include <sys/kstat.h>
140 #include <zfs_fletcher.h>
141
142 #ifndef _KERNEL
143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
144 boolean_t arc_watch = B_FALSE;
145 int arc_procfd;
146 #endif
147
148 static kmutex_t arc_reclaim_thr_lock;
149 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
150 static uint8_t arc_thread_exit;
151
152 #define ARC_REDUCE_DNLC_PERCENT 3
153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
154
155 typedef enum arc_reclaim_strategy {
156 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
157 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
158 } arc_reclaim_strategy_t;
159
160 /*
161 * The number of iterations through arc_evict_*() before we
162 * drop & reacquire the lock.
163 */
164 int arc_evict_iterations = 100;
165
166 /* number of seconds before growing cache again */
167 static int arc_grow_retry = 60;
168
169 /* shift of arc_c for calculating both min and max arc_p */
170 static int arc_p_min_shift = 4;
171
278 * indrect prefetch buffers that have not lived long enough, or are
279 * not from the spa we're trying to evict from.
280 */
281 kstat_named_t arcstat_evict_skip;
282 kstat_named_t arcstat_evict_l2_cached;
283 kstat_named_t arcstat_evict_l2_eligible;
284 kstat_named_t arcstat_evict_l2_ineligible;
285 kstat_named_t arcstat_hash_elements;
286 kstat_named_t arcstat_hash_elements_max;
287 kstat_named_t arcstat_hash_collisions;
288 kstat_named_t arcstat_hash_chains;
289 kstat_named_t arcstat_hash_chain_max;
290 kstat_named_t arcstat_p;
291 kstat_named_t arcstat_c;
292 kstat_named_t arcstat_c_min;
293 kstat_named_t arcstat_c_max;
294 kstat_named_t arcstat_size;
295 kstat_named_t arcstat_hdr_size;
296 kstat_named_t arcstat_data_size;
297 kstat_named_t arcstat_other_size;
298 kstat_named_t arcstat_l2_hits;
299 kstat_named_t arcstat_l2_misses;
300 kstat_named_t arcstat_l2_feeds;
301 kstat_named_t arcstat_l2_rw_clash;
302 kstat_named_t arcstat_l2_read_bytes;
303 kstat_named_t arcstat_l2_write_bytes;
304 kstat_named_t arcstat_l2_writes_sent;
305 kstat_named_t arcstat_l2_writes_done;
306 kstat_named_t arcstat_l2_writes_error;
307 kstat_named_t arcstat_l2_writes_hdr_miss;
308 kstat_named_t arcstat_l2_evict_lock_retry;
309 kstat_named_t arcstat_l2_evict_reading;
310 kstat_named_t arcstat_l2_free_on_write;
311 kstat_named_t arcstat_l2_abort_lowmem;
312 kstat_named_t arcstat_l2_cksum_bad;
313 kstat_named_t arcstat_l2_io_error;
314 kstat_named_t arcstat_l2_size;
315 kstat_named_t arcstat_l2_asize;
316 kstat_named_t arcstat_l2_hdr_size;
317 kstat_named_t arcstat_l2_compress_successes;
344 { "deleted", KSTAT_DATA_UINT64 },
345 { "recycle_miss", KSTAT_DATA_UINT64 },
346 { "mutex_miss", KSTAT_DATA_UINT64 },
347 { "evict_skip", KSTAT_DATA_UINT64 },
348 { "evict_l2_cached", KSTAT_DATA_UINT64 },
349 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
350 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
351 { "hash_elements", KSTAT_DATA_UINT64 },
352 { "hash_elements_max", KSTAT_DATA_UINT64 },
353 { "hash_collisions", KSTAT_DATA_UINT64 },
354 { "hash_chains", KSTAT_DATA_UINT64 },
355 { "hash_chain_max", KSTAT_DATA_UINT64 },
356 { "p", KSTAT_DATA_UINT64 },
357 { "c", KSTAT_DATA_UINT64 },
358 { "c_min", KSTAT_DATA_UINT64 },
359 { "c_max", KSTAT_DATA_UINT64 },
360 { "size", KSTAT_DATA_UINT64 },
361 { "hdr_size", KSTAT_DATA_UINT64 },
362 { "data_size", KSTAT_DATA_UINT64 },
363 { "other_size", KSTAT_DATA_UINT64 },
364 { "l2_hits", KSTAT_DATA_UINT64 },
365 { "l2_misses", KSTAT_DATA_UINT64 },
366 { "l2_feeds", KSTAT_DATA_UINT64 },
367 { "l2_rw_clash", KSTAT_DATA_UINT64 },
368 { "l2_read_bytes", KSTAT_DATA_UINT64 },
369 { "l2_write_bytes", KSTAT_DATA_UINT64 },
370 { "l2_writes_sent", KSTAT_DATA_UINT64 },
371 { "l2_writes_done", KSTAT_DATA_UINT64 },
372 { "l2_writes_error", KSTAT_DATA_UINT64 },
373 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
374 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
375 { "l2_evict_reading", KSTAT_DATA_UINT64 },
376 { "l2_free_on_write", KSTAT_DATA_UINT64 },
377 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
378 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
379 { "l2_io_error", KSTAT_DATA_UINT64 },
380 { "l2_size", KSTAT_DATA_UINT64 },
381 { "l2_asize", KSTAT_DATA_UINT64 },
382 { "l2_hdr_size", KSTAT_DATA_UINT64 },
383 { "l2_compress_successes", KSTAT_DATA_UINT64 },
413 /*
414 * We define a macro to allow ARC hits/misses to be easily broken down by
415 * two separate conditions, giving a total of four different subtypes for
416 * each of hits and misses (so eight statistics total).
417 */
418 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
419 if (cond1) { \
420 if (cond2) { \
421 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
422 } else { \
423 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
424 } \
425 } else { \
426 if (cond2) { \
427 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
428 } else { \
429 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
430 } \
431 }
432
433 kstat_t *arc_ksp;
434 static arc_state_t *arc_anon;
435 static arc_state_t *arc_mru;
436 static arc_state_t *arc_mru_ghost;
437 static arc_state_t *arc_mfu;
438 static arc_state_t *arc_mfu_ghost;
439 static arc_state_t *arc_l2c_only;
440
441 /*
442 * There are several ARC variables that are critical to export as kstats --
443 * but we don't want to have to grovel around in the kstat whenever we wish to
444 * manipulate them. For these variables, we therefore define them to be in
445 * terms of the statistic variable. This assures that we are not introducing
446 * the possibility of inconsistency by having shadow copies of the variables,
447 * while still allowing the code to be readable.
448 */
449 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
450 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
451 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
452 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
453 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
454 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
455 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
456 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
457
458 #define L2ARC_IS_VALID_COMPRESS(_c_) \
459 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
460
461 static int arc_no_grow; /* Don't try to grow cache size */
462 static uint64_t arc_tempreserve;
463 static uint64_t arc_loaned_bytes;
464
465 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
466
467 typedef struct arc_callback arc_callback_t;
468
469 struct arc_callback {
470 void *acb_private;
471 arc_done_func_t *acb_done;
472 arc_buf_t *acb_buf;
473 zio_t *acb_zio_dummy;
474 arc_callback_t *acb_next;
475 };
476
477 typedef struct arc_write_callback arc_write_callback_t;
478
479 struct arc_write_callback {
480 void *awcb_private;
481 arc_done_func_t *awcb_ready;
482 arc_done_func_t *awcb_physdone;
483 arc_done_func_t *awcb_done;
1257 {
1258 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1259
1260 switch (type) {
1261 case ARC_SPACE_DATA:
1262 ARCSTAT_INCR(arcstat_data_size, space);
1263 break;
1264 case ARC_SPACE_OTHER:
1265 ARCSTAT_INCR(arcstat_other_size, space);
1266 break;
1267 case ARC_SPACE_HDRS:
1268 ARCSTAT_INCR(arcstat_hdr_size, space);
1269 break;
1270 case ARC_SPACE_L2HDRS:
1271 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1272 break;
1273 }
1274
1275 ARCSTAT_INCR(arcstat_meta_used, space);
1276 atomic_add_64(&arc_size, space);
1277 }
1278
1279 void
1280 arc_space_return(uint64_t space, arc_space_type_t type)
1281 {
1282 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1283
1284 switch (type) {
1285 case ARC_SPACE_DATA:
1286 ARCSTAT_INCR(arcstat_data_size, -space);
1287 break;
1288 case ARC_SPACE_OTHER:
1289 ARCSTAT_INCR(arcstat_other_size, -space);
1290 break;
1291 case ARC_SPACE_HDRS:
1292 ARCSTAT_INCR(arcstat_hdr_size, -space);
1293 break;
1294 case ARC_SPACE_L2HDRS:
1295 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1296 break;
1297 }
1298
1299 ASSERT(arc_meta_used >= space);
1300 if (arc_meta_max < arc_meta_used)
1301 arc_meta_max = arc_meta_used;
1302 ARCSTAT_INCR(arcstat_meta_used, -space);
1303 ASSERT(arc_size >= space);
1304 atomic_add_64(&arc_size, -space);
1305 }
1306
1307 void *
1308 arc_data_buf_alloc(uint64_t size)
1309 {
1310 if (arc_evict_needed(ARC_BUFC_DATA))
1311 cv_signal(&arc_reclaim_thr_cv);
1312 atomic_add_64(&arc_size, size);
1313 return (zio_data_buf_alloc(size));
1314 }
1315
1316 void
1317 arc_data_buf_free(void *buf, uint64_t size)
1318 {
1319 zio_data_buf_free(buf, size);
1320 ASSERT(arc_size >= size);
1321 atomic_add_64(&arc_size, -size);
1322 }
1323
1324 arc_buf_t *
1325 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1326 {
1327 arc_buf_hdr_t *hdr;
1328 arc_buf_t *buf;
1329
1330 ASSERT3U(size, >, 0);
1331 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1332 ASSERT(BUF_EMPTY(hdr));
2094 /*
2095 * Adjust ghost lists
2096 */
2097
2098 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2099
2100 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2101 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2102 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2103 }
2104
2105 adjustment =
2106 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2107
2108 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2109 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2110 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2111 }
2112 }
2113
2114 static void
2115 arc_do_user_evicts(void)
2116 {
2117 mutex_enter(&arc_eviction_mtx);
2118 while (arc_eviction_list != NULL) {
2119 arc_buf_t *buf = arc_eviction_list;
2120 arc_eviction_list = buf->b_next;
2121 mutex_enter(&buf->b_evict_lock);
2122 buf->b_hdr = NULL;
2123 mutex_exit(&buf->b_evict_lock);
2124 mutex_exit(&arc_eviction_mtx);
2125
2126 if (buf->b_efunc != NULL)
2127 VERIFY0(buf->b_efunc(buf->b_private));
2128
2129 buf->b_efunc = NULL;
2130 buf->b_private = NULL;
2131 kmem_cache_free(buf_cache, buf);
2132 mutex_enter(&arc_eviction_mtx);
2133 }
2134 mutex_exit(&arc_eviction_mtx);
2232 to_free = arc_c >> arc_shrink_shift;
2233 #endif
2234 if (arc_c > arc_c_min + to_free)
2235 atomic_add_64(&arc_c, -to_free);
2236 else
2237 arc_c = arc_c_min;
2238
2239 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2240 if (arc_c > arc_size)
2241 arc_c = MAX(arc_size, arc_c_min);
2242 if (arc_p > arc_c)
2243 arc_p = (arc_c >> 1);
2244 ASSERT(arc_c >= arc_c_min);
2245 ASSERT((int64_t)arc_p >= 0);
2246 }
2247
2248 if (arc_size > arc_c)
2249 arc_adjust();
2250 }
2251
2252 /*
2253 * Determine if the system is under memory pressure and is asking
2254 * to reclaim memory. A return value of 1 indicates that the system
2255 * is under memory pressure and that the arc should adjust accordingly.
2256 */
2257 static int
2258 arc_reclaim_needed(void)
2259 {
2260 uint64_t extra;
2261
2262 #ifdef _KERNEL
2263
2264 if (needfree)
2265 return (1);
2266
2267 /*
2268 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2269 */
2270 extra = desfree;
2271
2374 prev_cache = zio_buf_cache[i];
2375 kmem_cache_reap_now(zio_buf_cache[i]);
2376 }
2377 if (zio_data_buf_cache[i] != prev_data_cache) {
2378 prev_data_cache = zio_data_buf_cache[i];
2379 kmem_cache_reap_now(zio_data_buf_cache[i]);
2380 }
2381 }
2382 kmem_cache_reap_now(buf_cache);
2383 kmem_cache_reap_now(hdr_cache);
2384 kmem_cache_reap_now(range_seg_cache);
2385
2386 /*
2387 * Ask the vmem areana to reclaim unused memory from its
2388 * quantum caches.
2389 */
2390 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2391 vmem_qcache_reap(zio_arena);
2392 }
2393
2394 static void
2395 arc_reclaim_thread(void)
2396 {
2397 clock_t growtime = 0;
2398 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2399 callb_cpr_t cpr;
2400
2401 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2402
2403 mutex_enter(&arc_reclaim_thr_lock);
2404 while (arc_thread_exit == 0) {
2405 if (arc_reclaim_needed()) {
2406
2407 if (arc_no_grow) {
2408 if (last_reclaim == ARC_RECLAIM_CONS) {
2409 last_reclaim = ARC_RECLAIM_AGGR;
2410 } else {
2411 last_reclaim = ARC_RECLAIM_CONS;
2412 }
2413 } else {
2414 arc_no_grow = TRUE;
2483 delta = MIN(bytes * mult, arc_p);
2484 arc_p = MAX(arc_p_min, arc_p - delta);
2485 }
2486 ASSERT((int64_t)arc_p >= 0);
2487
2488 if (arc_reclaim_needed()) {
2489 cv_signal(&arc_reclaim_thr_cv);
2490 return;
2491 }
2492
2493 if (arc_no_grow)
2494 return;
2495
2496 if (arc_c >= arc_c_max)
2497 return;
2498
2499 /*
2500 * If we're within (2 * maxblocksize) bytes of the target
2501 * cache size, increment the target cache size
2502 */
2503 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2504 atomic_add_64(&arc_c, (int64_t)bytes);
2505 if (arc_c > arc_c_max)
2506 arc_c = arc_c_max;
2507 else if (state == arc_anon)
2508 atomic_add_64(&arc_p, (int64_t)bytes);
2509 if (arc_p > arc_c)
2510 arc_p = arc_c;
2511 }
2512 ASSERT((int64_t)arc_p >= 0);
2513 }
2514
2515 /*
2516 * Check if the cache has reached its limits and eviction is required
2517 * prior to insert.
2518 */
2519 static int
2520 arc_evict_needed(arc_buf_contents_t type)
2521 {
2522 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2523 return (1);
2556 {
2557 arc_state_t *state = buf->b_hdr->b_state;
2558 uint64_t size = buf->b_hdr->b_size;
2559 arc_buf_contents_t type = buf->b_hdr->b_type;
2560
2561 arc_adapt(size, state);
2562
2563 /*
2564 * We have not yet reached cache maximum size,
2565 * just allocate a new buffer.
2566 */
2567 if (!arc_evict_needed(type)) {
2568 if (type == ARC_BUFC_METADATA) {
2569 buf->b_data = zio_buf_alloc(size);
2570 arc_space_consume(size, ARC_SPACE_DATA);
2571 } else {
2572 ASSERT(type == ARC_BUFC_DATA);
2573 buf->b_data = zio_data_buf_alloc(size);
2574 ARCSTAT_INCR(arcstat_data_size, size);
2575 atomic_add_64(&arc_size, size);
2576 }
2577 goto out;
2578 }
2579
2580 /*
2581 * If we are prefetching from the mfu ghost list, this buffer
2582 * will end up on the mru list; so steal space from there.
2583 */
2584 if (state == arc_mfu_ghost)
2585 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2586 else if (state == arc_mru_ghost)
2587 state = arc_mru;
2588
2589 if (state == arc_mru || state == arc_anon) {
2590 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2591 state = (arc_mfu->arcs_lsize[type] >= size &&
2592 arc_p > mru_used) ? arc_mfu : arc_mru;
2593 } else {
2594 /* MFU cases */
2595 uint64_t mfu_space = arc_c - arc_p;
2596 state = (arc_mru->arcs_lsize[type] >= size &&
2597 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2598 }
2599 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2600 if (type == ARC_BUFC_METADATA) {
2601 buf->b_data = zio_buf_alloc(size);
2602 arc_space_consume(size, ARC_SPACE_DATA);
2603 } else {
2604 ASSERT(type == ARC_BUFC_DATA);
2605 buf->b_data = zio_data_buf_alloc(size);
2606 ARCSTAT_INCR(arcstat_data_size, size);
2607 atomic_add_64(&arc_size, size);
2608 }
2609 ARCSTAT_BUMP(arcstat_recycle_miss);
2610 }
2611 ASSERT(buf->b_data != NULL);
2612 out:
2613 /*
2614 * Update the state size. Note that ghost states have a
2615 * "ghost size" and so don't need to be updated.
2616 */
2617 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2618 arc_buf_hdr_t *hdr = buf->b_hdr;
2619
2620 atomic_add_64(&hdr->b_state->arcs_size, size);
2621 if (list_link_active(&hdr->b_arc_node)) {
2622 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2623 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2624 }
2625 /*
2626 * If we are growing the cache, and we are adding anonymous
2627 * data, and we have outgrown arc_p, update arc_p
3754 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3755 arc_tempreserve>>10,
3756 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3757 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3758 reserve>>10, arc_c>>10);
3759 return (SET_ERROR(ERESTART));
3760 }
3761 atomic_add_64(&arc_tempreserve, reserve);
3762 return (0);
3763 }
3764
3765 /* Tuneable, default is 64, which is essentially arbitrary */
3766 int zfs_flush_ntasks = 64;
3767
3768 void
3769 arc_init(void)
3770 {
3771 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3772 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3773
3774 /* Convert seconds to clock ticks */
3775 arc_min_prefetch_lifespan = 1 * hz;
3776
3777 /* Start out with 1/8 of all memory */
3778 arc_c = physmem * PAGESIZE / 8;
3779
3780 #ifdef _KERNEL
3781 /*
3782 * On architectures where the physical memory can be larger
3783 * than the addressable space (intel in 32-bit mode), we may
3784 * need to limit the cache to 1/8 of VM size.
3785 */
3786 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3787 #endif
3788
3789 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3790 arc_c_min = MAX(arc_c / 4, 64<<20);
3791 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3792 if (arc_c * 8 >= 1<<30)
3793 arc_c_max = (arc_c * 8) - (1<<30);
3794 else
3795 arc_c_max = arc_c_min;
3796 arc_c_max = MAX(arc_c * 6, arc_c_max);
3797
3798 /*
3799 * Allow the tunables to override our calculations if they are
3800 * reasonable (ie. over 64MB)
3801 */
3802 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3803 arc_c_max = zfs_arc_max;
3804 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3805 arc_c_min = zfs_arc_min;
3806
3807 arc_c = arc_c_max;
3808 arc_p = (arc_c >> 1);
3870
3871 arc_flush_taskq = taskq_create("arc_flush_tq",
3872 max_ncpus, minclsyspri, 1, zfs_flush_ntasks, TASKQ_DYNAMIC);
3873 buf_init();
3874
3875 arc_thread_exit = 0;
3876 arc_eviction_list = NULL;
3877 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3878 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3879
3880 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3881 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3882
3883 if (arc_ksp != NULL) {
3884 arc_ksp->ks_data = &arc_stats;
3885 kstat_install(arc_ksp);
3886 }
3887
3888 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3889 TS_RUN, minclsyspri);
3890
3891 arc_dead = FALSE;
3892 arc_warm = B_FALSE;
3893
3894 /*
3895 * Calculate maximum amount of dirty data per pool.
3896 *
3897 * If it has been set by /etc/system, take that.
3898 * Otherwise, use a percentage of physical memory defined by
3899 * zfs_dirty_data_max_percent (default 10%) with a cap at
3900 * zfs_dirty_data_max_max (default 4GB).
3901 */
3902 if (zfs_dirty_data_max == 0) {
3903 zfs_dirty_data_max = physmem * PAGESIZE *
3904 zfs_dirty_data_max_percent / 100;
3905 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
3906 zfs_dirty_data_max_max);
3907 }
3908 }
3909
3910 void
3911 arc_fini(void)
3912 {
3913 mutex_enter(&arc_reclaim_thr_lock);
3914 arc_thread_exit = 1;
3915 while (arc_thread_exit != 0)
3916 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3917 mutex_exit(&arc_reclaim_thr_lock);
3918
3919 arc_flush(NULL);
3920
3921 arc_dead = TRUE;
3922
3923 if (arc_ksp != NULL) {
3924 kstat_delete(arc_ksp);
3925 arc_ksp = NULL;
3926 }
3927
3928 mutex_destroy(&arc_eviction_mtx);
3929 mutex_destroy(&arc_reclaim_thr_lock);
3930 cv_destroy(&arc_reclaim_thr_cv);
3931
3932 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3933 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3934 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3935 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3936 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3937 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3938 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3939 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3940
3941 mutex_destroy(&arc_anon->arcs_mtx);
3942 mutex_destroy(&arc_mru->arcs_mtx);
3943 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3944 mutex_destroy(&arc_mfu->arcs_mtx);
3945 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3946 mutex_destroy(&arc_l2c_only->arcs_mtx);
3947
3948 taskq_destroy(arc_flush_taskq);
3949 buf_fini();
3950
|
132 #ifdef _KERNEL
133 #include <sys/vmsystm.h>
134 #include <vm/anon.h>
135 #include <sys/fs/swapnode.h>
136 #include <sys/dnlc.h>
137 #endif
138 #include <sys/callb.h>
139 #include <sys/kstat.h>
140 #include <zfs_fletcher.h>
141
142 #ifndef _KERNEL
143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
144 boolean_t arc_watch = B_FALSE;
145 int arc_procfd;
146 #endif
147
148 static kmutex_t arc_reclaim_thr_lock;
149 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
150 static uint8_t arc_thread_exit;
151
152 static kmutex_t arc_pressure_thr_lock;
153 static kcondvar_t arc_pressure_thr_cv;
154 static uint8_t arc_pressure_thread_exit;
155 static uint64_t arc_pressure_threshold;
156
157 #define ARC_REDUCE_DNLC_PERCENT 3
158 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
159
160 typedef enum arc_reclaim_strategy {
161 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
162 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
163 } arc_reclaim_strategy_t;
164
165 /*
166 * The number of iterations through arc_evict_*() before we
167 * drop & reacquire the lock.
168 */
169 int arc_evict_iterations = 100;
170
171 /* number of seconds before growing cache again */
172 static int arc_grow_retry = 60;
173
174 /* shift of arc_c for calculating both min and max arc_p */
175 static int arc_p_min_shift = 4;
176
283 * indrect prefetch buffers that have not lived long enough, or are
284 * not from the spa we're trying to evict from.
285 */
286 kstat_named_t arcstat_evict_skip;
287 kstat_named_t arcstat_evict_l2_cached;
288 kstat_named_t arcstat_evict_l2_eligible;
289 kstat_named_t arcstat_evict_l2_ineligible;
290 kstat_named_t arcstat_hash_elements;
291 kstat_named_t arcstat_hash_elements_max;
292 kstat_named_t arcstat_hash_collisions;
293 kstat_named_t arcstat_hash_chains;
294 kstat_named_t arcstat_hash_chain_max;
295 kstat_named_t arcstat_p;
296 kstat_named_t arcstat_c;
297 kstat_named_t arcstat_c_min;
298 kstat_named_t arcstat_c_max;
299 kstat_named_t arcstat_size;
300 kstat_named_t arcstat_hdr_size;
301 kstat_named_t arcstat_data_size;
302 kstat_named_t arcstat_other_size;
303 kstat_named_t arcstat_growth_rate;
304 kstat_named_t arcstat_l2_hits;
305 kstat_named_t arcstat_l2_misses;
306 kstat_named_t arcstat_l2_feeds;
307 kstat_named_t arcstat_l2_rw_clash;
308 kstat_named_t arcstat_l2_read_bytes;
309 kstat_named_t arcstat_l2_write_bytes;
310 kstat_named_t arcstat_l2_writes_sent;
311 kstat_named_t arcstat_l2_writes_done;
312 kstat_named_t arcstat_l2_writes_error;
313 kstat_named_t arcstat_l2_writes_hdr_miss;
314 kstat_named_t arcstat_l2_evict_lock_retry;
315 kstat_named_t arcstat_l2_evict_reading;
316 kstat_named_t arcstat_l2_free_on_write;
317 kstat_named_t arcstat_l2_abort_lowmem;
318 kstat_named_t arcstat_l2_cksum_bad;
319 kstat_named_t arcstat_l2_io_error;
320 kstat_named_t arcstat_l2_size;
321 kstat_named_t arcstat_l2_asize;
322 kstat_named_t arcstat_l2_hdr_size;
323 kstat_named_t arcstat_l2_compress_successes;
350 { "deleted", KSTAT_DATA_UINT64 },
351 { "recycle_miss", KSTAT_DATA_UINT64 },
352 { "mutex_miss", KSTAT_DATA_UINT64 },
353 { "evict_skip", KSTAT_DATA_UINT64 },
354 { "evict_l2_cached", KSTAT_DATA_UINT64 },
355 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
356 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
357 { "hash_elements", KSTAT_DATA_UINT64 },
358 { "hash_elements_max", KSTAT_DATA_UINT64 },
359 { "hash_collisions", KSTAT_DATA_UINT64 },
360 { "hash_chains", KSTAT_DATA_UINT64 },
361 { "hash_chain_max", KSTAT_DATA_UINT64 },
362 { "p", KSTAT_DATA_UINT64 },
363 { "c", KSTAT_DATA_UINT64 },
364 { "c_min", KSTAT_DATA_UINT64 },
365 { "c_max", KSTAT_DATA_UINT64 },
366 { "size", KSTAT_DATA_UINT64 },
367 { "hdr_size", KSTAT_DATA_UINT64 },
368 { "data_size", KSTAT_DATA_UINT64 },
369 { "other_size", KSTAT_DATA_UINT64 },
370 { "growth_rate", KSTAT_DATA_UINT64 },
371 { "l2_hits", KSTAT_DATA_UINT64 },
372 { "l2_misses", KSTAT_DATA_UINT64 },
373 { "l2_feeds", KSTAT_DATA_UINT64 },
374 { "l2_rw_clash", KSTAT_DATA_UINT64 },
375 { "l2_read_bytes", KSTAT_DATA_UINT64 },
376 { "l2_write_bytes", KSTAT_DATA_UINT64 },
377 { "l2_writes_sent", KSTAT_DATA_UINT64 },
378 { "l2_writes_done", KSTAT_DATA_UINT64 },
379 { "l2_writes_error", KSTAT_DATA_UINT64 },
380 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
381 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
382 { "l2_evict_reading", KSTAT_DATA_UINT64 },
383 { "l2_free_on_write", KSTAT_DATA_UINT64 },
384 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
385 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
386 { "l2_io_error", KSTAT_DATA_UINT64 },
387 { "l2_size", KSTAT_DATA_UINT64 },
388 { "l2_asize", KSTAT_DATA_UINT64 },
389 { "l2_hdr_size", KSTAT_DATA_UINT64 },
390 { "l2_compress_successes", KSTAT_DATA_UINT64 },
420 /*
421 * We define a macro to allow ARC hits/misses to be easily broken down by
422 * two separate conditions, giving a total of four different subtypes for
423 * each of hits and misses (so eight statistics total).
424 */
425 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
426 if (cond1) { \
427 if (cond2) { \
428 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
429 } else { \
430 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
431 } \
432 } else { \
433 if (cond2) { \
434 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
435 } else { \
436 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
437 } \
438 }
439
440 /*
441 * This macro allows us to use kstats as floating averages. Each time we
442 * update this kstat, we first factor it and the update value by `factor'
443 * to shrink the new value's contribution to the overall average. This
444 * macro assumes that integer loads and stores are atomic, but is not
445 * safe for multiple writers updating the kstat in parallel (only the
446 * last writer's update will remain).
447 */
448 #define ARCSTAT_F_AVG(stat, value, factor) \
449 do { \
450 uint64_t x = ARCSTAT(stat); \
451 x = x - x / factor + (value) / factor; \
452 ARCSTAT(stat) = x; \
453 _NOTE(NOTREACHED) \
454 _NOTE(CONSTCOND) \
455 } while (0)
456
457 kstat_t *arc_ksp;
458 static arc_state_t *arc_anon;
459 static arc_state_t *arc_mru;
460 static arc_state_t *arc_mru_ghost;
461 static arc_state_t *arc_mfu;
462 static arc_state_t *arc_mfu_ghost;
463 static arc_state_t *arc_l2c_only;
464
465 /*
466 * There are several ARC variables that are critical to export as kstats --
467 * but we don't want to have to grovel around in the kstat whenever we wish to
468 * manipulate them. For these variables, we therefore define them to be in
469 * terms of the statistic variable. This assures that we are not introducing
470 * the possibility of inconsistency by having shadow copies of the variables,
471 * while still allowing the code to be readable.
472 */
473 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
474 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
475 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
476 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
477 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
478 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
479 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
480 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
481
482 #define L2ARC_IS_VALID_COMPRESS(_c_) \
483 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
484
485 static int arc_no_grow; /* Don't try to grow cache size */
486 static uint64_t arc_tempreserve;
487 static uint64_t arc_loaned_bytes;
488 static uint64_t arc_bytes_allocd = 0;
489
490 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
491
492 typedef struct arc_callback arc_callback_t;
493
494 struct arc_callback {
495 void *acb_private;
496 arc_done_func_t *acb_done;
497 arc_buf_t *acb_buf;
498 zio_t *acb_zio_dummy;
499 arc_callback_t *acb_next;
500 };
501
502 typedef struct arc_write_callback arc_write_callback_t;
503
504 struct arc_write_callback {
505 void *awcb_private;
506 arc_done_func_t *awcb_ready;
507 arc_done_func_t *awcb_physdone;
508 arc_done_func_t *awcb_done;
1282 {
1283 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1284
1285 switch (type) {
1286 case ARC_SPACE_DATA:
1287 ARCSTAT_INCR(arcstat_data_size, space);
1288 break;
1289 case ARC_SPACE_OTHER:
1290 ARCSTAT_INCR(arcstat_other_size, space);
1291 break;
1292 case ARC_SPACE_HDRS:
1293 ARCSTAT_INCR(arcstat_hdr_size, space);
1294 break;
1295 case ARC_SPACE_L2HDRS:
1296 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1297 break;
1298 }
1299
1300 ARCSTAT_INCR(arcstat_meta_used, space);
1301 atomic_add_64(&arc_size, space);
1302 atomic_add_64(&arc_bytes_allocd, space);
1303 }
1304
1305 void
1306 arc_space_return(uint64_t space, arc_space_type_t type)
1307 {
1308 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1309
1310 switch (type) {
1311 case ARC_SPACE_DATA:
1312 ARCSTAT_INCR(arcstat_data_size, -space);
1313 break;
1314 case ARC_SPACE_OTHER:
1315 ARCSTAT_INCR(arcstat_other_size, -space);
1316 break;
1317 case ARC_SPACE_HDRS:
1318 ARCSTAT_INCR(arcstat_hdr_size, -space);
1319 break;
1320 case ARC_SPACE_L2HDRS:
1321 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1322 break;
1323 }
1324
1325 ASSERT(arc_meta_used >= space);
1326 if (arc_meta_max < arc_meta_used)
1327 arc_meta_max = arc_meta_used;
1328 ARCSTAT_INCR(arcstat_meta_used, -space);
1329 ASSERT(arc_size >= space);
1330 atomic_add_64(&arc_size, -space);
1331 }
1332
1333 void *
1334 arc_data_buf_alloc(uint64_t size)
1335 {
1336 if (arc_evict_needed(ARC_BUFC_DATA))
1337 cv_signal(&arc_reclaim_thr_cv);
1338 atomic_add_64(&arc_size, size);
1339 atomic_add_64(&arc_bytes_allocd, size);
1340 return (zio_data_buf_alloc(size));
1341 }
1342
1343 void
1344 arc_data_buf_free(void *buf, uint64_t size)
1345 {
1346 zio_data_buf_free(buf, size);
1347 ASSERT(arc_size >= size);
1348 atomic_add_64(&arc_size, -size);
1349 }
1350
1351 arc_buf_t *
1352 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1353 {
1354 arc_buf_hdr_t *hdr;
1355 arc_buf_t *buf;
1356
1357 ASSERT3U(size, >, 0);
1358 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1359 ASSERT(BUF_EMPTY(hdr));
2121 /*
2122 * Adjust ghost lists
2123 */
2124
2125 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2126
2127 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2128 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2129 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2130 }
2131
2132 adjustment =
2133 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2134
2135 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2136 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2137 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2138 }
2139 }
2140
2141 #define ACCURACY 1000
2142
2143 static void
2144 arc_reclaim_bytes(uint64_t to_evict)
2145 {
2146 uint64_t to_evict_data_mru, to_evict_data_mfu;
2147 uint64_t to_evict_meta_mru, to_evict_meta_mfu;
2148
2149 to_evict_meta_mru = (((arc_mru->arcs_lsize[ARC_BUFC_METADATA] *
2150 ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2151 to_evict) / ACCURACY;
2152 to_evict_data_mru = (((arc_mru->arcs_lsize[ARC_BUFC_DATA] *
2153 ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2154 to_evict) / ACCURACY;
2155 to_evict_meta_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_METADATA] *
2156 ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2157 to_evict) / ACCURACY;
2158 to_evict_data_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_DATA] *
2159 ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2160 to_evict) / ACCURACY;
2161
2162 if (to_evict_meta_mru > 0)
2163 (void) arc_evict(arc_mru, NULL, to_evict_meta_mru, FALSE,
2164 ARC_BUFC_METADATA);
2165 if (to_evict_data_mru > 0)
2166 (void) arc_evict(arc_mru, NULL, to_evict_data_mru, FALSE,
2167 ARC_BUFC_DATA);
2168 if (to_evict_meta_mfu > 0)
2169 (void) arc_evict(arc_mfu, NULL, to_evict_meta_mfu, FALSE,
2170 ARC_BUFC_METADATA);
2171 if (to_evict_data_mfu > 0)
2172 (void) arc_evict(arc_mfu, NULL, to_evict_data_mfu, FALSE,
2173 ARC_BUFC_DATA);
2174 }
2175
2176 static void
2177 arc_do_user_evicts(void)
2178 {
2179 mutex_enter(&arc_eviction_mtx);
2180 while (arc_eviction_list != NULL) {
2181 arc_buf_t *buf = arc_eviction_list;
2182 arc_eviction_list = buf->b_next;
2183 mutex_enter(&buf->b_evict_lock);
2184 buf->b_hdr = NULL;
2185 mutex_exit(&buf->b_evict_lock);
2186 mutex_exit(&arc_eviction_mtx);
2187
2188 if (buf->b_efunc != NULL)
2189 VERIFY0(buf->b_efunc(buf->b_private));
2190
2191 buf->b_efunc = NULL;
2192 buf->b_private = NULL;
2193 kmem_cache_free(buf_cache, buf);
2194 mutex_enter(&arc_eviction_mtx);
2195 }
2196 mutex_exit(&arc_eviction_mtx);
2294 to_free = arc_c >> arc_shrink_shift;
2295 #endif
2296 if (arc_c > arc_c_min + to_free)
2297 atomic_add_64(&arc_c, -to_free);
2298 else
2299 arc_c = arc_c_min;
2300
2301 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2302 if (arc_c > arc_size)
2303 arc_c = MAX(arc_size, arc_c_min);
2304 if (arc_p > arc_c)
2305 arc_p = (arc_c >> 1);
2306 ASSERT(arc_c >= arc_c_min);
2307 ASSERT((int64_t)arc_p >= 0);
2308 }
2309
2310 if (arc_size > arc_c)
2311 arc_adjust();
2312 }
2313
2314 #define PHYSMEM_PRESSURE_FRACTION 100
2315
2316 static boolean_t
2317 arc_mem_pressure(void)
2318 {
2319 #ifdef _KERNEL
2320 uint64_t extra = desfree + physmem / PHYSMEM_PRESSURE_FRACTION;
2321
2322 if ((freemem < lotsfree + needfree + extra) ||
2323 (needfree || availrmem < swapfs_minfree + swapfs_reserve + extra) ||
2324 (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) <
2325 (vmem_size(zio_arena, VMEM_ALLOC) >> 4) +
2326 physmem / PHYSMEM_PRESSURE_FRACTION))
2327 return (B_TRUE);
2328
2329 return (freemem < physmem / PHYSMEM_PRESSURE_FRACTION);
2330 #else
2331 return (0);
2332 #endif
2333 }
2334
2335 /*
2336 * Determine if the system is under memory pressure and is asking
2337 * to reclaim memory. A return value of 1 indicates that the system
2338 * is under memory pressure and that the arc should adjust accordingly.
2339 */
2340 static int
2341 arc_reclaim_needed(void)
2342 {
2343 uint64_t extra;
2344
2345 #ifdef _KERNEL
2346
2347 if (needfree)
2348 return (1);
2349
2350 /*
2351 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2352 */
2353 extra = desfree;
2354
2457 prev_cache = zio_buf_cache[i];
2458 kmem_cache_reap_now(zio_buf_cache[i]);
2459 }
2460 if (zio_data_buf_cache[i] != prev_data_cache) {
2461 prev_data_cache = zio_data_buf_cache[i];
2462 kmem_cache_reap_now(zio_data_buf_cache[i]);
2463 }
2464 }
2465 kmem_cache_reap_now(buf_cache);
2466 kmem_cache_reap_now(hdr_cache);
2467 kmem_cache_reap_now(range_seg_cache);
2468
2469 /*
2470 * Ask the vmem areana to reclaim unused memory from its
2471 * quantum caches.
2472 */
2473 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2474 vmem_qcache_reap(zio_arena);
2475 }
2476
2477 #define RECLAIMS_PER_SEC 20
2478 #define STAT_UPDATES_PER_SEC 5
2479
2480 /*
2481 * During heavy use, the ARC naturally wants to oscilate its arc_c around
2482 * a maximum memory pressure point which corresponds to the arc_reclaim_needed
2483 * function evaluating to 1. This results in the arc_size slowly growing
2484 * towards this reclaim_needed threshold and exceeding it periodically. Once
2485 * this happens, both arc_c and arc_size are down-adjusted by the
2486 * arc_reclaim_thread and kmem_reap is initiated. This is problematic on
2487 * bigmem systems with a small recordsize (4k or 8k), because reaping a kmem
2488 * cache which contains very large numbers of objects is extremely expensive
2489 * from an xcall perspective (several seconds of heavy CPU use):
2490 *
2491 * (mem)
2492 * ^ arc_reclaim_thread reacts
2493 * | | |
2494 * | V V
2495 * |
2496 * | + +
2497 * | /| /|
2498 * | ......./..|................/..|.............. arc_reclaim_needed threshold
2499 * | / \_____________/ \___________/(etc)
2500 * | / kmem reap kmem reap
2501 * | /
2502 * |/
2503 * +----------------------------------------------------------------->
2504 * (time)
2505 *
2506 * To help address this stairstep pattern, the arc_pressure_thread periodically
2507 * gauges the distance of the current arc_size to the arc_reclaim_needed
2508 * threshold by way of an estimation algorithm (in arc_mem_pressure).
2509 */
2510 static void
2511 arc_pressure_thread(void)
2512 {
2513 clock_t last_update = ddi_get_lbolt();
2514 callb_cpr_t cpr;
2515
2516 CALLB_CPR_INIT(&cpr, &arc_pressure_thr_lock, callb_generic_cpr, FTAG);
2517
2518 mutex_enter(&arc_pressure_thr_lock);
2519 while (arc_pressure_thread_exit == 0) {
2520 clock_t now;
2521
2522 now = ddi_get_lbolt();
2523 if (now - last_update >= hz / STAT_UPDATES_PER_SEC) {
2524 uint64_t new_rate;
2525
2526 new_rate = (atomic_swap_64(&arc_bytes_allocd, 0) *
2527 hz) / (now - last_update);
2528
2529 if (ARCSTAT(arcstat_growth_rate) < new_rate)
2530 ARCSTAT(arcstat_growth_rate) = new_rate;
2531 else
2532 ARCSTAT_F_AVG(arcstat_growth_rate, new_rate, 4);
2533 last_update = now;
2534 }
2535
2536 arc_pressure_threshold = arc_c - ARCSTAT(arcstat_growth_rate);
2537 if (arc_size > arc_pressure_threshold) {
2538 arc_reclaim_bytes(arc_size - arc_pressure_threshold);
2539 }
2540
2541 CALLB_CPR_SAFE_BEGIN(&cpr);
2542 (void) cv_timedwait(&arc_pressure_thr_cv,
2543 &arc_pressure_thr_lock,
2544 ddi_get_lbolt() + hz / RECLAIMS_PER_SEC);
2545 CALLB_CPR_SAFE_END(&cpr, &arc_pressure_thr_lock);
2546 }
2547
2548 arc_pressure_thread_exit = 0;
2549 cv_broadcast(&arc_pressure_thr_cv);
2550 CALLB_CPR_EXIT(&cpr); /* drops arc_pressure_thr_lock */
2551 thread_exit();
2552 }
2553
2554 static void
2555 arc_reclaim_thread(void)
2556 {
2557 clock_t growtime = 0;
2558 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2559 callb_cpr_t cpr;
2560
2561 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2562
2563 mutex_enter(&arc_reclaim_thr_lock);
2564 while (arc_thread_exit == 0) {
2565 if (arc_reclaim_needed()) {
2566
2567 if (arc_no_grow) {
2568 if (last_reclaim == ARC_RECLAIM_CONS) {
2569 last_reclaim = ARC_RECLAIM_AGGR;
2570 } else {
2571 last_reclaim = ARC_RECLAIM_CONS;
2572 }
2573 } else {
2574 arc_no_grow = TRUE;
2643 delta = MIN(bytes * mult, arc_p);
2644 arc_p = MAX(arc_p_min, arc_p - delta);
2645 }
2646 ASSERT((int64_t)arc_p >= 0);
2647
2648 if (arc_reclaim_needed()) {
2649 cv_signal(&arc_reclaim_thr_cv);
2650 return;
2651 }
2652
2653 if (arc_no_grow)
2654 return;
2655
2656 if (arc_c >= arc_c_max)
2657 return;
2658
2659 /*
2660 * If we're within (2 * maxblocksize) bytes of the target
2661 * cache size, increment the target cache size
2662 */
2663 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT) ||
2664 (arc_size >= arc_pressure_threshold && arc_mem_pressure() == 0)) {
2665 atomic_add_64(&arc_c, (int64_t)bytes);
2666 if (arc_c > arc_c_max)
2667 arc_c = arc_c_max;
2668 else if (state == arc_anon)
2669 atomic_add_64(&arc_p, (int64_t)bytes);
2670 if (arc_p > arc_c)
2671 arc_p = arc_c;
2672 }
2673 ASSERT((int64_t)arc_p >= 0);
2674 }
2675
2676 /*
2677 * Check if the cache has reached its limits and eviction is required
2678 * prior to insert.
2679 */
2680 static int
2681 arc_evict_needed(arc_buf_contents_t type)
2682 {
2683 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2684 return (1);
2717 {
2718 arc_state_t *state = buf->b_hdr->b_state;
2719 uint64_t size = buf->b_hdr->b_size;
2720 arc_buf_contents_t type = buf->b_hdr->b_type;
2721
2722 arc_adapt(size, state);
2723
2724 /*
2725 * We have not yet reached cache maximum size,
2726 * just allocate a new buffer.
2727 */
2728 if (!arc_evict_needed(type)) {
2729 if (type == ARC_BUFC_METADATA) {
2730 buf->b_data = zio_buf_alloc(size);
2731 arc_space_consume(size, ARC_SPACE_DATA);
2732 } else {
2733 ASSERT(type == ARC_BUFC_DATA);
2734 buf->b_data = zio_data_buf_alloc(size);
2735 ARCSTAT_INCR(arcstat_data_size, size);
2736 atomic_add_64(&arc_size, size);
2737 atomic_add_64(&arc_bytes_allocd, size);
2738 }
2739 goto out;
2740 }
2741
2742 /*
2743 * If we are prefetching from the mfu ghost list, this buffer
2744 * will end up on the mru list; so steal space from there.
2745 */
2746 if (state == arc_mfu_ghost)
2747 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2748 else if (state == arc_mru_ghost)
2749 state = arc_mru;
2750
2751 if (state == arc_mru || state == arc_anon) {
2752 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2753 state = (arc_mfu->arcs_lsize[type] >= size &&
2754 arc_p > mru_used) ? arc_mfu : arc_mru;
2755 } else {
2756 /* MFU cases */
2757 uint64_t mfu_space = arc_c - arc_p;
2758 state = (arc_mru->arcs_lsize[type] >= size &&
2759 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2760 }
2761 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2762 if (type == ARC_BUFC_METADATA) {
2763 buf->b_data = zio_buf_alloc(size);
2764 arc_space_consume(size, ARC_SPACE_DATA);
2765 } else {
2766 ASSERT(type == ARC_BUFC_DATA);
2767 buf->b_data = zio_data_buf_alloc(size);
2768 ARCSTAT_INCR(arcstat_data_size, size);
2769 atomic_add_64(&arc_size, size);
2770 atomic_add_64(&arc_bytes_allocd, size);
2771 }
2772 ARCSTAT_BUMP(arcstat_recycle_miss);
2773 }
2774 ASSERT(buf->b_data != NULL);
2775 out:
2776 /*
2777 * Update the state size. Note that ghost states have a
2778 * "ghost size" and so don't need to be updated.
2779 */
2780 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2781 arc_buf_hdr_t *hdr = buf->b_hdr;
2782
2783 atomic_add_64(&hdr->b_state->arcs_size, size);
2784 if (list_link_active(&hdr->b_arc_node)) {
2785 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2786 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2787 }
2788 /*
2789 * If we are growing the cache, and we are adding anonymous
2790 * data, and we have outgrown arc_p, update arc_p
3917 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3918 arc_tempreserve>>10,
3919 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3920 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3921 reserve>>10, arc_c>>10);
3922 return (SET_ERROR(ERESTART));
3923 }
3924 atomic_add_64(&arc_tempreserve, reserve);
3925 return (0);
3926 }
3927
3928 /* Tuneable, default is 64, which is essentially arbitrary */
3929 int zfs_flush_ntasks = 64;
3930
3931 void
3932 arc_init(void)
3933 {
3934 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3935 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3936
3937 mutex_init(&arc_pressure_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3938 cv_init(&arc_pressure_thr_cv, NULL, CV_DEFAULT, NULL);
3939
3940 /* Convert seconds to clock ticks */
3941 arc_min_prefetch_lifespan = 1 * hz;
3942
3943 /* Start out with 1/8 of all memory */
3944 arc_c = physmem * PAGESIZE / 8;
3945
3946 #ifdef _KERNEL
3947 /*
3948 * On architectures where the physical memory can be larger
3949 * than the addressable space (intel in 32-bit mode), we may
3950 * need to limit the cache to 1/8 of VM size.
3951 */
3952 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3953 #endif
3954
3955 /* initial sensible value */
3956 arc_pressure_threshold = arc_c;
3957 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3958 arc_c_min = MAX(arc_c / 4, 64<<20);
3959 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3960 if (arc_c * 8 >= 1<<30)
3961 arc_c_max = (arc_c * 8) - (1<<30);
3962 else
3963 arc_c_max = arc_c_min;
3964 arc_c_max = MAX(arc_c * 6, arc_c_max);
3965
3966 /*
3967 * Allow the tunables to override our calculations if they are
3968 * reasonable (ie. over 64MB)
3969 */
3970 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3971 arc_c_max = zfs_arc_max;
3972 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3973 arc_c_min = zfs_arc_min;
3974
3975 arc_c = arc_c_max;
3976 arc_p = (arc_c >> 1);
4038
4039 arc_flush_taskq = taskq_create("arc_flush_tq",
4040 max_ncpus, minclsyspri, 1, zfs_flush_ntasks, TASKQ_DYNAMIC);
4041 buf_init();
4042
4043 arc_thread_exit = 0;
4044 arc_eviction_list = NULL;
4045 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4046 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4047
4048 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4049 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4050
4051 if (arc_ksp != NULL) {
4052 arc_ksp->ks_data = &arc_stats;
4053 kstat_install(arc_ksp);
4054 }
4055
4056 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4057 TS_RUN, minclsyspri);
4058 (void) thread_create(NULL, 0, arc_pressure_thread, NULL, 0, &p0,
4059 TS_RUN, minclsyspri);
4060
4061 arc_dead = FALSE;
4062 arc_warm = B_FALSE;
4063
4064 /*
4065 * Calculate maximum amount of dirty data per pool.
4066 *
4067 * If it has been set by /etc/system, take that.
4068 * Otherwise, use a percentage of physical memory defined by
4069 * zfs_dirty_data_max_percent (default 10%) with a cap at
4070 * zfs_dirty_data_max_max (default 4GB).
4071 */
4072 if (zfs_dirty_data_max == 0) {
4073 zfs_dirty_data_max = physmem * PAGESIZE *
4074 zfs_dirty_data_max_percent / 100;
4075 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4076 zfs_dirty_data_max_max);
4077 }
4078 }
4079
4080 void
4081 arc_fini(void)
4082 {
4083 mutex_enter(&arc_reclaim_thr_lock);
4084 arc_thread_exit = 1;
4085 while (arc_thread_exit != 0)
4086 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4087 mutex_exit(&arc_reclaim_thr_lock);
4088
4089 mutex_enter(&arc_pressure_thr_lock);
4090 arc_pressure_thread_exit = 1;
4091 while (arc_pressure_thread_exit != 0)
4092 cv_wait(&arc_pressure_thr_cv, &arc_pressure_thr_lock);
4093 mutex_exit(&arc_pressure_thr_lock);
4094
4095 arc_flush(NULL);
4096
4097 arc_dead = TRUE;
4098
4099 if (arc_ksp != NULL) {
4100 kstat_delete(arc_ksp);
4101 arc_ksp = NULL;
4102 }
4103
4104 mutex_destroy(&arc_eviction_mtx);
4105 mutex_destroy(&arc_reclaim_thr_lock);
4106 cv_destroy(&arc_reclaim_thr_cv);
4107 mutex_destroy(&arc_pressure_thr_lock);
4108 cv_destroy(&arc_pressure_thr_cv);
4109
4110 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
4111 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
4112 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
4113 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
4114 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
4115 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
4116 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
4117 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
4118
4119 mutex_destroy(&arc_anon->arcs_mtx);
4120 mutex_destroy(&arc_mru->arcs_mtx);
4121 mutex_destroy(&arc_mru_ghost->arcs_mtx);
4122 mutex_destroy(&arc_mfu->arcs_mtx);
4123 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
4124 mutex_destroy(&arc_l2c_only->arcs_mtx);
4125
4126 taskq_destroy(arc_flush_taskq);
4127 buf_fini();
4128
|