Print this page
ARC pressure valve implementation


 132 #ifdef _KERNEL
 133 #include <sys/vmsystm.h>
 134 #include <vm/anon.h>
 135 #include <sys/fs/swapnode.h>
 136 #include <sys/dnlc.h>
 137 #endif
 138 #include <sys/callb.h>
 139 #include <sys/kstat.h>
 140 #include <zfs_fletcher.h>
 141 
 142 #ifndef _KERNEL
 143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 144 boolean_t arc_watch = B_FALSE;
 145 int arc_procfd;
 146 #endif
 147 
 148 static kmutex_t         arc_reclaim_thr_lock;
 149 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 150 static uint8_t          arc_thread_exit;
 151 





 152 #define ARC_REDUCE_DNLC_PERCENT 3
 153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154 
 155 typedef enum arc_reclaim_strategy {
 156         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158 } arc_reclaim_strategy_t;
 159 
 160 /*
 161  * The number of iterations through arc_evict_*() before we
 162  * drop & reacquire the lock.
 163  */
 164 int arc_evict_iterations = 100;
 165 
 166 /* number of seconds before growing cache again */
 167 static int              arc_grow_retry = 60;
 168 
 169 /* shift of arc_c for calculating both min and max arc_p */
 170 static int              arc_p_min_shift = 4;
 171 


 278          * indrect prefetch buffers that have not lived long enough, or are
 279          * not from the spa we're trying to evict from.
 280          */
 281         kstat_named_t arcstat_evict_skip;
 282         kstat_named_t arcstat_evict_l2_cached;
 283         kstat_named_t arcstat_evict_l2_eligible;
 284         kstat_named_t arcstat_evict_l2_ineligible;
 285         kstat_named_t arcstat_hash_elements;
 286         kstat_named_t arcstat_hash_elements_max;
 287         kstat_named_t arcstat_hash_collisions;
 288         kstat_named_t arcstat_hash_chains;
 289         kstat_named_t arcstat_hash_chain_max;
 290         kstat_named_t arcstat_p;
 291         kstat_named_t arcstat_c;
 292         kstat_named_t arcstat_c_min;
 293         kstat_named_t arcstat_c_max;
 294         kstat_named_t arcstat_size;
 295         kstat_named_t arcstat_hdr_size;
 296         kstat_named_t arcstat_data_size;
 297         kstat_named_t arcstat_other_size;

 298         kstat_named_t arcstat_l2_hits;
 299         kstat_named_t arcstat_l2_misses;
 300         kstat_named_t arcstat_l2_feeds;
 301         kstat_named_t arcstat_l2_rw_clash;
 302         kstat_named_t arcstat_l2_read_bytes;
 303         kstat_named_t arcstat_l2_write_bytes;
 304         kstat_named_t arcstat_l2_writes_sent;
 305         kstat_named_t arcstat_l2_writes_done;
 306         kstat_named_t arcstat_l2_writes_error;
 307         kstat_named_t arcstat_l2_writes_hdr_miss;
 308         kstat_named_t arcstat_l2_evict_lock_retry;
 309         kstat_named_t arcstat_l2_evict_reading;
 310         kstat_named_t arcstat_l2_free_on_write;
 311         kstat_named_t arcstat_l2_abort_lowmem;
 312         kstat_named_t arcstat_l2_cksum_bad;
 313         kstat_named_t arcstat_l2_io_error;
 314         kstat_named_t arcstat_l2_size;
 315         kstat_named_t arcstat_l2_asize;
 316         kstat_named_t arcstat_l2_hdr_size;
 317         kstat_named_t arcstat_l2_compress_successes;


 344         { "deleted",                    KSTAT_DATA_UINT64 },
 345         { "recycle_miss",               KSTAT_DATA_UINT64 },
 346         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 347         { "evict_skip",                 KSTAT_DATA_UINT64 },
 348         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 349         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 350         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 351         { "hash_elements",              KSTAT_DATA_UINT64 },
 352         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 353         { "hash_collisions",            KSTAT_DATA_UINT64 },
 354         { "hash_chains",                KSTAT_DATA_UINT64 },
 355         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 356         { "p",                          KSTAT_DATA_UINT64 },
 357         { "c",                          KSTAT_DATA_UINT64 },
 358         { "c_min",                      KSTAT_DATA_UINT64 },
 359         { "c_max",                      KSTAT_DATA_UINT64 },
 360         { "size",                       KSTAT_DATA_UINT64 },
 361         { "hdr_size",                   KSTAT_DATA_UINT64 },
 362         { "data_size",                  KSTAT_DATA_UINT64 },
 363         { "other_size",                 KSTAT_DATA_UINT64 },

 364         { "l2_hits",                    KSTAT_DATA_UINT64 },
 365         { "l2_misses",                  KSTAT_DATA_UINT64 },
 366         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 367         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 368         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 369         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 370         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 371         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 372         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 373         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 374         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 375         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 376         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 377         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 378         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 379         { "l2_io_error",                KSTAT_DATA_UINT64 },
 380         { "l2_size",                    KSTAT_DATA_UINT64 },
 381         { "l2_asize",                   KSTAT_DATA_UINT64 },
 382         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 383         { "l2_compress_successes",      KSTAT_DATA_UINT64 },


 413 /*
 414  * We define a macro to allow ARC hits/misses to be easily broken down by
 415  * two separate conditions, giving a total of four different subtypes for
 416  * each of hits and misses (so eight statistics total).
 417  */
 418 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 419         if (cond1) {                                                    \
 420                 if (cond2) {                                            \
 421                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 422                 } else {                                                \
 423                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 424                 }                                                       \
 425         } else {                                                        \
 426                 if (cond2) {                                            \
 427                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 428                 } else {                                                \
 429                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 430                 }                                                       \
 431         }
 432 

















 433 kstat_t                 *arc_ksp;
 434 static arc_state_t      *arc_anon;
 435 static arc_state_t      *arc_mru;
 436 static arc_state_t      *arc_mru_ghost;
 437 static arc_state_t      *arc_mfu;
 438 static arc_state_t      *arc_mfu_ghost;
 439 static arc_state_t      *arc_l2c_only;
 440 
 441 /*
 442  * There are several ARC variables that are critical to export as kstats --
 443  * but we don't want to have to grovel around in the kstat whenever we wish to
 444  * manipulate them.  For these variables, we therefore define them to be in
 445  * terms of the statistic variable.  This assures that we are not introducing
 446  * the possibility of inconsistency by having shadow copies of the variables,
 447  * while still allowing the code to be readable.
 448  */
 449 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 450 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 451 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 452 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 453 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 454 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 455 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 456 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 457 
 458 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 459         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 460 
 461 static int              arc_no_grow;    /* Don't try to grow cache size */
 462 static uint64_t         arc_tempreserve;
 463 static uint64_t         arc_loaned_bytes;

 464 
 465 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 466 
 467 typedef struct arc_callback arc_callback_t;
 468 
 469 struct arc_callback {
 470         void                    *acb_private;
 471         arc_done_func_t         *acb_done;
 472         arc_buf_t               *acb_buf;
 473         zio_t                   *acb_zio_dummy;
 474         arc_callback_t          *acb_next;
 475 };
 476 
 477 typedef struct arc_write_callback arc_write_callback_t;
 478 
 479 struct arc_write_callback {
 480         void            *awcb_private;
 481         arc_done_func_t *awcb_ready;
 482         arc_done_func_t *awcb_physdone;
 483         arc_done_func_t *awcb_done;


1257 {
1258         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1259 
1260         switch (type) {
1261         case ARC_SPACE_DATA:
1262                 ARCSTAT_INCR(arcstat_data_size, space);
1263                 break;
1264         case ARC_SPACE_OTHER:
1265                 ARCSTAT_INCR(arcstat_other_size, space);
1266                 break;
1267         case ARC_SPACE_HDRS:
1268                 ARCSTAT_INCR(arcstat_hdr_size, space);
1269                 break;
1270         case ARC_SPACE_L2HDRS:
1271                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1272                 break;
1273         }
1274 
1275         ARCSTAT_INCR(arcstat_meta_used, space);
1276         atomic_add_64(&arc_size, space);

1277 }
1278 
1279 void
1280 arc_space_return(uint64_t space, arc_space_type_t type)
1281 {
1282         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1283 
1284         switch (type) {
1285         case ARC_SPACE_DATA:
1286                 ARCSTAT_INCR(arcstat_data_size, -space);
1287                 break;
1288         case ARC_SPACE_OTHER:
1289                 ARCSTAT_INCR(arcstat_other_size, -space);
1290                 break;
1291         case ARC_SPACE_HDRS:
1292                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1293                 break;
1294         case ARC_SPACE_L2HDRS:
1295                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1296                 break;
1297         }
1298 
1299         ASSERT(arc_meta_used >= space);
1300         if (arc_meta_max < arc_meta_used)
1301                 arc_meta_max = arc_meta_used;
1302         ARCSTAT_INCR(arcstat_meta_used, -space);
1303         ASSERT(arc_size >= space);
1304         atomic_add_64(&arc_size, -space);
1305 }
1306 
1307 void *
1308 arc_data_buf_alloc(uint64_t size)
1309 {
1310         if (arc_evict_needed(ARC_BUFC_DATA))
1311                 cv_signal(&arc_reclaim_thr_cv);
1312         atomic_add_64(&arc_size, size);

1313         return (zio_data_buf_alloc(size));
1314 }
1315 
1316 void
1317 arc_data_buf_free(void *buf, uint64_t size)
1318 {
1319         zio_data_buf_free(buf, size);
1320         ASSERT(arc_size >= size);
1321         atomic_add_64(&arc_size, -size);
1322 }
1323 
1324 arc_buf_t *
1325 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1326 {
1327         arc_buf_hdr_t *hdr;
1328         arc_buf_t *buf;
1329 
1330         ASSERT3U(size, >, 0);
1331         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1332         ASSERT(BUF_EMPTY(hdr));


2094         /*
2095          * Adjust ghost lists
2096          */
2097 
2098         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2099 
2100         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2101                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2102                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2103         }
2104 
2105         adjustment =
2106             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2107 
2108         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2109                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2110                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2111         }
2112 }
2113 


2114 static void

































2115 arc_do_user_evicts(void)
2116 {
2117         mutex_enter(&arc_eviction_mtx);
2118         while (arc_eviction_list != NULL) {
2119                 arc_buf_t *buf = arc_eviction_list;
2120                 arc_eviction_list = buf->b_next;
2121                 mutex_enter(&buf->b_evict_lock);
2122                 buf->b_hdr = NULL;
2123                 mutex_exit(&buf->b_evict_lock);
2124                 mutex_exit(&arc_eviction_mtx);
2125 
2126                 if (buf->b_efunc != NULL)
2127                         VERIFY0(buf->b_efunc(buf->b_private));
2128 
2129                 buf->b_efunc = NULL;
2130                 buf->b_private = NULL;
2131                 kmem_cache_free(buf_cache, buf);
2132                 mutex_enter(&arc_eviction_mtx);
2133         }
2134         mutex_exit(&arc_eviction_mtx);


2232                 to_free = arc_c >> arc_shrink_shift;
2233 #endif
2234                 if (arc_c > arc_c_min + to_free)
2235                         atomic_add_64(&arc_c, -to_free);
2236                 else
2237                         arc_c = arc_c_min;
2238 
2239                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2240                 if (arc_c > arc_size)
2241                         arc_c = MAX(arc_size, arc_c_min);
2242                 if (arc_p > arc_c)
2243                         arc_p = (arc_c >> 1);
2244                 ASSERT(arc_c >= arc_c_min);
2245                 ASSERT((int64_t)arc_p >= 0);
2246         }
2247 
2248         if (arc_size > arc_c)
2249                 arc_adjust();
2250 }
2251 





















2252 /*
2253  * Determine if the system is under memory pressure and is asking
2254  * to reclaim memory. A return value of 1 indicates that the system
2255  * is under memory pressure and that the arc should adjust accordingly.
2256  */
2257 static int
2258 arc_reclaim_needed(void)
2259 {
2260         uint64_t extra;
2261 
2262 #ifdef _KERNEL
2263 
2264         if (needfree)
2265                 return (1);
2266 
2267         /*
2268          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2269          */
2270         extra = desfree;
2271 


2374                         prev_cache = zio_buf_cache[i];
2375                         kmem_cache_reap_now(zio_buf_cache[i]);
2376                 }
2377                 if (zio_data_buf_cache[i] != prev_data_cache) {
2378                         prev_data_cache = zio_data_buf_cache[i];
2379                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2380                 }
2381         }
2382         kmem_cache_reap_now(buf_cache);
2383         kmem_cache_reap_now(hdr_cache);
2384         kmem_cache_reap_now(range_seg_cache);
2385 
2386         /*
2387          * Ask the vmem areana to reclaim unused memory from its
2388          * quantum caches.
2389          */
2390         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2391                 vmem_qcache_reap(zio_arena);
2392 }
2393 

































2394 static void












































2395 arc_reclaim_thread(void)
2396 {
2397         clock_t                 growtime = 0;
2398         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2399         callb_cpr_t             cpr;
2400 
2401         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2402 
2403         mutex_enter(&arc_reclaim_thr_lock);
2404         while (arc_thread_exit == 0) {
2405                 if (arc_reclaim_needed()) {
2406 
2407                         if (arc_no_grow) {
2408                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2409                                         last_reclaim = ARC_RECLAIM_AGGR;
2410                                 } else {
2411                                         last_reclaim = ARC_RECLAIM_CONS;
2412                                 }
2413                         } else {
2414                                 arc_no_grow = TRUE;


2483                 delta = MIN(bytes * mult, arc_p);
2484                 arc_p = MAX(arc_p_min, arc_p - delta);
2485         }
2486         ASSERT((int64_t)arc_p >= 0);
2487 
2488         if (arc_reclaim_needed()) {
2489                 cv_signal(&arc_reclaim_thr_cv);
2490                 return;
2491         }
2492 
2493         if (arc_no_grow)
2494                 return;
2495 
2496         if (arc_c >= arc_c_max)
2497                 return;
2498 
2499         /*
2500          * If we're within (2 * maxblocksize) bytes of the target
2501          * cache size, increment the target cache size
2502          */
2503         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {

2504                 atomic_add_64(&arc_c, (int64_t)bytes);
2505                 if (arc_c > arc_c_max)
2506                         arc_c = arc_c_max;
2507                 else if (state == arc_anon)
2508                         atomic_add_64(&arc_p, (int64_t)bytes);
2509                 if (arc_p > arc_c)
2510                         arc_p = arc_c;
2511         }
2512         ASSERT((int64_t)arc_p >= 0);
2513 }
2514 
2515 /*
2516  * Check if the cache has reached its limits and eviction is required
2517  * prior to insert.
2518  */
2519 static int
2520 arc_evict_needed(arc_buf_contents_t type)
2521 {
2522         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2523                 return (1);


2556 {
2557         arc_state_t             *state = buf->b_hdr->b_state;
2558         uint64_t                size = buf->b_hdr->b_size;
2559         arc_buf_contents_t      type = buf->b_hdr->b_type;
2560 
2561         arc_adapt(size, state);
2562 
2563         /*
2564          * We have not yet reached cache maximum size,
2565          * just allocate a new buffer.
2566          */
2567         if (!arc_evict_needed(type)) {
2568                 if (type == ARC_BUFC_METADATA) {
2569                         buf->b_data = zio_buf_alloc(size);
2570                         arc_space_consume(size, ARC_SPACE_DATA);
2571                 } else {
2572                         ASSERT(type == ARC_BUFC_DATA);
2573                         buf->b_data = zio_data_buf_alloc(size);
2574                         ARCSTAT_INCR(arcstat_data_size, size);
2575                         atomic_add_64(&arc_size, size);

2576                 }
2577                 goto out;
2578         }
2579 
2580         /*
2581          * If we are prefetching from the mfu ghost list, this buffer
2582          * will end up on the mru list; so steal space from there.
2583          */
2584         if (state == arc_mfu_ghost)
2585                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2586         else if (state == arc_mru_ghost)
2587                 state = arc_mru;
2588 
2589         if (state == arc_mru || state == arc_anon) {
2590                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2591                 state = (arc_mfu->arcs_lsize[type] >= size &&
2592                     arc_p > mru_used) ? arc_mfu : arc_mru;
2593         } else {
2594                 /* MFU cases */
2595                 uint64_t mfu_space = arc_c - arc_p;
2596                 state =  (arc_mru->arcs_lsize[type] >= size &&
2597                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2598         }
2599         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2600                 if (type == ARC_BUFC_METADATA) {
2601                         buf->b_data = zio_buf_alloc(size);
2602                         arc_space_consume(size, ARC_SPACE_DATA);
2603                 } else {
2604                         ASSERT(type == ARC_BUFC_DATA);
2605                         buf->b_data = zio_data_buf_alloc(size);
2606                         ARCSTAT_INCR(arcstat_data_size, size);
2607                         atomic_add_64(&arc_size, size);

2608                 }
2609                 ARCSTAT_BUMP(arcstat_recycle_miss);
2610         }
2611         ASSERT(buf->b_data != NULL);
2612 out:
2613         /*
2614          * Update the state size.  Note that ghost states have a
2615          * "ghost size" and so don't need to be updated.
2616          */
2617         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2618                 arc_buf_hdr_t *hdr = buf->b_hdr;
2619 
2620                 atomic_add_64(&hdr->b_state->arcs_size, size);
2621                 if (list_link_active(&hdr->b_arc_node)) {
2622                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2623                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2624                 }
2625                 /*
2626                  * If we are growing the cache, and we are adding anonymous
2627                  * data, and we have outgrown arc_p, update arc_p


3754                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3755                     arc_tempreserve>>10,
3756                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3757                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3758                     reserve>>10, arc_c>>10);
3759                 return (SET_ERROR(ERESTART));
3760         }
3761         atomic_add_64(&arc_tempreserve, reserve);
3762         return (0);
3763 }
3764 
3765 /* Tuneable, default is 64, which is essentially arbitrary */
3766 int zfs_flush_ntasks = 64;
3767 
3768 void
3769 arc_init(void)
3770 {
3771         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3772         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3773 



3774         /* Convert seconds to clock ticks */
3775         arc_min_prefetch_lifespan = 1 * hz;
3776 
3777         /* Start out with 1/8 of all memory */
3778         arc_c = physmem * PAGESIZE / 8;
3779 
3780 #ifdef _KERNEL
3781         /*
3782          * On architectures where the physical memory can be larger
3783          * than the addressable space (intel in 32-bit mode), we may
3784          * need to limit the cache to 1/8 of VM size.
3785          */
3786         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3787 #endif
3788 


3789         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3790         arc_c_min = MAX(arc_c / 4, 64<<20);
3791         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3792         if (arc_c * 8 >= 1<<30)
3793                 arc_c_max = (arc_c * 8) - (1<<30);
3794         else
3795                 arc_c_max = arc_c_min;
3796         arc_c_max = MAX(arc_c * 6, arc_c_max);
3797 
3798         /*
3799          * Allow the tunables to override our calculations if they are
3800          * reasonable (ie. over 64MB)
3801          */
3802         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3803                 arc_c_max = zfs_arc_max;
3804         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3805                 arc_c_min = zfs_arc_min;
3806 
3807         arc_c = arc_c_max;
3808         arc_p = (arc_c >> 1);


3870 
3871         arc_flush_taskq = taskq_create("arc_flush_tq",
3872             max_ncpus, minclsyspri, 1, zfs_flush_ntasks, TASKQ_DYNAMIC);
3873         buf_init();
3874 
3875         arc_thread_exit = 0;
3876         arc_eviction_list = NULL;
3877         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3878         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3879 
3880         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3881             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3882 
3883         if (arc_ksp != NULL) {
3884                 arc_ksp->ks_data = &arc_stats;
3885                 kstat_install(arc_ksp);
3886         }
3887 
3888         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3889             TS_RUN, minclsyspri);


3890 
3891         arc_dead = FALSE;
3892         arc_warm = B_FALSE;
3893 
3894         /*
3895          * Calculate maximum amount of dirty data per pool.
3896          *
3897          * If it has been set by /etc/system, take that.
3898          * Otherwise, use a percentage of physical memory defined by
3899          * zfs_dirty_data_max_percent (default 10%) with a cap at
3900          * zfs_dirty_data_max_max (default 4GB).
3901          */
3902         if (zfs_dirty_data_max == 0) {
3903                 zfs_dirty_data_max = physmem * PAGESIZE *
3904                     zfs_dirty_data_max_percent / 100;
3905                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
3906                     zfs_dirty_data_max_max);
3907         }
3908 }
3909 
3910 void
3911 arc_fini(void)
3912 {
3913         mutex_enter(&arc_reclaim_thr_lock);
3914         arc_thread_exit = 1;
3915         while (arc_thread_exit != 0)
3916                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3917         mutex_exit(&arc_reclaim_thr_lock);
3918 






3919         arc_flush(NULL);
3920 
3921         arc_dead = TRUE;
3922 
3923         if (arc_ksp != NULL) {
3924                 kstat_delete(arc_ksp);
3925                 arc_ksp = NULL;
3926         }
3927 
3928         mutex_destroy(&arc_eviction_mtx);
3929         mutex_destroy(&arc_reclaim_thr_lock);
3930         cv_destroy(&arc_reclaim_thr_cv);


3931 
3932         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3933         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3934         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3935         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3936         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3937         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3938         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3939         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3940 
3941         mutex_destroy(&arc_anon->arcs_mtx);
3942         mutex_destroy(&arc_mru->arcs_mtx);
3943         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3944         mutex_destroy(&arc_mfu->arcs_mtx);
3945         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3946         mutex_destroy(&arc_l2c_only->arcs_mtx);
3947 
3948         taskq_destroy(arc_flush_taskq);
3949         buf_fini();
3950 




 132 #ifdef _KERNEL
 133 #include <sys/vmsystm.h>
 134 #include <vm/anon.h>
 135 #include <sys/fs/swapnode.h>
 136 #include <sys/dnlc.h>
 137 #endif
 138 #include <sys/callb.h>
 139 #include <sys/kstat.h>
 140 #include <zfs_fletcher.h>
 141 
 142 #ifndef _KERNEL
 143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 144 boolean_t arc_watch = B_FALSE;
 145 int arc_procfd;
 146 #endif
 147 
 148 static kmutex_t         arc_reclaim_thr_lock;
 149 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 150 static uint8_t          arc_thread_exit;
 151 
 152 static kmutex_t         arc_pressure_thr_lock;
 153 static kcondvar_t       arc_pressure_thr_cv;
 154 static uint8_t          arc_pressure_thread_exit;
 155 static uint64_t         arc_pressure_threshold;
 156 
 157 #define ARC_REDUCE_DNLC_PERCENT 3
 158 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 159 
 160 typedef enum arc_reclaim_strategy {
 161         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 162         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 163 } arc_reclaim_strategy_t;
 164 
 165 /*
 166  * The number of iterations through arc_evict_*() before we
 167  * drop & reacquire the lock.
 168  */
 169 int arc_evict_iterations = 100;
 170 
 171 /* number of seconds before growing cache again */
 172 static int              arc_grow_retry = 60;
 173 
 174 /* shift of arc_c for calculating both min and max arc_p */
 175 static int              arc_p_min_shift = 4;
 176 


 283          * indrect prefetch buffers that have not lived long enough, or are
 284          * not from the spa we're trying to evict from.
 285          */
 286         kstat_named_t arcstat_evict_skip;
 287         kstat_named_t arcstat_evict_l2_cached;
 288         kstat_named_t arcstat_evict_l2_eligible;
 289         kstat_named_t arcstat_evict_l2_ineligible;
 290         kstat_named_t arcstat_hash_elements;
 291         kstat_named_t arcstat_hash_elements_max;
 292         kstat_named_t arcstat_hash_collisions;
 293         kstat_named_t arcstat_hash_chains;
 294         kstat_named_t arcstat_hash_chain_max;
 295         kstat_named_t arcstat_p;
 296         kstat_named_t arcstat_c;
 297         kstat_named_t arcstat_c_min;
 298         kstat_named_t arcstat_c_max;
 299         kstat_named_t arcstat_size;
 300         kstat_named_t arcstat_hdr_size;
 301         kstat_named_t arcstat_data_size;
 302         kstat_named_t arcstat_other_size;
 303         kstat_named_t arcstat_growth_rate;
 304         kstat_named_t arcstat_l2_hits;
 305         kstat_named_t arcstat_l2_misses;
 306         kstat_named_t arcstat_l2_feeds;
 307         kstat_named_t arcstat_l2_rw_clash;
 308         kstat_named_t arcstat_l2_read_bytes;
 309         kstat_named_t arcstat_l2_write_bytes;
 310         kstat_named_t arcstat_l2_writes_sent;
 311         kstat_named_t arcstat_l2_writes_done;
 312         kstat_named_t arcstat_l2_writes_error;
 313         kstat_named_t arcstat_l2_writes_hdr_miss;
 314         kstat_named_t arcstat_l2_evict_lock_retry;
 315         kstat_named_t arcstat_l2_evict_reading;
 316         kstat_named_t arcstat_l2_free_on_write;
 317         kstat_named_t arcstat_l2_abort_lowmem;
 318         kstat_named_t arcstat_l2_cksum_bad;
 319         kstat_named_t arcstat_l2_io_error;
 320         kstat_named_t arcstat_l2_size;
 321         kstat_named_t arcstat_l2_asize;
 322         kstat_named_t arcstat_l2_hdr_size;
 323         kstat_named_t arcstat_l2_compress_successes;


 350         { "deleted",                    KSTAT_DATA_UINT64 },
 351         { "recycle_miss",               KSTAT_DATA_UINT64 },
 352         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 353         { "evict_skip",                 KSTAT_DATA_UINT64 },
 354         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 355         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 356         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 357         { "hash_elements",              KSTAT_DATA_UINT64 },
 358         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 359         { "hash_collisions",            KSTAT_DATA_UINT64 },
 360         { "hash_chains",                KSTAT_DATA_UINT64 },
 361         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 362         { "p",                          KSTAT_DATA_UINT64 },
 363         { "c",                          KSTAT_DATA_UINT64 },
 364         { "c_min",                      KSTAT_DATA_UINT64 },
 365         { "c_max",                      KSTAT_DATA_UINT64 },
 366         { "size",                       KSTAT_DATA_UINT64 },
 367         { "hdr_size",                   KSTAT_DATA_UINT64 },
 368         { "data_size",                  KSTAT_DATA_UINT64 },
 369         { "other_size",                 KSTAT_DATA_UINT64 },
 370         { "growth_rate",                KSTAT_DATA_UINT64 },
 371         { "l2_hits",                    KSTAT_DATA_UINT64 },
 372         { "l2_misses",                  KSTAT_DATA_UINT64 },
 373         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 374         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 375         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 376         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 377         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 378         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 379         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 380         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 381         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 382         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 383         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 384         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 385         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 386         { "l2_io_error",                KSTAT_DATA_UINT64 },
 387         { "l2_size",                    KSTAT_DATA_UINT64 },
 388         { "l2_asize",                   KSTAT_DATA_UINT64 },
 389         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 390         { "l2_compress_successes",      KSTAT_DATA_UINT64 },


 420 /*
 421  * We define a macro to allow ARC hits/misses to be easily broken down by
 422  * two separate conditions, giving a total of four different subtypes for
 423  * each of hits and misses (so eight statistics total).
 424  */
 425 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 426         if (cond1) {                                                    \
 427                 if (cond2) {                                            \
 428                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 429                 } else {                                                \
 430                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 431                 }                                                       \
 432         } else {                                                        \
 433                 if (cond2) {                                            \
 434                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 435                 } else {                                                \
 436                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 437                 }                                                       \
 438         }
 439 
 440 /*
 441  * This macro allows us to use kstats as floating averages. Each time we
 442  * update this kstat, we first factor it and the update value by `factor'
 443  * to shrink the new value's contribution to the overall average. This
 444  * macro assumes that integer loads and stores are atomic, but is not
 445  * safe for multiple writers updating the kstat in parallel (only the
 446  * last writer's update will remain).
 447  */
 448 #define ARCSTAT_F_AVG(stat, value, factor) \
 449         do { \
 450                 uint64_t x = ARCSTAT(stat); \
 451                 x = x - x / factor + (value) / factor; \
 452                 ARCSTAT(stat) = x; \
 453                 _NOTE(NOTREACHED) \
 454                 _NOTE(CONSTCOND) \
 455         } while (0)
 456 
 457 kstat_t                 *arc_ksp;
 458 static arc_state_t      *arc_anon;
 459 static arc_state_t      *arc_mru;
 460 static arc_state_t      *arc_mru_ghost;
 461 static arc_state_t      *arc_mfu;
 462 static arc_state_t      *arc_mfu_ghost;
 463 static arc_state_t      *arc_l2c_only;
 464 
 465 /*
 466  * There are several ARC variables that are critical to export as kstats --
 467  * but we don't want to have to grovel around in the kstat whenever we wish to
 468  * manipulate them.  For these variables, we therefore define them to be in
 469  * terms of the statistic variable.  This assures that we are not introducing
 470  * the possibility of inconsistency by having shadow copies of the variables,
 471  * while still allowing the code to be readable.
 472  */
 473 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 474 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 475 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 476 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 477 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 478 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 479 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 480 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 481 
 482 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 483         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 484 
 485 static int              arc_no_grow;    /* Don't try to grow cache size */
 486 static uint64_t         arc_tempreserve;
 487 static uint64_t         arc_loaned_bytes;
 488 static uint64_t         arc_bytes_allocd = 0;
 489 
 490 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 491 
 492 typedef struct arc_callback arc_callback_t;
 493 
 494 struct arc_callback {
 495         void                    *acb_private;
 496         arc_done_func_t         *acb_done;
 497         arc_buf_t               *acb_buf;
 498         zio_t                   *acb_zio_dummy;
 499         arc_callback_t          *acb_next;
 500 };
 501 
 502 typedef struct arc_write_callback arc_write_callback_t;
 503 
 504 struct arc_write_callback {
 505         void            *awcb_private;
 506         arc_done_func_t *awcb_ready;
 507         arc_done_func_t *awcb_physdone;
 508         arc_done_func_t *awcb_done;


1282 {
1283         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1284 
1285         switch (type) {
1286         case ARC_SPACE_DATA:
1287                 ARCSTAT_INCR(arcstat_data_size, space);
1288                 break;
1289         case ARC_SPACE_OTHER:
1290                 ARCSTAT_INCR(arcstat_other_size, space);
1291                 break;
1292         case ARC_SPACE_HDRS:
1293                 ARCSTAT_INCR(arcstat_hdr_size, space);
1294                 break;
1295         case ARC_SPACE_L2HDRS:
1296                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1297                 break;
1298         }
1299 
1300         ARCSTAT_INCR(arcstat_meta_used, space);
1301         atomic_add_64(&arc_size, space);
1302         atomic_add_64(&arc_bytes_allocd, space);
1303 }
1304 
1305 void
1306 arc_space_return(uint64_t space, arc_space_type_t type)
1307 {
1308         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1309 
1310         switch (type) {
1311         case ARC_SPACE_DATA:
1312                 ARCSTAT_INCR(arcstat_data_size, -space);
1313                 break;
1314         case ARC_SPACE_OTHER:
1315                 ARCSTAT_INCR(arcstat_other_size, -space);
1316                 break;
1317         case ARC_SPACE_HDRS:
1318                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1319                 break;
1320         case ARC_SPACE_L2HDRS:
1321                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1322                 break;
1323         }
1324 
1325         ASSERT(arc_meta_used >= space);
1326         if (arc_meta_max < arc_meta_used)
1327                 arc_meta_max = arc_meta_used;
1328         ARCSTAT_INCR(arcstat_meta_used, -space);
1329         ASSERT(arc_size >= space);
1330         atomic_add_64(&arc_size, -space);
1331 }
1332 
1333 void *
1334 arc_data_buf_alloc(uint64_t size)
1335 {
1336         if (arc_evict_needed(ARC_BUFC_DATA))
1337                 cv_signal(&arc_reclaim_thr_cv);
1338         atomic_add_64(&arc_size, size);
1339         atomic_add_64(&arc_bytes_allocd, size);
1340         return (zio_data_buf_alloc(size));
1341 }
1342 
1343 void
1344 arc_data_buf_free(void *buf, uint64_t size)
1345 {
1346         zio_data_buf_free(buf, size);
1347         ASSERT(arc_size >= size);
1348         atomic_add_64(&arc_size, -size);
1349 }
1350 
1351 arc_buf_t *
1352 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1353 {
1354         arc_buf_hdr_t *hdr;
1355         arc_buf_t *buf;
1356 
1357         ASSERT3U(size, >, 0);
1358         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1359         ASSERT(BUF_EMPTY(hdr));


2121         /*
2122          * Adjust ghost lists
2123          */
2124 
2125         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2126 
2127         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2128                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2129                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2130         }
2131 
2132         adjustment =
2133             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2134 
2135         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2136                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2137                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2138         }
2139 }
2140 
2141 #define ACCURACY        1000
2142 
2143 static void
2144 arc_reclaim_bytes(uint64_t to_evict)
2145 {
2146         uint64_t to_evict_data_mru, to_evict_data_mfu;
2147         uint64_t to_evict_meta_mru, to_evict_meta_mfu;
2148 
2149         to_evict_meta_mru = (((arc_mru->arcs_lsize[ARC_BUFC_METADATA] *
2150             ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2151             to_evict) / ACCURACY;
2152         to_evict_data_mru = (((arc_mru->arcs_lsize[ARC_BUFC_DATA] *
2153             ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2154             to_evict) / ACCURACY;
2155         to_evict_meta_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_METADATA] *
2156             ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2157             to_evict) / ACCURACY;
2158         to_evict_data_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_DATA] *
2159             ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2160             to_evict) / ACCURACY;
2161 
2162         if (to_evict_meta_mru > 0)
2163                 (void) arc_evict(arc_mru, NULL, to_evict_meta_mru, FALSE,
2164                     ARC_BUFC_METADATA);
2165         if (to_evict_data_mru > 0)
2166                 (void) arc_evict(arc_mru, NULL, to_evict_data_mru, FALSE,
2167                     ARC_BUFC_DATA);
2168         if (to_evict_meta_mfu > 0)
2169                 (void) arc_evict(arc_mfu, NULL, to_evict_meta_mfu, FALSE,
2170                     ARC_BUFC_METADATA);
2171         if (to_evict_data_mfu > 0)
2172                 (void) arc_evict(arc_mfu, NULL, to_evict_data_mfu, FALSE,
2173                     ARC_BUFC_DATA);
2174 }
2175 
2176 static void
2177 arc_do_user_evicts(void)
2178 {
2179         mutex_enter(&arc_eviction_mtx);
2180         while (arc_eviction_list != NULL) {
2181                 arc_buf_t *buf = arc_eviction_list;
2182                 arc_eviction_list = buf->b_next;
2183                 mutex_enter(&buf->b_evict_lock);
2184                 buf->b_hdr = NULL;
2185                 mutex_exit(&buf->b_evict_lock);
2186                 mutex_exit(&arc_eviction_mtx);
2187 
2188                 if (buf->b_efunc != NULL)
2189                         VERIFY0(buf->b_efunc(buf->b_private));
2190 
2191                 buf->b_efunc = NULL;
2192                 buf->b_private = NULL;
2193                 kmem_cache_free(buf_cache, buf);
2194                 mutex_enter(&arc_eviction_mtx);
2195         }
2196         mutex_exit(&arc_eviction_mtx);


2294                 to_free = arc_c >> arc_shrink_shift;
2295 #endif
2296                 if (arc_c > arc_c_min + to_free)
2297                         atomic_add_64(&arc_c, -to_free);
2298                 else
2299                         arc_c = arc_c_min;
2300 
2301                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2302                 if (arc_c > arc_size)
2303                         arc_c = MAX(arc_size, arc_c_min);
2304                 if (arc_p > arc_c)
2305                         arc_p = (arc_c >> 1);
2306                 ASSERT(arc_c >= arc_c_min);
2307                 ASSERT((int64_t)arc_p >= 0);
2308         }
2309 
2310         if (arc_size > arc_c)
2311                 arc_adjust();
2312 }
2313 
2314 #define PHYSMEM_PRESSURE_FRACTION       100
2315 
2316 static boolean_t
2317 arc_mem_pressure(void)
2318 {
2319 #ifdef _KERNEL
2320         uint64_t extra = desfree + physmem / PHYSMEM_PRESSURE_FRACTION;
2321 
2322         if ((freemem < lotsfree + needfree + extra) ||
2323             (needfree || availrmem < swapfs_minfree + swapfs_reserve + extra) ||
2324             (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) <
2325             (vmem_size(zio_arena, VMEM_ALLOC) >> 4) +
2326             physmem / PHYSMEM_PRESSURE_FRACTION))
2327                 return (B_TRUE);
2328 
2329         return (freemem < physmem / PHYSMEM_PRESSURE_FRACTION);
2330 #else
2331         return (0);
2332 #endif
2333 }
2334 
2335 /*
2336  * Determine if the system is under memory pressure and is asking
2337  * to reclaim memory. A return value of 1 indicates that the system
2338  * is under memory pressure and that the arc should adjust accordingly.
2339  */
2340 static int
2341 arc_reclaim_needed(void)
2342 {
2343         uint64_t extra;
2344 
2345 #ifdef _KERNEL
2346 
2347         if (needfree)
2348                 return (1);
2349 
2350         /*
2351          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2352          */
2353         extra = desfree;
2354 


2457                         prev_cache = zio_buf_cache[i];
2458                         kmem_cache_reap_now(zio_buf_cache[i]);
2459                 }
2460                 if (zio_data_buf_cache[i] != prev_data_cache) {
2461                         prev_data_cache = zio_data_buf_cache[i];
2462                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2463                 }
2464         }
2465         kmem_cache_reap_now(buf_cache);
2466         kmem_cache_reap_now(hdr_cache);
2467         kmem_cache_reap_now(range_seg_cache);
2468 
2469         /*
2470          * Ask the vmem areana to reclaim unused memory from its
2471          * quantum caches.
2472          */
2473         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2474                 vmem_qcache_reap(zio_arena);
2475 }
2476 
2477 #define RECLAIMS_PER_SEC        20
2478 #define STAT_UPDATES_PER_SEC    5
2479 
2480 /*
2481  * During heavy use, the ARC naturally wants to oscilate its arc_c around
2482  * a maximum memory pressure point which corresponds to the arc_reclaim_needed
2483  * function evaluating to 1. This results in the arc_size slowly growing
2484  * towards this reclaim_needed threshold and exceeding it periodically. Once
2485  * this happens, both arc_c and arc_size are down-adjusted by the
2486  * arc_reclaim_thread and kmem_reap is initiated. This is problematic on
2487  * bigmem systems with a small recordsize (4k or 8k), because reaping a kmem
2488  * cache which contains very large numbers of objects is extremely expensive
2489  * from an xcall perspective (several seconds of heavy CPU use):
2490  *
2491  * (mem)
2492  * ^         arc_reclaim_thread reacts
2493  * |           |                   |
2494  * |           V                   V
2495  * |
2496  * |           +                   +
2497  * |          /|                  /|
2498  * | ......./..|................/..|.............. arc_reclaim_needed threshold
2499  * |      /     \_____________/     \___________/(etc)
2500  * |    /          kmem reap          kmem reap
2501  * |  /
2502  * |/
2503  * +----------------------------------------------------------------->
2504  *                                                            (time)
2505  *
2506  * To help address this stairstep pattern, the arc_pressure_thread periodically
2507  * gauges the distance of the current arc_size to the arc_reclaim_needed
2508  * threshold by way of an estimation algorithm (in arc_mem_pressure).
2509  */
2510 static void
2511 arc_pressure_thread(void)
2512 {
2513         clock_t                 last_update = ddi_get_lbolt();
2514         callb_cpr_t             cpr;
2515 
2516         CALLB_CPR_INIT(&cpr, &arc_pressure_thr_lock, callb_generic_cpr, FTAG);
2517 
2518         mutex_enter(&arc_pressure_thr_lock);
2519         while (arc_pressure_thread_exit == 0) {
2520                 clock_t now;
2521 
2522                 now = ddi_get_lbolt();
2523                 if (now - last_update >= hz / STAT_UPDATES_PER_SEC) {
2524                         uint64_t new_rate;
2525 
2526                         new_rate = (atomic_swap_64(&arc_bytes_allocd, 0) *
2527                             hz) / (now - last_update);
2528 
2529                         if (ARCSTAT(arcstat_growth_rate) < new_rate)
2530                                 ARCSTAT(arcstat_growth_rate) = new_rate;
2531                         else
2532                                 ARCSTAT_F_AVG(arcstat_growth_rate, new_rate, 4);
2533                         last_update = now;
2534                 }
2535 
2536                 arc_pressure_threshold = arc_c - ARCSTAT(arcstat_growth_rate);
2537                 if (arc_size > arc_pressure_threshold) {
2538                         arc_reclaim_bytes(arc_size - arc_pressure_threshold);
2539                 }
2540 
2541                 CALLB_CPR_SAFE_BEGIN(&cpr);
2542                 (void) cv_timedwait(&arc_pressure_thr_cv,
2543                     &arc_pressure_thr_lock,
2544                     ddi_get_lbolt() + hz / RECLAIMS_PER_SEC);
2545                 CALLB_CPR_SAFE_END(&cpr, &arc_pressure_thr_lock);
2546         }
2547 
2548         arc_pressure_thread_exit = 0;
2549         cv_broadcast(&arc_pressure_thr_cv);
2550         CALLB_CPR_EXIT(&cpr);               /* drops arc_pressure_thr_lock */
2551         thread_exit();
2552 }
2553 
2554 static void
2555 arc_reclaim_thread(void)
2556 {
2557         clock_t                 growtime = 0;
2558         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2559         callb_cpr_t             cpr;
2560 
2561         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2562 
2563         mutex_enter(&arc_reclaim_thr_lock);
2564         while (arc_thread_exit == 0) {
2565                 if (arc_reclaim_needed()) {
2566 
2567                         if (arc_no_grow) {
2568                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2569                                         last_reclaim = ARC_RECLAIM_AGGR;
2570                                 } else {
2571                                         last_reclaim = ARC_RECLAIM_CONS;
2572                                 }
2573                         } else {
2574                                 arc_no_grow = TRUE;


2643                 delta = MIN(bytes * mult, arc_p);
2644                 arc_p = MAX(arc_p_min, arc_p - delta);
2645         }
2646         ASSERT((int64_t)arc_p >= 0);
2647 
2648         if (arc_reclaim_needed()) {
2649                 cv_signal(&arc_reclaim_thr_cv);
2650                 return;
2651         }
2652 
2653         if (arc_no_grow)
2654                 return;
2655 
2656         if (arc_c >= arc_c_max)
2657                 return;
2658 
2659         /*
2660          * If we're within (2 * maxblocksize) bytes of the target
2661          * cache size, increment the target cache size
2662          */
2663         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT) ||
2664             (arc_size >= arc_pressure_threshold && arc_mem_pressure() == 0)) {
2665                 atomic_add_64(&arc_c, (int64_t)bytes);
2666                 if (arc_c > arc_c_max)
2667                         arc_c = arc_c_max;
2668                 else if (state == arc_anon)
2669                         atomic_add_64(&arc_p, (int64_t)bytes);
2670                 if (arc_p > arc_c)
2671                         arc_p = arc_c;
2672         }
2673         ASSERT((int64_t)arc_p >= 0);
2674 }
2675 
2676 /*
2677  * Check if the cache has reached its limits and eviction is required
2678  * prior to insert.
2679  */
2680 static int
2681 arc_evict_needed(arc_buf_contents_t type)
2682 {
2683         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2684                 return (1);


2717 {
2718         arc_state_t             *state = buf->b_hdr->b_state;
2719         uint64_t                size = buf->b_hdr->b_size;
2720         arc_buf_contents_t      type = buf->b_hdr->b_type;
2721 
2722         arc_adapt(size, state);
2723 
2724         /*
2725          * We have not yet reached cache maximum size,
2726          * just allocate a new buffer.
2727          */
2728         if (!arc_evict_needed(type)) {
2729                 if (type == ARC_BUFC_METADATA) {
2730                         buf->b_data = zio_buf_alloc(size);
2731                         arc_space_consume(size, ARC_SPACE_DATA);
2732                 } else {
2733                         ASSERT(type == ARC_BUFC_DATA);
2734                         buf->b_data = zio_data_buf_alloc(size);
2735                         ARCSTAT_INCR(arcstat_data_size, size);
2736                         atomic_add_64(&arc_size, size);
2737                         atomic_add_64(&arc_bytes_allocd, size);
2738                 }
2739                 goto out;
2740         }
2741 
2742         /*
2743          * If we are prefetching from the mfu ghost list, this buffer
2744          * will end up on the mru list; so steal space from there.
2745          */
2746         if (state == arc_mfu_ghost)
2747                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2748         else if (state == arc_mru_ghost)
2749                 state = arc_mru;
2750 
2751         if (state == arc_mru || state == arc_anon) {
2752                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2753                 state = (arc_mfu->arcs_lsize[type] >= size &&
2754                     arc_p > mru_used) ? arc_mfu : arc_mru;
2755         } else {
2756                 /* MFU cases */
2757                 uint64_t mfu_space = arc_c - arc_p;
2758                 state =  (arc_mru->arcs_lsize[type] >= size &&
2759                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2760         }
2761         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2762                 if (type == ARC_BUFC_METADATA) {
2763                         buf->b_data = zio_buf_alloc(size);
2764                         arc_space_consume(size, ARC_SPACE_DATA);
2765                 } else {
2766                         ASSERT(type == ARC_BUFC_DATA);
2767                         buf->b_data = zio_data_buf_alloc(size);
2768                         ARCSTAT_INCR(arcstat_data_size, size);
2769                         atomic_add_64(&arc_size, size);
2770                         atomic_add_64(&arc_bytes_allocd, size);
2771                 }
2772                 ARCSTAT_BUMP(arcstat_recycle_miss);
2773         }
2774         ASSERT(buf->b_data != NULL);
2775 out:
2776         /*
2777          * Update the state size.  Note that ghost states have a
2778          * "ghost size" and so don't need to be updated.
2779          */
2780         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2781                 arc_buf_hdr_t *hdr = buf->b_hdr;
2782 
2783                 atomic_add_64(&hdr->b_state->arcs_size, size);
2784                 if (list_link_active(&hdr->b_arc_node)) {
2785                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2786                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2787                 }
2788                 /*
2789                  * If we are growing the cache, and we are adding anonymous
2790                  * data, and we have outgrown arc_p, update arc_p


3917                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3918                     arc_tempreserve>>10,
3919                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3920                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3921                     reserve>>10, arc_c>>10);
3922                 return (SET_ERROR(ERESTART));
3923         }
3924         atomic_add_64(&arc_tempreserve, reserve);
3925         return (0);
3926 }
3927 
3928 /* Tuneable, default is 64, which is essentially arbitrary */
3929 int zfs_flush_ntasks = 64;
3930 
3931 void
3932 arc_init(void)
3933 {
3934         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3935         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3936 
3937         mutex_init(&arc_pressure_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3938         cv_init(&arc_pressure_thr_cv, NULL, CV_DEFAULT, NULL);
3939 
3940         /* Convert seconds to clock ticks */
3941         arc_min_prefetch_lifespan = 1 * hz;
3942 
3943         /* Start out with 1/8 of all memory */
3944         arc_c = physmem * PAGESIZE / 8;
3945 
3946 #ifdef _KERNEL
3947         /*
3948          * On architectures where the physical memory can be larger
3949          * than the addressable space (intel in 32-bit mode), we may
3950          * need to limit the cache to 1/8 of VM size.
3951          */
3952         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3953 #endif
3954 
3955         /* initial sensible value */
3956         arc_pressure_threshold = arc_c;
3957         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3958         arc_c_min = MAX(arc_c / 4, 64<<20);
3959         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3960         if (arc_c * 8 >= 1<<30)
3961                 arc_c_max = (arc_c * 8) - (1<<30);
3962         else
3963                 arc_c_max = arc_c_min;
3964         arc_c_max = MAX(arc_c * 6, arc_c_max);
3965 
3966         /*
3967          * Allow the tunables to override our calculations if they are
3968          * reasonable (ie. over 64MB)
3969          */
3970         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3971                 arc_c_max = zfs_arc_max;
3972         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3973                 arc_c_min = zfs_arc_min;
3974 
3975         arc_c = arc_c_max;
3976         arc_p = (arc_c >> 1);


4038 
4039         arc_flush_taskq = taskq_create("arc_flush_tq",
4040             max_ncpus, minclsyspri, 1, zfs_flush_ntasks, TASKQ_DYNAMIC);
4041         buf_init();
4042 
4043         arc_thread_exit = 0;
4044         arc_eviction_list = NULL;
4045         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4046         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4047 
4048         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4049             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4050 
4051         if (arc_ksp != NULL) {
4052                 arc_ksp->ks_data = &arc_stats;
4053                 kstat_install(arc_ksp);
4054         }
4055 
4056         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4057             TS_RUN, minclsyspri);
4058         (void) thread_create(NULL, 0, arc_pressure_thread, NULL, 0, &p0,
4059             TS_RUN, minclsyspri);
4060 
4061         arc_dead = FALSE;
4062         arc_warm = B_FALSE;
4063 
4064         /*
4065          * Calculate maximum amount of dirty data per pool.
4066          *
4067          * If it has been set by /etc/system, take that.
4068          * Otherwise, use a percentage of physical memory defined by
4069          * zfs_dirty_data_max_percent (default 10%) with a cap at
4070          * zfs_dirty_data_max_max (default 4GB).
4071          */
4072         if (zfs_dirty_data_max == 0) {
4073                 zfs_dirty_data_max = physmem * PAGESIZE *
4074                     zfs_dirty_data_max_percent / 100;
4075                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4076                     zfs_dirty_data_max_max);
4077         }
4078 }
4079 
4080 void
4081 arc_fini(void)
4082 {
4083         mutex_enter(&arc_reclaim_thr_lock);
4084         arc_thread_exit = 1;
4085         while (arc_thread_exit != 0)
4086                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4087         mutex_exit(&arc_reclaim_thr_lock);
4088 
4089         mutex_enter(&arc_pressure_thr_lock);
4090         arc_pressure_thread_exit = 1;
4091         while (arc_pressure_thread_exit != 0)
4092                 cv_wait(&arc_pressure_thr_cv, &arc_pressure_thr_lock);
4093         mutex_exit(&arc_pressure_thr_lock);
4094 
4095         arc_flush(NULL);
4096 
4097         arc_dead = TRUE;
4098 
4099         if (arc_ksp != NULL) {
4100                 kstat_delete(arc_ksp);
4101                 arc_ksp = NULL;
4102         }
4103 
4104         mutex_destroy(&arc_eviction_mtx);
4105         mutex_destroy(&arc_reclaim_thr_lock);
4106         cv_destroy(&arc_reclaim_thr_cv);
4107         mutex_destroy(&arc_pressure_thr_lock);
4108         cv_destroy(&arc_pressure_thr_cv);
4109 
4110         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
4111         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
4112         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
4113         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
4114         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
4115         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
4116         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
4117         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
4118 
4119         mutex_destroy(&arc_anon->arcs_mtx);
4120         mutex_destroy(&arc_mru->arcs_mtx);
4121         mutex_destroy(&arc_mru_ghost->arcs_mtx);
4122         mutex_destroy(&arc_mfu->arcs_mtx);
4123         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
4124         mutex_destroy(&arc_l2c_only->arcs_mtx);
4125 
4126         taskq_destroy(arc_flush_taskq);
4127         buf_fini();
4128