Print this page
ARC pressure valve implementation

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c
↓ open down ↓ 141 lines elided ↑ open up ↑
 142  142  #ifndef _KERNEL
 143  143  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 144  144  boolean_t arc_watch = B_FALSE;
 145  145  int arc_procfd;
 146  146  #endif
 147  147  
 148  148  static kmutex_t         arc_reclaim_thr_lock;
 149  149  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 150  150  static uint8_t          arc_thread_exit;
 151  151  
      152 +static kmutex_t         arc_pressure_thr_lock;
      153 +static kcondvar_t       arc_pressure_thr_cv;
      154 +static uint8_t          arc_pressure_thread_exit;
      155 +static uint64_t         arc_pressure_threshold;
      156 +
 152  157  #define ARC_REDUCE_DNLC_PERCENT 3
 153  158  uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154  159  
 155  160  typedef enum arc_reclaim_strategy {
 156  161          ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157  162          ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158  163  } arc_reclaim_strategy_t;
 159  164  
 160  165  /*
 161  166   * The number of iterations through arc_evict_*() before we
↓ open down ↓ 126 lines elided ↑ open up ↑
 288  293          kstat_named_t arcstat_hash_chains;
 289  294          kstat_named_t arcstat_hash_chain_max;
 290  295          kstat_named_t arcstat_p;
 291  296          kstat_named_t arcstat_c;
 292  297          kstat_named_t arcstat_c_min;
 293  298          kstat_named_t arcstat_c_max;
 294  299          kstat_named_t arcstat_size;
 295  300          kstat_named_t arcstat_hdr_size;
 296  301          kstat_named_t arcstat_data_size;
 297  302          kstat_named_t arcstat_other_size;
      303 +        kstat_named_t arcstat_growth_rate;
 298  304          kstat_named_t arcstat_l2_hits;
 299  305          kstat_named_t arcstat_l2_misses;
 300  306          kstat_named_t arcstat_l2_feeds;
 301  307          kstat_named_t arcstat_l2_rw_clash;
 302  308          kstat_named_t arcstat_l2_read_bytes;
 303  309          kstat_named_t arcstat_l2_write_bytes;
 304  310          kstat_named_t arcstat_l2_writes_sent;
 305  311          kstat_named_t arcstat_l2_writes_done;
 306  312          kstat_named_t arcstat_l2_writes_error;
 307  313          kstat_named_t arcstat_l2_writes_hdr_miss;
↓ open down ↓ 46 lines elided ↑ open up ↑
 354  360          { "hash_chains",                KSTAT_DATA_UINT64 },
 355  361          { "hash_chain_max",             KSTAT_DATA_UINT64 },
 356  362          { "p",                          KSTAT_DATA_UINT64 },
 357  363          { "c",                          KSTAT_DATA_UINT64 },
 358  364          { "c_min",                      KSTAT_DATA_UINT64 },
 359  365          { "c_max",                      KSTAT_DATA_UINT64 },
 360  366          { "size",                       KSTAT_DATA_UINT64 },
 361  367          { "hdr_size",                   KSTAT_DATA_UINT64 },
 362  368          { "data_size",                  KSTAT_DATA_UINT64 },
 363  369          { "other_size",                 KSTAT_DATA_UINT64 },
      370 +        { "growth_rate",                KSTAT_DATA_UINT64 },
 364  371          { "l2_hits",                    KSTAT_DATA_UINT64 },
 365  372          { "l2_misses",                  KSTAT_DATA_UINT64 },
 366  373          { "l2_feeds",                   KSTAT_DATA_UINT64 },
 367  374          { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 368  375          { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 369  376          { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 370  377          { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 371  378          { "l2_writes_done",             KSTAT_DATA_UINT64 },
 372  379          { "l2_writes_error",            KSTAT_DATA_UINT64 },
 373  380          { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
↓ open down ↓ 49 lines elided ↑ open up ↑
 423  430                          ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 424  431                  }                                                       \
 425  432          } else {                                                        \
 426  433                  if (cond2) {                                            \
 427  434                          ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 428  435                  } else {                                                \
 429  436                          ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 430  437                  }                                                       \
 431  438          }
 432  439  
      440 +/*
      441 + * This macro allows us to use kstats as floating averages. Each time we
      442 + * update this kstat, we first factor it and the update value by `factor'
      443 + * to shrink the new value's contribution to the overall average. This
      444 + * macro assumes that integer loads and stores are atomic, but is not
      445 + * safe for multiple writers updating the kstat in parallel (only the
      446 + * last writer's update will remain).
      447 + */
      448 +#define ARCSTAT_F_AVG(stat, value, factor) \
      449 +        do { \
      450 +                uint64_t x = ARCSTAT(stat); \
      451 +                x = x - x / factor + (value) / factor; \
      452 +                ARCSTAT(stat) = x; \
      453 +                _NOTE(NOTREACHED) \
      454 +                _NOTE(CONSTCOND) \
      455 +        } while (0)
      456 +
 433  457  kstat_t                 *arc_ksp;
 434  458  static arc_state_t      *arc_anon;
 435  459  static arc_state_t      *arc_mru;
 436  460  static arc_state_t      *arc_mru_ghost;
 437  461  static arc_state_t      *arc_mfu;
 438  462  static arc_state_t      *arc_mfu_ghost;
 439  463  static arc_state_t      *arc_l2c_only;
 440  464  
 441  465  /*
 442  466   * There are several ARC variables that are critical to export as kstats --
↓ open down ↓ 11 lines elided ↑ open up ↑
 454  478  #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 455  479  #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 456  480  #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 457  481  
 458  482  #define L2ARC_IS_VALID_COMPRESS(_c_) \
 459  483          ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 460  484  
 461  485  static int              arc_no_grow;    /* Don't try to grow cache size */
 462  486  static uint64_t         arc_tempreserve;
 463  487  static uint64_t         arc_loaned_bytes;
      488 +static uint64_t         arc_bytes_allocd = 0;
 464  489  
 465  490  typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 466  491  
 467  492  typedef struct arc_callback arc_callback_t;
 468  493  
 469  494  struct arc_callback {
 470  495          void                    *acb_private;
 471  496          arc_done_func_t         *acb_done;
 472  497          arc_buf_t               *acb_buf;
 473  498          zio_t                   *acb_zio_dummy;
↓ open down ↓ 793 lines elided ↑ open up ↑
1267 1292          case ARC_SPACE_HDRS:
1268 1293                  ARCSTAT_INCR(arcstat_hdr_size, space);
1269 1294                  break;
1270 1295          case ARC_SPACE_L2HDRS:
1271 1296                  ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1272 1297                  break;
1273 1298          }
1274 1299  
1275 1300          ARCSTAT_INCR(arcstat_meta_used, space);
1276 1301          atomic_add_64(&arc_size, space);
     1302 +        atomic_add_64(&arc_bytes_allocd, space);
1277 1303  }
1278 1304  
1279 1305  void
1280 1306  arc_space_return(uint64_t space, arc_space_type_t type)
1281 1307  {
1282 1308          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1283 1309  
1284 1310          switch (type) {
1285 1311          case ARC_SPACE_DATA:
1286 1312                  ARCSTAT_INCR(arcstat_data_size, -space);
↓ open down ↓ 16 lines elided ↑ open up ↑
1303 1329          ASSERT(arc_size >= space);
1304 1330          atomic_add_64(&arc_size, -space);
1305 1331  }
1306 1332  
1307 1333  void *
1308 1334  arc_data_buf_alloc(uint64_t size)
1309 1335  {
1310 1336          if (arc_evict_needed(ARC_BUFC_DATA))
1311 1337                  cv_signal(&arc_reclaim_thr_cv);
1312 1338          atomic_add_64(&arc_size, size);
     1339 +        atomic_add_64(&arc_bytes_allocd, size);
1313 1340          return (zio_data_buf_alloc(size));
1314 1341  }
1315 1342  
1316 1343  void
1317 1344  arc_data_buf_free(void *buf, uint64_t size)
1318 1345  {
1319 1346          zio_data_buf_free(buf, size);
1320 1347          ASSERT(arc_size >= size);
1321 1348          atomic_add_64(&arc_size, -size);
1322 1349  }
↓ open down ↓ 781 lines elided ↑ open up ↑
2104 2131  
2105 2132          adjustment =
2106 2133              arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2107 2134  
2108 2135          if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2109 2136                  delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2110 2137                  arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2111 2138          }
2112 2139  }
2113 2140  
     2141 +#define ACCURACY        1000
     2142 +
2114 2143  static void
     2144 +arc_reclaim_bytes(uint64_t to_evict)
     2145 +{
     2146 +        uint64_t to_evict_data_mru, to_evict_data_mfu;
     2147 +        uint64_t to_evict_meta_mru, to_evict_meta_mfu;
     2148 +
     2149 +        to_evict_meta_mru = (((arc_mru->arcs_lsize[ARC_BUFC_METADATA] *
     2150 +            ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
     2151 +            to_evict) / ACCURACY;
     2152 +        to_evict_data_mru = (((arc_mru->arcs_lsize[ARC_BUFC_DATA] *
     2153 +            ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
     2154 +            to_evict) / ACCURACY;
     2155 +        to_evict_meta_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_METADATA] *
     2156 +            ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
     2157 +            to_evict) / ACCURACY;
     2158 +        to_evict_data_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_DATA] *
     2159 +            ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
     2160 +            to_evict) / ACCURACY;
     2161 +
     2162 +        if (to_evict_meta_mru > 0)
     2163 +                (void) arc_evict(arc_mru, NULL, to_evict_meta_mru, FALSE,
     2164 +                    ARC_BUFC_METADATA);
     2165 +        if (to_evict_data_mru > 0)
     2166 +                (void) arc_evict(arc_mru, NULL, to_evict_data_mru, FALSE,
     2167 +                    ARC_BUFC_DATA);
     2168 +        if (to_evict_meta_mfu > 0)
     2169 +                (void) arc_evict(arc_mfu, NULL, to_evict_meta_mfu, FALSE,
     2170 +                    ARC_BUFC_METADATA);
     2171 +        if (to_evict_data_mfu > 0)
     2172 +                (void) arc_evict(arc_mfu, NULL, to_evict_data_mfu, FALSE,
     2173 +                    ARC_BUFC_DATA);
     2174 +}
     2175 +
     2176 +static void
2115 2177  arc_do_user_evicts(void)
2116 2178  {
2117 2179          mutex_enter(&arc_eviction_mtx);
2118 2180          while (arc_eviction_list != NULL) {
2119 2181                  arc_buf_t *buf = arc_eviction_list;
2120 2182                  arc_eviction_list = buf->b_next;
2121 2183                  mutex_enter(&buf->b_evict_lock);
2122 2184                  buf->b_hdr = NULL;
2123 2185                  mutex_exit(&buf->b_evict_lock);
2124 2186                  mutex_exit(&arc_eviction_mtx);
↓ open down ↓ 117 lines elided ↑ open up ↑
2242 2304                  if (arc_p > arc_c)
2243 2305                          arc_p = (arc_c >> 1);
2244 2306                  ASSERT(arc_c >= arc_c_min);
2245 2307                  ASSERT((int64_t)arc_p >= 0);
2246 2308          }
2247 2309  
2248 2310          if (arc_size > arc_c)
2249 2311                  arc_adjust();
2250 2312  }
2251 2313  
     2314 +#define PHYSMEM_PRESSURE_FRACTION       100
     2315 +
     2316 +static boolean_t
     2317 +arc_mem_pressure(void)
     2318 +{
     2319 +#ifdef _KERNEL
     2320 +        uint64_t extra = desfree + physmem / PHYSMEM_PRESSURE_FRACTION;
     2321 +
     2322 +        if ((freemem < lotsfree + needfree + extra) ||
     2323 +            (needfree || availrmem < swapfs_minfree + swapfs_reserve + extra) ||
     2324 +            (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) <
     2325 +            (vmem_size(zio_arena, VMEM_ALLOC) >> 4) +
     2326 +            physmem / PHYSMEM_PRESSURE_FRACTION))
     2327 +                return (B_TRUE);
     2328 +
     2329 +        return (freemem < physmem / PHYSMEM_PRESSURE_FRACTION);
     2330 +#else
     2331 +        return (0);
     2332 +#endif
     2333 +}
     2334 +
2252 2335  /*
2253 2336   * Determine if the system is under memory pressure and is asking
2254 2337   * to reclaim memory. A return value of 1 indicates that the system
2255 2338   * is under memory pressure and that the arc should adjust accordingly.
2256 2339   */
2257 2340  static int
2258 2341  arc_reclaim_needed(void)
2259 2342  {
2260 2343          uint64_t extra;
2261 2344  
↓ open down ↓ 122 lines elided ↑ open up ↑
2384 2467          kmem_cache_reap_now(range_seg_cache);
2385 2468  
2386 2469          /*
2387 2470           * Ask the vmem areana to reclaim unused memory from its
2388 2471           * quantum caches.
2389 2472           */
2390 2473          if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2391 2474                  vmem_qcache_reap(zio_arena);
2392 2475  }
2393 2476  
     2477 +#define RECLAIMS_PER_SEC        20
     2478 +#define STAT_UPDATES_PER_SEC    5
     2479 +
     2480 +/*
     2481 + * During heavy use, the ARC naturally wants to oscilate its arc_c around
     2482 + * a maximum memory pressure point which corresponds to the arc_reclaim_needed
     2483 + * function evaluating to 1. This results in the arc_size slowly growing
     2484 + * towards this reclaim_needed threshold and exceeding it periodically. Once
     2485 + * this happens, both arc_c and arc_size are down-adjusted by the
     2486 + * arc_reclaim_thread and kmem_reap is initiated. This is problematic on
     2487 + * bigmem systems with a small recordsize (4k or 8k), because reaping a kmem
     2488 + * cache which contains very large numbers of objects is extremely expensive
     2489 + * from an xcall perspective (several seconds of heavy CPU use):
     2490 + *
     2491 + * (mem)
     2492 + * ^         arc_reclaim_thread reacts
     2493 + * |           |                   |
     2494 + * |           V                   V
     2495 + * |
     2496 + * |           +                   +
     2497 + * |          /|                  /|
     2498 + * | ......./..|................/..|.............. arc_reclaim_needed threshold
     2499 + * |      /     \_____________/     \___________/(etc)
     2500 + * |    /          kmem reap          kmem reap
     2501 + * |  /
     2502 + * |/
     2503 + * +----------------------------------------------------------------->
     2504 + *                                                            (time)
     2505 + *
     2506 + * To help address this stairstep pattern, the arc_pressure_thread periodically
     2507 + * gauges the distance of the current arc_size to the arc_reclaim_needed
     2508 + * threshold by way of an estimation algorithm (in arc_mem_pressure).
     2509 + */
2394 2510  static void
     2511 +arc_pressure_thread(void)
     2512 +{
     2513 +        clock_t                 last_update = ddi_get_lbolt();
     2514 +        callb_cpr_t             cpr;
     2515 +
     2516 +        CALLB_CPR_INIT(&cpr, &arc_pressure_thr_lock, callb_generic_cpr, FTAG);
     2517 +
     2518 +        mutex_enter(&arc_pressure_thr_lock);
     2519 +        while (arc_pressure_thread_exit == 0) {
     2520 +                clock_t now;
     2521 +
     2522 +                now = ddi_get_lbolt();
     2523 +                if (now - last_update >= hz / STAT_UPDATES_PER_SEC) {
     2524 +                        uint64_t new_rate;
     2525 +
     2526 +                        new_rate = (atomic_swap_64(&arc_bytes_allocd, 0) *
     2527 +                            hz) / (now - last_update);
     2528 +
     2529 +                        if (ARCSTAT(arcstat_growth_rate) < new_rate)
     2530 +                                ARCSTAT(arcstat_growth_rate) = new_rate;
     2531 +                        else
     2532 +                                ARCSTAT_F_AVG(arcstat_growth_rate, new_rate, 4);
     2533 +                        last_update = now;
     2534 +                }
     2535 +
     2536 +                arc_pressure_threshold = arc_c - ARCSTAT(arcstat_growth_rate);
     2537 +                if (arc_size > arc_pressure_threshold) {
     2538 +                        arc_reclaim_bytes(arc_size - arc_pressure_threshold);
     2539 +                }
     2540 +
     2541 +                CALLB_CPR_SAFE_BEGIN(&cpr);
     2542 +                (void) cv_timedwait(&arc_pressure_thr_cv,
     2543 +                    &arc_pressure_thr_lock,
     2544 +                    ddi_get_lbolt() + hz / RECLAIMS_PER_SEC);
     2545 +                CALLB_CPR_SAFE_END(&cpr, &arc_pressure_thr_lock);
     2546 +        }
     2547 +
     2548 +        arc_pressure_thread_exit = 0;
     2549 +        cv_broadcast(&arc_pressure_thr_cv);
     2550 +        CALLB_CPR_EXIT(&cpr);           /* drops arc_pressure_thr_lock */
     2551 +        thread_exit();
     2552 +}
     2553 +
     2554 +static void
2395 2555  arc_reclaim_thread(void)
2396 2556  {
2397 2557          clock_t                 growtime = 0;
2398 2558          arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2399 2559          callb_cpr_t             cpr;
2400 2560  
2401 2561          CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2402 2562  
2403 2563          mutex_enter(&arc_reclaim_thr_lock);
2404 2564          while (arc_thread_exit == 0) {
↓ open down ↓ 88 lines elided ↑ open up ↑
2493 2653          if (arc_no_grow)
2494 2654                  return;
2495 2655  
2496 2656          if (arc_c >= arc_c_max)
2497 2657                  return;
2498 2658  
2499 2659          /*
2500 2660           * If we're within (2 * maxblocksize) bytes of the target
2501 2661           * cache size, increment the target cache size
2502 2662           */
2503      -        if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
     2663 +        if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT) ||
     2664 +            (arc_size >= arc_pressure_threshold && arc_mem_pressure() == 0)) {
2504 2665                  atomic_add_64(&arc_c, (int64_t)bytes);
2505 2666                  if (arc_c > arc_c_max)
2506 2667                          arc_c = arc_c_max;
2507 2668                  else if (state == arc_anon)
2508 2669                          atomic_add_64(&arc_p, (int64_t)bytes);
2509 2670                  if (arc_p > arc_c)
2510 2671                          arc_p = arc_c;
2511 2672          }
2512 2673          ASSERT((int64_t)arc_p >= 0);
2513 2674  }
↓ open down ↓ 52 lines elided ↑ open up ↑
2566 2727           */
2567 2728          if (!arc_evict_needed(type)) {
2568 2729                  if (type == ARC_BUFC_METADATA) {
2569 2730                          buf->b_data = zio_buf_alloc(size);
2570 2731                          arc_space_consume(size, ARC_SPACE_DATA);
2571 2732                  } else {
2572 2733                          ASSERT(type == ARC_BUFC_DATA);
2573 2734                          buf->b_data = zio_data_buf_alloc(size);
2574 2735                          ARCSTAT_INCR(arcstat_data_size, size);
2575 2736                          atomic_add_64(&arc_size, size);
     2737 +                        atomic_add_64(&arc_bytes_allocd, size);
2576 2738                  }
2577 2739                  goto out;
2578 2740          }
2579 2741  
2580 2742          /*
2581 2743           * If we are prefetching from the mfu ghost list, this buffer
2582 2744           * will end up on the mru list; so steal space from there.
2583 2745           */
2584 2746          if (state == arc_mfu_ghost)
2585 2747                  state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
↓ open down ↓ 12 lines elided ↑ open up ↑
2598 2760          }
2599 2761          if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2600 2762                  if (type == ARC_BUFC_METADATA) {
2601 2763                          buf->b_data = zio_buf_alloc(size);
2602 2764                          arc_space_consume(size, ARC_SPACE_DATA);
2603 2765                  } else {
2604 2766                          ASSERT(type == ARC_BUFC_DATA);
2605 2767                          buf->b_data = zio_data_buf_alloc(size);
2606 2768                          ARCSTAT_INCR(arcstat_data_size, size);
2607 2769                          atomic_add_64(&arc_size, size);
     2770 +                        atomic_add_64(&arc_bytes_allocd, size);
2608 2771                  }
2609 2772                  ARCSTAT_BUMP(arcstat_recycle_miss);
2610 2773          }
2611 2774          ASSERT(buf->b_data != NULL);
2612 2775  out:
2613 2776          /*
2614 2777           * Update the state size.  Note that ghost states have a
2615 2778           * "ghost size" and so don't need to be updated.
2616 2779           */
2617 2780          if (!GHOST_STATE(buf->b_hdr->b_state)) {
↓ open down ↓ 1146 lines elided ↑ open up ↑
3764 3927  
3765 3928  /* Tuneable, default is 64, which is essentially arbitrary */
3766 3929  int zfs_flush_ntasks = 64;
3767 3930  
3768 3931  void
3769 3932  arc_init(void)
3770 3933  {
3771 3934          mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3772 3935          cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3773 3936  
     3937 +        mutex_init(&arc_pressure_thr_lock, NULL, MUTEX_DEFAULT, NULL);
     3938 +        cv_init(&arc_pressure_thr_cv, NULL, CV_DEFAULT, NULL);
     3939 +
3774 3940          /* Convert seconds to clock ticks */
3775 3941          arc_min_prefetch_lifespan = 1 * hz;
3776 3942  
3777 3943          /* Start out with 1/8 of all memory */
3778 3944          arc_c = physmem * PAGESIZE / 8;
3779 3945  
3780 3946  #ifdef _KERNEL
3781 3947          /*
3782 3948           * On architectures where the physical memory can be larger
3783 3949           * than the addressable space (intel in 32-bit mode), we may
3784 3950           * need to limit the cache to 1/8 of VM size.
3785 3951           */
3786 3952          arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3787 3953  #endif
3788 3954  
     3955 +        /* initial sensible value */
     3956 +        arc_pressure_threshold = arc_c;
3789 3957          /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3790 3958          arc_c_min = MAX(arc_c / 4, 64<<20);
3791 3959          /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3792 3960          if (arc_c * 8 >= 1<<30)
3793 3961                  arc_c_max = (arc_c * 8) - (1<<30);
3794 3962          else
3795 3963                  arc_c_max = arc_c_min;
3796 3964          arc_c_max = MAX(arc_c * 6, arc_c_max);
3797 3965  
3798 3966          /*
↓ open down ↓ 81 lines elided ↑ open up ↑
3880 4048          arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3881 4049              sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3882 4050  
3883 4051          if (arc_ksp != NULL) {
3884 4052                  arc_ksp->ks_data = &arc_stats;
3885 4053                  kstat_install(arc_ksp);
3886 4054          }
3887 4055  
3888 4056          (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3889 4057              TS_RUN, minclsyspri);
     4058 +        (void) thread_create(NULL, 0, arc_pressure_thread, NULL, 0, &p0,
     4059 +            TS_RUN, minclsyspri);
3890 4060  
3891 4061          arc_dead = FALSE;
3892 4062          arc_warm = B_FALSE;
3893 4063  
3894 4064          /*
3895 4065           * Calculate maximum amount of dirty data per pool.
3896 4066           *
3897 4067           * If it has been set by /etc/system, take that.
3898 4068           * Otherwise, use a percentage of physical memory defined by
3899 4069           * zfs_dirty_data_max_percent (default 10%) with a cap at
↓ open down ↓ 9 lines elided ↑ open up ↑
3909 4079  
3910 4080  void
3911 4081  arc_fini(void)
3912 4082  {
3913 4083          mutex_enter(&arc_reclaim_thr_lock);
3914 4084          arc_thread_exit = 1;
3915 4085          while (arc_thread_exit != 0)
3916 4086                  cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3917 4087          mutex_exit(&arc_reclaim_thr_lock);
3918 4088  
     4089 +        mutex_enter(&arc_pressure_thr_lock);
     4090 +        arc_pressure_thread_exit = 1;
     4091 +        while (arc_pressure_thread_exit != 0)
     4092 +                cv_wait(&arc_pressure_thr_cv, &arc_pressure_thr_lock);
     4093 +        mutex_exit(&arc_pressure_thr_lock);
     4094 +
3919 4095          arc_flush(NULL);
3920 4096  
3921 4097          arc_dead = TRUE;
3922 4098  
3923 4099          if (arc_ksp != NULL) {
3924 4100                  kstat_delete(arc_ksp);
3925 4101                  arc_ksp = NULL;
3926 4102          }
3927 4103  
3928 4104          mutex_destroy(&arc_eviction_mtx);
3929 4105          mutex_destroy(&arc_reclaim_thr_lock);
3930 4106          cv_destroy(&arc_reclaim_thr_cv);
     4107 +        mutex_destroy(&arc_pressure_thr_lock);
     4108 +        cv_destroy(&arc_pressure_thr_cv);
3931 4109  
3932 4110          list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3933 4111          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3934 4112          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3935 4113          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3936 4114          list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3937 4115          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3938 4116          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3939 4117          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3940 4118  
↓ open down ↓ 1427 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX