40 * subset of the blocks in the cache are un-evictable because we
41 * have handed out a reference to them. Blocks are only evictable
42 * when there are no external references active. This makes
43 * eviction far more problematic: we choose to evict the evictable
44 * blocks that are the "lowest" in the list.
45 *
46 * There are times when it is not possible to evict the requested
47 * space. In these circumstances we are unable to adjust the cache
48 * size. To prevent the cache growing unbounded at these times we
49 * implement a "cache throttle" that slows the flow of new data
50 * into the cache until we can make space available.
51 *
52 * 2. The Megiddo and Modha model assumes a fixed cache size.
53 * Pages are evicted when the cache is full and there is a cache
54 * miss. Our model has a variable sized cache. It grows with
55 * high use, but also tries to react to memory pressure from the
56 * operating system: decreasing its size when system memory is
57 * tight.
58 *
59 * 3. The Megiddo and Modha model assumes a fixed page size. All
60 * elements of the cache are therefor exactly the same size. So
61 * when adjusting the cache size following a cache miss, its simply
62 * a matter of choosing a single page to evict. In our model, we
63 * have variable sized cache blocks (rangeing from 512 bytes to
64 * 128K bytes). We therefor choose a set of blocks to evict to make
65 * space for a cache miss that approximates as closely as possible
66 * the space used by the new block.
67 *
68 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
69 * by N. Megiddo & D. Modha, FAST 2003
70 */
71
72 /*
73 * The locking model:
74 *
75 * A new reference to a cache buffer can be obtained in two
76 * ways: 1) via a hash table lookup using the DVA as a key,
77 * or 2) via one of the ARC lists. The arc_read() interface
78 * uses method 1, while the internal arc algorithms for
79 * adjusting the cache use method 2. We therefor provide two
80 * types of locks: 1) the hash table lock array, and 2) the
81 * arc list locks.
82 *
83 * Buffers do not have their own mutexes, rather they rely on the
84 * hash table mutexes for the bulk of their protection (i.e. most
85 * fields in the arc_buf_hdr_t are protected by these mutexes).
86 *
87 * buf_hash_find() returns the appropriate mutex (held) when it
88 * locates the requested buffer in the hash table. It returns
89 * NULL for the mutex if the buffer was not in the table.
90 *
91 * buf_hash_remove() expects the appropriate hash mutex to be
92 * already held before it is invoked.
93 *
94 * Each arc state also has a mutex which is used to protect the
95 * buffer list associated with the state. When attempting to
96 * obtain a hash table lock while holding an arc list lock you
97 * must use: mutex_tryenter() to avoid deadlock. Also note that
98 * the active state mutex must be held before the ghost state mutex.
99 *
358 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
359 { "l2_evict_reading", KSTAT_DATA_UINT64 },
360 { "l2_free_on_write", KSTAT_DATA_UINT64 },
361 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
362 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
363 { "l2_io_error", KSTAT_DATA_UINT64 },
364 { "l2_size", KSTAT_DATA_UINT64 },
365 { "l2_hdr_size", KSTAT_DATA_UINT64 },
366 { "memory_throttle_count", KSTAT_DATA_UINT64 },
367 { "duplicate_buffers", KSTAT_DATA_UINT64 },
368 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
369 { "duplicate_reads", KSTAT_DATA_UINT64 },
370 { "arc_meta_used", KSTAT_DATA_UINT64 },
371 { "arc_meta_limit", KSTAT_DATA_UINT64 },
372 { "arc_meta_max", KSTAT_DATA_UINT64 }
373 };
374
375 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
376
377 #define ARCSTAT_INCR(stat, val) \
378 atomic_add_64(&arc_stats.stat.value.ui64, (val));
379
380 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
381 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
382
383 #define ARCSTAT_MAX(stat, val) { \
384 uint64_t m; \
385 while ((val) > (m = arc_stats.stat.value.ui64) && \
386 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
387 continue; \
388 }
389
390 #define ARCSTAT_MAXSTAT(stat) \
391 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
392
393 /*
394 * We define a macro to allow ARC hits/misses to be easily broken down by
395 * two separate conditions, giving a total of four different subtypes for
396 * each of hits and misses (so eight statistics total).
397 */
398 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
578 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
579 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
580 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
581 #define HDR_LOCK(hdr) \
582 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
583
584 uint64_t zfs_crc64_table[256];
585
586 /*
587 * Level 2 ARC
588 */
589
590 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
591 #define L2ARC_HEADROOM 2 /* num of writes */
592 #define L2ARC_FEED_SECS 1 /* caching interval secs */
593 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
594
595 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
596 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
597
598 /*
599 * L2ARC Performance Tunables
600 */
601 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
602 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
603 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
604 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
605 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
606 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
607 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
608 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
609
610 /*
611 * L2ARC Internals
612 */
613 typedef struct l2arc_dev {
614 vdev_t *l2ad_vdev; /* vdev */
615 spa_t *l2ad_spa; /* spa */
616 uint64_t l2ad_hand; /* next write location */
617 uint64_t l2ad_write; /* desired write size, bytes */
618 uint64_t l2ad_boost; /* warmup write boost, bytes */
619 uint64_t l2ad_start; /* first addr on device */
620 uint64_t l2ad_end; /* last addr on device */
3528 */
3529 if (spa_get_random(10000) == 0) {
3530 dprintf("forcing random failure\n");
3531 return (SET_ERROR(ERESTART));
3532 }
3533 #endif
3534 if (reserve > arc_c/4 && !arc_no_grow)
3535 arc_c = MIN(arc_c_max, reserve * 4);
3536 if (reserve > arc_c)
3537 return (SET_ERROR(ENOMEM));
3538
3539 /*
3540 * Don't count loaned bufs as in flight dirty data to prevent long
3541 * network delays from blocking transactions that are ready to be
3542 * assigned to a txg.
3543 */
3544 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3545
3546 /*
3547 * Writes will, almost always, require additional memory allocations
3548 * in order to compress/encrypt/etc the data. We therefor need to
3549 * make sure that there is sufficient available memory for this.
3550 */
3551 if (error = arc_memory_throttle(reserve, anon_size, txg))
3552 return (error);
3553
3554 /*
3555 * Throttle writes when the amount of dirty data in the cache
3556 * gets too large. We try to keep the cache less than half full
3557 * of dirty blocks so that our sync times don't grow too large.
3558 * Note: if two requests come in concurrently, we might let them
3559 * both succeed, when one of them should fail. Not a huge deal.
3560 */
3561
3562 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3563 anon_size > arc_c / 4) {
3564 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3565 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3566 arc_tempreserve>>10,
3567 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3568 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
|
40 * subset of the blocks in the cache are un-evictable because we
41 * have handed out a reference to them. Blocks are only evictable
42 * when there are no external references active. This makes
43 * eviction far more problematic: we choose to evict the evictable
44 * blocks that are the "lowest" in the list.
45 *
46 * There are times when it is not possible to evict the requested
47 * space. In these circumstances we are unable to adjust the cache
48 * size. To prevent the cache growing unbounded at these times we
49 * implement a "cache throttle" that slows the flow of new data
50 * into the cache until we can make space available.
51 *
52 * 2. The Megiddo and Modha model assumes a fixed cache size.
53 * Pages are evicted when the cache is full and there is a cache
54 * miss. Our model has a variable sized cache. It grows with
55 * high use, but also tries to react to memory pressure from the
56 * operating system: decreasing its size when system memory is
57 * tight.
58 *
59 * 3. The Megiddo and Modha model assumes a fixed page size. All
60 * elements of the cache are therefore exactly the same size. So
61 * when adjusting the cache size following a cache miss, its simply
62 * a matter of choosing a single page to evict. In our model, we
63 * have variable sized cache blocks (rangeing from 512 bytes to
64 * 128K bytes). We therefore choose a set of blocks to evict to make
65 * space for a cache miss that approximates as closely as possible
66 * the space used by the new block.
67 *
68 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
69 * by N. Megiddo & D. Modha, FAST 2003
70 */
71
72 /*
73 * The locking model:
74 *
75 * A new reference to a cache buffer can be obtained in two
76 * ways: 1) via a hash table lookup using the DVA as a key,
77 * or 2) via one of the ARC lists. The arc_read() interface
78 * uses method 1, while the internal arc algorithms for
79 * adjusting the cache use method 2. We therefore provide two
80 * types of locks: 1) the hash table lock array, and 2) the
81 * arc list locks.
82 *
83 * Buffers do not have their own mutexes, rather they rely on the
84 * hash table mutexes for the bulk of their protection (i.e. most
85 * fields in the arc_buf_hdr_t are protected by these mutexes).
86 *
87 * buf_hash_find() returns the appropriate mutex (held) when it
88 * locates the requested buffer in the hash table. It returns
89 * NULL for the mutex if the buffer was not in the table.
90 *
91 * buf_hash_remove() expects the appropriate hash mutex to be
92 * already held before it is invoked.
93 *
94 * Each arc state also has a mutex which is used to protect the
95 * buffer list associated with the state. When attempting to
96 * obtain a hash table lock while holding an arc list lock you
97 * must use: mutex_tryenter() to avoid deadlock. Also note that
98 * the active state mutex must be held before the ghost state mutex.
99 *
358 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
359 { "l2_evict_reading", KSTAT_DATA_UINT64 },
360 { "l2_free_on_write", KSTAT_DATA_UINT64 },
361 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
362 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
363 { "l2_io_error", KSTAT_DATA_UINT64 },
364 { "l2_size", KSTAT_DATA_UINT64 },
365 { "l2_hdr_size", KSTAT_DATA_UINT64 },
366 { "memory_throttle_count", KSTAT_DATA_UINT64 },
367 { "duplicate_buffers", KSTAT_DATA_UINT64 },
368 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
369 { "duplicate_reads", KSTAT_DATA_UINT64 },
370 { "arc_meta_used", KSTAT_DATA_UINT64 },
371 { "arc_meta_limit", KSTAT_DATA_UINT64 },
372 { "arc_meta_max", KSTAT_DATA_UINT64 }
373 };
374
375 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
376
377 #define ARCSTAT_INCR(stat, val) \
378 atomic_add_64(&arc_stats.stat.value.ui64, (val))
379
380 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
381 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
382
383 #define ARCSTAT_MAX(stat, val) { \
384 uint64_t m; \
385 while ((val) > (m = arc_stats.stat.value.ui64) && \
386 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
387 continue; \
388 }
389
390 #define ARCSTAT_MAXSTAT(stat) \
391 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
392
393 /*
394 * We define a macro to allow ARC hits/misses to be easily broken down by
395 * two separate conditions, giving a total of four different subtypes for
396 * each of hits and misses (so eight statistics total).
397 */
398 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
578 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
579 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
580 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
581 #define HDR_LOCK(hdr) \
582 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
583
584 uint64_t zfs_crc64_table[256];
585
586 /*
587 * Level 2 ARC
588 */
589
590 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
591 #define L2ARC_HEADROOM 2 /* num of writes */
592 #define L2ARC_FEED_SECS 1 /* caching interval secs */
593 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
594
595 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
596 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
597
598 /* L2ARC Performance Tunables */
599 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
600 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
601 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
602 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
603 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
604 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
605 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
606 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
607
608 /*
609 * L2ARC Internals
610 */
611 typedef struct l2arc_dev {
612 vdev_t *l2ad_vdev; /* vdev */
613 spa_t *l2ad_spa; /* spa */
614 uint64_t l2ad_hand; /* next write location */
615 uint64_t l2ad_write; /* desired write size, bytes */
616 uint64_t l2ad_boost; /* warmup write boost, bytes */
617 uint64_t l2ad_start; /* first addr on device */
618 uint64_t l2ad_end; /* last addr on device */
3526 */
3527 if (spa_get_random(10000) == 0) {
3528 dprintf("forcing random failure\n");
3529 return (SET_ERROR(ERESTART));
3530 }
3531 #endif
3532 if (reserve > arc_c/4 && !arc_no_grow)
3533 arc_c = MIN(arc_c_max, reserve * 4);
3534 if (reserve > arc_c)
3535 return (SET_ERROR(ENOMEM));
3536
3537 /*
3538 * Don't count loaned bufs as in flight dirty data to prevent long
3539 * network delays from blocking transactions that are ready to be
3540 * assigned to a txg.
3541 */
3542 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3543
3544 /*
3545 * Writes will, almost always, require additional memory allocations
3546 * in order to compress/encrypt/etc the data. We therefore need to
3547 * make sure that there is sufficient available memory for this.
3548 */
3549 if (error = arc_memory_throttle(reserve, anon_size, txg))
3550 return (error);
3551
3552 /*
3553 * Throttle writes when the amount of dirty data in the cache
3554 * gets too large. We try to keep the cache less than half full
3555 * of dirty blocks so that our sync times don't grow too large.
3556 * Note: if two requests come in concurrently, we might let them
3557 * both succeed, when one of them should fail. Not a huge deal.
3558 */
3559
3560 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3561 anon_size > arc_c / 4) {
3562 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3563 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3564 arc_tempreserve>>10,
3565 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3566 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
|