110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 * - L2ARC buflist creation
116 * - L2ARC buflist eviction
117 * - L2ARC write completion, which walks L2ARC buflists
118 * - ARC header destruction, as it removes from L2ARC buflists
119 * - ARC header release, as it removes from L2ARC buflists
120 */
121
122 #include <sys/spa.h>
123 #include <sys/zio.h>
124 #include <sys/zio_compress.h>
125 #include <sys/zfs_context.h>
126 #include <sys/arc.h>
127 #include <sys/refcount.h>
128 #include <sys/vdev.h>
129 #include <sys/vdev_impl.h>
130 #ifdef _KERNEL
131 #include <sys/vmsystm.h>
132 #include <vm/anon.h>
133 #include <sys/fs/swapnode.h>
134 #include <sys/dnlc.h>
135 #endif
136 #include <sys/callb.h>
137 #include <sys/kstat.h>
138 #include <zfs_fletcher.h>
139
140 #ifndef _KERNEL
141 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
142 boolean_t arc_watch = B_FALSE;
143 int arc_procfd;
144 #endif
145
146 static kmutex_t arc_reclaim_thr_lock;
147 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
148 static uint8_t arc_thread_exit;
149
150 extern int zfs_write_limit_shift;
151 extern uint64_t zfs_write_limit_max;
152 extern kmutex_t zfs_write_limit_lock;
153
154 #define ARC_REDUCE_DNLC_PERCENT 3
155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
156
157 typedef enum arc_reclaim_strategy {
158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
159 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
160 } arc_reclaim_strategy_t;
161
162 /* number of seconds before growing cache again */
163 static int arc_grow_retry = 60;
164
165 /* shift of arc_c for calculating both min and max arc_p */
166 static int arc_p_min_shift = 4;
167
168 /* log2(fraction of arc to reclaim) */
169 static int arc_shrink_shift = 5;
170
171 /*
172 * minimum lifespan of a prefetch block in clock ticks
173 * (initialized in arc_init())
174 */
175 static int arc_min_prefetch_lifespan;
176
177 static int arc_dead;
178
179 /*
180 * The arc has filled available memory and has now warmed up.
181 */
182 static boolean_t arc_warm;
183
184 /*
185 * These tunables are for performance analysis.
186 */
187 uint64_t zfs_arc_max;
188 uint64_t zfs_arc_min;
189 uint64_t zfs_arc_meta_limit = 0;
190 int zfs_arc_grow_retry = 0;
191 int zfs_arc_shrink_shift = 0;
192 int zfs_arc_p_min_shift = 0;
193 int zfs_disable_dup_eviction = 0;
194
195 /*
196 * Note that buffers can be in one of 6 states:
452 static uint64_t arc_tempreserve;
453 static uint64_t arc_loaned_bytes;
454
455 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
456
457 typedef struct arc_callback arc_callback_t;
458
459 struct arc_callback {
460 void *acb_private;
461 arc_done_func_t *acb_done;
462 arc_buf_t *acb_buf;
463 zio_t *acb_zio_dummy;
464 arc_callback_t *acb_next;
465 };
466
467 typedef struct arc_write_callback arc_write_callback_t;
468
469 struct arc_write_callback {
470 void *awcb_private;
471 arc_done_func_t *awcb_ready;
472 arc_done_func_t *awcb_done;
473 arc_buf_t *awcb_buf;
474 };
475
476 struct arc_buf_hdr {
477 /* protected by hash lock */
478 dva_t b_dva;
479 uint64_t b_birth;
480 uint64_t b_cksum0;
481
482 kmutex_t b_freeze_lock;
483 zio_cksum_t *b_freeze_cksum;
484 void *b_thawed;
485
486 arc_buf_hdr_t *b_hash_next;
487 arc_buf_t *b_buf;
488 uint32_t b_flags;
489 uint32_t b_datacnt;
490
491 arc_callback_t *b_acb;
1146 list_insert_head(&state->arcs_list[ab->b_type], ab);
1147 ASSERT(ab->b_datacnt > 0);
1148 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1149 mutex_exit(&state->arcs_mtx);
1150 }
1151 return (cnt);
1152 }
1153
1154 /*
1155 * Move the supplied buffer to the indicated state. The mutex
1156 * for the buffer must be held by the caller.
1157 */
1158 static void
1159 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1160 {
1161 arc_state_t *old_state = ab->b_state;
1162 int64_t refcnt = refcount_count(&ab->b_refcnt);
1163 uint64_t from_delta, to_delta;
1164
1165 ASSERT(MUTEX_HELD(hash_lock));
1166 ASSERT(new_state != old_state);
1167 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1168 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1169 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1170
1171 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1172
1173 /*
1174 * If this buffer is evictable, transfer it from the
1175 * old state list to the new state list.
1176 */
1177 if (refcnt == 0) {
1178 if (old_state != arc_anon) {
1179 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1180 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1181
1182 if (use_mutex)
1183 mutex_enter(&old_state->arcs_mtx);
1184
1185 ASSERT(list_link_active(&ab->b_arc_node));
1186 list_remove(&old_state->arcs_list[ab->b_type], ab);
1761 * - look for a buffer to evict that is `bytes' long.
1762 * - return the data block from this buffer rather than freeing it.
1763 * This flag is used by callers that are trying to make space for a
1764 * new buffer in a full arc cache.
1765 *
1766 * This function makes a "best effort". It skips over any buffers
1767 * it can't get a hash_lock on, and so may not catch all candidates.
1768 * It may also return without evicting as much space as requested.
1769 */
1770 static void *
1771 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1772 arc_buf_contents_t type)
1773 {
1774 arc_state_t *evicted_state;
1775 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1776 arc_buf_hdr_t *ab, *ab_prev = NULL;
1777 list_t *list = &state->arcs_list[type];
1778 kmutex_t *hash_lock;
1779 boolean_t have_lock;
1780 void *stolen = NULL;
1781
1782 ASSERT(state == arc_mru || state == arc_mfu);
1783
1784 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1785
1786 mutex_enter(&state->arcs_mtx);
1787 mutex_enter(&evicted_state->arcs_mtx);
1788
1789 for (ab = list_tail(list); ab; ab = ab_prev) {
1790 ab_prev = list_prev(list, ab);
1791 /* prefetch buffers have a minimum lifespan */
1792 if (HDR_IO_IN_PROGRESS(ab) ||
1793 (spa && ab->b_spa != spa) ||
1794 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1795 ddi_get_lbolt() - ab->b_arc_access <
1796 arc_min_prefetch_lifespan)) {
1797 skipped++;
1798 continue;
1799 }
1800 /* "lookahead" for better eviction candidate */
1801 if (recycle && ab->b_size != bytes &&
1802 ab_prev && ab_prev->b_size == bytes)
1803 continue;
1804 hash_lock = HDR_LOCK(ab);
1805 have_lock = MUTEX_HELD(hash_lock);
1806 if (have_lock || mutex_tryenter(hash_lock)) {
1807 ASSERT0(refcount_count(&ab->b_refcnt));
1808 ASSERT(ab->b_datacnt > 0);
1809 while (ab->b_buf) {
1810 arc_buf_t *buf = ab->b_buf;
1811 if (!mutex_tryenter(&buf->b_evict_lock)) {
1812 missed += 1;
1813 break;
1814 }
1815 if (buf->b_data) {
1816 bytes_evicted += ab->b_size;
1817 if (recycle && ab->b_type == type &&
1818 ab->b_size == bytes &&
1819 !HDR_L2_WRITING(ab)) {
1820 stolen = buf->b_data;
1821 recycle = FALSE;
1822 }
1823 }
1865 break;
1866 } else {
1867 missed += 1;
1868 }
1869 }
1870
1871 mutex_exit(&evicted_state->arcs_mtx);
1872 mutex_exit(&state->arcs_mtx);
1873
1874 if (bytes_evicted < bytes)
1875 dprintf("only evicted %lld bytes from %x",
1876 (longlong_t)bytes_evicted, state);
1877
1878 if (skipped)
1879 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1880
1881 if (missed)
1882 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1883
1884 /*
1885 * We have just evicted some data into the ghost state, make
1886 * sure we also adjust the ghost state size if necessary.
1887 */
1888 if (arc_no_grow &&
1889 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1890 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1891 arc_mru_ghost->arcs_size - arc_c;
1892
1893 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1894 int64_t todelete =
1895 MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1896 arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1897 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1898 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1899 arc_mru_ghost->arcs_size +
1900 arc_mfu_ghost->arcs_size - arc_c);
1901 arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1902 }
1903 }
1904
1905 return (stolen);
1906 }
1907
1908 /*
1909 * Remove buffers from list until we've removed the specified number of
1910 * bytes. Destroy the buffers that are removed.
1911 */
1912 static void
1913 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1914 {
1915 arc_buf_hdr_t *ab, *ab_prev;
1916 arc_buf_hdr_t marker = { 0 };
1917 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1918 kmutex_t *hash_lock;
1919 uint64_t bytes_deleted = 0;
1920 uint64_t bufs_skipped = 0;
1921
1922 ASSERT(GHOST_STATE(state));
1923 top:
1924 mutex_enter(&state->arcs_mtx);
1925 for (ab = list_tail(list); ab; ab = ab_prev) {
1926 ab_prev = list_prev(list, ab);
1927 if (spa && ab->b_spa != spa)
1928 continue;
1929
1930 /* ignore markers */
1931 if (ab->b_spa == 0)
1932 continue;
1933
1934 hash_lock = HDR_LOCK(ab);
1935 /* caller may be trying to modify this buffer, skip it */
1936 if (MUTEX_HELD(hash_lock))
1937 continue;
1938 if (mutex_tryenter(hash_lock)) {
1939 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1940 ASSERT(ab->b_buf == NULL);
1941 ARCSTAT_BUMP(arcstat_deleted);
1942 bytes_deleted += ab->b_size;
1943
1944 if (ab->b_l2hdr != NULL) {
1945 /*
1946 * This buffer is cached on the 2nd Level ARC;
1947 * don't destroy the header.
1948 */
1949 arc_change_state(arc_l2c_only, ab, hash_lock);
1950 mutex_exit(hash_lock);
1951 } else {
1952 arc_change_state(arc_anon, ab, hash_lock);
1953 mutex_exit(hash_lock);
1954 arc_hdr_destroy(ab);
1955 }
1956
1957 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1958 if (bytes >= 0 && bytes_deleted >= bytes)
1959 break;
1960 } else if (bytes < 0) {
1961 /*
1962 * Insert a list marker and then wait for the
1963 * hash lock to become available. Once its
1964 * available, restart from where we left off.
1965 */
1966 list_insert_after(list, ab, &marker);
1967 mutex_exit(&state->arcs_mtx);
1968 mutex_enter(hash_lock);
1969 mutex_exit(hash_lock);
1970 mutex_enter(&state->arcs_mtx);
1971 ab_prev = list_prev(list, &marker);
1972 list_remove(list, &marker);
1973 } else
1974 bufs_skipped += 1;
1975 }
1976 mutex_exit(&state->arcs_mtx);
1977
1978 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1979 (bytes < 0 || bytes_deleted < bytes)) {
1980 list = &state->arcs_list[ARC_BUFC_METADATA];
1981 goto top;
1982 }
1983
1984 if (bufs_skipped) {
1985 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1986 ASSERT(bytes >= 0);
1987 }
1988
1989 if (bytes_deleted < bytes)
1990 dprintf("only deleted %lld bytes from %p",
1991 (longlong_t)bytes_deleted, state);
1992 }
1993
1994 static void
1995 arc_adjust(void)
2808 /*
2809 * "Read" the block at the specified DVA (in bp) via the
2810 * cache. If the block is found in the cache, invoke the provided
2811 * callback immediately and return. Note that the `zio' parameter
2812 * in the callback will be NULL in this case, since no IO was
2813 * required. If the block is not in the cache pass the read request
2814 * on to the spa with a substitute callback function, so that the
2815 * requested block will be added to the cache.
2816 *
2817 * If a read request arrives for a block that has a read in-progress,
2818 * either wait for the in-progress read to complete (and return the
2819 * results); or, if this is a read with a "done" func, add a record
2820 * to the read to invoke the "done" func when the read completes,
2821 * and return; or just return.
2822 *
2823 * arc_read_done() will invoke all the requested "done" functions
2824 * for readers of this block.
2825 */
2826 int
2827 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2828 void *private, int priority, int zio_flags, uint32_t *arc_flags,
2829 const zbookmark_t *zb)
2830 {
2831 arc_buf_hdr_t *hdr;
2832 arc_buf_t *buf = NULL;
2833 kmutex_t *hash_lock;
2834 zio_t *rzio;
2835 uint64_t guid = spa_load_guid(spa);
2836
2837 top:
2838 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2839 &hash_lock);
2840 if (hdr && hdr->b_datacnt > 0) {
2841
2842 *arc_flags |= ARC_CACHED;
2843
2844 if (HDR_IO_IN_PROGRESS(hdr)) {
2845
2846 if (*arc_flags & ARC_WAIT) {
2847 cv_wait(&hdr->b_cv, hash_lock);
2848 mutex_exit(hash_lock);
3411 callback->awcb_ready(zio, buf, callback->awcb_private);
3412
3413 /*
3414 * If the IO is already in progress, then this is a re-write
3415 * attempt, so we need to thaw and re-compute the cksum.
3416 * It is the responsibility of the callback to handle the
3417 * accounting for any re-write attempt.
3418 */
3419 if (HDR_IO_IN_PROGRESS(hdr)) {
3420 mutex_enter(&hdr->b_freeze_lock);
3421 if (hdr->b_freeze_cksum != NULL) {
3422 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3423 hdr->b_freeze_cksum = NULL;
3424 }
3425 mutex_exit(&hdr->b_freeze_lock);
3426 }
3427 arc_cksum_compute(buf, B_FALSE);
3428 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3429 }
3430
3431 static void
3432 arc_write_done(zio_t *zio)
3433 {
3434 arc_write_callback_t *callback = zio->io_private;
3435 arc_buf_t *buf = callback->awcb_buf;
3436 arc_buf_hdr_t *hdr = buf->b_hdr;
3437
3438 ASSERT(hdr->b_acb == NULL);
3439
3440 if (zio->io_error == 0) {
3441 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3442 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3443 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3444 } else {
3445 ASSERT(BUF_EMPTY(hdr));
3446 }
3447
3448 /*
3449 * If the block to be written was all-zero, we may have
3450 * compressed it away. In this case no write was performed
3451 * so there will be no dva/birth/checksum. The buffer must
3491 }
3492 }
3493 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3494 /* if it's not anon, we are doing a scrub */
3495 if (!exists && hdr->b_state == arc_anon)
3496 arc_access(hdr, hash_lock);
3497 mutex_exit(hash_lock);
3498 } else {
3499 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3500 }
3501
3502 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3503 callback->awcb_done(zio, buf, callback->awcb_private);
3504
3505 kmem_free(callback, sizeof (arc_write_callback_t));
3506 }
3507
3508 zio_t *
3509 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3510 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3511 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3512 void *private, int priority, int zio_flags, const zbookmark_t *zb)
3513 {
3514 arc_buf_hdr_t *hdr = buf->b_hdr;
3515 arc_write_callback_t *callback;
3516 zio_t *zio;
3517
3518 ASSERT(ready != NULL);
3519 ASSERT(done != NULL);
3520 ASSERT(!HDR_IO_ERROR(hdr));
3521 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3522 ASSERT(hdr->b_acb == NULL);
3523 if (l2arc)
3524 hdr->b_flags |= ARC_L2CACHE;
3525 if (l2arc_compress)
3526 hdr->b_flags |= ARC_L2COMPRESS;
3527 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3528 callback->awcb_ready = ready;
3529 callback->awcb_done = done;
3530 callback->awcb_private = private;
3531 callback->awcb_buf = buf;
3532
3533 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3534 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3535
3536 return (zio);
3537 }
3538
3539 static int
3540 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3541 {
3542 #ifdef _KERNEL
3543 uint64_t available_memory = ptob(freemem);
3544 static uint64_t page_load = 0;
3545 static uint64_t last_txg = 0;
3546
3547 #if defined(__i386)
3548 available_memory =
3549 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3550 #endif
3551 if (available_memory >= zfs_write_limit_max)
3552 return (0);
3553
3554 if (txg > last_txg) {
3555 last_txg = txg;
3556 page_load = 0;
3557 }
3558 /*
3559 * If we are in pageout, we know that memory is already tight,
3560 * the arc is already going to be evicting, so we just want to
3561 * continue to let page writes occur as quickly as possible.
3562 */
3563 if (curproc == proc_pageout) {
3564 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3565 return (SET_ERROR(ERESTART));
3566 /* Note: reserve is inflated, so we deflate */
3567 page_load += reserve / 8;
3568 return (0);
3569 } else if (page_load > 0 && arc_reclaim_needed()) {
3570 /* memory is low, delay before restarting */
3571 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3572 return (SET_ERROR(EAGAIN));
3573 }
3574 page_load = 0;
3575
3576 if (arc_size > arc_c_min) {
3577 uint64_t evictable_memory =
3578 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3579 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3580 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3581 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3582 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3583 }
3584
3585 if (inflight_data > available_memory / 4) {
3586 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3587 return (SET_ERROR(ERESTART));
3588 }
3589 #endif
3590 return (0);
3591 }
3592
3593 void
3594 arc_tempreserve_clear(uint64_t reserve)
3595 {
3596 atomic_add_64(&arc_tempreserve, -reserve);
3597 ASSERT((int64_t)arc_tempreserve >= 0);
3598 }
3599
3600 int
3601 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3602 {
3603 int error;
3604 uint64_t anon_size;
3605
3606 #ifdef ZFS_DEBUG
3607 /*
3608 * Once in a while, fail for no reason. Everything should cope.
3609 */
3610 if (spa_get_random(10000) == 0) {
3611 dprintf("forcing random failure\n");
3612 return (SET_ERROR(ERESTART));
3613 }
3614 #endif
3615 if (reserve > arc_c/4 && !arc_no_grow)
3616 arc_c = MIN(arc_c_max, reserve * 4);
3617 if (reserve > arc_c)
3618 return (SET_ERROR(ENOMEM));
3619
3620 /*
3621 * Don't count loaned bufs as in flight dirty data to prevent long
3622 * network delays from blocking transactions that are ready to be
3623 * assigned to a txg.
3624 */
3625 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3626
3627 /*
3628 * Writes will, almost always, require additional memory allocations
3629 * in order to compress/encrypt/etc the data. We therefore need to
3630 * make sure that there is sufficient available memory for this.
3631 */
3632 if (error = arc_memory_throttle(reserve, anon_size, txg))
3633 return (error);
3634
3635 /*
3636 * Throttle writes when the amount of dirty data in the cache
3637 * gets too large. We try to keep the cache less than half full
3638 * of dirty blocks so that our sync times don't grow too large.
3639 * Note: if two requests come in concurrently, we might let them
3640 * both succeed, when one of them should fail. Not a huge deal.
3641 */
3642
3643 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3644 anon_size > arc_c / 4) {
3645 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3646 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3647 arc_tempreserve>>10,
3648 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3649 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3650 reserve>>10, arc_c>>10);
3651 return (SET_ERROR(ERESTART));
3652 }
3761
3762 arc_thread_exit = 0;
3763 arc_eviction_list = NULL;
3764 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3765 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3766
3767 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3768 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3769
3770 if (arc_ksp != NULL) {
3771 arc_ksp->ks_data = &arc_stats;
3772 kstat_install(arc_ksp);
3773 }
3774
3775 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3776 TS_RUN, minclsyspri);
3777
3778 arc_dead = FALSE;
3779 arc_warm = B_FALSE;
3780
3781 if (zfs_write_limit_max == 0)
3782 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3783 else
3784 zfs_write_limit_shift = 0;
3785 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3786 }
3787
3788 void
3789 arc_fini(void)
3790 {
3791 mutex_enter(&arc_reclaim_thr_lock);
3792 arc_thread_exit = 1;
3793 while (arc_thread_exit != 0)
3794 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3795 mutex_exit(&arc_reclaim_thr_lock);
3796
3797 arc_flush(NULL);
3798
3799 arc_dead = TRUE;
3800
3801 if (arc_ksp != NULL) {
3802 kstat_delete(arc_ksp);
3803 arc_ksp = NULL;
3804 }
3805
3806 mutex_destroy(&arc_eviction_mtx);
3807 mutex_destroy(&arc_reclaim_thr_lock);
3808 cv_destroy(&arc_reclaim_thr_cv);
3809
3810 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3811 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3812 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3813 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3814 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3815 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3816 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3817 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3818
3819 mutex_destroy(&arc_anon->arcs_mtx);
3820 mutex_destroy(&arc_mru->arcs_mtx);
3821 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3822 mutex_destroy(&arc_mfu->arcs_mtx);
3823 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3824 mutex_destroy(&arc_l2c_only->arcs_mtx);
3825
3826 mutex_destroy(&zfs_write_limit_lock);
3827
3828 buf_fini();
3829
3830 ASSERT(arc_loaned_bytes == 0);
3831 }
3832
3833 /*
3834 * Level 2 ARC
3835 *
3836 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3837 * It uses dedicated storage devices to hold cached data, which are populated
3838 * using large infrequent writes. The main role of this cache is to boost
3839 * the performance of random read workloads. The intended L2ARC devices
3840 * include short-stroked disks, solid state disks, and other media with
3841 * substantially faster read latency than disk.
3842 *
3843 * +-----------------------+
3844 * | ARC |
3845 * +-----------------------+
3846 * | ^ ^
3847 * | | |
|
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 * - L2ARC buflist creation
116 * - L2ARC buflist eviction
117 * - L2ARC write completion, which walks L2ARC buflists
118 * - ARC header destruction, as it removes from L2ARC buflists
119 * - ARC header release, as it removes from L2ARC buflists
120 */
121
122 #include <sys/spa.h>
123 #include <sys/zio.h>
124 #include <sys/zio_compress.h>
125 #include <sys/zfs_context.h>
126 #include <sys/arc.h>
127 #include <sys/refcount.h>
128 #include <sys/vdev.h>
129 #include <sys/vdev_impl.h>
130 #include <sys/dsl_pool.h>
131 #ifdef _KERNEL
132 #include <sys/vmsystm.h>
133 #include <vm/anon.h>
134 #include <sys/fs/swapnode.h>
135 #include <sys/dnlc.h>
136 #endif
137 #include <sys/callb.h>
138 #include <sys/kstat.h>
139 #include <zfs_fletcher.h>
140
141 #ifndef _KERNEL
142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
143 boolean_t arc_watch = B_FALSE;
144 int arc_procfd;
145 #endif
146
147 static kmutex_t arc_reclaim_thr_lock;
148 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
149 static uint8_t arc_thread_exit;
150
151 #define ARC_REDUCE_DNLC_PERCENT 3
152 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
153
154 typedef enum arc_reclaim_strategy {
155 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
156 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
157 } arc_reclaim_strategy_t;
158
159 /*
160 * The number of iterations through arc_evict_*() before we
161 * drop & reacquire the lock.
162 */
163 int arc_evict_iterations = 100;
164
165 /* number of seconds before growing cache again */
166 static int arc_grow_retry = 60;
167
168 /* shift of arc_c for calculating both min and max arc_p */
169 static int arc_p_min_shift = 4;
170
171 /* log2(fraction of arc to reclaim) */
172 static int arc_shrink_shift = 5;
173
174 /*
175 * minimum lifespan of a prefetch block in clock ticks
176 * (initialized in arc_init())
177 */
178 static int arc_min_prefetch_lifespan;
179
180 /*
181 * If this percent of memory is free, don't throttle.
182 */
183 int arc_lotsfree_percent = 10;
184
185 static int arc_dead;
186
187 /*
188 * The arc has filled available memory and has now warmed up.
189 */
190 static boolean_t arc_warm;
191
192 /*
193 * These tunables are for performance analysis.
194 */
195 uint64_t zfs_arc_max;
196 uint64_t zfs_arc_min;
197 uint64_t zfs_arc_meta_limit = 0;
198 int zfs_arc_grow_retry = 0;
199 int zfs_arc_shrink_shift = 0;
200 int zfs_arc_p_min_shift = 0;
201 int zfs_disable_dup_eviction = 0;
202
203 /*
204 * Note that buffers can be in one of 6 states:
460 static uint64_t arc_tempreserve;
461 static uint64_t arc_loaned_bytes;
462
463 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
464
465 typedef struct arc_callback arc_callback_t;
466
467 struct arc_callback {
468 void *acb_private;
469 arc_done_func_t *acb_done;
470 arc_buf_t *acb_buf;
471 zio_t *acb_zio_dummy;
472 arc_callback_t *acb_next;
473 };
474
475 typedef struct arc_write_callback arc_write_callback_t;
476
477 struct arc_write_callback {
478 void *awcb_private;
479 arc_done_func_t *awcb_ready;
480 arc_done_func_t *awcb_physdone;
481 arc_done_func_t *awcb_done;
482 arc_buf_t *awcb_buf;
483 };
484
485 struct arc_buf_hdr {
486 /* protected by hash lock */
487 dva_t b_dva;
488 uint64_t b_birth;
489 uint64_t b_cksum0;
490
491 kmutex_t b_freeze_lock;
492 zio_cksum_t *b_freeze_cksum;
493 void *b_thawed;
494
495 arc_buf_hdr_t *b_hash_next;
496 arc_buf_t *b_buf;
497 uint32_t b_flags;
498 uint32_t b_datacnt;
499
500 arc_callback_t *b_acb;
1155 list_insert_head(&state->arcs_list[ab->b_type], ab);
1156 ASSERT(ab->b_datacnt > 0);
1157 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1158 mutex_exit(&state->arcs_mtx);
1159 }
1160 return (cnt);
1161 }
1162
1163 /*
1164 * Move the supplied buffer to the indicated state. The mutex
1165 * for the buffer must be held by the caller.
1166 */
1167 static void
1168 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1169 {
1170 arc_state_t *old_state = ab->b_state;
1171 int64_t refcnt = refcount_count(&ab->b_refcnt);
1172 uint64_t from_delta, to_delta;
1173
1174 ASSERT(MUTEX_HELD(hash_lock));
1175 ASSERT3P(new_state, !=, old_state);
1176 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1177 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1178 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1179
1180 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1181
1182 /*
1183 * If this buffer is evictable, transfer it from the
1184 * old state list to the new state list.
1185 */
1186 if (refcnt == 0) {
1187 if (old_state != arc_anon) {
1188 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1189 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1190
1191 if (use_mutex)
1192 mutex_enter(&old_state->arcs_mtx);
1193
1194 ASSERT(list_link_active(&ab->b_arc_node));
1195 list_remove(&old_state->arcs_list[ab->b_type], ab);
1770 * - look for a buffer to evict that is `bytes' long.
1771 * - return the data block from this buffer rather than freeing it.
1772 * This flag is used by callers that are trying to make space for a
1773 * new buffer in a full arc cache.
1774 *
1775 * This function makes a "best effort". It skips over any buffers
1776 * it can't get a hash_lock on, and so may not catch all candidates.
1777 * It may also return without evicting as much space as requested.
1778 */
1779 static void *
1780 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1781 arc_buf_contents_t type)
1782 {
1783 arc_state_t *evicted_state;
1784 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1785 arc_buf_hdr_t *ab, *ab_prev = NULL;
1786 list_t *list = &state->arcs_list[type];
1787 kmutex_t *hash_lock;
1788 boolean_t have_lock;
1789 void *stolen = NULL;
1790 arc_buf_hdr_t marker = { 0 };
1791 int count = 0;
1792
1793 ASSERT(state == arc_mru || state == arc_mfu);
1794
1795 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1796
1797 mutex_enter(&state->arcs_mtx);
1798 mutex_enter(&evicted_state->arcs_mtx);
1799
1800 for (ab = list_tail(list); ab; ab = ab_prev) {
1801 ab_prev = list_prev(list, ab);
1802 /* prefetch buffers have a minimum lifespan */
1803 if (HDR_IO_IN_PROGRESS(ab) ||
1804 (spa && ab->b_spa != spa) ||
1805 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1806 ddi_get_lbolt() - ab->b_arc_access <
1807 arc_min_prefetch_lifespan)) {
1808 skipped++;
1809 continue;
1810 }
1811 /* "lookahead" for better eviction candidate */
1812 if (recycle && ab->b_size != bytes &&
1813 ab_prev && ab_prev->b_size == bytes)
1814 continue;
1815
1816 /* ignore markers */
1817 if (ab->b_spa == 0)
1818 continue;
1819
1820 /*
1821 * It may take a long time to evict all the bufs requested.
1822 * To avoid blocking all arc activity, periodically drop
1823 * the arcs_mtx and give other threads a chance to run
1824 * before reacquiring the lock.
1825 *
1826 * If we are looking for a buffer to recycle, we are in
1827 * the hot code path, so don't sleep.
1828 */
1829 if (!recycle && count++ > arc_evict_iterations) {
1830 list_insert_after(list, ab, &marker);
1831 mutex_exit(&evicted_state->arcs_mtx);
1832 mutex_exit(&state->arcs_mtx);
1833 kpreempt(KPREEMPT_SYNC);
1834 mutex_enter(&state->arcs_mtx);
1835 mutex_enter(&evicted_state->arcs_mtx);
1836 ab_prev = list_prev(list, &marker);
1837 list_remove(list, &marker);
1838 count = 0;
1839 continue;
1840 }
1841
1842 hash_lock = HDR_LOCK(ab);
1843 have_lock = MUTEX_HELD(hash_lock);
1844 if (have_lock || mutex_tryenter(hash_lock)) {
1845 ASSERT0(refcount_count(&ab->b_refcnt));
1846 ASSERT(ab->b_datacnt > 0);
1847 while (ab->b_buf) {
1848 arc_buf_t *buf = ab->b_buf;
1849 if (!mutex_tryenter(&buf->b_evict_lock)) {
1850 missed += 1;
1851 break;
1852 }
1853 if (buf->b_data) {
1854 bytes_evicted += ab->b_size;
1855 if (recycle && ab->b_type == type &&
1856 ab->b_size == bytes &&
1857 !HDR_L2_WRITING(ab)) {
1858 stolen = buf->b_data;
1859 recycle = FALSE;
1860 }
1861 }
1903 break;
1904 } else {
1905 missed += 1;
1906 }
1907 }
1908
1909 mutex_exit(&evicted_state->arcs_mtx);
1910 mutex_exit(&state->arcs_mtx);
1911
1912 if (bytes_evicted < bytes)
1913 dprintf("only evicted %lld bytes from %x",
1914 (longlong_t)bytes_evicted, state);
1915
1916 if (skipped)
1917 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1918
1919 if (missed)
1920 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1921
1922 /*
1923 * Note: we have just evicted some data into the ghost state,
1924 * potentially putting the ghost size over the desired size. Rather
1925 * that evicting from the ghost list in this hot code path, leave
1926 * this chore to the arc_reclaim_thread().
1927 */
1928
1929 return (stolen);
1930 }
1931
1932 /*
1933 * Remove buffers from list until we've removed the specified number of
1934 * bytes. Destroy the buffers that are removed.
1935 */
1936 static void
1937 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1938 {
1939 arc_buf_hdr_t *ab, *ab_prev;
1940 arc_buf_hdr_t marker = { 0 };
1941 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1942 kmutex_t *hash_lock;
1943 uint64_t bytes_deleted = 0;
1944 uint64_t bufs_skipped = 0;
1945 int count = 0;
1946
1947 ASSERT(GHOST_STATE(state));
1948 top:
1949 mutex_enter(&state->arcs_mtx);
1950 for (ab = list_tail(list); ab; ab = ab_prev) {
1951 ab_prev = list_prev(list, ab);
1952 if (ab->b_type > ARC_BUFC_NUMTYPES)
1953 panic("invalid ab=%p", (void *)ab);
1954 if (spa && ab->b_spa != spa)
1955 continue;
1956
1957 /* ignore markers */
1958 if (ab->b_spa == 0)
1959 continue;
1960
1961 hash_lock = HDR_LOCK(ab);
1962 /* caller may be trying to modify this buffer, skip it */
1963 if (MUTEX_HELD(hash_lock))
1964 continue;
1965
1966 /*
1967 * It may take a long time to evict all the bufs requested.
1968 * To avoid blocking all arc activity, periodically drop
1969 * the arcs_mtx and give other threads a chance to run
1970 * before reacquiring the lock.
1971 */
1972 if (count++ > arc_evict_iterations) {
1973 list_insert_after(list, ab, &marker);
1974 mutex_exit(&state->arcs_mtx);
1975 kpreempt(KPREEMPT_SYNC);
1976 mutex_enter(&state->arcs_mtx);
1977 ab_prev = list_prev(list, &marker);
1978 list_remove(list, &marker);
1979 count = 0;
1980 continue;
1981 }
1982 if (mutex_tryenter(hash_lock)) {
1983 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1984 ASSERT(ab->b_buf == NULL);
1985 ARCSTAT_BUMP(arcstat_deleted);
1986 bytes_deleted += ab->b_size;
1987
1988 if (ab->b_l2hdr != NULL) {
1989 /*
1990 * This buffer is cached on the 2nd Level ARC;
1991 * don't destroy the header.
1992 */
1993 arc_change_state(arc_l2c_only, ab, hash_lock);
1994 mutex_exit(hash_lock);
1995 } else {
1996 arc_change_state(arc_anon, ab, hash_lock);
1997 mutex_exit(hash_lock);
1998 arc_hdr_destroy(ab);
1999 }
2000
2001 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2002 if (bytes >= 0 && bytes_deleted >= bytes)
2003 break;
2004 } else if (bytes < 0) {
2005 /*
2006 * Insert a list marker and then wait for the
2007 * hash lock to become available. Once its
2008 * available, restart from where we left off.
2009 */
2010 list_insert_after(list, ab, &marker);
2011 mutex_exit(&state->arcs_mtx);
2012 mutex_enter(hash_lock);
2013 mutex_exit(hash_lock);
2014 mutex_enter(&state->arcs_mtx);
2015 ab_prev = list_prev(list, &marker);
2016 list_remove(list, &marker);
2017 } else {
2018 bufs_skipped += 1;
2019 }
2020
2021 }
2022 mutex_exit(&state->arcs_mtx);
2023
2024 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2025 (bytes < 0 || bytes_deleted < bytes)) {
2026 list = &state->arcs_list[ARC_BUFC_METADATA];
2027 goto top;
2028 }
2029
2030 if (bufs_skipped) {
2031 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2032 ASSERT(bytes >= 0);
2033 }
2034
2035 if (bytes_deleted < bytes)
2036 dprintf("only deleted %lld bytes from %p",
2037 (longlong_t)bytes_deleted, state);
2038 }
2039
2040 static void
2041 arc_adjust(void)
2854 /*
2855 * "Read" the block at the specified DVA (in bp) via the
2856 * cache. If the block is found in the cache, invoke the provided
2857 * callback immediately and return. Note that the `zio' parameter
2858 * in the callback will be NULL in this case, since no IO was
2859 * required. If the block is not in the cache pass the read request
2860 * on to the spa with a substitute callback function, so that the
2861 * requested block will be added to the cache.
2862 *
2863 * If a read request arrives for a block that has a read in-progress,
2864 * either wait for the in-progress read to complete (and return the
2865 * results); or, if this is a read with a "done" func, add a record
2866 * to the read to invoke the "done" func when the read completes,
2867 * and return; or just return.
2868 *
2869 * arc_read_done() will invoke all the requested "done" functions
2870 * for readers of this block.
2871 */
2872 int
2873 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2874 void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
2875 const zbookmark_t *zb)
2876 {
2877 arc_buf_hdr_t *hdr;
2878 arc_buf_t *buf = NULL;
2879 kmutex_t *hash_lock;
2880 zio_t *rzio;
2881 uint64_t guid = spa_load_guid(spa);
2882
2883 top:
2884 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2885 &hash_lock);
2886 if (hdr && hdr->b_datacnt > 0) {
2887
2888 *arc_flags |= ARC_CACHED;
2889
2890 if (HDR_IO_IN_PROGRESS(hdr)) {
2891
2892 if (*arc_flags & ARC_WAIT) {
2893 cv_wait(&hdr->b_cv, hash_lock);
2894 mutex_exit(hash_lock);
3457 callback->awcb_ready(zio, buf, callback->awcb_private);
3458
3459 /*
3460 * If the IO is already in progress, then this is a re-write
3461 * attempt, so we need to thaw and re-compute the cksum.
3462 * It is the responsibility of the callback to handle the
3463 * accounting for any re-write attempt.
3464 */
3465 if (HDR_IO_IN_PROGRESS(hdr)) {
3466 mutex_enter(&hdr->b_freeze_lock);
3467 if (hdr->b_freeze_cksum != NULL) {
3468 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3469 hdr->b_freeze_cksum = NULL;
3470 }
3471 mutex_exit(&hdr->b_freeze_lock);
3472 }
3473 arc_cksum_compute(buf, B_FALSE);
3474 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3475 }
3476
3477 /*
3478 * The SPA calls this callback for each physical write that happens on behalf
3479 * of a logical write. See the comment in dbuf_write_physdone() for details.
3480 */
3481 static void
3482 arc_write_physdone(zio_t *zio)
3483 {
3484 arc_write_callback_t *cb = zio->io_private;
3485 if (cb->awcb_physdone != NULL)
3486 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3487 }
3488
3489 static void
3490 arc_write_done(zio_t *zio)
3491 {
3492 arc_write_callback_t *callback = zio->io_private;
3493 arc_buf_t *buf = callback->awcb_buf;
3494 arc_buf_hdr_t *hdr = buf->b_hdr;
3495
3496 ASSERT(hdr->b_acb == NULL);
3497
3498 if (zio->io_error == 0) {
3499 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3500 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3501 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3502 } else {
3503 ASSERT(BUF_EMPTY(hdr));
3504 }
3505
3506 /*
3507 * If the block to be written was all-zero, we may have
3508 * compressed it away. In this case no write was performed
3509 * so there will be no dva/birth/checksum. The buffer must
3549 }
3550 }
3551 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3552 /* if it's not anon, we are doing a scrub */
3553 if (!exists && hdr->b_state == arc_anon)
3554 arc_access(hdr, hash_lock);
3555 mutex_exit(hash_lock);
3556 } else {
3557 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3558 }
3559
3560 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3561 callback->awcb_done(zio, buf, callback->awcb_private);
3562
3563 kmem_free(callback, sizeof (arc_write_callback_t));
3564 }
3565
3566 zio_t *
3567 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3568 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3569 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3570 arc_done_func_t *done, void *private, zio_priority_t priority,
3571 int zio_flags, const zbookmark_t *zb)
3572 {
3573 arc_buf_hdr_t *hdr = buf->b_hdr;
3574 arc_write_callback_t *callback;
3575 zio_t *zio;
3576
3577 ASSERT(ready != NULL);
3578 ASSERT(done != NULL);
3579 ASSERT(!HDR_IO_ERROR(hdr));
3580 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3581 ASSERT(hdr->b_acb == NULL);
3582 if (l2arc)
3583 hdr->b_flags |= ARC_L2CACHE;
3584 if (l2arc_compress)
3585 hdr->b_flags |= ARC_L2COMPRESS;
3586 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3587 callback->awcb_ready = ready;
3588 callback->awcb_physdone = physdone;
3589 callback->awcb_done = done;
3590 callback->awcb_private = private;
3591 callback->awcb_buf = buf;
3592
3593 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3594 arc_write_ready, arc_write_physdone, arc_write_done, callback,
3595 priority, zio_flags, zb);
3596
3597 return (zio);
3598 }
3599
3600 static int
3601 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3602 {
3603 #ifdef _KERNEL
3604 uint64_t available_memory = ptob(freemem);
3605 static uint64_t page_load = 0;
3606 static uint64_t last_txg = 0;
3607
3608 #if defined(__i386)
3609 available_memory =
3610 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3611 #endif
3612
3613 if (freemem > physmem * arc_lotsfree_percent / 100)
3614 return (0);
3615
3616 if (txg > last_txg) {
3617 last_txg = txg;
3618 page_load = 0;
3619 }
3620 /*
3621 * If we are in pageout, we know that memory is already tight,
3622 * the arc is already going to be evicting, so we just want to
3623 * continue to let page writes occur as quickly as possible.
3624 */
3625 if (curproc == proc_pageout) {
3626 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3627 return (SET_ERROR(ERESTART));
3628 /* Note: reserve is inflated, so we deflate */
3629 page_load += reserve / 8;
3630 return (0);
3631 } else if (page_load > 0 && arc_reclaim_needed()) {
3632 /* memory is low, delay before restarting */
3633 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3634 return (SET_ERROR(EAGAIN));
3635 }
3636 page_load = 0;
3637 #endif
3638 return (0);
3639 }
3640
3641 void
3642 arc_tempreserve_clear(uint64_t reserve)
3643 {
3644 atomic_add_64(&arc_tempreserve, -reserve);
3645 ASSERT((int64_t)arc_tempreserve >= 0);
3646 }
3647
3648 int
3649 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3650 {
3651 int error;
3652 uint64_t anon_size;
3653
3654 if (reserve > arc_c/4 && !arc_no_grow)
3655 arc_c = MIN(arc_c_max, reserve * 4);
3656 if (reserve > arc_c)
3657 return (SET_ERROR(ENOMEM));
3658
3659 /*
3660 * Don't count loaned bufs as in flight dirty data to prevent long
3661 * network delays from blocking transactions that are ready to be
3662 * assigned to a txg.
3663 */
3664 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3665
3666 /*
3667 * Writes will, almost always, require additional memory allocations
3668 * in order to compress/encrypt/etc the data. We therefore need to
3669 * make sure that there is sufficient available memory for this.
3670 */
3671 error = arc_memory_throttle(reserve, txg);
3672 if (error != 0)
3673 return (error);
3674
3675 /*
3676 * Throttle writes when the amount of dirty data in the cache
3677 * gets too large. We try to keep the cache less than half full
3678 * of dirty blocks so that our sync times don't grow too large.
3679 * Note: if two requests come in concurrently, we might let them
3680 * both succeed, when one of them should fail. Not a huge deal.
3681 */
3682
3683 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3684 anon_size > arc_c / 4) {
3685 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3686 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3687 arc_tempreserve>>10,
3688 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3689 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3690 reserve>>10, arc_c>>10);
3691 return (SET_ERROR(ERESTART));
3692 }
3801
3802 arc_thread_exit = 0;
3803 arc_eviction_list = NULL;
3804 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3805 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3806
3807 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3808 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3809
3810 if (arc_ksp != NULL) {
3811 arc_ksp->ks_data = &arc_stats;
3812 kstat_install(arc_ksp);
3813 }
3814
3815 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3816 TS_RUN, minclsyspri);
3817
3818 arc_dead = FALSE;
3819 arc_warm = B_FALSE;
3820
3821 /*
3822 * Calculate maximum amount of dirty data per pool.
3823 *
3824 * If it has been set by /etc/system, take that.
3825 * Otherwise, use a percentage of physical memory defined by
3826 * zfs_dirty_data_max_percent (default 10%) with a cap at
3827 * zfs_dirty_data_max_max (default 4GB).
3828 */
3829 if (zfs_dirty_data_max == 0) {
3830 zfs_dirty_data_max = physmem * PAGESIZE *
3831 zfs_dirty_data_max_percent / 100;
3832 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
3833 zfs_dirty_data_max_max);
3834 }
3835 }
3836
3837 void
3838 arc_fini(void)
3839 {
3840 mutex_enter(&arc_reclaim_thr_lock);
3841 arc_thread_exit = 1;
3842 while (arc_thread_exit != 0)
3843 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3844 mutex_exit(&arc_reclaim_thr_lock);
3845
3846 arc_flush(NULL);
3847
3848 arc_dead = TRUE;
3849
3850 if (arc_ksp != NULL) {
3851 kstat_delete(arc_ksp);
3852 arc_ksp = NULL;
3853 }
3854
3855 mutex_destroy(&arc_eviction_mtx);
3856 mutex_destroy(&arc_reclaim_thr_lock);
3857 cv_destroy(&arc_reclaim_thr_cv);
3858
3859 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3860 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3861 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3862 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3863 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3864 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3865 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3866 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3867
3868 mutex_destroy(&arc_anon->arcs_mtx);
3869 mutex_destroy(&arc_mru->arcs_mtx);
3870 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3871 mutex_destroy(&arc_mfu->arcs_mtx);
3872 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3873 mutex_destroy(&arc_l2c_only->arcs_mtx);
3874
3875 buf_fini();
3876
3877 ASSERT(arc_loaned_bytes == 0);
3878 }
3879
3880 /*
3881 * Level 2 ARC
3882 *
3883 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3884 * It uses dedicated storage devices to hold cached data, which are populated
3885 * using large infrequent writes. The main role of this cache is to boost
3886 * the performance of random read workloads. The intended L2ARC devices
3887 * include short-stroked disks, solid state disks, and other media with
3888 * substantially faster read latency than disk.
3889 *
3890 * +-----------------------+
3891 * | ARC |
3892 * +-----------------------+
3893 * | ^ ^
3894 * | | |
|