dlpx-os-diff Sdiff usr/src/uts/common/fs/zfs/arc.c

Print this page

4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

 110  * Note that the majority of the performance stats are manipulated
 111  * with atomic operations.
 112  *
 113  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 114  *
 115  *      - L2ARC buflist creation
 116  *      - L2ARC buflist eviction
 117  *      - L2ARC write completion, which walks L2ARC buflists
 118  *      - ARC header destruction, as it removes from L2ARC buflists
 119  *      - ARC header release, as it removes from L2ARC buflists
 120  */
 121 
 122 #include <sys/spa.h>
 123 #include <sys/zio.h>
 124 #include <sys/zio_compress.h>
 125 #include <sys/zfs_context.h>
 126 #include <sys/arc.h>
 127 #include <sys/refcount.h>
 128 #include <sys/vdev.h>
 129 #include <sys/vdev_impl.h>

 130 #ifdef _KERNEL
 131 #include <sys/vmsystm.h>
 132 #include <vm/anon.h>
 133 #include <sys/fs/swapnode.h>
 134 #include <sys/dnlc.h>
 135 #endif
 136 #include <sys/callb.h>
 137 #include <sys/kstat.h>
 138 #include <zfs_fletcher.h>
 139 
 140 #ifndef _KERNEL
 141 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 142 boolean_t arc_watch = B_FALSE;
 143 int arc_procfd;
 144 #endif
 145 
 146 static kmutex_t         arc_reclaim_thr_lock;
 147 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 148 static uint8_t          arc_thread_exit;
 149 
 150 extern int zfs_write_limit_shift;
 151 extern uint64_t zfs_write_limit_max;
 152 extern kmutex_t zfs_write_limit_lock;
 153 
 154 #define ARC_REDUCE_DNLC_PERCENT 3
 155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 156 
 157 typedef enum arc_reclaim_strategy {
 158         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 159         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 160 } arc_reclaim_strategy_t;
 161 






 162 /* number of seconds before growing cache again */
 163 static int              arc_grow_retry = 60;
 164 
 165 /* shift of arc_c for calculating both min and max arc_p */
 166 static int              arc_p_min_shift = 4;
 167 
 168 /* log2(fraction of arc to reclaim) */
 169 static int              arc_shrink_shift = 5;
 170 
 171 /*
 172  * minimum lifespan of a prefetch block in clock ticks
 173  * (initialized in arc_init())
 174  */
 175 static int              arc_min_prefetch_lifespan;
 176 





 177 static int arc_dead;
 178 
 179 /*
 180  * The arc has filled available memory and has now warmed up.
 181  */
 182 static boolean_t arc_warm;
 183 
 184 /*
 185  * These tunables are for performance analysis.
 186  */
 187 uint64_t zfs_arc_max;
 188 uint64_t zfs_arc_min;
 189 uint64_t zfs_arc_meta_limit = 0;
 190 int zfs_arc_grow_retry = 0;
 191 int zfs_arc_shrink_shift = 0;
 192 int zfs_arc_p_min_shift = 0;
 193 int zfs_disable_dup_eviction = 0;
 194 
 195 /*
 196  * Note that buffers can be in one of 6 states:

 452 static uint64_t         arc_tempreserve;
 453 static uint64_t         arc_loaned_bytes;
 454 
 455 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 456 
 457 typedef struct arc_callback arc_callback_t;
 458 
 459 struct arc_callback {
 460         void                    *acb_private;
 461         arc_done_func_t         *acb_done;
 462         arc_buf_t               *acb_buf;
 463         zio_t                   *acb_zio_dummy;
 464         arc_callback_t          *acb_next;
 465 };
 466 
 467 typedef struct arc_write_callback arc_write_callback_t;
 468 
 469 struct arc_write_callback {
 470         void            *awcb_private;
 471         arc_done_func_t *awcb_ready;

 472         arc_done_func_t *awcb_done;
 473         arc_buf_t       *awcb_buf;
 474 };
 475 
 476 struct arc_buf_hdr {
 477         /* protected by hash lock */
 478         dva_t                   b_dva;
 479         uint64_t                b_birth;
 480         uint64_t                b_cksum0;
 481 
 482         kmutex_t                b_freeze_lock;
 483         zio_cksum_t             *b_freeze_cksum;
 484         void                    *b_thawed;
 485 
 486         arc_buf_hdr_t           *b_hash_next;
 487         arc_buf_t               *b_buf;
 488         uint32_t                b_flags;
 489         uint32_t                b_datacnt;
 490 
 491         arc_callback_t          *b_acb;

1146                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1147                 ASSERT(ab->b_datacnt > 0);
1148                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1149                 mutex_exit(&state->arcs_mtx);
1150         }
1151         return (cnt);
1152 }
1153 
1154 /*
1155  * Move the supplied buffer to the indicated state.  The mutex
1156  * for the buffer must be held by the caller.
1157  */
1158 static void
1159 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1160 {
1161         arc_state_t *old_state = ab->b_state;
1162         int64_t refcnt = refcount_count(&ab->b_refcnt);
1163         uint64_t from_delta, to_delta;
1164 
1165         ASSERT(MUTEX_HELD(hash_lock));
1166         ASSERT(new_state != old_state);
1167         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1168         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1169         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1170 
1171         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1172 
1173         /*
1174          * If this buffer is evictable, transfer it from the
1175          * old state list to the new state list.
1176          */
1177         if (refcnt == 0) {
1178                 if (old_state != arc_anon) {
1179                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1180                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1181 
1182                         if (use_mutex)
1183                                 mutex_enter(&old_state->arcs_mtx);
1184 
1185                         ASSERT(list_link_active(&ab->b_arc_node));
1186                         list_remove(&old_state->arcs_list[ab->b_type], ab);

1761  * - look for a buffer to evict that is `bytes' long.
1762  * - return the data block from this buffer rather than freeing it.
1763  * This flag is used by callers that are trying to make space for a
1764  * new buffer in a full arc cache.
1765  *
1766  * This function makes a "best effort".  It skips over any buffers
1767  * it can't get a hash_lock on, and so may not catch all candidates.
1768  * It may also return without evicting as much space as requested.
1769  */
1770 static void *
1771 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1772     arc_buf_contents_t type)
1773 {
1774         arc_state_t *evicted_state;
1775         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1776         arc_buf_hdr_t *ab, *ab_prev = NULL;
1777         list_t *list = &state->arcs_list[type];
1778         kmutex_t *hash_lock;
1779         boolean_t have_lock;
1780         void *stolen = NULL;


1781 
1782         ASSERT(state == arc_mru || state == arc_mfu);
1783 
1784         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1785 
1786         mutex_enter(&state->arcs_mtx);
1787         mutex_enter(&evicted_state->arcs_mtx);
1788 
1789         for (ab = list_tail(list); ab; ab = ab_prev) {
1790                 ab_prev = list_prev(list, ab);
1791                 /* prefetch buffers have a minimum lifespan */
1792                 if (HDR_IO_IN_PROGRESS(ab) ||
1793                     (spa && ab->b_spa != spa) ||
1794                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1795                     ddi_get_lbolt() - ab->b_arc_access <
1796                     arc_min_prefetch_lifespan)) {
1797                         skipped++;
1798                         continue;
1799                 }
1800                 /* "lookahead" for better eviction candidate */
1801                 if (recycle && ab->b_size != bytes &&
1802                     ab_prev && ab_prev->b_size == bytes)
1803                         continue;



























1804                 hash_lock = HDR_LOCK(ab);
1805                 have_lock = MUTEX_HELD(hash_lock);
1806                 if (have_lock || mutex_tryenter(hash_lock)) {
1807                         ASSERT0(refcount_count(&ab->b_refcnt));
1808                         ASSERT(ab->b_datacnt > 0);
1809                         while (ab->b_buf) {
1810                                 arc_buf_t *buf = ab->b_buf;
1811                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1812                                         missed += 1;
1813                                         break;
1814                                 }
1815                                 if (buf->b_data) {
1816                                         bytes_evicted += ab->b_size;
1817                                         if (recycle && ab->b_type == type &&
1818                                             ab->b_size == bytes &&
1819                                             !HDR_L2_WRITING(ab)) {
1820                                                 stolen = buf->b_data;
1821                                                 recycle = FALSE;
1822                                         }
1823                                 }

1865                                 break;
1866                 } else {
1867                         missed += 1;
1868                 }
1869         }
1870 
1871         mutex_exit(&evicted_state->arcs_mtx);
1872         mutex_exit(&state->arcs_mtx);
1873 
1874         if (bytes_evicted < bytes)
1875                 dprintf("only evicted %lld bytes from %x",
1876                     (longlong_t)bytes_evicted, state);
1877 
1878         if (skipped)
1879                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1880 
1881         if (missed)
1882                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1883 
1884         /*
1885          * We have just evicted some data into the ghost state, make
1886          * sure we also adjust the ghost state size if necessary.


1887          */
1888         if (arc_no_grow &&
1889             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1890                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1891                     arc_mru_ghost->arcs_size - arc_c;
1892 
1893                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1894                         int64_t todelete =
1895                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1896                         arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1897                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1898                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1899                             arc_mru_ghost->arcs_size +
1900                             arc_mfu_ghost->arcs_size - arc_c);
1901                         arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1902                 }
1903         }
1904 
1905         return (stolen);
1906 }
1907 
1908 /*
1909  * Remove buffers from list until we've removed the specified number of
1910  * bytes.  Destroy the buffers that are removed.
1911  */
1912 static void
1913 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1914 {
1915         arc_buf_hdr_t *ab, *ab_prev;
1916         arc_buf_hdr_t marker = { 0 };
1917         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1918         kmutex_t *hash_lock;
1919         uint64_t bytes_deleted = 0;
1920         uint64_t bufs_skipped = 0;

1921 
1922         ASSERT(GHOST_STATE(state));
1923 top:
1924         mutex_enter(&state->arcs_mtx);
1925         for (ab = list_tail(list); ab; ab = ab_prev) {
1926                 ab_prev = list_prev(list, ab);


1927                 if (spa && ab->b_spa != spa)
1928                         continue;
1929 
1930                 /* ignore markers */
1931                 if (ab->b_spa == 0)
1932                         continue;
1933 
1934                 hash_lock = HDR_LOCK(ab);
1935                 /* caller may be trying to modify this buffer, skip it */
1936                 if (MUTEX_HELD(hash_lock))
1937                         continue;

















1938                 if (mutex_tryenter(hash_lock)) {
1939                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1940                         ASSERT(ab->b_buf == NULL);
1941                         ARCSTAT_BUMP(arcstat_deleted);
1942                         bytes_deleted += ab->b_size;
1943 
1944                         if (ab->b_l2hdr != NULL) {
1945                                 /*
1946                                  * This buffer is cached on the 2nd Level ARC;
1947                                  * don't destroy the header.
1948                                  */
1949                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1950                                 mutex_exit(hash_lock);
1951                         } else {
1952                                 arc_change_state(arc_anon, ab, hash_lock);
1953                                 mutex_exit(hash_lock);
1954                                 arc_hdr_destroy(ab);
1955                         }
1956 
1957                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1958                         if (bytes >= 0 && bytes_deleted >= bytes)
1959                                 break;
1960                 } else if (bytes < 0) {
1961                         /*
1962                          * Insert a list marker and then wait for the
1963                          * hash lock to become available. Once its
1964                          * available, restart from where we left off.
1965                          */
1966                         list_insert_after(list, ab, &marker);
1967                         mutex_exit(&state->arcs_mtx);
1968                         mutex_enter(hash_lock);
1969                         mutex_exit(hash_lock);
1970                         mutex_enter(&state->arcs_mtx);
1971                         ab_prev = list_prev(list, &marker);
1972                         list_remove(list, &marker);
1973                 } else
1974                         bufs_skipped += 1;
1975         }


1976         mutex_exit(&state->arcs_mtx);
1977 
1978         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1979             (bytes < 0 || bytes_deleted < bytes)) {
1980                 list = &state->arcs_list[ARC_BUFC_METADATA];
1981                 goto top;
1982         }
1983 
1984         if (bufs_skipped) {
1985                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1986                 ASSERT(bytes >= 0);
1987         }
1988 
1989         if (bytes_deleted < bytes)
1990                 dprintf("only deleted %lld bytes from %p",
1991                     (longlong_t)bytes_deleted, state);
1992 }
1993 
1994 static void
1995 arc_adjust(void)

2808 /*
2809  * "Read" the block at the specified DVA (in bp) via the
2810  * cache.  If the block is found in the cache, invoke the provided
2811  * callback immediately and return.  Note that the `zio' parameter
2812  * in the callback will be NULL in this case, since no IO was
2813  * required.  If the block is not in the cache pass the read request
2814  * on to the spa with a substitute callback function, so that the
2815  * requested block will be added to the cache.
2816  *
2817  * If a read request arrives for a block that has a read in-progress,
2818  * either wait for the in-progress read to complete (and return the
2819  * results); or, if this is a read with a "done" func, add a record
2820  * to the read to invoke the "done" func when the read completes,
2821  * and return; or just return.
2822  *
2823  * arc_read_done() will invoke all the requested "done" functions
2824  * for readers of this block.
2825  */
2826 int
2827 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2828     void *private, int priority, int zio_flags, uint32_t *arc_flags,
2829     const zbookmark_t *zb)
2830 {
2831         arc_buf_hdr_t *hdr;
2832         arc_buf_t *buf = NULL;
2833         kmutex_t *hash_lock;
2834         zio_t *rzio;
2835         uint64_t guid = spa_load_guid(spa);
2836 
2837 top:
2838         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2839             &hash_lock);
2840         if (hdr && hdr->b_datacnt > 0) {
2841 
2842                 *arc_flags |= ARC_CACHED;
2843 
2844                 if (HDR_IO_IN_PROGRESS(hdr)) {
2845 
2846                         if (*arc_flags & ARC_WAIT) {
2847                                 cv_wait(&hdr->b_cv, hash_lock);
2848                                 mutex_exit(hash_lock);

3411         callback->awcb_ready(zio, buf, callback->awcb_private);
3412 
3413         /*
3414          * If the IO is already in progress, then this is a re-write
3415          * attempt, so we need to thaw and re-compute the cksum.
3416          * It is the responsibility of the callback to handle the
3417          * accounting for any re-write attempt.
3418          */
3419         if (HDR_IO_IN_PROGRESS(hdr)) {
3420                 mutex_enter(&hdr->b_freeze_lock);
3421                 if (hdr->b_freeze_cksum != NULL) {
3422                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3423                         hdr->b_freeze_cksum = NULL;
3424                 }
3425                 mutex_exit(&hdr->b_freeze_lock);
3426         }
3427         arc_cksum_compute(buf, B_FALSE);
3428         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3429 }
3430 




3431 static void








3432 arc_write_done(zio_t *zio)
3433 {
3434         arc_write_callback_t *callback = zio->io_private;
3435         arc_buf_t *buf = callback->awcb_buf;
3436         arc_buf_hdr_t *hdr = buf->b_hdr;
3437 
3438         ASSERT(hdr->b_acb == NULL);
3439 
3440         if (zio->io_error == 0) {
3441                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3442                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3443                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3444         } else {
3445                 ASSERT(BUF_EMPTY(hdr));
3446         }
3447 
3448         /*
3449          * If the block to be written was all-zero, we may have
3450          * compressed it away.  In this case no write was performed
3451          * so there will be no dva/birth/checksum.  The buffer must

3491                         }
3492                 }
3493                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3494                 /* if it's not anon, we are doing a scrub */
3495                 if (!exists && hdr->b_state == arc_anon)
3496                         arc_access(hdr, hash_lock);
3497                 mutex_exit(hash_lock);
3498         } else {
3499                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3500         }
3501 
3502         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3503         callback->awcb_done(zio, buf, callback->awcb_private);
3504 
3505         kmem_free(callback, sizeof (arc_write_callback_t));
3506 }
3507 
3508 zio_t *
3509 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3510     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3511     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3512     void *private, int priority, int zio_flags, const zbookmark_t *zb)

3513 {
3514         arc_buf_hdr_t *hdr = buf->b_hdr;
3515         arc_write_callback_t *callback;
3516         zio_t *zio;
3517 
3518         ASSERT(ready != NULL);
3519         ASSERT(done != NULL);
3520         ASSERT(!HDR_IO_ERROR(hdr));
3521         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3522         ASSERT(hdr->b_acb == NULL);
3523         if (l2arc)
3524                 hdr->b_flags |= ARC_L2CACHE;
3525         if (l2arc_compress)
3526                 hdr->b_flags |= ARC_L2COMPRESS;
3527         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3528         callback->awcb_ready = ready;

3529         callback->awcb_done = done;
3530         callback->awcb_private = private;
3531         callback->awcb_buf = buf;
3532 
3533         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3534             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);

3535 
3536         return (zio);
3537 }
3538 
3539 static int
3540 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3541 {
3542 #ifdef _KERNEL
3543         uint64_t available_memory = ptob(freemem);
3544         static uint64_t page_load = 0;
3545         static uint64_t last_txg = 0;
3546 
3547 #if defined(__i386)
3548         available_memory =
3549             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3550 #endif
3551         if (available_memory >= zfs_write_limit_max)

3552                 return (0);
3553 
3554         if (txg > last_txg) {
3555                 last_txg = txg;
3556                 page_load = 0;
3557         }
3558         /*
3559          * If we are in pageout, we know that memory is already tight,
3560          * the arc is already going to be evicting, so we just want to
3561          * continue to let page writes occur as quickly as possible.
3562          */
3563         if (curproc == proc_pageout) {
3564                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3565                         return (SET_ERROR(ERESTART));
3566                 /* Note: reserve is inflated, so we deflate */
3567                 page_load += reserve / 8;
3568                 return (0);
3569         } else if (page_load > 0 && arc_reclaim_needed()) {
3570                 /* memory is low, delay before restarting */
3571                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3572                 return (SET_ERROR(EAGAIN));
3573         }
3574         page_load = 0;
3575 
3576         if (arc_size > arc_c_min) {
3577                 uint64_t evictable_memory =
3578                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3579                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3580                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3581                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3582                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3583         }
3584 
3585         if (inflight_data > available_memory / 4) {
3586                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3587                 return (SET_ERROR(ERESTART));
3588         }
3589 #endif
3590         return (0);
3591 }
3592 
3593 void
3594 arc_tempreserve_clear(uint64_t reserve)
3595 {
3596         atomic_add_64(&arc_tempreserve, -reserve);
3597         ASSERT((int64_t)arc_tempreserve >= 0);
3598 }
3599 
3600 int
3601 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3602 {
3603         int error;
3604         uint64_t anon_size;
3605 
3606 #ifdef ZFS_DEBUG
3607         /*
3608          * Once in a while, fail for no reason.  Everything should cope.
3609          */
3610         if (spa_get_random(10000) == 0) {
3611                 dprintf("forcing random failure\n");
3612                 return (SET_ERROR(ERESTART));
3613         }
3614 #endif
3615         if (reserve > arc_c/4 && !arc_no_grow)
3616                 arc_c = MIN(arc_c_max, reserve * 4);
3617         if (reserve > arc_c)
3618                 return (SET_ERROR(ENOMEM));
3619 
3620         /*
3621          * Don't count loaned bufs as in flight dirty data to prevent long
3622          * network delays from blocking transactions that are ready to be
3623          * assigned to a txg.
3624          */
3625         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3626 
3627         /*
3628          * Writes will, almost always, require additional memory allocations
3629          * in order to compress/encrypt/etc the data.  We therefore need to
3630          * make sure that there is sufficient available memory for this.
3631          */
3632         if (error = arc_memory_throttle(reserve, anon_size, txg))

3633                 return (error);
3634 
3635         /*
3636          * Throttle writes when the amount of dirty data in the cache
3637          * gets too large.  We try to keep the cache less than half full
3638          * of dirty blocks so that our sync times don't grow too large.
3639          * Note: if two requests come in concurrently, we might let them
3640          * both succeed, when one of them should fail.  Not a huge deal.
3641          */
3642 
3643         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3644             anon_size > arc_c / 4) {
3645                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3646                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3647                     arc_tempreserve>>10,
3648                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3649                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3650                     reserve>>10, arc_c>>10);
3651                 return (SET_ERROR(ERESTART));
3652         }

3761 
3762         arc_thread_exit = 0;
3763         arc_eviction_list = NULL;
3764         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3765         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3766 
3767         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3768             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3769 
3770         if (arc_ksp != NULL) {
3771                 arc_ksp->ks_data = &arc_stats;
3772                 kstat_install(arc_ksp);
3773         }
3774 
3775         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3776             TS_RUN, minclsyspri);
3777 
3778         arc_dead = FALSE;
3779         arc_warm = B_FALSE;
3780 
3781         if (zfs_write_limit_max == 0)
3782                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3783         else
3784                 zfs_write_limit_shift = 0;
3785         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);









3786 }
3787 
3788 void
3789 arc_fini(void)
3790 {
3791         mutex_enter(&arc_reclaim_thr_lock);
3792         arc_thread_exit = 1;
3793         while (arc_thread_exit != 0)
3794                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3795         mutex_exit(&arc_reclaim_thr_lock);
3796 
3797         arc_flush(NULL);
3798 
3799         arc_dead = TRUE;
3800 
3801         if (arc_ksp != NULL) {
3802                 kstat_delete(arc_ksp);
3803                 arc_ksp = NULL;
3804         }
3805 
3806         mutex_destroy(&arc_eviction_mtx);
3807         mutex_destroy(&arc_reclaim_thr_lock);
3808         cv_destroy(&arc_reclaim_thr_cv);
3809 
3810         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3811         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3812         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3813         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3814         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3815         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3816         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3817         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3818 
3819         mutex_destroy(&arc_anon->arcs_mtx);
3820         mutex_destroy(&arc_mru->arcs_mtx);
3821         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3822         mutex_destroy(&arc_mfu->arcs_mtx);
3823         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3824         mutex_destroy(&arc_l2c_only->arcs_mtx);
3825 
3826         mutex_destroy(&zfs_write_limit_lock);
3827 
3828         buf_fini();
3829 
3830         ASSERT(arc_loaned_bytes == 0);
3831 }
3832 
3833 /*
3834  * Level 2 ARC
3835  *
3836  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3837  * It uses dedicated storage devices to hold cached data, which are populated
3838  * using large infrequent writes.  The main role of this cache is to boost
3839  * the performance of random read workloads.  The intended L2ARC devices
3840  * include short-stroked disks, solid state disks, and other media with
3841  * substantially faster read latency than disk.
3842  *
3843  *                 +-----------------------+
3844  *                 |         ARC           |
3845  *                 +-----------------------+
3846  *                    |         ^     ^
3847  *                    |         |     |

 110  * Note that the majority of the performance stats are manipulated
 111  * with atomic operations.
 112  *
 113  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 114  *
 115  *      - L2ARC buflist creation
 116  *      - L2ARC buflist eviction
 117  *      - L2ARC write completion, which walks L2ARC buflists
 118  *      - ARC header destruction, as it removes from L2ARC buflists
 119  *      - ARC header release, as it removes from L2ARC buflists
 120  */
 121 
 122 #include <sys/spa.h>
 123 #include <sys/zio.h>
 124 #include <sys/zio_compress.h>
 125 #include <sys/zfs_context.h>
 126 #include <sys/arc.h>
 127 #include <sys/refcount.h>
 128 #include <sys/vdev.h>
 129 #include <sys/vdev_impl.h>
 130 #include <sys/dsl_pool.h>
 131 #ifdef _KERNEL
 132 #include <sys/vmsystm.h>
 133 #include <vm/anon.h>
 134 #include <sys/fs/swapnode.h>
 135 #include <sys/dnlc.h>
 136 #endif
 137 #include <sys/callb.h>
 138 #include <sys/kstat.h>
 139 #include <zfs_fletcher.h>
 140 
 141 #ifndef _KERNEL
 142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 143 boolean_t arc_watch = B_FALSE;
 144 int arc_procfd;
 145 #endif
 146 
 147 static kmutex_t         arc_reclaim_thr_lock;
 148 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 149 static uint8_t          arc_thread_exit;
 150 




 151 #define ARC_REDUCE_DNLC_PERCENT 3
 152 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 153 
 154 typedef enum arc_reclaim_strategy {
 155         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 156         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 157 } arc_reclaim_strategy_t;
 158 
 159 /*
 160  * The number of iterations through arc_evict_*() before we
 161  * drop & reacquire the lock.
 162  */
 163 int arc_evict_iterations = 100;
 164 
 165 /* number of seconds before growing cache again */
 166 static int              arc_grow_retry = 60;
 167 
 168 /* shift of arc_c for calculating both min and max arc_p */
 169 static int              arc_p_min_shift = 4;
 170 
 171 /* log2(fraction of arc to reclaim) */
 172 static int              arc_shrink_shift = 5;
 173 
 174 /*
 175  * minimum lifespan of a prefetch block in clock ticks
 176  * (initialized in arc_init())
 177  */
 178 static int              arc_min_prefetch_lifespan;
 179 
 180 /*
 181  * If this percent of memory is free, don't throttle.
 182  */
 183 int arc_lotsfree_percent = 10;
 184 
 185 static int arc_dead;
 186 
 187 /*
 188  * The arc has filled available memory and has now warmed up.
 189  */
 190 static boolean_t arc_warm;
 191 
 192 /*
 193  * These tunables are for performance analysis.
 194  */
 195 uint64_t zfs_arc_max;
 196 uint64_t zfs_arc_min;
 197 uint64_t zfs_arc_meta_limit = 0;
 198 int zfs_arc_grow_retry = 0;
 199 int zfs_arc_shrink_shift = 0;
 200 int zfs_arc_p_min_shift = 0;
 201 int zfs_disable_dup_eviction = 0;
 202 
 203 /*
 204  * Note that buffers can be in one of 6 states:

 460 static uint64_t         arc_tempreserve;
 461 static uint64_t         arc_loaned_bytes;
 462 
 463 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 464 
 465 typedef struct arc_callback arc_callback_t;
 466 
 467 struct arc_callback {
 468         void                    *acb_private;
 469         arc_done_func_t         *acb_done;
 470         arc_buf_t               *acb_buf;
 471         zio_t                   *acb_zio_dummy;
 472         arc_callback_t          *acb_next;
 473 };
 474 
 475 typedef struct arc_write_callback arc_write_callback_t;
 476 
 477 struct arc_write_callback {
 478         void            *awcb_private;
 479         arc_done_func_t *awcb_ready;
 480         arc_done_func_t *awcb_physdone;
 481         arc_done_func_t *awcb_done;
 482         arc_buf_t       *awcb_buf;
 483 };
 484 
 485 struct arc_buf_hdr {
 486         /* protected by hash lock */
 487         dva_t                   b_dva;
 488         uint64_t                b_birth;
 489         uint64_t                b_cksum0;
 490 
 491         kmutex_t                b_freeze_lock;
 492         zio_cksum_t             *b_freeze_cksum;
 493         void                    *b_thawed;
 494 
 495         arc_buf_hdr_t           *b_hash_next;
 496         arc_buf_t               *b_buf;
 497         uint32_t                b_flags;
 498         uint32_t                b_datacnt;
 499 
 500         arc_callback_t          *b_acb;

1155                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1156                 ASSERT(ab->b_datacnt > 0);
1157                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1158                 mutex_exit(&state->arcs_mtx);
1159         }
1160         return (cnt);
1161 }
1162 
1163 /*
1164  * Move the supplied buffer to the indicated state.  The mutex
1165  * for the buffer must be held by the caller.
1166  */
1167 static void
1168 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1169 {
1170         arc_state_t *old_state = ab->b_state;
1171         int64_t refcnt = refcount_count(&ab->b_refcnt);
1172         uint64_t from_delta, to_delta;
1173 
1174         ASSERT(MUTEX_HELD(hash_lock));
1175         ASSERT3P(new_state, !=, old_state);
1176         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1177         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1178         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1179 
1180         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1181 
1182         /*
1183          * If this buffer is evictable, transfer it from the
1184          * old state list to the new state list.
1185          */
1186         if (refcnt == 0) {
1187                 if (old_state != arc_anon) {
1188                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1189                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1190 
1191                         if (use_mutex)
1192                                 mutex_enter(&old_state->arcs_mtx);
1193 
1194                         ASSERT(list_link_active(&ab->b_arc_node));
1195                         list_remove(&old_state->arcs_list[ab->b_type], ab);

1770  * - look for a buffer to evict that is `bytes' long.
1771  * - return the data block from this buffer rather than freeing it.
1772  * This flag is used by callers that are trying to make space for a
1773  * new buffer in a full arc cache.
1774  *
1775  * This function makes a "best effort".  It skips over any buffers
1776  * it can't get a hash_lock on, and so may not catch all candidates.
1777  * It may also return without evicting as much space as requested.
1778  */
1779 static void *
1780 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1781     arc_buf_contents_t type)
1782 {
1783         arc_state_t *evicted_state;
1784         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1785         arc_buf_hdr_t *ab, *ab_prev = NULL;
1786         list_t *list = &state->arcs_list[type];
1787         kmutex_t *hash_lock;
1788         boolean_t have_lock;
1789         void *stolen = NULL;
1790         arc_buf_hdr_t marker = { 0 };
1791         int count = 0;
1792 
1793         ASSERT(state == arc_mru || state == arc_mfu);
1794 
1795         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1796 
1797         mutex_enter(&state->arcs_mtx);
1798         mutex_enter(&evicted_state->arcs_mtx);
1799 
1800         for (ab = list_tail(list); ab; ab = ab_prev) {
1801                 ab_prev = list_prev(list, ab);
1802                 /* prefetch buffers have a minimum lifespan */
1803                 if (HDR_IO_IN_PROGRESS(ab) ||
1804                     (spa && ab->b_spa != spa) ||
1805                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1806                     ddi_get_lbolt() - ab->b_arc_access <
1807                     arc_min_prefetch_lifespan)) {
1808                         skipped++;
1809                         continue;
1810                 }
1811                 /* "lookahead" for better eviction candidate */
1812                 if (recycle && ab->b_size != bytes &&
1813                     ab_prev && ab_prev->b_size == bytes)
1814                         continue;
1815 
1816                 /* ignore markers */
1817                 if (ab->b_spa == 0)
1818                         continue;
1819 
1820                 /*
1821                  * It may take a long time to evict all the bufs requested.
1822                  * To avoid blocking all arc activity, periodically drop
1823                  * the arcs_mtx and give other threads a chance to run
1824                  * before reacquiring the lock.
1825                  *
1826                  * If we are looking for a buffer to recycle, we are in
1827                  * the hot code path, so don't sleep.
1828                  */
1829                 if (!recycle && count++ > arc_evict_iterations) {
1830                         list_insert_after(list, ab, &marker);
1831                         mutex_exit(&evicted_state->arcs_mtx);
1832                         mutex_exit(&state->arcs_mtx);
1833                         kpreempt(KPREEMPT_SYNC);
1834                         mutex_enter(&state->arcs_mtx);
1835                         mutex_enter(&evicted_state->arcs_mtx);
1836                         ab_prev = list_prev(list, &marker);
1837                         list_remove(list, &marker);
1838                         count = 0;
1839                         continue;
1840                 }
1841 
1842                 hash_lock = HDR_LOCK(ab);
1843                 have_lock = MUTEX_HELD(hash_lock);
1844                 if (have_lock || mutex_tryenter(hash_lock)) {
1845                         ASSERT0(refcount_count(&ab->b_refcnt));
1846                         ASSERT(ab->b_datacnt > 0);
1847                         while (ab->b_buf) {
1848                                 arc_buf_t *buf = ab->b_buf;
1849                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1850                                         missed += 1;
1851                                         break;
1852                                 }
1853                                 if (buf->b_data) {
1854                                         bytes_evicted += ab->b_size;
1855                                         if (recycle && ab->b_type == type &&
1856                                             ab->b_size == bytes &&
1857                                             !HDR_L2_WRITING(ab)) {
1858                                                 stolen = buf->b_data;
1859                                                 recycle = FALSE;
1860                                         }
1861                                 }

1903                                 break;
1904                 } else {
1905                         missed += 1;
1906                 }
1907         }
1908 
1909         mutex_exit(&evicted_state->arcs_mtx);
1910         mutex_exit(&state->arcs_mtx);
1911 
1912         if (bytes_evicted < bytes)
1913                 dprintf("only evicted %lld bytes from %x",
1914                     (longlong_t)bytes_evicted, state);
1915 
1916         if (skipped)
1917                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1918 
1919         if (missed)
1920                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1921 
1922         /*
1923          * Note: we have just evicted some data into the ghost state,
1924          * potentially putting the ghost size over the desired size.  Rather
1925          * that evicting from the ghost list in this hot code path, leave
1926          * this chore to the arc_reclaim_thread().
1927          */




1928 












1929         return (stolen);
1930 }
1931 
1932 /*
1933  * Remove buffers from list until we've removed the specified number of
1934  * bytes.  Destroy the buffers that are removed.
1935  */
1936 static void
1937 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1938 {
1939         arc_buf_hdr_t *ab, *ab_prev;
1940         arc_buf_hdr_t marker = { 0 };
1941         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1942         kmutex_t *hash_lock;
1943         uint64_t bytes_deleted = 0;
1944         uint64_t bufs_skipped = 0;
1945         int count = 0;
1946 
1947         ASSERT(GHOST_STATE(state));
1948 top:
1949         mutex_enter(&state->arcs_mtx);
1950         for (ab = list_tail(list); ab; ab = ab_prev) {
1951                 ab_prev = list_prev(list, ab);
1952                 if (ab->b_type > ARC_BUFC_NUMTYPES)
1953                         panic("invalid ab=%p", (void *)ab);
1954                 if (spa && ab->b_spa != spa)
1955                         continue;
1956 
1957                 /* ignore markers */
1958                 if (ab->b_spa == 0)
1959                         continue;
1960 
1961                 hash_lock = HDR_LOCK(ab);
1962                 /* caller may be trying to modify this buffer, skip it */
1963                 if (MUTEX_HELD(hash_lock))
1964                         continue;
1965 
1966                 /*
1967                  * It may take a long time to evict all the bufs requested.
1968                  * To avoid blocking all arc activity, periodically drop
1969                  * the arcs_mtx and give other threads a chance to run
1970                  * before reacquiring the lock.
1971                  */
1972                 if (count++ > arc_evict_iterations) {
1973                         list_insert_after(list, ab, &marker);
1974                         mutex_exit(&state->arcs_mtx);
1975                         kpreempt(KPREEMPT_SYNC);
1976                         mutex_enter(&state->arcs_mtx);
1977                         ab_prev = list_prev(list, &marker);
1978                         list_remove(list, &marker);
1979                         count = 0;
1980                         continue;
1981                 }
1982                 if (mutex_tryenter(hash_lock)) {
1983                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1984                         ASSERT(ab->b_buf == NULL);
1985                         ARCSTAT_BUMP(arcstat_deleted);
1986                         bytes_deleted += ab->b_size;
1987 
1988                         if (ab->b_l2hdr != NULL) {
1989                                 /*
1990                                  * This buffer is cached on the 2nd Level ARC;
1991                                  * don't destroy the header.
1992                                  */
1993                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1994                                 mutex_exit(hash_lock);
1995                         } else {
1996                                 arc_change_state(arc_anon, ab, hash_lock);
1997                                 mutex_exit(hash_lock);
1998                                 arc_hdr_destroy(ab);
1999                         }
2000 
2001                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2002                         if (bytes >= 0 && bytes_deleted >= bytes)
2003                                 break;
2004                 } else if (bytes < 0) {
2005                         /*
2006                          * Insert a list marker and then wait for the
2007                          * hash lock to become available. Once its
2008                          * available, restart from where we left off.
2009                          */
2010                         list_insert_after(list, ab, &marker);
2011                         mutex_exit(&state->arcs_mtx);
2012                         mutex_enter(hash_lock);
2013                         mutex_exit(hash_lock);
2014                         mutex_enter(&state->arcs_mtx);
2015                         ab_prev = list_prev(list, &marker);
2016                         list_remove(list, &marker);
2017                 } else {
2018                         bufs_skipped += 1;
2019                 }
2020 
2021         }
2022         mutex_exit(&state->arcs_mtx);
2023 
2024         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2025             (bytes < 0 || bytes_deleted < bytes)) {
2026                 list = &state->arcs_list[ARC_BUFC_METADATA];
2027                 goto top;
2028         }
2029 
2030         if (bufs_skipped) {
2031                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2032                 ASSERT(bytes >= 0);
2033         }
2034 
2035         if (bytes_deleted < bytes)
2036                 dprintf("only deleted %lld bytes from %p",
2037                     (longlong_t)bytes_deleted, state);
2038 }
2039 
2040 static void
2041 arc_adjust(void)

2854 /*
2855  * "Read" the block at the specified DVA (in bp) via the
2856  * cache.  If the block is found in the cache, invoke the provided
2857  * callback immediately and return.  Note that the `zio' parameter
2858  * in the callback will be NULL in this case, since no IO was
2859  * required.  If the block is not in the cache pass the read request
2860  * on to the spa with a substitute callback function, so that the
2861  * requested block will be added to the cache.
2862  *
2863  * If a read request arrives for a block that has a read in-progress,
2864  * either wait for the in-progress read to complete (and return the
2865  * results); or, if this is a read with a "done" func, add a record
2866  * to the read to invoke the "done" func when the read completes,
2867  * and return; or just return.
2868  *
2869  * arc_read_done() will invoke all the requested "done" functions
2870  * for readers of this block.
2871  */
2872 int
2873 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2874     void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
2875     const zbookmark_t *zb)
2876 {
2877         arc_buf_hdr_t *hdr;
2878         arc_buf_t *buf = NULL;
2879         kmutex_t *hash_lock;
2880         zio_t *rzio;
2881         uint64_t guid = spa_load_guid(spa);
2882 
2883 top:
2884         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2885             &hash_lock);
2886         if (hdr && hdr->b_datacnt > 0) {
2887 
2888                 *arc_flags |= ARC_CACHED;
2889 
2890                 if (HDR_IO_IN_PROGRESS(hdr)) {
2891 
2892                         if (*arc_flags & ARC_WAIT) {
2893                                 cv_wait(&hdr->b_cv, hash_lock);
2894                                 mutex_exit(hash_lock);

3457         callback->awcb_ready(zio, buf, callback->awcb_private);
3458 
3459         /*
3460          * If the IO is already in progress, then this is a re-write
3461          * attempt, so we need to thaw and re-compute the cksum.
3462          * It is the responsibility of the callback to handle the
3463          * accounting for any re-write attempt.
3464          */
3465         if (HDR_IO_IN_PROGRESS(hdr)) {
3466                 mutex_enter(&hdr->b_freeze_lock);
3467                 if (hdr->b_freeze_cksum != NULL) {
3468                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3469                         hdr->b_freeze_cksum = NULL;
3470                 }
3471                 mutex_exit(&hdr->b_freeze_lock);
3472         }
3473         arc_cksum_compute(buf, B_FALSE);
3474         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3475 }
3476 
3477 /*
3478  * The SPA calls this callback for each physical write that happens on behalf
3479  * of a logical write.  See the comment in dbuf_write_physdone() for details.
3480  */
3481 static void
3482 arc_write_physdone(zio_t *zio)
3483 {
3484         arc_write_callback_t *cb = zio->io_private;
3485         if (cb->awcb_physdone != NULL)
3486                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3487 }
3488 
3489 static void
3490 arc_write_done(zio_t *zio)
3491 {
3492         arc_write_callback_t *callback = zio->io_private;
3493         arc_buf_t *buf = callback->awcb_buf;
3494         arc_buf_hdr_t *hdr = buf->b_hdr;
3495 
3496         ASSERT(hdr->b_acb == NULL);
3497 
3498         if (zio->io_error == 0) {
3499                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3500                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3501                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3502         } else {
3503                 ASSERT(BUF_EMPTY(hdr));
3504         }
3505 
3506         /*
3507          * If the block to be written was all-zero, we may have
3508          * compressed it away.  In this case no write was performed
3509          * so there will be no dva/birth/checksum.  The buffer must

3549                         }
3550                 }
3551                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3552                 /* if it's not anon, we are doing a scrub */
3553                 if (!exists && hdr->b_state == arc_anon)
3554                         arc_access(hdr, hash_lock);
3555                 mutex_exit(hash_lock);
3556         } else {
3557                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3558         }
3559 
3560         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3561         callback->awcb_done(zio, buf, callback->awcb_private);
3562 
3563         kmem_free(callback, sizeof (arc_write_callback_t));
3564 }
3565 
3566 zio_t *
3567 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3568     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3569     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3570     arc_done_func_t *done, void *private, zio_priority_t priority,
3571     int zio_flags, const zbookmark_t *zb)
3572 {
3573         arc_buf_hdr_t *hdr = buf->b_hdr;
3574         arc_write_callback_t *callback;
3575         zio_t *zio;
3576 
3577         ASSERT(ready != NULL);
3578         ASSERT(done != NULL);
3579         ASSERT(!HDR_IO_ERROR(hdr));
3580         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3581         ASSERT(hdr->b_acb == NULL);
3582         if (l2arc)
3583                 hdr->b_flags |= ARC_L2CACHE;
3584         if (l2arc_compress)
3585                 hdr->b_flags |= ARC_L2COMPRESS;
3586         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3587         callback->awcb_ready = ready;
3588         callback->awcb_physdone = physdone;
3589         callback->awcb_done = done;
3590         callback->awcb_private = private;
3591         callback->awcb_buf = buf;
3592 
3593         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3594             arc_write_ready, arc_write_physdone, arc_write_done, callback,
3595             priority, zio_flags, zb);
3596 
3597         return (zio);
3598 }
3599 
3600 static int
3601 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3602 {
3603 #ifdef _KERNEL
3604         uint64_t available_memory = ptob(freemem);
3605         static uint64_t page_load = 0;
3606         static uint64_t last_txg = 0;
3607 
3608 #if defined(__i386)
3609         available_memory =
3610             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3611 #endif
3612 
3613         if (freemem > physmem * arc_lotsfree_percent / 100)
3614                 return (0);
3615 
3616         if (txg > last_txg) {
3617                 last_txg = txg;
3618                 page_load = 0;
3619         }
3620         /*
3621          * If we are in pageout, we know that memory is already tight,
3622          * the arc is already going to be evicting, so we just want to
3623          * continue to let page writes occur as quickly as possible.
3624          */
3625         if (curproc == proc_pageout) {
3626                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3627                         return (SET_ERROR(ERESTART));
3628                 /* Note: reserve is inflated, so we deflate */
3629                 page_load += reserve / 8;
3630                 return (0);
3631         } else if (page_load > 0 && arc_reclaim_needed()) {
3632                 /* memory is low, delay before restarting */
3633                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3634                 return (SET_ERROR(EAGAIN));
3635         }
3636         page_load = 0;














3637 #endif
3638         return (0);
3639 }
3640 
3641 void
3642 arc_tempreserve_clear(uint64_t reserve)
3643 {
3644         atomic_add_64(&arc_tempreserve, -reserve);
3645         ASSERT((int64_t)arc_tempreserve >= 0);
3646 }
3647 
3648 int
3649 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3650 {
3651         int error;
3652         uint64_t anon_size;
3653 









3654         if (reserve > arc_c/4 && !arc_no_grow)
3655                 arc_c = MIN(arc_c_max, reserve * 4);
3656         if (reserve > arc_c)
3657                 return (SET_ERROR(ENOMEM));
3658 
3659         /*
3660          * Don't count loaned bufs as in flight dirty data to prevent long
3661          * network delays from blocking transactions that are ready to be
3662          * assigned to a txg.
3663          */
3664         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3665 
3666         /*
3667          * Writes will, almost always, require additional memory allocations
3668          * in order to compress/encrypt/etc the data.  We therefore need to
3669          * make sure that there is sufficient available memory for this.
3670          */
3671         error = arc_memory_throttle(reserve, txg);
3672         if (error != 0)
3673                 return (error);
3674 
3675         /*
3676          * Throttle writes when the amount of dirty data in the cache
3677          * gets too large.  We try to keep the cache less than half full
3678          * of dirty blocks so that our sync times don't grow too large.
3679          * Note: if two requests come in concurrently, we might let them
3680          * both succeed, when one of them should fail.  Not a huge deal.
3681          */
3682 
3683         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3684             anon_size > arc_c / 4) {
3685                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3686                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3687                     arc_tempreserve>>10,
3688                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3689                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3690                     reserve>>10, arc_c>>10);
3691                 return (SET_ERROR(ERESTART));
3692         }

3801 
3802         arc_thread_exit = 0;
3803         arc_eviction_list = NULL;
3804         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3805         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3806 
3807         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3808             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3809 
3810         if (arc_ksp != NULL) {
3811                 arc_ksp->ks_data = &arc_stats;
3812                 kstat_install(arc_ksp);
3813         }
3814 
3815         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3816             TS_RUN, minclsyspri);
3817 
3818         arc_dead = FALSE;
3819         arc_warm = B_FALSE;
3820 
3821         /*
3822          * Calculate maximum amount of dirty data per pool.
3823          *
3824          * If it has been set by /etc/system, take that.
3825          * Otherwise, use a percentage of physical memory defined by
3826          * zfs_dirty_data_max_percent (default 10%) with a cap at
3827          * zfs_dirty_data_max_max (default 4GB).
3828          */
3829         if (zfs_dirty_data_max == 0) {
3830                 zfs_dirty_data_max = physmem * PAGESIZE *
3831                     zfs_dirty_data_max_percent / 100;
3832                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
3833                     zfs_dirty_data_max_max);
3834         }
3835 }
3836 
3837 void
3838 arc_fini(void)
3839 {
3840         mutex_enter(&arc_reclaim_thr_lock);
3841         arc_thread_exit = 1;
3842         while (arc_thread_exit != 0)
3843                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3844         mutex_exit(&arc_reclaim_thr_lock);
3845 
3846         arc_flush(NULL);
3847 
3848         arc_dead = TRUE;
3849 
3850         if (arc_ksp != NULL) {
3851                 kstat_delete(arc_ksp);
3852                 arc_ksp = NULL;
3853         }
3854 
3855         mutex_destroy(&arc_eviction_mtx);
3856         mutex_destroy(&arc_reclaim_thr_lock);
3857         cv_destroy(&arc_reclaim_thr_cv);
3858 
3859         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3860         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3861         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3862         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3863         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3864         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3865         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3866         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3867 
3868         mutex_destroy(&arc_anon->arcs_mtx);
3869         mutex_destroy(&arc_mru->arcs_mtx);
3870         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3871         mutex_destroy(&arc_mfu->arcs_mtx);
3872         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3873         mutex_destroy(&arc_l2c_only->arcs_mtx);
3874 


3875         buf_fini();
3876 
3877         ASSERT(arc_loaned_bytes == 0);
3878 }
3879 
3880 /*
3881  * Level 2 ARC
3882  *
3883  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3884  * It uses dedicated storage devices to hold cached data, which are populated
3885  * using large infrequent writes.  The main role of this cache is to boost
3886  * the performance of random read workloads.  The intended L2ARC devices
3887  * include short-stroked disks, solid state disks, and other media with
3888  * substantially faster read latency than disk.
3889  *
3890  *                 +-----------------------+
3891  *                 |         ARC           |
3892  *                 +-----------------------+
3893  *                    |         ^     ^
3894  *                    |         |     |