nexenta-gate Old usr/src/uts/common/fs/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 /*
  30  * DVA-based Adjustable Replacement Cache
  31  *
  32  * While much of the theory of operation used here is
  33  * based on the self-tuning, low overhead replacement cache
  34  * presented by Megiddo and Modha at FAST 2003, there are some
  35  * significant differences:
  36  *
  37  * 1. The Megiddo and Modha model assumes any page is evictable.
  38  * Pages in its cache cannot be "locked" into memory.  This makes
  39  * the eviction algorithm simple: evict the last page in the list.
  40  * This also make the performance characteristics easy to reason
  41  * about.  Our cache is not so simple.  At any given moment, some
  42  * subset of the blocks in the cache are un-evictable because we
  43  * have handed out a reference to them.  Blocks are only evictable
  44  * when there are no external references active.  This makes
  45  * eviction far more problematic:  we choose to evict the evictable
  46  * blocks that are the "lowest" in the list.
  47  *
  48  * There are times when it is not possible to evict the requested
  49  * space.  In these circumstances we are unable to adjust the cache
  50  * size.  To prevent the cache growing unbounded at these times we
  51  * implement a "cache throttle" that slows the flow of new data
  52  * into the cache until we can make space available.
  53  *
  54  * 2. The Megiddo and Modha model assumes a fixed cache size.
  55  * Pages are evicted when the cache is full and there is a cache
  56  * miss.  Our model has a variable sized cache.  It grows with
  57  * high use, but also tries to react to memory pressure from the
  58  * operating system: decreasing its size when system memory is
  59  * tight.
  60  *
  61  * 3. The Megiddo and Modha model assumes a fixed page size. All
  62  * elements of the cache are therefore exactly the same size.  So
  63  * when adjusting the cache size following a cache miss, its simply
  64  * a matter of choosing a single page to evict.  In our model, we
  65  * have variable sized cache blocks (rangeing from 512 bytes to
  66  * 128K bytes).  We therefore choose a set of blocks to evict to make
  67  * space for a cache miss that approximates as closely as possible
  68  * the space used by the new block.
  69  *
  70  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71  * by N. Megiddo & D. Modha, FAST 2003
  72  */
  73 
  74 /*
  75  * The locking model:
  76  *
  77  * A new reference to a cache buffer can be obtained in two
  78  * ways: 1) via a hash table lookup using the DVA as a key,
  79  * or 2) via one of the ARC lists.  The arc_read() interface
  80  * uses method 1, while the internal arc algorithms for
  81  * adjusting the cache use method 2.  We therefore provide two
  82  * types of locks: 1) the hash table lock array, and 2) the
  83  * arc list locks.
  84  *
  85  * Buffers do not have their own mutexes, rather they rely on the
  86  * hash table mutexes for the bulk of their protection (i.e. most
  87  * fields in the arc_buf_hdr_t are protected by these mutexes).
  88  *
  89  * buf_hash_find() returns the appropriate mutex (held) when it
  90  * locates the requested buffer in the hash table.  It returns
  91  * NULL for the mutex if the buffer was not in the table.
  92  *
  93  * buf_hash_remove() expects the appropriate hash mutex to be
  94  * already held before it is invoked.
  95  *
  96  * Each arc state also has a mutex which is used to protect the
  97  * buffer list associated with the state.  When attempting to
  98  * obtain a hash table lock while holding an arc list lock you
  99  * must use: mutex_tryenter() to avoid deadlock.  Also note that
 100  * the active state mutex must be held before the ghost state mutex.
 101  *
 102  * Arc buffers may have an associated eviction callback function.
 103  * This function will be invoked prior to removing the buffer (e.g.
 104  * in arc_do_user_evicts()).  Note however that the data associated
 105  * with the buffer may be evicted prior to the callback.  The callback
 106  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 107  * the users of callbacks must ensure that their private data is
 108  * protected from simultaneous callbacks from arc_clear_callback()
 109  * and arc_do_user_evicts().
 110  *
 111  * Note that the majority of the performance stats are manipulated
 112  * with atomic operations.
 113  *
 114  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 115  *
 116  *      - L2ARC buflist creation
 117  *      - L2ARC buflist eviction
 118  *      - L2ARC write completion, which walks L2ARC buflists
 119  *      - ARC header destruction, as it removes from L2ARC buflists
 120  *      - ARC header release, as it removes from L2ARC buflists
 121  */
 122 
 123 #include <sys/spa.h>
 124 #include <sys/zio.h>
 125 #include <sys/zio_compress.h>
 126 #include <sys/zfs_context.h>
 127 #include <sys/arc.h>
 128 #include <sys/refcount.h>
 129 #include <sys/vdev.h>
 130 #include <sys/vdev_impl.h>
 131 #include <sys/dsl_pool.h>
 132 #ifdef _KERNEL
 133 #include <sys/vmsystm.h>
 134 #include <vm/anon.h>
 135 #include <sys/fs/swapnode.h>
 136 #include <sys/dnlc.h>
 137 #endif
 138 #include <sys/callb.h>
 139 #include <sys/kstat.h>
 140 #include <zfs_fletcher.h>
 141 
 142 #ifndef _KERNEL
 143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 144 boolean_t arc_watch = B_FALSE;
 145 int arc_procfd;
 146 #endif
 147 
 148 static kmutex_t         arc_reclaim_thr_lock;
 149 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 150 static uint8_t          arc_thread_exit;
 151 
 152 #define ARC_REDUCE_DNLC_PERCENT 3
 153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154 
 155 typedef enum arc_reclaim_strategy {
 156         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158 } arc_reclaim_strategy_t;
 159 
 160 /*
 161  * The number of iterations through arc_evict_*() before we
 162  * drop & reacquire the lock.
 163  */
 164 int arc_evict_iterations = 100;
 165 
 166 /* number of seconds before growing cache again */
 167 static int              arc_grow_retry = 60;
 168 
 169 /* shift of arc_c for calculating both min and max arc_p */
 170 static int              arc_p_min_shift = 4;
 171 
 172 /* log2(fraction of arc to reclaim) */
 173 static int              arc_shrink_shift = 5;
 174 
 175 /*
 176  * minimum lifespan of a prefetch block in clock ticks
 177  * (initialized in arc_init())
 178  */
 179 static int              arc_min_prefetch_lifespan;
 180 
 181 /*
 182  * If this percent of memory is free, don't throttle.
 183  */
 184 int arc_lotsfree_percent = 10;
 185 
 186 static int arc_dead;
 187 
 188 /*
 189  * The arc has filled available memory and has now warmed up.
 190  */
 191 static boolean_t arc_warm;
 192 
 193 /*
 194  * These tunables are for performance analysis.
 195  */
 196 uint64_t zfs_arc_max;
 197 uint64_t zfs_arc_min;
 198 uint64_t zfs_arc_meta_limit = 0;
 199 int zfs_arc_grow_retry = 0;
 200 int zfs_arc_shrink_shift = 0;
 201 int zfs_arc_p_min_shift = 0;
 202 int zfs_disable_dup_eviction = 0;
 203 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 204 
 205 /*
 206  * Note that buffers can be in one of 6 states:
 207  *      ARC_anon        - anonymous (discussed below)
 208  *      ARC_mru         - recently used, currently cached
 209  *      ARC_mru_ghost   - recentely used, no longer in cache
 210  *      ARC_mfu         - frequently used, currently cached
 211  *      ARC_mfu_ghost   - frequently used, no longer in cache
 212  *      ARC_l2c_only    - exists in L2ARC but not other states
 213  * When there are no active references to the buffer, they are
 214  * are linked onto a list in one of these arc states.  These are
 215  * the only buffers that can be evicted or deleted.  Within each
 216  * state there are multiple lists, one for meta-data and one for
 217  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 218  * etc.) is tracked separately so that it can be managed more
 219  * explicitly: favored over data, limited explicitly.
 220  *
 221  * Anonymous buffers are buffers that are not associated with
 222  * a DVA.  These are buffers that hold dirty block copies
 223  * before they are written to stable storage.  By definition,
 224  * they are "ref'd" and are considered part of arc_mru
 225  * that cannot be freed.  Generally, they will aquire a DVA
 226  * as they are written and migrate onto the arc_mru list.
 227  *
 228  * The ARC_l2c_only state is for buffers that are in the second
 229  * level ARC but no longer in any of the ARC_m* lists.  The second
 230  * level ARC itself may also contain buffers that are in any of
 231  * the ARC_m* states - meaning that a buffer can exist in two
 232  * places.  The reason for the ARC_l2c_only state is to keep the
 233  * buffer header in the hash table, so that reads that hit the
 234  * second level ARC benefit from these fast lookups.
 235  */
 236 
 237 typedef struct arc_state {
 238         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 239         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 240         uint64_t arcs_size;     /* total amount of data in this state */
 241         kmutex_t arcs_mtx;
 242 } arc_state_t;
 243 
 244 /* The 6 states: */
 245 static arc_state_t ARC_anon;
 246 static arc_state_t ARC_mru;
 247 static arc_state_t ARC_mru_ghost;
 248 static arc_state_t ARC_mfu;
 249 static arc_state_t ARC_mfu_ghost;
 250 static arc_state_t ARC_l2c_only;
 251 
 252 typedef struct arc_stats {
 253         kstat_named_t arcstat_hits;
 254         kstat_named_t arcstat_misses;
 255         kstat_named_t arcstat_demand_data_hits;
 256         kstat_named_t arcstat_demand_data_misses;
 257         kstat_named_t arcstat_demand_metadata_hits;
 258         kstat_named_t arcstat_demand_metadata_misses;
 259         kstat_named_t arcstat_prefetch_data_hits;
 260         kstat_named_t arcstat_prefetch_data_misses;
 261         kstat_named_t arcstat_prefetch_metadata_hits;
 262         kstat_named_t arcstat_prefetch_metadata_misses;
 263         kstat_named_t arcstat_mru_hits;
 264         kstat_named_t arcstat_mru_ghost_hits;
 265         kstat_named_t arcstat_mfu_hits;
 266         kstat_named_t arcstat_mfu_ghost_hits;
 267         kstat_named_t arcstat_deleted;
 268         kstat_named_t arcstat_recycle_miss;
 269         /*
 270          * Number of buffers that could not be evicted because the hash lock
 271          * was held by another thread.  The lock may not necessarily be held
 272          * by something using the same buffer, since hash locks are shared
 273          * by multiple buffers.
 274          */
 275         kstat_named_t arcstat_mutex_miss;
 276         /*
 277          * Number of buffers skipped because they have I/O in progress, are
 278          * indrect prefetch buffers that have not lived long enough, or are
 279          * not from the spa we're trying to evict from.
 280          */
 281         kstat_named_t arcstat_evict_skip;
 282         kstat_named_t arcstat_evict_l2_cached;
 283         kstat_named_t arcstat_evict_l2_eligible;
 284         kstat_named_t arcstat_evict_l2_ineligible;
 285         kstat_named_t arcstat_hash_elements;
 286         kstat_named_t arcstat_hash_elements_max;
 287         kstat_named_t arcstat_hash_collisions;
 288         kstat_named_t arcstat_hash_chains;
 289         kstat_named_t arcstat_hash_chain_max;
 290         kstat_named_t arcstat_p;
 291         kstat_named_t arcstat_c;
 292         kstat_named_t arcstat_c_min;
 293         kstat_named_t arcstat_c_max;
 294         kstat_named_t arcstat_size;
 295         kstat_named_t arcstat_hdr_size;
 296         kstat_named_t arcstat_data_size;
 297         kstat_named_t arcstat_other_size;
 298         kstat_named_t arcstat_l2_hits;
 299         kstat_named_t arcstat_l2_misses;
 300         kstat_named_t arcstat_l2_feeds;
 301         kstat_named_t arcstat_l2_rw_clash;
 302         kstat_named_t arcstat_l2_read_bytes;
 303         kstat_named_t arcstat_l2_write_bytes;
 304         kstat_named_t arcstat_l2_writes_sent;
 305         kstat_named_t arcstat_l2_writes_done;
 306         kstat_named_t arcstat_l2_writes_error;
 307         kstat_named_t arcstat_l2_writes_hdr_miss;
 308         kstat_named_t arcstat_l2_evict_lock_retry;
 309         kstat_named_t arcstat_l2_evict_reading;
 310         kstat_named_t arcstat_l2_free_on_write;
 311         kstat_named_t arcstat_l2_abort_lowmem;
 312         kstat_named_t arcstat_l2_cksum_bad;
 313         kstat_named_t arcstat_l2_io_error;
 314         kstat_named_t arcstat_l2_size;
 315         kstat_named_t arcstat_l2_asize;
 316         kstat_named_t arcstat_l2_hdr_size;
 317         kstat_named_t arcstat_l2_compress_successes;
 318         kstat_named_t arcstat_l2_compress_zeros;
 319         kstat_named_t arcstat_l2_compress_failures;
 320         kstat_named_t arcstat_memory_throttle_count;
 321         kstat_named_t arcstat_duplicate_buffers;
 322         kstat_named_t arcstat_duplicate_buffers_size;
 323         kstat_named_t arcstat_duplicate_reads;
 324         kstat_named_t arcstat_meta_used;
 325         kstat_named_t arcstat_meta_limit;
 326         kstat_named_t arcstat_meta_max;
 327 } arc_stats_t;
 328 
 329 static arc_stats_t arc_stats = {
 330         { "hits",                       KSTAT_DATA_UINT64 },
 331         { "misses",                     KSTAT_DATA_UINT64 },
 332         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 333         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 334         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 335         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 336         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 337         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 338         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 339         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 340         { "mru_hits",                   KSTAT_DATA_UINT64 },
 341         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 342         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 343         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 344         { "deleted",                    KSTAT_DATA_UINT64 },
 345         { "recycle_miss",               KSTAT_DATA_UINT64 },
 346         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 347         { "evict_skip",                 KSTAT_DATA_UINT64 },
 348         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 349         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 350         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 351         { "hash_elements",              KSTAT_DATA_UINT64 },
 352         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 353         { "hash_collisions",            KSTAT_DATA_UINT64 },
 354         { "hash_chains",                KSTAT_DATA_UINT64 },
 355         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 356         { "p",                          KSTAT_DATA_UINT64 },
 357         { "c",                          KSTAT_DATA_UINT64 },
 358         { "c_min",                      KSTAT_DATA_UINT64 },
 359         { "c_max",                      KSTAT_DATA_UINT64 },
 360         { "size",                       KSTAT_DATA_UINT64 },
 361         { "hdr_size",                   KSTAT_DATA_UINT64 },
 362         { "data_size",                  KSTAT_DATA_UINT64 },
 363         { "other_size",                 KSTAT_DATA_UINT64 },
 364         { "l2_hits",                    KSTAT_DATA_UINT64 },
 365         { "l2_misses",                  KSTAT_DATA_UINT64 },
 366         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 367         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 368         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 369         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 370         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 371         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 372         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 373         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 374         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 375         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 376         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 377         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 378         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 379         { "l2_io_error",                KSTAT_DATA_UINT64 },
 380         { "l2_size",                    KSTAT_DATA_UINT64 },
 381         { "l2_asize",                   KSTAT_DATA_UINT64 },
 382         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 383         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 384         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 385         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 386         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 387         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 388         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 389         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 390         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 391         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 392         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 393 };
 394 
 395 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 396 
 397 #define ARCSTAT_INCR(stat, val) \
 398         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 399 
 400 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 401 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 402 
 403 #define ARCSTAT_MAX(stat, val) {                                        \
 404         uint64_t m;                                                     \
 405         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 406             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 407                 continue;                                               \
 408 }
 409 
 410 #define ARCSTAT_MAXSTAT(stat) \
 411         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 412 
 413 /*
 414  * We define a macro to allow ARC hits/misses to be easily broken down by
 415  * two separate conditions, giving a total of four different subtypes for
 416  * each of hits and misses (so eight statistics total).
 417  */
 418 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 419         if (cond1) {                                                    \
 420                 if (cond2) {                                            \
 421                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 422                 } else {                                                \
 423                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 424                 }                                                       \
 425         } else {                                                        \
 426                 if (cond2) {                                            \
 427                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 428                 } else {                                                \
 429                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 430                 }                                                       \
 431         }
 432 
 433 kstat_t                 *arc_ksp;
 434 static arc_state_t      *arc_anon;
 435 static arc_state_t      *arc_mru;
 436 static arc_state_t      *arc_mru_ghost;
 437 static arc_state_t      *arc_mfu;
 438 static arc_state_t      *arc_mfu_ghost;
 439 static arc_state_t      *arc_l2c_only;
 440 
 441 /*
 442  * There are several ARC variables that are critical to export as kstats --
 443  * but we don't want to have to grovel around in the kstat whenever we wish to
 444  * manipulate them.  For these variables, we therefore define them to be in
 445  * terms of the statistic variable.  This assures that we are not introducing
 446  * the possibility of inconsistency by having shadow copies of the variables,
 447  * while still allowing the code to be readable.
 448  */
 449 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 450 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 451 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 452 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 453 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 454 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 455 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 456 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 457 
 458 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 459         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 460 
 461 static int              arc_no_grow;    /* Don't try to grow cache size */
 462 static uint64_t         arc_tempreserve;
 463 static uint64_t         arc_loaned_bytes;
 464 
 465 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 466 
 467 typedef struct arc_callback arc_callback_t;
 468 
 469 struct arc_callback {
 470         void                    *acb_private;
 471         arc_done_func_t         *acb_done;
 472         arc_buf_t               *acb_buf;
 473         zio_t                   *acb_zio_dummy;
 474         arc_callback_t          *acb_next;
 475 };
 476 
 477 typedef struct arc_write_callback arc_write_callback_t;
 478 
 479 struct arc_write_callback {
 480         void            *awcb_private;
 481         arc_done_func_t *awcb_ready;
 482         arc_done_func_t *awcb_physdone;
 483         arc_done_func_t *awcb_done;
 484         arc_buf_t       *awcb_buf;
 485 };
 486 
 487 struct arc_buf_hdr {
 488         /* protected by hash lock */
 489         dva_t                   b_dva;
 490         uint64_t                b_birth;
 491         uint64_t                b_cksum0;
 492 
 493         kmutex_t                b_freeze_lock;
 494         zio_cksum_t             *b_freeze_cksum;
 495         void                    *b_thawed;
 496 
 497         arc_buf_hdr_t           *b_hash_next;
 498         arc_buf_t               *b_buf;
 499         uint32_t                b_flags;
 500         uint32_t                b_datacnt;
 501 
 502         arc_callback_t          *b_acb;
 503         kcondvar_t              b_cv;
 504 
 505         /* immutable */
 506         arc_buf_contents_t      b_type;
 507         uint64_t                b_size;
 508         uint64_t                b_spa;
 509 
 510         /* protected by arc state mutex */
 511         arc_state_t             *b_state;
 512         list_node_t             b_arc_node;
 513 
 514         /* updated atomically */
 515         clock_t                 b_arc_access;
 516 
 517         /* self protecting */
 518         refcount_t              b_refcnt;
 519 
 520         l2arc_buf_hdr_t         *b_l2hdr;
 521         list_node_t             b_l2node;
 522 };
 523 
 524 static arc_buf_t *arc_eviction_list;
 525 static kmutex_t arc_eviction_mtx;
 526 static arc_buf_hdr_t arc_eviction_hdr;
 527 static void arc_get_data_buf(arc_buf_t *buf);
 528 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 529 static int arc_evict_needed(arc_buf_contents_t type);
 530 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 531 static void arc_buf_watch(arc_buf_t *buf);
 532 
 533 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 534 
 535 #define GHOST_STATE(state)      \
 536         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 537         (state) == arc_l2c_only)
 538 
 539 /*
 540  * Private ARC flags.  These flags are private ARC only flags that will show up
 541  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 542  * be passed in as arc_flags in things like arc_read.  However, these flags
 543  * should never be passed and should only be set by ARC code.  When adding new
 544  * public flags, make sure not to smash the private ones.
 545  */
 546 
 547 #define ARC_IN_HASH_TABLE       (1 << 9)  /* this buffer is hashed */
 548 #define ARC_IO_IN_PROGRESS      (1 << 10) /* I/O in progress for buf */
 549 #define ARC_IO_ERROR            (1 << 11) /* I/O failed for buf */
 550 #define ARC_FREED_IN_READ       (1 << 12) /* buf freed while in read */
 551 #define ARC_BUF_AVAILABLE       (1 << 13) /* block not in active use */
 552 #define ARC_INDIRECT            (1 << 14) /* this is an indirect block */
 553 #define ARC_FREE_IN_PROGRESS    (1 << 15) /* hdr about to be freed */
 554 #define ARC_L2_WRITING          (1 << 16) /* L2ARC write in progress */
 555 #define ARC_L2_EVICTED          (1 << 17) /* evicted during I/O */
 556 #define ARC_L2_WRITE_HEAD       (1 << 18) /* head of write list */
 557 
 558 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 559 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 560 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 561 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 562 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 563 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 564 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 565 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 566 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS &&  \
 567                                     (hdr)->b_l2hdr != NULL)
 568 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 569 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 570 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 571 
 572 /*
 573  * Other sizes
 574  */
 575 
 576 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 577 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 578 
 579 /*
 580  * Hash table routines
 581  */
 582 
 583 struct ht_table {
 584         arc_buf_hdr_t   *hdr;
 585         kmutex_t        lock;
 586 };
 587 
 588 typedef struct buf_hash_table {
 589         uint64_t ht_mask;
 590         struct ht_table *ht_table;
 591 } buf_hash_table_t;
 592 
 593 #pragma align 64(buf_hash_table)
 594 static buf_hash_table_t buf_hash_table;
 595 
 596 #define BUF_HASH_INDEX(spa, dva, birth) \
 597         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 598 #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_table[idx].lock)
 599 #define HDR_LOCK(hdr) \
 600         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 601 
 602 uint64_t zfs_crc64_table[256];
 603 
 604 /*
 605  * Level 2 ARC
 606  */
 607 
 608 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 609 #define L2ARC_HEADROOM          2                       /* num of writes */
 610 /*
 611  * If we discover during ARC scan any buffers to be compressed, we boost
 612  * our headroom for the next scanning cycle by this percentage multiple.
 613  */
 614 #define L2ARC_HEADROOM_BOOST    200
 615 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 616 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 617 
 618 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 619 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 620 
 621 /* L2ARC Performance Tunables */
 622 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 623 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 624 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 625 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 626 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 627 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 628 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 629 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 630 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 631 
 632 /*
 633  * L2ARC Internals
 634  */
 635 typedef struct l2arc_dev {
 636         vdev_t                  *l2ad_vdev;     /* vdev */
 637         spa_t                   *l2ad_spa;      /* spa */
 638         uint64_t                l2ad_hand;      /* next write location */
 639         uint64_t                l2ad_start;     /* first addr on device */
 640         uint64_t                l2ad_end;       /* last addr on device */
 641         uint64_t                l2ad_evict;     /* last addr eviction reached */
 642         boolean_t               l2ad_first;     /* first sweep through */
 643         boolean_t               l2ad_writing;   /* currently writing */
 644         list_t                  *l2ad_buflist;  /* buffer list */
 645         list_node_t             l2ad_node;      /* device list node */
 646 } l2arc_dev_t;
 647 
 648 static list_t L2ARC_dev_list;                   /* device list */
 649 static list_t *l2arc_dev_list;                  /* device list pointer */
 650 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 651 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 652 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 653 static list_t L2ARC_free_on_write;              /* free after write buf list */
 654 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 655 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 656 static uint64_t l2arc_ndev;                     /* number of devices */
 657 
 658 typedef struct l2arc_read_callback {
 659         arc_buf_t               *l2rcb_buf;             /* read buffer */
 660         spa_t                   *l2rcb_spa;             /* spa */
 661         blkptr_t                l2rcb_bp;               /* original blkptr */
 662         zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
 663         int                     l2rcb_flags;            /* original flags */
 664         enum zio_compress       l2rcb_compress;         /* applied compress */
 665 } l2arc_read_callback_t;
 666 
 667 typedef struct l2arc_write_callback {
 668         l2arc_dev_t     *l2wcb_dev;             /* device info */
 669         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 670 } l2arc_write_callback_t;
 671 
 672 struct l2arc_buf_hdr {
 673         /* protected by arc_buf_hdr  mutex */
 674         l2arc_dev_t             *b_dev;         /* L2ARC device */
 675         uint64_t                b_daddr;        /* disk address, offset byte */
 676         /* compression applied to buffer data */
 677         enum zio_compress       b_compress;
 678         /* real alloc'd buffer size depending on b_compress applied */
 679         int                     b_asize;
 680         /* temporary buffer holder for in-flight compressed data */
 681         void                    *b_tmp_cdata;
 682 };
 683 
 684 typedef struct l2arc_data_free {
 685         /* protected by l2arc_free_on_write_mtx */
 686         void            *l2df_data;
 687         size_t          l2df_size;
 688         void            (*l2df_func)(void *, size_t);
 689         list_node_t     l2df_list_node;
 690 } l2arc_data_free_t;
 691 
 692 static kmutex_t l2arc_feed_thr_lock;
 693 static kcondvar_t l2arc_feed_thr_cv;
 694 static uint8_t l2arc_thread_exit;
 695 
 696 static void l2arc_read_done(zio_t *zio);
 697 static void l2arc_hdr_stat_add(void);
 698 static void l2arc_hdr_stat_remove(void);
 699 
 700 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 701 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 702     enum zio_compress c);
 703 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 704 
 705 static uint64_t
 706 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 707 {
 708         uint8_t *vdva = (uint8_t *)dva;
 709         uint64_t crc = -1ULL;
 710         int i;
 711 
 712         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 713 
 714         for (i = 0; i < sizeof (dva_t); i++)
 715                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 716 
 717         crc ^= (spa>>8) ^ birth;
 718 
 719         return (crc);
 720 }
 721 
 722 #define BUF_EMPTY(buf)                                          \
 723         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 724         (buf)->b_dva.dva_word[1] == 0 &&                     \
 725         (buf)->b_cksum0 == 0)
 726 
 727 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 728         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&       \
 729         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&       \
 730         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 731 
 732 static void
 733 buf_discard_identity(arc_buf_hdr_t *hdr)
 734 {
 735         hdr->b_dva.dva_word[0] = 0;
 736         hdr->b_dva.dva_word[1] = 0;
 737         hdr->b_birth = 0;
 738         hdr->b_cksum0 = 0;
 739 }
 740 
 741 static arc_buf_hdr_t *
 742 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 743 {
 744         const dva_t *dva = BP_IDENTITY(bp);
 745         uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 746         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 747         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 748         arc_buf_hdr_t *buf;
 749 
 750         mutex_enter(hash_lock);
 751         for (buf = buf_hash_table.ht_table[idx].hdr; buf != NULL;
 752             buf = buf->b_hash_next) {
 753                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 754                         *lockp = hash_lock;
 755                         return (buf);
 756                 }
 757         }
 758         mutex_exit(hash_lock);
 759         *lockp = NULL;
 760         return (NULL);
 761 }
 762 
 763 /*
 764  * Insert an entry into the hash table.  If there is already an element
 765  * equal to elem in the hash table, then the already existing element
 766  * will be returned and the new element will not be inserted.
 767  * Otherwise returns NULL.
 768  */
 769 static arc_buf_hdr_t *
 770 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 771 {
 772         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 773         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 774         arc_buf_hdr_t *fbuf;
 775         uint32_t i;
 776 
 777         ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
 778         ASSERT(buf->b_birth != 0);
 779         ASSERT(!HDR_IN_HASH_TABLE(buf));
 780         *lockp = hash_lock;
 781         mutex_enter(hash_lock);
 782         for (fbuf = buf_hash_table.ht_table[idx].hdr, i = 0; fbuf != NULL;
 783             fbuf = fbuf->b_hash_next, i++) {
 784                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 785                         return (fbuf);
 786         }
 787 
 788         buf->b_hash_next = buf_hash_table.ht_table[idx].hdr;
 789         buf_hash_table.ht_table[idx].hdr = buf;
 790         buf->b_flags |= ARC_IN_HASH_TABLE;
 791 
 792         /* collect some hash table performance data */
 793         if (i > 0) {
 794                 ARCSTAT_BUMP(arcstat_hash_collisions);
 795                 if (i == 1)
 796                         ARCSTAT_BUMP(arcstat_hash_chains);
 797 
 798                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 799         }
 800 
 801         ARCSTAT_BUMP(arcstat_hash_elements);
 802         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 803 
 804         return (NULL);
 805 }
 806 
 807 static void
 808 buf_hash_remove(arc_buf_hdr_t *buf)
 809 {
 810         arc_buf_hdr_t *fbuf, **bufp;
 811         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 812 
 813         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 814         ASSERT(HDR_IN_HASH_TABLE(buf));
 815 
 816         bufp = &buf_hash_table.ht_table[idx].hdr;
 817         while ((fbuf = *bufp) != buf) {
 818                 ASSERT(fbuf != NULL);
 819                 bufp = &fbuf->b_hash_next;
 820         }
 821         *bufp = buf->b_hash_next;
 822         buf->b_hash_next = NULL;
 823         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 824 
 825         /* collect some hash table performance data */
 826         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 827 
 828         if (buf_hash_table.ht_table[idx].hdr &&
 829             buf_hash_table.ht_table[idx].hdr->b_hash_next == NULL)
 830                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 831 }
 832 
 833 /*
 834  * Global data structures and functions for the buf kmem cache.
 835  */
 836 static kmem_cache_t *hdr_cache;
 837 static kmem_cache_t *buf_cache;
 838 
 839 static void
 840 buf_fini(void)
 841 {
 842         int i;
 843 
 844         for (i = 0; i < buf_hash_table.ht_mask + 1; i++)
 845                 mutex_destroy(&buf_hash_table.ht_table[i].lock);
 846         kmem_free(buf_hash_table.ht_table,
 847             (buf_hash_table.ht_mask + 1) * sizeof (struct ht_table));
 848         kmem_cache_destroy(hdr_cache);
 849         kmem_cache_destroy(buf_cache);
 850 }
 851 
 852 /*
 853  * Constructor callback - called when the cache is empty
 854  * and a new buf is requested.
 855  */
 856 /* ARGSUSED */
 857 static int
 858 hdr_cons(void *vbuf, void *unused, int kmflag)
 859 {
 860         arc_buf_hdr_t *buf = vbuf;
 861 
 862         bzero(buf, sizeof (arc_buf_hdr_t));
 863         refcount_create(&buf->b_refcnt);
 864         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 865         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 866         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 867 
 868         return (0);
 869 }
 870 
 871 /* ARGSUSED */
 872 static int
 873 buf_cons(void *vbuf, void *unused, int kmflag)
 874 {
 875         arc_buf_t *buf = vbuf;
 876 
 877         bzero(buf, sizeof (arc_buf_t));
 878         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 879         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 880 
 881         return (0);
 882 }
 883 
 884 /*
 885  * Destructor callback - called when a cached buf is
 886  * no longer required.
 887  */
 888 /* ARGSUSED */
 889 static void
 890 hdr_dest(void *vbuf, void *unused)
 891 {
 892         arc_buf_hdr_t *buf = vbuf;
 893 
 894         ASSERT(BUF_EMPTY(buf));
 895         refcount_destroy(&buf->b_refcnt);
 896         cv_destroy(&buf->b_cv);
 897         mutex_destroy(&buf->b_freeze_lock);
 898         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 899 }
 900 
 901 /* ARGSUSED */
 902 static void
 903 buf_dest(void *vbuf, void *unused)
 904 {
 905         arc_buf_t *buf = vbuf;
 906 
 907         mutex_destroy(&buf->b_evict_lock);
 908         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 909 }
 910 
 911 /*
 912  * Reclaim callback -- invoked when memory is low.
 913  */
 914 /* ARGSUSED */
 915 static void
 916 hdr_recl(void *unused)
 917 {
 918         dprintf("hdr_recl called\n");
 919         /*
 920          * umem calls the reclaim func when we destroy the buf cache,
 921          * which is after we do arc_fini().
 922          */
 923         if (!arc_dead)
 924                 cv_signal(&arc_reclaim_thr_cv);
 925 }
 926 
 927 static void
 928 buf_init(void)
 929 {
 930         uint64_t *ct;
 931         uint64_t hsize = 1ULL << 12;
 932         int i, j;
 933 
 934         /*
 935          * The hash table is big enough to fill all of physical memory
 936          * with an average block size of zfs_arc_average_blocksize (default 8K).
 937          * By default, the table will take up
 938          * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 939          */
 940         while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
 941                 hsize <<= 1;
 942 retry:
 943         buf_hash_table.ht_mask = hsize - 1;
 944         buf_hash_table.ht_table =
 945             kmem_zalloc(hsize * sizeof (struct ht_table), KM_NOSLEEP);
 946         if (buf_hash_table.ht_table == NULL) {
 947                 ASSERT(hsize > (1ULL << 8));
 948                 hsize >>= 1;
 949                 goto retry;
 950         }
 951 
 952         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 953             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 954         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 955             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 956 
 957         for (i = 0; i < 256; i++)
 958                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 959                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 960 
 961         for (i = 0; i < hsize; i++) {
 962                 mutex_init(&buf_hash_table.ht_table[i].lock,
 963                     NULL, MUTEX_DEFAULT, NULL);
 964         }
 965 }
 966 
 967 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 968 
 969 static void
 970 arc_cksum_verify(arc_buf_t *buf)
 971 {
 972         zio_cksum_t zc;
 973 
 974         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 975                 return;
 976 
 977         mutex_enter(&buf->b_hdr->b_freeze_lock);
 978         if (buf->b_hdr->b_freeze_cksum == NULL ||
 979             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 980                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 981                 return;
 982         }
 983         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 984         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 985                 panic("buffer modified while frozen!");
 986         mutex_exit(&buf->b_hdr->b_freeze_lock);
 987 }
 988 
 989 static int
 990 arc_cksum_equal(arc_buf_t *buf)
 991 {
 992         zio_cksum_t zc;
 993         int equal;
 994 
 995         mutex_enter(&buf->b_hdr->b_freeze_lock);
 996         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 997         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 998         mutex_exit(&buf->b_hdr->b_freeze_lock);
 999 
1000         return (equal);
1001 }
1002 
1003 static void
1004 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1005 {
1006         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1007                 return;
1008 
1009         mutex_enter(&buf->b_hdr->b_freeze_lock);
1010         if (buf->b_hdr->b_freeze_cksum != NULL) {
1011                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1012                 return;
1013         }
1014         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1015         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1016             buf->b_hdr->b_freeze_cksum);
1017         mutex_exit(&buf->b_hdr->b_freeze_lock);
1018         arc_buf_watch(buf);
1019 }
1020 
1021 #ifndef _KERNEL
1022 typedef struct procctl {
1023         long cmd;
1024         prwatch_t prwatch;
1025 } procctl_t;
1026 #endif
1027 
1028 /* ARGSUSED */
1029 static void
1030 arc_buf_unwatch(arc_buf_t *buf)
1031 {
1032 #ifndef _KERNEL
1033         if (arc_watch) {
1034                 int result;
1035                 procctl_t ctl;
1036                 ctl.cmd = PCWATCH;
1037                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1038                 ctl.prwatch.pr_size = 0;
1039                 ctl.prwatch.pr_wflags = 0;
1040                 result = write(arc_procfd, &ctl, sizeof (ctl));
1041                 ASSERT3U(result, ==, sizeof (ctl));
1042         }
1043 #endif
1044 }
1045 
1046 /* ARGSUSED */
1047 static void
1048 arc_buf_watch(arc_buf_t *buf)
1049 {
1050 #ifndef _KERNEL
1051         if (arc_watch) {
1052                 int result;
1053                 procctl_t ctl;
1054                 ctl.cmd = PCWATCH;
1055                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1056                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1057                 ctl.prwatch.pr_wflags = WA_WRITE;
1058                 result = write(arc_procfd, &ctl, sizeof (ctl));
1059                 ASSERT3U(result, ==, sizeof (ctl));
1060         }
1061 #endif
1062 }
1063 
1064 void
1065 arc_buf_thaw(arc_buf_t *buf)
1066 {
1067         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1068                 if (buf->b_hdr->b_state != arc_anon)
1069                         panic("modifying non-anon buffer!");
1070                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1071                         panic("modifying buffer while i/o in progress!");
1072                 arc_cksum_verify(buf);
1073         }
1074 
1075         mutex_enter(&buf->b_hdr->b_freeze_lock);
1076         if (buf->b_hdr->b_freeze_cksum != NULL) {
1077                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1078                 buf->b_hdr->b_freeze_cksum = NULL;
1079         }
1080 
1081         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1082                 if (buf->b_hdr->b_thawed)
1083                         kmem_free(buf->b_hdr->b_thawed, 1);
1084                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1085         }
1086 
1087         mutex_exit(&buf->b_hdr->b_freeze_lock);
1088 
1089         arc_buf_unwatch(buf);
1090 }
1091 
1092 void
1093 arc_buf_freeze(arc_buf_t *buf)
1094 {
1095         kmutex_t *hash_lock;
1096 
1097         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1098                 return;
1099 
1100         hash_lock = HDR_LOCK(buf->b_hdr);
1101         mutex_enter(hash_lock);
1102 
1103         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1104             buf->b_hdr->b_state == arc_anon);
1105         arc_cksum_compute(buf, B_FALSE);
1106         mutex_exit(hash_lock);
1107 
1108 }
1109 
1110 static void
1111 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1112 {
1113         ASSERT(MUTEX_HELD(hash_lock));
1114 
1115         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1116             (ab->b_state != arc_anon)) {
1117                 uint64_t delta = ab->b_size * ab->b_datacnt;
1118                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1119                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1120 
1121                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1122                 mutex_enter(&ab->b_state->arcs_mtx);
1123                 ASSERT(list_link_active(&ab->b_arc_node));
1124                 list_remove(list, ab);
1125                 if (GHOST_STATE(ab->b_state)) {
1126                         ASSERT0(ab->b_datacnt);
1127                         ASSERT3P(ab->b_buf, ==, NULL);
1128                         delta = ab->b_size;
1129                 }
1130                 ASSERT(delta > 0);
1131                 ASSERT3U(*size, >=, delta);
1132                 atomic_add_64(size, -delta);
1133                 mutex_exit(&ab->b_state->arcs_mtx);
1134                 /* remove the prefetch flag if we get a reference */
1135                 if (ab->b_flags & ARC_PREFETCH)
1136                         ab->b_flags &= ~ARC_PREFETCH;
1137         }
1138 }
1139 
1140 static int
1141 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1142 {
1143         int cnt;
1144         arc_state_t *state = ab->b_state;
1145 
1146         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1147         ASSERT(!GHOST_STATE(state));
1148 
1149         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1150             (state != arc_anon)) {
1151                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1152 
1153                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1154                 mutex_enter(&state->arcs_mtx);
1155                 ASSERT(!list_link_active(&ab->b_arc_node));
1156                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1157                 ASSERT(ab->b_datacnt > 0);
1158                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1159                 mutex_exit(&state->arcs_mtx);
1160         }
1161         return (cnt);
1162 }
1163 
1164 /*
1165  * Move the supplied buffer to the indicated state.  The mutex
1166  * for the buffer must be held by the caller.
1167  */
1168 static void
1169 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1170 {
1171         arc_state_t *old_state = ab->b_state;
1172         int64_t refcnt = refcount_count(&ab->b_refcnt);
1173         uint64_t from_delta, to_delta;
1174 
1175         ASSERT(MUTEX_HELD(hash_lock));
1176         ASSERT3P(new_state, !=, old_state);
1177         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1178         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1179         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1180 
1181         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1182 
1183         /*
1184          * If this buffer is evictable, transfer it from the
1185          * old state list to the new state list.
1186          */
1187         if (refcnt == 0) {
1188                 if (old_state != arc_anon) {
1189                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1190                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1191 
1192                         if (use_mutex)
1193                                 mutex_enter(&old_state->arcs_mtx);
1194 
1195                         ASSERT(list_link_active(&ab->b_arc_node));
1196                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1197 
1198                         /*
1199                          * If prefetching out of the ghost cache,
1200                          * we will have a non-zero datacnt.
1201                          */
1202                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1203                                 /* ghost elements have a ghost size */
1204                                 ASSERT(ab->b_buf == NULL);
1205                                 from_delta = ab->b_size;
1206                         }
1207                         ASSERT3U(*size, >=, from_delta);
1208                         atomic_add_64(size, -from_delta);
1209 
1210                         if (use_mutex)
1211                                 mutex_exit(&old_state->arcs_mtx);
1212                 }
1213                 if (new_state != arc_anon) {
1214                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1215                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1216 
1217                         if (use_mutex)
1218                                 mutex_enter(&new_state->arcs_mtx);
1219 
1220                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1221 
1222                         /* ghost elements have a ghost size */
1223                         if (GHOST_STATE(new_state)) {
1224                                 ASSERT(ab->b_datacnt == 0);
1225                                 ASSERT(ab->b_buf == NULL);
1226                                 to_delta = ab->b_size;
1227                         }
1228                         atomic_add_64(size, to_delta);
1229 
1230                         if (use_mutex)
1231                                 mutex_exit(&new_state->arcs_mtx);
1232                 }
1233         }
1234 
1235         ASSERT(!BUF_EMPTY(ab));
1236         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1237                 buf_hash_remove(ab);
1238 
1239         /* adjust state sizes */
1240         if (to_delta)
1241                 atomic_add_64(&new_state->arcs_size, to_delta);
1242         if (from_delta) {
1243                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1244                 atomic_add_64(&old_state->arcs_size, -from_delta);
1245         }
1246         ab->b_state = new_state;
1247 
1248         /* adjust l2arc hdr stats */
1249         if (new_state == arc_l2c_only)
1250                 l2arc_hdr_stat_add();
1251         else if (old_state == arc_l2c_only)
1252                 l2arc_hdr_stat_remove();
1253 }
1254 
1255 void
1256 arc_space_consume(uint64_t space, arc_space_type_t type)
1257 {
1258         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1259 
1260         switch (type) {
1261         case ARC_SPACE_DATA:
1262                 ARCSTAT_INCR(arcstat_data_size, space);
1263                 break;
1264         case ARC_SPACE_OTHER:
1265                 ARCSTAT_INCR(arcstat_other_size, space);
1266                 break;
1267         case ARC_SPACE_HDRS:
1268                 ARCSTAT_INCR(arcstat_hdr_size, space);
1269                 break;
1270         case ARC_SPACE_L2HDRS:
1271                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1272                 break;
1273         }
1274 
1275         ARCSTAT_INCR(arcstat_meta_used, space);
1276         atomic_add_64(&arc_size, space);
1277 }
1278 
1279 void
1280 arc_space_return(uint64_t space, arc_space_type_t type)
1281 {
1282         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1283 
1284         switch (type) {
1285         case ARC_SPACE_DATA:
1286                 ARCSTAT_INCR(arcstat_data_size, -space);
1287                 break;
1288         case ARC_SPACE_OTHER:
1289                 ARCSTAT_INCR(arcstat_other_size, -space);
1290                 break;
1291         case ARC_SPACE_HDRS:
1292                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1293                 break;
1294         case ARC_SPACE_L2HDRS:
1295                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1296                 break;
1297         }
1298 
1299         ASSERT(arc_meta_used >= space);
1300         if (arc_meta_max < arc_meta_used)
1301                 arc_meta_max = arc_meta_used;
1302         ARCSTAT_INCR(arcstat_meta_used, -space);
1303         ASSERT(arc_size >= space);
1304         atomic_add_64(&arc_size, -space);
1305 }
1306 
1307 void *
1308 arc_data_buf_alloc(uint64_t size)
1309 {
1310         if (arc_evict_needed(ARC_BUFC_DATA))
1311                 cv_signal(&arc_reclaim_thr_cv);
1312         atomic_add_64(&arc_size, size);
1313         return (zio_data_buf_alloc(size));
1314 }
1315 
1316 void
1317 arc_data_buf_free(void *buf, uint64_t size)
1318 {
1319         zio_data_buf_free(buf, size);
1320         ASSERT(arc_size >= size);
1321         atomic_add_64(&arc_size, -size);
1322 }
1323 
1324 arc_buf_t *
1325 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1326 {
1327         arc_buf_hdr_t *hdr;
1328         arc_buf_t *buf;
1329 
1330         ASSERT3U(size, >, 0);
1331         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1332         ASSERT(BUF_EMPTY(hdr));
1333         hdr->b_size = size;
1334         hdr->b_type = type;
1335         hdr->b_spa = spa_load_guid(spa);
1336         hdr->b_state = arc_anon;
1337         hdr->b_arc_access = 0;
1338         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1339         buf->b_hdr = hdr;
1340         buf->b_data = NULL;
1341         buf->b_efunc = NULL;
1342         buf->b_private = NULL;
1343         buf->b_next = NULL;
1344         hdr->b_buf = buf;
1345         arc_get_data_buf(buf);
1346         hdr->b_datacnt = 1;
1347         hdr->b_flags = 0;
1348         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1349         (void) refcount_add(&hdr->b_refcnt, tag);
1350 
1351         return (buf);
1352 }
1353 
1354 static char *arc_onloan_tag = "onloan";
1355 
1356 /*
1357  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1358  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1359  * buffers must be returned to the arc before they can be used by the DMU or
1360  * freed.
1361  */
1362 arc_buf_t *
1363 arc_loan_buf(spa_t *spa, int size)
1364 {
1365         arc_buf_t *buf;
1366 
1367         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1368 
1369         atomic_add_64(&arc_loaned_bytes, size);
1370         return (buf);
1371 }
1372 
1373 /*
1374  * Return a loaned arc buffer to the arc.
1375  */
1376 void
1377 arc_return_buf(arc_buf_t *buf, void *tag)
1378 {
1379         arc_buf_hdr_t *hdr = buf->b_hdr;
1380 
1381         ASSERT(buf->b_data != NULL);
1382         (void) refcount_add(&hdr->b_refcnt, tag);
1383         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1384 
1385         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1386 }
1387 
1388 /* Detach an arc_buf from a dbuf (tag) */
1389 void
1390 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1391 {
1392         arc_buf_hdr_t *hdr;
1393 
1394         ASSERT(buf->b_data != NULL);
1395         hdr = buf->b_hdr;
1396         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1397         (void) refcount_remove(&hdr->b_refcnt, tag);
1398         buf->b_efunc = NULL;
1399         buf->b_private = NULL;
1400 
1401         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1402 }
1403 
1404 static arc_buf_t *
1405 arc_buf_clone(arc_buf_t *from)
1406 {
1407         arc_buf_t *buf;
1408         arc_buf_hdr_t *hdr = from->b_hdr;
1409         uint64_t size = hdr->b_size;
1410 
1411         ASSERT(hdr->b_state != arc_anon);
1412 
1413         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1414         buf->b_hdr = hdr;
1415         buf->b_data = NULL;
1416         buf->b_efunc = NULL;
1417         buf->b_private = NULL;
1418         buf->b_next = hdr->b_buf;
1419         hdr->b_buf = buf;
1420         arc_get_data_buf(buf);
1421         bcopy(from->b_data, buf->b_data, size);
1422 
1423         /*
1424          * This buffer already exists in the arc so create a duplicate
1425          * copy for the caller.  If the buffer is associated with user data
1426          * then track the size and number of duplicates.  These stats will be
1427          * updated as duplicate buffers are created and destroyed.
1428          */
1429         if (hdr->b_type == ARC_BUFC_DATA) {
1430                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1431                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1432         }
1433         hdr->b_datacnt += 1;
1434         return (buf);
1435 }
1436 
1437 void
1438 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1439 {
1440         arc_buf_hdr_t *hdr;
1441         kmutex_t *hash_lock;
1442 
1443         /*
1444          * Check to see if this buffer is evicted.  Callers
1445          * must verify b_data != NULL to know if the add_ref
1446          * was successful.
1447          */
1448         mutex_enter(&buf->b_evict_lock);
1449         if (buf->b_data == NULL) {
1450                 mutex_exit(&buf->b_evict_lock);
1451                 return;
1452         }
1453         hash_lock = HDR_LOCK(buf->b_hdr);
1454         mutex_enter(hash_lock);
1455         hdr = buf->b_hdr;
1456         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1457         mutex_exit(&buf->b_evict_lock);
1458 
1459         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1460         add_reference(hdr, hash_lock, tag);
1461         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1462         arc_access(hdr, hash_lock);
1463         mutex_exit(hash_lock);
1464         ARCSTAT_BUMP(arcstat_hits);
1465         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1466             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1467             data, metadata, hits);
1468 }
1469 
1470 /*
1471  * Free the arc data buffer.  If it is an l2arc write in progress,
1472  * the buffer is placed on l2arc_free_on_write to be freed later.
1473  */
1474 static void
1475 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1476 {
1477         arc_buf_hdr_t *hdr = buf->b_hdr;
1478 
1479         if (HDR_L2_WRITING(hdr)) {
1480                 l2arc_data_free_t *df;
1481                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1482                 df->l2df_data = buf->b_data;
1483                 df->l2df_size = hdr->b_size;
1484                 df->l2df_func = free_func;
1485                 mutex_enter(&l2arc_free_on_write_mtx);
1486                 list_insert_head(l2arc_free_on_write, df);
1487                 mutex_exit(&l2arc_free_on_write_mtx);
1488                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1489         } else {
1490                 free_func(buf->b_data, hdr->b_size);
1491         }
1492 }
1493 
1494 /*
1495  * Free up buf->b_data and if 'remove' is set, then pull the
1496  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1497  */
1498 static void
1499 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1500 {
1501         arc_buf_t **bufp;
1502 
1503         /* free up data associated with the buf */
1504         if (buf->b_data) {
1505                 arc_state_t *state = buf->b_hdr->b_state;
1506                 uint64_t size = buf->b_hdr->b_size;
1507                 arc_buf_contents_t type = buf->b_hdr->b_type;
1508 
1509                 arc_cksum_verify(buf);
1510                 arc_buf_unwatch(buf);
1511 
1512                 if (!recycle) {
1513                         if (type == ARC_BUFC_METADATA) {
1514                                 arc_buf_data_free(buf, zio_buf_free);
1515                                 arc_space_return(size, ARC_SPACE_DATA);
1516                         } else {
1517                                 ASSERT(type == ARC_BUFC_DATA);
1518                                 arc_buf_data_free(buf, zio_data_buf_free);
1519                                 ARCSTAT_INCR(arcstat_data_size, -size);
1520                                 atomic_add_64(&arc_size, -size);
1521                         }
1522                 }
1523                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1524                         uint64_t *cnt = &state->arcs_lsize[type];
1525 
1526                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1527                         ASSERT(state != arc_anon);
1528 
1529                         ASSERT3U(*cnt, >=, size);
1530                         atomic_add_64(cnt, -size);
1531                 }
1532                 ASSERT3U(state->arcs_size, >=, size);
1533                 atomic_add_64(&state->arcs_size, -size);
1534                 buf->b_data = NULL;
1535 
1536                 /*
1537                  * If we're destroying a duplicate buffer make sure
1538                  * that the appropriate statistics are updated.
1539                  */
1540                 if (buf->b_hdr->b_datacnt > 1 &&
1541                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1542                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1543                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1544                 }
1545                 ASSERT(buf->b_hdr->b_datacnt > 0);
1546                 buf->b_hdr->b_datacnt -= 1;
1547         }
1548 
1549         /* only remove the buf if requested */
1550         if (!remove)
1551                 return;
1552 
1553         /* remove the buf from the hdr list */
1554         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1555                 continue;
1556         *bufp = buf->b_next;
1557         buf->b_next = NULL;
1558 
1559         ASSERT(buf->b_efunc == NULL);
1560 
1561         /* clean up the buf */
1562         buf->b_hdr = NULL;
1563         kmem_cache_free(buf_cache, buf);
1564 }
1565 
1566 static void
1567 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1568 {
1569         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1570         ASSERT3P(hdr->b_state, ==, arc_anon);
1571         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1572         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1573 
1574         if (l2hdr != NULL) {
1575                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1576                 /*
1577                  * To prevent arc_free() and l2arc_evict() from
1578                  * attempting to free the same buffer at the same time,
1579                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1580                  * give it priority.  l2arc_evict() can't destroy this
1581                  * header while we are waiting on l2arc_buflist_mtx.
1582                  *
1583                  * The hdr may be removed from l2ad_buflist before we
1584                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1585                  */
1586                 if (!buflist_held) {
1587                         mutex_enter(&l2arc_buflist_mtx);
1588                         l2hdr = hdr->b_l2hdr;
1589                 }
1590 
1591                 if (l2hdr != NULL) {
1592                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1593                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1594                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1595                         if (l2hdr->b_dev->l2ad_vdev)
1596                                 vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1597                                     -l2hdr->b_asize, 0, 0);
1598                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1599                         if (hdr->b_state == arc_l2c_only)
1600                                 l2arc_hdr_stat_remove();
1601                         hdr->b_l2hdr = NULL;
1602                 }
1603 
1604                 if (!buflist_held)
1605                         mutex_exit(&l2arc_buflist_mtx);
1606         }
1607 
1608         if (!BUF_EMPTY(hdr)) {
1609                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1610                 buf_discard_identity(hdr);
1611         }
1612         while (hdr->b_buf) {
1613                 arc_buf_t *buf = hdr->b_buf;
1614 
1615                 if (buf->b_efunc) {
1616                         mutex_enter(&arc_eviction_mtx);
1617                         mutex_enter(&buf->b_evict_lock);
1618                         ASSERT(buf->b_hdr != NULL);
1619                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1620                         hdr->b_buf = buf->b_next;
1621                         buf->b_hdr = &arc_eviction_hdr;
1622                         buf->b_next = arc_eviction_list;
1623                         arc_eviction_list = buf;
1624                         mutex_exit(&buf->b_evict_lock);
1625                         mutex_exit(&arc_eviction_mtx);
1626                 } else {
1627                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1628                 }
1629         }
1630         if (hdr->b_freeze_cksum != NULL) {
1631                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1632                 hdr->b_freeze_cksum = NULL;
1633         }
1634         if (hdr->b_thawed) {
1635                 kmem_free(hdr->b_thawed, 1);
1636                 hdr->b_thawed = NULL;
1637         }
1638 
1639         ASSERT(!list_link_active(&hdr->b_arc_node));
1640         ASSERT3P(hdr->b_hash_next, ==, NULL);
1641         ASSERT3P(hdr->b_acb, ==, NULL);
1642         kmem_cache_free(hdr_cache, hdr);
1643 }
1644 
1645 void
1646 arc_buf_free(arc_buf_t *buf, void *tag)
1647 {
1648         arc_buf_hdr_t *hdr = buf->b_hdr;
1649         int hashed = hdr->b_state != arc_anon;
1650 
1651         ASSERT(buf->b_efunc == NULL);
1652         ASSERT(buf->b_data != NULL);
1653 
1654         if (hashed) {
1655                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1656 
1657                 mutex_enter(hash_lock);
1658                 hdr = buf->b_hdr;
1659                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1660 
1661                 (void) remove_reference(hdr, hash_lock, tag);
1662                 if (hdr->b_datacnt > 1) {
1663                         arc_buf_destroy(buf, FALSE, TRUE);
1664                 } else {
1665                         ASSERT(buf == hdr->b_buf);
1666                         ASSERT(buf->b_efunc == NULL);
1667                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1668                 }
1669                 mutex_exit(hash_lock);
1670         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1671                 int destroy_hdr;
1672                 /*
1673                  * We are in the middle of an async write.  Don't destroy
1674                  * this buffer unless the write completes before we finish
1675                  * decrementing the reference count.
1676                  */
1677                 mutex_enter(&arc_eviction_mtx);
1678                 (void) remove_reference(hdr, NULL, tag);
1679                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1680                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1681                 mutex_exit(&arc_eviction_mtx);
1682                 if (destroy_hdr)
1683                         arc_hdr_destroy(hdr);
1684         } else {
1685                 if (remove_reference(hdr, NULL, tag) > 0)
1686                         arc_buf_destroy(buf, FALSE, TRUE);
1687                 else
1688                         arc_hdr_destroy(hdr);
1689         }
1690 }
1691 
1692 boolean_t
1693 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1694 {
1695         arc_buf_hdr_t *hdr = buf->b_hdr;
1696         kmutex_t *hash_lock = HDR_LOCK(hdr);
1697         boolean_t no_callback = (buf->b_efunc == NULL);
1698 
1699         if (hdr->b_state == arc_anon) {
1700                 ASSERT(hdr->b_datacnt == 1);
1701                 arc_buf_free(buf, tag);
1702                 return (no_callback);
1703         }
1704 
1705         mutex_enter(hash_lock);
1706         hdr = buf->b_hdr;
1707         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1708         ASSERT(hdr->b_state != arc_anon);
1709         ASSERT(buf->b_data != NULL);
1710 
1711         (void) remove_reference(hdr, hash_lock, tag);
1712         if (hdr->b_datacnt > 1) {
1713                 if (no_callback)
1714                         arc_buf_destroy(buf, FALSE, TRUE);
1715         } else if (no_callback) {
1716                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1717                 ASSERT(buf->b_efunc == NULL);
1718                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1719         }
1720         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1721             refcount_is_zero(&hdr->b_refcnt));
1722         mutex_exit(hash_lock);
1723         return (no_callback);
1724 }
1725 
1726 int
1727 arc_buf_size(arc_buf_t *buf)
1728 {
1729         return (buf->b_hdr->b_size);
1730 }
1731 
1732 /*
1733  * Called from the DMU to determine if the current buffer should be
1734  * evicted. In order to ensure proper locking, the eviction must be initiated
1735  * from the DMU. Return true if the buffer is associated with user data and
1736  * duplicate buffers still exist.
1737  */
1738 boolean_t
1739 arc_buf_eviction_needed(arc_buf_t *buf)
1740 {
1741         arc_buf_hdr_t *hdr;
1742         boolean_t evict_needed = B_FALSE;
1743 
1744         if (zfs_disable_dup_eviction)
1745                 return (B_FALSE);
1746 
1747         mutex_enter(&buf->b_evict_lock);
1748         hdr = buf->b_hdr;
1749         if (hdr == NULL) {
1750                 /*
1751                  * We are in arc_do_user_evicts(); let that function
1752                  * perform the eviction.
1753                  */
1754                 ASSERT(buf->b_data == NULL);
1755                 mutex_exit(&buf->b_evict_lock);
1756                 return (B_FALSE);
1757         } else if (buf->b_data == NULL) {
1758                 /*
1759                  * We have already been added to the arc eviction list;
1760                  * recommend eviction.
1761                  */
1762                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1763                 mutex_exit(&buf->b_evict_lock);
1764                 return (B_TRUE);
1765         }
1766 
1767         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1768                 evict_needed = B_TRUE;
1769 
1770         mutex_exit(&buf->b_evict_lock);
1771         return (evict_needed);
1772 }
1773 
1774 int zfs_fastflush = 1;
1775 
1776 /*
1777  * Evict buffers from list until we've removed the specified number of
1778  * bytes.  Move the removed buffers to the appropriate evict state.
1779  * If the recycle flag is set, then attempt to "recycle" a buffer:
1780  * - look for a buffer to evict that is `bytes' long.
1781  * - return the data block from this buffer rather than freeing it.
1782  * This flag is used by callers that are trying to make space for a
1783  * new buffer in a full arc cache.
1784  *
1785  * This function makes a "best effort".  It skips over any buffers
1786  * it can't get a hash_lock on, and so may not catch all candidates.
1787  * It may also return without evicting as much space as requested.
1788  */
1789 static void *
1790 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1791     arc_buf_contents_t type)
1792 {
1793         arc_state_t *evicted_state;
1794         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1795         arc_buf_hdr_t *ab, *ab_prev = NULL;
1796         list_t *list = &state->arcs_list[type];
1797         kmutex_t *hash_lock;
1798         boolean_t have_lock;
1799         void *stolen = NULL;
1800         arc_buf_hdr_t marker = { 0 };
1801         int count = 0;
1802 
1803         ASSERT(state == arc_mru || state == arc_mfu);
1804 
1805         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1806 
1807         mutex_enter(&state->arcs_mtx);
1808         mutex_enter(&evicted_state->arcs_mtx);
1809 
1810         for (ab = list_tail(list); ab; ab = ab_prev) {
1811                 ab_prev = list_prev(list, ab);
1812                 /* prefetch buffers have a minimum lifespan */
1813                 if (HDR_IO_IN_PROGRESS(ab) ||
1814                     (spa && ab->b_spa != spa) ||
1815                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1816                     ddi_get_lbolt() - ab->b_arc_access <
1817                     arc_min_prefetch_lifespan)) {
1818                         skipped++;
1819                         continue;
1820                 }
1821                 /* "lookahead" for better eviction candidate */
1822                 if (recycle && ab->b_size != bytes &&
1823                     ab_prev && ab_prev->b_size == bytes)
1824                         continue;
1825 
1826                 /* ignore markers */
1827                 if (ab->b_spa == 0)
1828                         continue;
1829 
1830                 /*
1831                  * It may take a long time to evict all the bufs requested.
1832                  * To avoid blocking all arc activity, periodically drop
1833                  * the arcs_mtx and give other threads a chance to run
1834                  * before reacquiring the lock.
1835                  *
1836                  * If we are looking for a buffer to recycle, we are in
1837                  * the hot code path, so don't sleep.
1838                  */
1839                 if (!recycle && count++ > arc_evict_iterations) {
1840                         list_insert_after(list, ab, &marker);
1841                         mutex_exit(&evicted_state->arcs_mtx);
1842                         mutex_exit(&state->arcs_mtx);
1843                         kpreempt(KPREEMPT_SYNC);
1844                         mutex_enter(&state->arcs_mtx);
1845                         mutex_enter(&evicted_state->arcs_mtx);
1846                         ab_prev = list_prev(list, &marker);
1847                         list_remove(list, &marker);
1848                         count = 0;
1849                         continue;
1850                 }
1851 
1852                 hash_lock = HDR_LOCK(ab);
1853                 have_lock = MUTEX_HELD(hash_lock);
1854                 if (have_lock || mutex_tryenter(hash_lock)) {
1855                         ASSERT0(refcount_count(&ab->b_refcnt));
1856                         ASSERT(ab->b_datacnt > 0);
1857                         while (ab->b_buf) {
1858                                 arc_buf_t *buf = ab->b_buf;
1859                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1860                                         missed += 1;
1861                                         break;
1862                                 }
1863                                 if (buf->b_data) {
1864                                         bytes_evicted += ab->b_size;
1865                                         if (recycle && ab->b_type == type &&
1866                                             ab->b_size == bytes &&
1867                                             !HDR_L2_WRITING(ab)) {
1868                                                 stolen = buf->b_data;
1869                                                 recycle = FALSE;
1870                                         }
1871                                 }
1872                                 if (buf->b_efunc) {
1873                                         mutex_enter(&arc_eviction_mtx);
1874                                         arc_buf_destroy(buf,
1875                                             buf->b_data == stolen, FALSE);
1876                                         ab->b_buf = buf->b_next;
1877                                         buf->b_hdr = &arc_eviction_hdr;
1878                                         buf->b_next = arc_eviction_list;
1879                                         arc_eviction_list = buf;
1880                                         mutex_exit(&arc_eviction_mtx);
1881                                         mutex_exit(&buf->b_evict_lock);
1882                                 } else {
1883                                         mutex_exit(&buf->b_evict_lock);
1884                                         arc_buf_destroy(buf,
1885                                             buf->b_data == stolen, TRUE);
1886                                 }
1887                         }
1888 
1889                         if (ab->b_l2hdr) {
1890                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1891                                     ab->b_size);
1892                         } else {
1893                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1894                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1895                                             ab->b_size);
1896                                 } else {
1897                                         ARCSTAT_INCR(
1898                                             arcstat_evict_l2_ineligible,
1899                                             ab->b_size);
1900                                 }
1901                         }
1902 
1903                         if (ab->b_datacnt == 0) {
1904                                 arc_change_state(evicted_state, ab, hash_lock);
1905                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1906                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1907                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1908                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1909                         }
1910                         if (!have_lock)
1911                                 mutex_exit(hash_lock);
1912                         if (bytes >= 0 && bytes_evicted >= bytes)
1913                                 break;
1914                 } else {
1915                         missed += 1;
1916                 }
1917         }
1918 
1919         mutex_exit(&evicted_state->arcs_mtx);
1920         mutex_exit(&state->arcs_mtx);
1921 
1922         if (bytes_evicted < bytes)
1923                 dprintf("only evicted %lld bytes from %x",
1924                     (longlong_t)bytes_evicted, state);
1925 
1926         if (skipped)
1927                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1928 
1929         if (missed)
1930                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1931 
1932         /*
1933          * Note: we have just evicted some data into the ghost state,
1934          * potentially putting the ghost size over the desired size.  Rather
1935          * that evicting from the ghost list in this hot code path, leave
1936          * this chore to the arc_reclaim_thread().
1937          */
1938 
1939         return (stolen);
1940 }
1941 
1942 /*
1943  * Remove buffers from list until we've removed the specified number of
1944  * bytes.  Destroy the buffers that are removed.
1945  */
1946 static void
1947 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1948 {
1949         arc_buf_hdr_t *ab, *ab_prev;
1950         arc_buf_hdr_t marker = { 0 };
1951         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1952         kmutex_t *hash_lock;
1953         uint64_t bytes_deleted = 0;
1954         uint64_t bufs_skipped = 0;
1955         int count = 0;
1956 
1957         ASSERT(GHOST_STATE(state));
1958 top:
1959         mutex_enter(&state->arcs_mtx);
1960         for (ab = list_tail(list); ab; ab = ab_prev) {
1961                 ab_prev = list_prev(list, ab);
1962                 if (ab->b_type > ARC_BUFC_NUMTYPES)
1963                         panic("invalid ab=%p", (void *)ab);
1964                 if (spa && ab->b_spa != spa)
1965                         continue;
1966 
1967                 /* ignore markers */
1968                 if (ab->b_spa == 0)
1969                         continue;
1970 
1971                 hash_lock = HDR_LOCK(ab);
1972                 /* caller may be trying to modify this buffer, skip it */
1973                 if (MUTEX_HELD(hash_lock))
1974                         continue;
1975 
1976                 /*
1977                  * It may take a long time to evict all the bufs requested.
1978                  * To avoid blocking all arc activity, periodically drop
1979                  * the arcs_mtx and give other threads a chance to run
1980                  * before reacquiring the lock.
1981                  */
1982                 if (count++ > arc_evict_iterations) {
1983                         list_insert_after(list, ab, &marker);
1984                         mutex_exit(&state->arcs_mtx);
1985                         kpreempt(KPREEMPT_SYNC);
1986                         mutex_enter(&state->arcs_mtx);
1987                         ab_prev = list_prev(list, &marker);
1988                         list_remove(list, &marker);
1989                         count = 0;
1990                         continue;
1991                 }
1992                 if (mutex_tryenter(hash_lock)) {
1993                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1994                         ASSERT(ab->b_buf == NULL);
1995                         ARCSTAT_BUMP(arcstat_deleted);
1996                         bytes_deleted += ab->b_size;
1997 
1998                         if (ab->b_l2hdr != NULL) {
1999                                 /*
2000                                  * This buffer is cached on the 2nd Level ARC;
2001                                  * don't destroy the header.
2002                                  */
2003                                 arc_change_state(arc_l2c_only, ab, hash_lock);
2004                                 mutex_exit(hash_lock);
2005                         } else {
2006                                 arc_change_state(arc_anon, ab, hash_lock);
2007                                 mutex_exit(hash_lock);
2008                                 arc_hdr_destroy(ab);
2009                         }
2010 
2011                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2012                         if (bytes >= 0 && bytes_deleted >= bytes)
2013                                 break;
2014                 } else if (bytes < 0) {
2015                         /*
2016                          * Insert a list marker and then wait for the
2017                          * hash lock to become available. Once its
2018                          * available, restart from where we left off.
2019                          */
2020                         list_insert_after(list, ab, &marker);
2021                         mutex_exit(&state->arcs_mtx);
2022                         mutex_enter(hash_lock);
2023                         mutex_exit(hash_lock);
2024                         mutex_enter(&state->arcs_mtx);
2025                         ab_prev = list_prev(list, &marker);
2026                         list_remove(list, &marker);
2027                 } else {
2028                         bufs_skipped += 1;
2029                 }
2030 
2031         }
2032         mutex_exit(&state->arcs_mtx);
2033 
2034         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2035             (bytes < 0 || bytes_deleted < bytes)) {
2036                 list = &state->arcs_list[ARC_BUFC_METADATA];
2037                 goto top;
2038         }
2039 
2040         if (bufs_skipped) {
2041                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2042                 ASSERT(bytes >= 0);
2043         }
2044 
2045         if (bytes_deleted < bytes)
2046                 dprintf("only deleted %lld bytes from %p",
2047                     (longlong_t)bytes_deleted, state);
2048 }
2049 
2050 static void
2051 arc_adjust(void)
2052 {
2053         int64_t adjustment, delta;
2054 
2055         /*
2056          * Adjust MRU size
2057          */
2058 
2059         adjustment = MIN((int64_t)(arc_size - arc_c),
2060             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2061             arc_p));
2062 
2063         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2064                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2065                 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2066                 adjustment -= delta;
2067         }
2068 
2069         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2070                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2071                 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2072                     ARC_BUFC_METADATA);
2073         }
2074 
2075         /*
2076          * Adjust MFU size
2077          */
2078 
2079         adjustment = arc_size - arc_c;
2080 
2081         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2082                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2083                 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2084                 adjustment -= delta;
2085         }
2086 
2087         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2088                 int64_t delta = MIN(adjustment,
2089                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2090                 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2091                     ARC_BUFC_METADATA);
2092         }
2093 
2094         /*
2095          * Adjust ghost lists
2096          */
2097 
2098         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2099 
2100         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2101                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2102                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2103         }
2104 
2105         adjustment =
2106             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2107 
2108         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2109                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2110                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2111         }
2112 }
2113 
2114 static void
2115 arc_do_user_evicts(void)
2116 {
2117         mutex_enter(&arc_eviction_mtx);
2118         while (arc_eviction_list != NULL) {
2119                 arc_buf_t *buf = arc_eviction_list;
2120                 arc_eviction_list = buf->b_next;
2121                 mutex_enter(&buf->b_evict_lock);
2122                 buf->b_hdr = NULL;
2123                 mutex_exit(&buf->b_evict_lock);
2124                 mutex_exit(&arc_eviction_mtx);
2125 
2126                 if (buf->b_efunc != NULL)
2127                         VERIFY0(buf->b_efunc(buf->b_private));
2128 
2129                 buf->b_efunc = NULL;
2130                 buf->b_private = NULL;
2131                 kmem_cache_free(buf_cache, buf);
2132                 mutex_enter(&arc_eviction_mtx);
2133         }
2134         mutex_exit(&arc_eviction_mtx);
2135 }
2136 
2137 typedef struct arc_async_flush_data {
2138         uint64_t        aaf_guid;
2139 } arc_async_flush_data_t;
2140 
2141 static taskq_t *arc_flush_taskq;
2142 
2143 static void
2144 _arc_flush(uint64_t guid)
2145 {
2146         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2147                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2148                 if (guid)
2149                         break;
2150         }
2151         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2152                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2153                 if (guid)
2154                         break;
2155         }
2156         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2157                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2158                 if (guid)
2159                         break;
2160         }
2161         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2162                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2163                 if (guid)
2164                         break;
2165         }
2166 
2167         arc_evict_ghost(arc_mru_ghost, guid, -1);
2168         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2169 
2170         mutex_enter(&arc_reclaim_thr_lock);
2171         arc_do_user_evicts();
2172         mutex_exit(&arc_reclaim_thr_lock);
2173 }
2174 
2175 static void
2176 arc_flush_task(void *arg)
2177 {
2178         arc_async_flush_data_t *aaf = (arc_async_flush_data_t *)arg;
2179         _arc_flush(aaf->aaf_guid);
2180         kmem_free(aaf, sizeof (arc_async_flush_data_t));
2181 }
2182 
2183 /*
2184  * Flush all *evictable* data from the cache for the given spa.
2185  * NOTE: this will not touch "active" (i.e. referenced) data.
2186  */
2187 void
2188 arc_flush(spa_t *spa)
2189 {
2190         uint64_t guid = 0;
2191         boolean_t async_flush = (spa ? zfs_fastflush : FALSE);
2192         arc_async_flush_data_t *aaf = NULL;
2193 
2194         if (spa) {
2195                 guid = spa_load_guid(spa);
2196                 if (async_flush) {
2197                         aaf = kmem_alloc(sizeof (arc_async_flush_data_t),
2198                             KM_SLEEP);
2199                         aaf->aaf_guid = guid;
2200                 }
2201         }
2202 
2203         /*
2204          * Try to flush per-spa remaining ARC ghost buffers and buffers in
2205          * arc_eviction_list asynchronously while a pool is being closed.
2206          * An ARC buffer is bound to spa only by guid, so buffer can
2207          * exist even when pool has already gone. If asynchronous flushing
2208          * fails we fall back to regular (synchronous) one.
2209          * NOTE: If asynchronous flushing had not yet finished when the pool
2210          * was imported again it wouldn't be a problem, even when guids before
2211          * and after export/import are the same. We can evict only unreferenced
2212          * buffers, other are skipped.
2213          */
2214         if (!async_flush || (taskq_dispatch(arc_flush_taskq, arc_flush_task,
2215             aaf, TQ_NOSLEEP) == NULL)) {
2216                 _arc_flush(guid);
2217                 ASSERT(spa || arc_eviction_list == NULL);
2218                 if (async_flush)
2219                         kmem_free(aaf, sizeof (arc_async_flush_data_t));
2220         }
2221 }
2222 
2223 void
2224 arc_shrink(void)
2225 {
2226         if (arc_c > arc_c_min) {
2227                 uint64_t to_free;
2228 
2229 #ifdef _KERNEL
2230                 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2231 #else
2232                 to_free = arc_c >> arc_shrink_shift;
2233 #endif
2234                 if (arc_c > arc_c_min + to_free)
2235                         atomic_add_64(&arc_c, -to_free);
2236                 else
2237                         arc_c = arc_c_min;
2238 
2239                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2240                 if (arc_c > arc_size)
2241                         arc_c = MAX(arc_size, arc_c_min);
2242                 if (arc_p > arc_c)
2243                         arc_p = (arc_c >> 1);
2244                 ASSERT(arc_c >= arc_c_min);
2245                 ASSERT((int64_t)arc_p >= 0);
2246         }
2247 
2248         if (arc_size > arc_c)
2249                 arc_adjust();
2250 }
2251 
2252 /*
2253  * Determine if the system is under memory pressure and is asking
2254  * to reclaim memory. A return value of 1 indicates that the system
2255  * is under memory pressure and that the arc should adjust accordingly.
2256  */
2257 static int
2258 arc_reclaim_needed(void)
2259 {
2260         uint64_t extra;
2261 
2262 #ifdef _KERNEL
2263 
2264         if (needfree)
2265                 return (1);
2266 
2267         /*
2268          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2269          */
2270         extra = desfree;
2271 
2272         /*
2273          * check that we're out of range of the pageout scanner.  It starts to
2274          * schedule paging if freemem is less than lotsfree and needfree.
2275          * lotsfree is the high-water mark for pageout, and needfree is the
2276          * number of needed free pages.  We add extra pages here to make sure
2277          * the scanner doesn't start up while we're freeing memory.
2278          */
2279         if (freemem < lotsfree + needfree + extra)
2280                 return (1);
2281 
2282         /*
2283          * check to make sure that swapfs has enough space so that anon
2284          * reservations can still succeed. anon_resvmem() checks that the
2285          * availrmem is greater than swapfs_minfree, and the number of reserved
2286          * swap pages.  We also add a bit of extra here just to prevent
2287          * circumstances from getting really dire.
2288          */
2289         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2290                 return (1);
2291 
2292         /*
2293          * Check that we have enough availrmem that memory locking (e.g., via
2294          * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2295          * stores the number of pages that cannot be locked; when availrmem
2296          * drops below pages_pp_maximum, page locking mechanisms such as
2297          * page_pp_lock() will fail.)
2298          */
2299         if (availrmem <= pages_pp_maximum)
2300                 return (1);
2301 
2302 #if defined(__i386)
2303         /*
2304          * If we're on an i386 platform, it's possible that we'll exhaust the
2305          * kernel heap space before we ever run out of available physical
2306          * memory.  Most checks of the size of the heap_area compare against
2307          * tune.t_minarmem, which is the minimum available real memory that we
2308          * can have in the system.  However, this is generally fixed at 25 pages
2309          * which is so low that it's useless.  In this comparison, we seek to
2310          * calculate the total heap-size, and reclaim if more than 3/4ths of the
2311          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2312          * free)
2313          */
2314         if (vmem_size(heap_arena, VMEM_FREE) <
2315             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2316                 return (1);
2317 #endif
2318 
2319         /*
2320          * If zio data pages are being allocated out of a separate heap segment,
2321          * then enforce that the size of available vmem for this arena remains
2322          * above about 1/16th free.
2323          *
2324          * Note: The 1/16th arena free requirement was put in place
2325          * to aggressively evict memory from the arc in order to avoid
2326          * memory fragmentation issues.
2327          */
2328         if (zio_arena != NULL &&
2329             vmem_size(zio_arena, VMEM_FREE) <
2330             (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2331                 return (1);
2332 #else
2333         if (spa_get_random(100) == 0)
2334                 return (1);
2335 #endif
2336         return (0);
2337 }
2338 
2339 static void
2340 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2341 {
2342         size_t                  i;
2343         kmem_cache_t            *prev_cache = NULL;
2344         kmem_cache_t            *prev_data_cache = NULL;
2345         extern kmem_cache_t     *zio_buf_cache[];
2346         extern kmem_cache_t     *zio_data_buf_cache[];
2347         extern kmem_cache_t     *range_seg_cache;
2348 
2349 #ifdef _KERNEL
2350         if (arc_meta_used >= arc_meta_limit) {
2351                 /*
2352                  * We are exceeding our meta-data cache limit.
2353                  * Purge some DNLC entries to release holds on meta-data.
2354                  */
2355                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2356         }
2357 #if defined(__i386)
2358         /*
2359          * Reclaim unused memory from all kmem caches.
2360          */
2361         kmem_reap();
2362 #endif
2363 #endif
2364 
2365         /*
2366          * An aggressive reclamation will shrink the cache size as well as
2367          * reap free buffers from the arc kmem caches.
2368          */
2369         if (strat == ARC_RECLAIM_AGGR)
2370                 arc_shrink();
2371 
2372         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2373                 if (zio_buf_cache[i] != prev_cache) {
2374                         prev_cache = zio_buf_cache[i];
2375                         kmem_cache_reap_now(zio_buf_cache[i]);
2376                 }
2377                 if (zio_data_buf_cache[i] != prev_data_cache) {
2378                         prev_data_cache = zio_data_buf_cache[i];
2379                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2380                 }
2381         }
2382         kmem_cache_reap_now(buf_cache);
2383         kmem_cache_reap_now(hdr_cache);
2384         kmem_cache_reap_now(range_seg_cache);
2385 
2386         /*
2387          * Ask the vmem areana to reclaim unused memory from its
2388          * quantum caches.
2389          */
2390         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2391                 vmem_qcache_reap(zio_arena);
2392 }
2393 
2394 static void
2395 arc_reclaim_thread(void)
2396 {
2397         clock_t                 growtime = 0;
2398         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2399         callb_cpr_t             cpr;
2400 
2401         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2402 
2403         mutex_enter(&arc_reclaim_thr_lock);
2404         while (arc_thread_exit == 0) {
2405                 if (arc_reclaim_needed()) {
2406 
2407                         if (arc_no_grow) {
2408                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2409                                         last_reclaim = ARC_RECLAIM_AGGR;
2410                                 } else {
2411                                         last_reclaim = ARC_RECLAIM_CONS;
2412                                 }
2413                         } else {
2414                                 arc_no_grow = TRUE;
2415                                 last_reclaim = ARC_RECLAIM_AGGR;
2416                                 membar_producer();
2417                         }
2418 
2419                         /* reset the growth delay for every reclaim */
2420                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2421 
2422                         arc_kmem_reap_now(last_reclaim);
2423                         arc_warm = B_TRUE;
2424 
2425                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2426                         arc_no_grow = FALSE;
2427                 }
2428 
2429                 arc_adjust();
2430 
2431                 if (arc_eviction_list != NULL)
2432                         arc_do_user_evicts();
2433 
2434                 /* block until needed, or one second, whichever is shorter */
2435                 CALLB_CPR_SAFE_BEGIN(&cpr);
2436                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2437                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2438                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2439         }
2440 
2441         arc_thread_exit = 0;
2442         cv_broadcast(&arc_reclaim_thr_cv);
2443         CALLB_CPR_EXIT(&cpr);               /* drops arc_reclaim_thr_lock */
2444         thread_exit();
2445 }
2446 
2447 /*
2448  * Adapt arc info given the number of bytes we are trying to add and
2449  * the state that we are comming from.  This function is only called
2450  * when we are adding new content to the cache.
2451  */
2452 static void
2453 arc_adapt(int bytes, arc_state_t *state)
2454 {
2455         int mult;
2456         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2457 
2458         if (state == arc_l2c_only)
2459                 return;
2460 
2461         ASSERT(bytes > 0);
2462         /*
2463          * Adapt the target size of the MRU list:
2464          *      - if we just hit in the MRU ghost list, then increase
2465          *        the target size of the MRU list.
2466          *      - if we just hit in the MFU ghost list, then increase
2467          *        the target size of the MFU list by decreasing the
2468          *        target size of the MRU list.
2469          */
2470         if (state == arc_mru_ghost) {
2471                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2472                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2473                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2474 
2475                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2476         } else if (state == arc_mfu_ghost) {
2477                 uint64_t delta;
2478 
2479                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2480                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2481                 mult = MIN(mult, 10);
2482 
2483                 delta = MIN(bytes * mult, arc_p);
2484                 arc_p = MAX(arc_p_min, arc_p - delta);
2485         }
2486         ASSERT((int64_t)arc_p >= 0);
2487 
2488         if (arc_reclaim_needed()) {
2489                 cv_signal(&arc_reclaim_thr_cv);
2490                 return;
2491         }
2492 
2493         if (arc_no_grow)
2494                 return;
2495 
2496         if (arc_c >= arc_c_max)
2497                 return;
2498 
2499         /*
2500          * If we're within (2 * maxblocksize) bytes of the target
2501          * cache size, increment the target cache size
2502          */
2503         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2504                 atomic_add_64(&arc_c, (int64_t)bytes);
2505                 if (arc_c > arc_c_max)
2506                         arc_c = arc_c_max;
2507                 else if (state == arc_anon)
2508                         atomic_add_64(&arc_p, (int64_t)bytes);
2509                 if (arc_p > arc_c)
2510                         arc_p = arc_c;
2511         }
2512         ASSERT((int64_t)arc_p >= 0);
2513 }
2514 
2515 /*
2516  * Check if the cache has reached its limits and eviction is required
2517  * prior to insert.
2518  */
2519 static int
2520 arc_evict_needed(arc_buf_contents_t type)
2521 {
2522         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2523                 return (1);
2524 
2525         if (arc_reclaim_needed())
2526                 return (1);
2527 
2528         return (arc_size > arc_c);
2529 }
2530 
2531 /*
2532  * The buffer, supplied as the first argument, needs a data block.
2533  * So, if we are at cache max, determine which cache should be victimized.
2534  * We have the following cases:
2535  *
2536  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2537  * In this situation if we're out of space, but the resident size of the MFU is
2538  * under the limit, victimize the MFU cache to satisfy this insertion request.
2539  *
2540  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2541  * Here, we've used up all of the available space for the MRU, so we need to
2542  * evict from our own cache instead.  Evict from the set of resident MRU
2543  * entries.
2544  *
2545  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2546  * c minus p represents the MFU space in the cache, since p is the size of the
2547  * cache that is dedicated to the MRU.  In this situation there's still space on
2548  * the MFU side, so the MRU side needs to be victimized.
2549  *
2550  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2551  * MFU's resident set is consuming more space than it has been allotted.  In
2552  * this situation, we must victimize our own cache, the MFU, for this insertion.
2553  */
2554 static void
2555 arc_get_data_buf(arc_buf_t *buf)
2556 {
2557         arc_state_t             *state = buf->b_hdr->b_state;
2558         uint64_t                size = buf->b_hdr->b_size;
2559         arc_buf_contents_t      type = buf->b_hdr->b_type;
2560 
2561         arc_adapt(size, state);
2562 
2563         /*
2564          * We have not yet reached cache maximum size,
2565          * just allocate a new buffer.
2566          */
2567         if (!arc_evict_needed(type)) {
2568                 if (type == ARC_BUFC_METADATA) {
2569                         buf->b_data = zio_buf_alloc(size);
2570                         arc_space_consume(size, ARC_SPACE_DATA);
2571                 } else {
2572                         ASSERT(type == ARC_BUFC_DATA);
2573                         buf->b_data = zio_data_buf_alloc(size);
2574                         ARCSTAT_INCR(arcstat_data_size, size);
2575                         atomic_add_64(&arc_size, size);
2576                 }
2577                 goto out;
2578         }
2579 
2580         /*
2581          * If we are prefetching from the mfu ghost list, this buffer
2582          * will end up on the mru list; so steal space from there.
2583          */
2584         if (state == arc_mfu_ghost)
2585                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2586         else if (state == arc_mru_ghost)
2587                 state = arc_mru;
2588 
2589         if (state == arc_mru || state == arc_anon) {
2590                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2591                 state = (arc_mfu->arcs_lsize[type] >= size &&
2592                     arc_p > mru_used) ? arc_mfu : arc_mru;
2593         } else {
2594                 /* MFU cases */
2595                 uint64_t mfu_space = arc_c - arc_p;
2596                 state =  (arc_mru->arcs_lsize[type] >= size &&
2597                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2598         }
2599         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2600                 if (type == ARC_BUFC_METADATA) {
2601                         buf->b_data = zio_buf_alloc(size);
2602                         arc_space_consume(size, ARC_SPACE_DATA);
2603                 } else {
2604                         ASSERT(type == ARC_BUFC_DATA);
2605                         buf->b_data = zio_data_buf_alloc(size);
2606                         ARCSTAT_INCR(arcstat_data_size, size);
2607                         atomic_add_64(&arc_size, size);
2608                 }
2609                 ARCSTAT_BUMP(arcstat_recycle_miss);
2610         }
2611         ASSERT(buf->b_data != NULL);
2612 out:
2613         /*
2614          * Update the state size.  Note that ghost states have a
2615          * "ghost size" and so don't need to be updated.
2616          */
2617         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2618                 arc_buf_hdr_t *hdr = buf->b_hdr;
2619 
2620                 atomic_add_64(&hdr->b_state->arcs_size, size);
2621                 if (list_link_active(&hdr->b_arc_node)) {
2622                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2623                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2624                 }
2625                 /*
2626                  * If we are growing the cache, and we are adding anonymous
2627                  * data, and we have outgrown arc_p, update arc_p
2628                  */
2629                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2630                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2631                         arc_p = MIN(arc_c, arc_p + size);
2632         }
2633 }
2634 
2635 /*
2636  * This routine is called whenever a buffer is accessed.
2637  * NOTE: the hash lock is dropped in this function.
2638  */
2639 static void
2640 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2641 {
2642         clock_t now;
2643 
2644         ASSERT(MUTEX_HELD(hash_lock));
2645 
2646         if (buf->b_state == arc_anon) {
2647                 /*
2648                  * This buffer is not in the cache, and does not
2649                  * appear in our "ghost" list.  Add the new buffer
2650                  * to the MRU state.
2651                  */
2652 
2653                 ASSERT(buf->b_arc_access == 0);
2654                 buf->b_arc_access = ddi_get_lbolt();
2655                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2656                 arc_change_state(arc_mru, buf, hash_lock);
2657 
2658         } else if (buf->b_state == arc_mru) {
2659                 now = ddi_get_lbolt();
2660 
2661                 /*
2662                  * If this buffer is here because of a prefetch, then either:
2663                  * - clear the flag if this is a "referencing" read
2664                  *   (any subsequent access will bump this into the MFU state).
2665                  * or
2666                  * - move the buffer to the head of the list if this is
2667                  *   another prefetch (to make it less likely to be evicted).
2668                  */
2669                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2670                         if (refcount_count(&buf->b_refcnt) == 0) {
2671                                 ASSERT(list_link_active(&buf->b_arc_node));
2672                         } else {
2673                                 buf->b_flags &= ~ARC_PREFETCH;
2674                                 ARCSTAT_BUMP(arcstat_mru_hits);
2675                         }
2676                         buf->b_arc_access = now;
2677                         return;
2678                 }
2679 
2680                 /*
2681                  * This buffer has been "accessed" only once so far,
2682                  * but it is still in the cache. Move it to the MFU
2683                  * state.
2684                  */
2685                 if (now > buf->b_arc_access + ARC_MINTIME) {
2686                         /*
2687                          * More than 125ms have passed since we
2688                          * instantiated this buffer.  Move it to the
2689                          * most frequently used state.
2690                          */
2691                         buf->b_arc_access = now;
2692                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2693                         arc_change_state(arc_mfu, buf, hash_lock);
2694                 }
2695                 ARCSTAT_BUMP(arcstat_mru_hits);
2696         } else if (buf->b_state == arc_mru_ghost) {
2697                 arc_state_t     *new_state;
2698                 /*
2699                  * This buffer has been "accessed" recently, but
2700                  * was evicted from the cache.  Move it to the
2701                  * MFU state.
2702                  */
2703 
2704                 if (buf->b_flags & ARC_PREFETCH) {
2705                         new_state = arc_mru;
2706                         if (refcount_count(&buf->b_refcnt) > 0)
2707                                 buf->b_flags &= ~ARC_PREFETCH;
2708                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2709                 } else {
2710                         new_state = arc_mfu;
2711                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2712                 }
2713 
2714                 buf->b_arc_access = ddi_get_lbolt();
2715                 arc_change_state(new_state, buf, hash_lock);
2716 
2717                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2718         } else if (buf->b_state == arc_mfu) {
2719                 /*
2720                  * This buffer has been accessed more than once and is
2721                  * still in the cache.  Keep it in the MFU state.
2722                  *
2723                  * NOTE: an add_reference() that occurred when we did
2724                  * the arc_read() will have kicked this off the list.
2725                  * If it was a prefetch, we will explicitly move it to
2726                  * the head of the list now.
2727                  */
2728                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2729                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2730                         ASSERT(list_link_active(&buf->b_arc_node));
2731                 }
2732                 ARCSTAT_BUMP(arcstat_mfu_hits);
2733                 buf->b_arc_access = ddi_get_lbolt();
2734         } else if (buf->b_state == arc_mfu_ghost) {
2735                 arc_state_t     *new_state = arc_mfu;
2736                 /*
2737                  * This buffer has been accessed more than once but has
2738                  * been evicted from the cache.  Move it back to the
2739                  * MFU state.
2740                  */
2741 
2742                 if (buf->b_flags & ARC_PREFETCH) {
2743                         /*
2744                          * This is a prefetch access...
2745                          * move this block back to the MRU state.
2746                          */
2747                         ASSERT0(refcount_count(&buf->b_refcnt));
2748                         new_state = arc_mru;
2749                 }
2750 
2751                 buf->b_arc_access = ddi_get_lbolt();
2752                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2753                 arc_change_state(new_state, buf, hash_lock);
2754 
2755                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2756         } else if (buf->b_state == arc_l2c_only) {
2757                 /*
2758                  * This buffer is on the 2nd Level ARC.
2759                  */
2760 
2761                 buf->b_arc_access = ddi_get_lbolt();
2762                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2763                 arc_change_state(arc_mfu, buf, hash_lock);
2764         } else {
2765                 ASSERT(!"invalid arc state");
2766         }
2767 }
2768 
2769 /* a generic arc_done_func_t which you can use */
2770 /* ARGSUSED */
2771 void
2772 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2773 {
2774         if (zio == NULL || zio->io_error == 0)
2775                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2776         VERIFY(arc_buf_remove_ref(buf, arg));
2777 }
2778 
2779 /* a generic arc_done_func_t */
2780 void
2781 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2782 {
2783         arc_buf_t **bufp = arg;
2784         if (zio && zio->io_error) {
2785                 VERIFY(arc_buf_remove_ref(buf, arg));
2786                 *bufp = NULL;
2787         } else {
2788                 *bufp = buf;
2789                 ASSERT(buf->b_data);
2790         }
2791 }
2792 
2793 static void
2794 arc_read_done(zio_t *zio)
2795 {
2796         arc_buf_hdr_t   *hdr;
2797         arc_buf_t       *buf;
2798         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2799         kmutex_t        *hash_lock = NULL;
2800         arc_callback_t  *callback_list, *acb;
2801         int             freeable = FALSE;
2802 
2803         buf = zio->io_private;
2804         hdr = buf->b_hdr;
2805 
2806         /*
2807          * The hdr was inserted into hash-table and removed from lists
2808          * prior to starting I/O.  We should find this header, since
2809          * it's in the hash table, and it should be legit since it's
2810          * not possible to evict it during the I/O.  The only possible
2811          * reason for it not to be found is if we were freed during the
2812          * read.
2813          */
2814         if (HDR_IN_HASH_TABLE(hdr)) {
2815                 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
2816                 ASSERT3U(hdr->b_dva.dva_word[0], ==,
2817                     BP_IDENTITY(zio->io_bp)->dva_word[0]);
2818                 ASSERT3U(hdr->b_dva.dva_word[1], ==,
2819                     BP_IDENTITY(zio->io_bp)->dva_word[1]);
2820 
2821                 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
2822                     &hash_lock);
2823 
2824                 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
2825                     hash_lock == NULL) ||
2826                     (found == hdr &&
2827                     DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2828                     (found == hdr && HDR_L2_READING(hdr)));
2829         }
2830 
2831         hdr->b_flags &= ~ARC_L2_EVICTED;
2832         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2833                 hdr->b_flags &= ~ARC_L2CACHE;
2834 
2835         /* byteswap if necessary */
2836         callback_list = hdr->b_acb;
2837         ASSERT(callback_list != NULL);
2838         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2839                 dmu_object_byteswap_t bswap =
2840                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2841                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2842                     byteswap_uint64_array :
2843                     dmu_ot_byteswap[bswap].ob_func;
2844                 func(buf->b_data, hdr->b_size);
2845         }
2846 
2847         arc_cksum_compute(buf, B_FALSE);
2848         arc_buf_watch(buf);
2849 
2850         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2851                 /*
2852                  * Only call arc_access on anonymous buffers.  This is because
2853                  * if we've issued an I/O for an evicted buffer, we've already
2854                  * called arc_access (to prevent any simultaneous readers from
2855                  * getting confused).
2856                  */
2857                 arc_access(hdr, hash_lock);
2858         }
2859 
2860         /* create copies of the data buffer for the callers */
2861         abuf = buf;
2862         for (acb = callback_list; acb; acb = acb->acb_next) {
2863                 if (acb->acb_done) {
2864                         if (abuf == NULL) {
2865                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
2866                                 abuf = arc_buf_clone(buf);
2867                         }
2868                         acb->acb_buf = abuf;
2869                         abuf = NULL;
2870                 }
2871         }
2872         hdr->b_acb = NULL;
2873         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2874         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2875         if (abuf == buf) {
2876                 ASSERT(buf->b_efunc == NULL);
2877                 ASSERT(hdr->b_datacnt == 1);
2878                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2879         }
2880 
2881         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2882 
2883         if (zio->io_error != 0) {
2884                 hdr->b_flags |= ARC_IO_ERROR;
2885                 if (hdr->b_state != arc_anon)
2886                         arc_change_state(arc_anon, hdr, hash_lock);
2887                 if (HDR_IN_HASH_TABLE(hdr))
2888                         buf_hash_remove(hdr);
2889                 freeable = refcount_is_zero(&hdr->b_refcnt);
2890         }
2891 
2892         /*
2893          * Broadcast before we drop the hash_lock to avoid the possibility
2894          * that the hdr (and hence the cv) might be freed before we get to
2895          * the cv_broadcast().
2896          */
2897         cv_broadcast(&hdr->b_cv);
2898 
2899         if (hash_lock) {
2900                 mutex_exit(hash_lock);
2901         } else {
2902                 /*
2903                  * This block was freed while we waited for the read to
2904                  * complete.  It has been removed from the hash table and
2905                  * moved to the anonymous state (so that it won't show up
2906                  * in the cache).
2907                  */
2908                 ASSERT3P(hdr->b_state, ==, arc_anon);
2909                 freeable = refcount_is_zero(&hdr->b_refcnt);
2910         }
2911 
2912         /* execute each callback and free its structure */
2913         while ((acb = callback_list) != NULL) {
2914                 if (acb->acb_done)
2915                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2916 
2917                 if (acb->acb_zio_dummy != NULL) {
2918                         acb->acb_zio_dummy->io_error = zio->io_error;
2919                         zio_nowait(acb->acb_zio_dummy);
2920                 }
2921 
2922                 callback_list = acb->acb_next;
2923                 kmem_free(acb, sizeof (arc_callback_t));
2924         }
2925 
2926         if (freeable)
2927                 arc_hdr_destroy(hdr);
2928 }
2929 
2930 /*
2931  * "Read" the block at the specified DVA (in bp) via the
2932  * cache.  If the block is found in the cache, invoke the provided
2933  * callback immediately and return.  Note that the `zio' parameter
2934  * in the callback will be NULL in this case, since no IO was
2935  * required.  If the block is not in the cache pass the read request
2936  * on to the spa with a substitute callback function, so that the
2937  * requested block will be added to the cache.
2938  *
2939  * If a read request arrives for a block that has a read in-progress,
2940  * either wait for the in-progress read to complete (and return the
2941  * results); or, if this is a read with a "done" func, add a record
2942  * to the read to invoke the "done" func when the read completes,
2943  * and return; or just return.
2944  *
2945  * arc_read_done() will invoke all the requested "done" functions
2946  * for readers of this block.
2947  */
2948 int
2949 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2950     void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
2951     const zbookmark_phys_t *zb)
2952 {
2953         arc_buf_hdr_t *hdr = NULL;
2954         arc_buf_t *buf = NULL;
2955         kmutex_t *hash_lock = NULL;
2956         zio_t *rzio;
2957         uint64_t guid = spa_load_guid(spa);
2958 
2959         ASSERT(!BP_IS_EMBEDDED(bp) ||
2960             BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
2961 
2962 top:
2963         if (!BP_IS_EMBEDDED(bp)) {
2964                 /*
2965                  * Embedded BP's have no DVA and require no I/O to "read".
2966                  * Create an anonymous arc buf to back it.
2967                  */
2968                 hdr = buf_hash_find(guid, bp, &hash_lock);
2969         }
2970 
2971         if (hdr != NULL && hdr->b_datacnt > 0) {
2972 
2973                 *arc_flags |= ARC_CACHED;
2974 
2975                 if (HDR_IO_IN_PROGRESS(hdr)) {
2976 
2977                         if (*arc_flags & ARC_WAIT) {
2978                                 cv_wait(&hdr->b_cv, hash_lock);
2979                                 mutex_exit(hash_lock);
2980                                 goto top;
2981                         }
2982                         ASSERT(*arc_flags & ARC_NOWAIT);
2983 
2984                         if (done) {
2985                                 arc_callback_t  *acb = NULL;
2986 
2987                                 acb = kmem_zalloc(sizeof (arc_callback_t),
2988                                     KM_SLEEP);
2989                                 acb->acb_done = done;
2990                                 acb->acb_private = private;
2991                                 if (pio != NULL)
2992                                         acb->acb_zio_dummy = zio_null(pio,
2993                                             spa, NULL, NULL, NULL, zio_flags);
2994 
2995                                 ASSERT(acb->acb_done != NULL);
2996                                 acb->acb_next = hdr->b_acb;
2997                                 hdr->b_acb = acb;
2998                                 add_reference(hdr, hash_lock, private);
2999                                 mutex_exit(hash_lock);
3000                                 return (0);
3001                         }
3002                         mutex_exit(hash_lock);
3003                         return (0);
3004                 }
3005 
3006                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3007 
3008                 if (done) {
3009                         add_reference(hdr, hash_lock, private);
3010                         /*
3011                          * If this block is already in use, create a new
3012                          * copy of the data so that we will be guaranteed
3013                          * that arc_release() will always succeed.
3014                          */
3015                         buf = hdr->b_buf;
3016                         ASSERT(buf);
3017                         ASSERT(buf->b_data);
3018                         if (HDR_BUF_AVAILABLE(hdr)) {
3019                                 ASSERT(buf->b_efunc == NULL);
3020                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3021                         } else {
3022                                 buf = arc_buf_clone(buf);
3023                         }
3024 
3025                 } else if (*arc_flags & ARC_PREFETCH &&
3026                     refcount_count(&hdr->b_refcnt) == 0) {
3027                         hdr->b_flags |= ARC_PREFETCH;
3028                 }
3029                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3030                 arc_access(hdr, hash_lock);
3031                 if (*arc_flags & ARC_L2CACHE)
3032                         hdr->b_flags |= ARC_L2CACHE;
3033                 if (*arc_flags & ARC_L2COMPRESS)
3034                         hdr->b_flags |= ARC_L2COMPRESS;
3035                 mutex_exit(hash_lock);
3036                 ARCSTAT_BUMP(arcstat_hits);
3037                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3038                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3039                     data, metadata, hits);
3040 
3041                 if (done)
3042                         done(NULL, buf, private);
3043         } else {
3044                 uint64_t size = BP_GET_LSIZE(bp);
3045                 arc_callback_t *acb;
3046                 vdev_t *vd = NULL;
3047                 uint64_t addr = 0;
3048                 boolean_t devw = B_FALSE;
3049                 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3050                 uint64_t b_asize = 0;
3051 
3052                 if (hdr == NULL) {
3053                         /* this block is not in the cache */
3054                         arc_buf_hdr_t *exists = NULL;
3055                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3056                         buf = arc_buf_alloc(spa, size, private, type);
3057                         hdr = buf->b_hdr;
3058                         if (!BP_IS_EMBEDDED(bp)) {
3059                                 hdr->b_dva = *BP_IDENTITY(bp);
3060                                 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3061                                 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3062                                 exists = buf_hash_insert(hdr, &hash_lock);
3063                         }
3064                         if (exists != NULL) {
3065                                 /* somebody beat us to the hash insert */
3066                                 mutex_exit(hash_lock);
3067                                 buf_discard_identity(hdr);
3068                                 (void) arc_buf_remove_ref(buf, private);
3069                                 goto top; /* restart the IO request */
3070                         }
3071                         /* if this is a prefetch, we don't have a reference */
3072                         if (*arc_flags & ARC_PREFETCH) {
3073                                 (void) remove_reference(hdr, hash_lock,
3074                                     private);
3075                                 hdr->b_flags |= ARC_PREFETCH;
3076                         }
3077                         if (*arc_flags & ARC_L2CACHE)
3078                                 hdr->b_flags |= ARC_L2CACHE;
3079                         if (*arc_flags & ARC_L2COMPRESS)
3080                                 hdr->b_flags |= ARC_L2COMPRESS;
3081                         if (BP_GET_LEVEL(bp) > 0)
3082                                 hdr->b_flags |= ARC_INDIRECT;
3083                 } else {
3084                         /* this block is in the ghost cache */
3085                         ASSERT(GHOST_STATE(hdr->b_state));
3086                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3087                         ASSERT0(refcount_count(&hdr->b_refcnt));
3088                         ASSERT(hdr->b_buf == NULL);
3089 
3090                         /* if this is a prefetch, we don't have a reference */
3091                         if (*arc_flags & ARC_PREFETCH)
3092                                 hdr->b_flags |= ARC_PREFETCH;
3093                         else
3094                                 add_reference(hdr, hash_lock, private);
3095                         if (*arc_flags & ARC_L2CACHE)
3096                                 hdr->b_flags |= ARC_L2CACHE;
3097                         if (*arc_flags & ARC_L2COMPRESS)
3098                                 hdr->b_flags |= ARC_L2COMPRESS;
3099                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3100                         buf->b_hdr = hdr;
3101                         buf->b_data = NULL;
3102                         buf->b_efunc = NULL;
3103                         buf->b_private = NULL;
3104                         buf->b_next = NULL;
3105                         hdr->b_buf = buf;
3106                         ASSERT(hdr->b_datacnt == 0);
3107                         hdr->b_datacnt = 1;
3108                         arc_get_data_buf(buf);
3109                         arc_access(hdr, hash_lock);
3110                 }
3111 
3112                 ASSERT(!GHOST_STATE(hdr->b_state));
3113 
3114                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3115                 acb->acb_done = done;
3116                 acb->acb_private = private;
3117 
3118                 ASSERT(hdr->b_acb == NULL);
3119                 hdr->b_acb = acb;
3120                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3121 
3122                 if (hdr->b_l2hdr != NULL &&
3123                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3124                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3125                         addr = hdr->b_l2hdr->b_daddr;
3126                         b_compress = hdr->b_l2hdr->b_compress;
3127                         b_asize = hdr->b_l2hdr->b_asize;
3128                         /*
3129                          * Lock out device removal.
3130                          */
3131                         if (vdev_is_dead(vd) ||
3132                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3133                                 vd = NULL;
3134                 }
3135 
3136                 if (hash_lock != NULL)
3137                         mutex_exit(hash_lock);
3138 
3139                 /*
3140                  * At this point, we have a level 1 cache miss.  Try again in
3141                  * L2ARC if possible.
3142                  */
3143                 ASSERT3U(hdr->b_size, ==, size);
3144                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3145                     uint64_t, size, zbookmark_phys_t *, zb);
3146                 ARCSTAT_BUMP(arcstat_misses);
3147                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3148                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3149                     data, metadata, misses);
3150 
3151                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3152                         /*
3153                          * Read from the L2ARC if the following are true:
3154                          * 1. The L2ARC vdev was previously cached.
3155                          * 2. This buffer still has L2ARC metadata.
3156                          * 3. This buffer isn't currently writing to the L2ARC.
3157                          * 4. The L2ARC entry wasn't evicted, which may
3158                          *    also have invalidated the vdev.
3159                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3160                          */
3161                         if (hdr->b_l2hdr != NULL &&
3162                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3163                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3164                                 l2arc_read_callback_t *cb;
3165 
3166                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3167                                 ARCSTAT_BUMP(arcstat_l2_hits);
3168 
3169                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3170                                     KM_SLEEP);
3171                                 cb->l2rcb_buf = buf;
3172                                 cb->l2rcb_spa = spa;
3173                                 cb->l2rcb_bp = *bp;
3174                                 cb->l2rcb_zb = *zb;
3175                                 cb->l2rcb_flags = zio_flags;
3176                                 cb->l2rcb_compress = b_compress;
3177 
3178                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3179                                     addr + size < vd->vdev_psize -
3180                                     VDEV_LABEL_END_SIZE);
3181 
3182                                 /*
3183                                  * l2arc read.  The SCL_L2ARC lock will be
3184                                  * released by l2arc_read_done().
3185                                  * Issue a null zio if the underlying buffer
3186                                  * was squashed to zero size by compression.
3187                                  */
3188                                 if (b_compress == ZIO_COMPRESS_EMPTY) {
3189                                         rzio = zio_null(pio, spa, vd,
3190                                             l2arc_read_done, cb,
3191                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3192                                             ZIO_FLAG_CANFAIL |
3193                                             ZIO_FLAG_DONT_PROPAGATE |
3194                                             ZIO_FLAG_DONT_RETRY);
3195                                 } else {
3196                                         rzio = zio_read_phys(pio, vd, addr,
3197                                             b_asize, buf->b_data,
3198                                             ZIO_CHECKSUM_OFF,
3199                                             l2arc_read_done, cb, priority,
3200                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3201                                             ZIO_FLAG_CANFAIL |
3202                                             ZIO_FLAG_DONT_PROPAGATE |
3203                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
3204                                 }
3205                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3206                                     zio_t *, rzio);
3207                                 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3208 
3209                                 if (*arc_flags & ARC_NOWAIT) {
3210                                         zio_nowait(rzio);
3211                                         return (0);
3212                                 }
3213 
3214                                 ASSERT(*arc_flags & ARC_WAIT);
3215                                 if (zio_wait(rzio) == 0)
3216                                         return (0);
3217 
3218                                 /* l2arc read error; goto zio_read() */
3219                         } else {
3220                                 DTRACE_PROBE1(l2arc__miss,
3221                                     arc_buf_hdr_t *, hdr);
3222                                 ARCSTAT_BUMP(arcstat_l2_misses);
3223                                 if (HDR_L2_WRITING(hdr))
3224                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3225                                 spa_config_exit(spa, SCL_L2ARC, vd);
3226                         }
3227                 } else {
3228                         if (vd != NULL)
3229                                 spa_config_exit(spa, SCL_L2ARC, vd);
3230                         if (l2arc_ndev != 0) {
3231                                 DTRACE_PROBE1(l2arc__miss,
3232                                     arc_buf_hdr_t *, hdr);
3233                                 ARCSTAT_BUMP(arcstat_l2_misses);
3234                         }
3235                 }
3236 
3237                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3238                     arc_read_done, buf, priority, zio_flags, zb);
3239 
3240                 if (*arc_flags & ARC_WAIT)
3241                         return (zio_wait(rzio));
3242 
3243                 ASSERT(*arc_flags & ARC_NOWAIT);
3244                 zio_nowait(rzio);
3245         }
3246         return (0);
3247 }
3248 
3249 void
3250 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3251 {
3252         ASSERT(buf->b_hdr != NULL);
3253         ASSERT(buf->b_hdr->b_state != arc_anon);
3254         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3255         ASSERT(buf->b_efunc == NULL);
3256         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3257 
3258         buf->b_efunc = func;
3259         buf->b_private = private;
3260 }
3261 
3262 /*
3263  * Notify the arc that a block was freed, and thus will never be used again.
3264  */
3265 void
3266 arc_freed(spa_t *spa, const blkptr_t *bp)
3267 {
3268         arc_buf_hdr_t *hdr;
3269         kmutex_t *hash_lock;
3270         uint64_t guid = spa_load_guid(spa);
3271 
3272         ASSERT(!BP_IS_EMBEDDED(bp));
3273 
3274         hdr = buf_hash_find(guid, bp, &hash_lock);
3275         if (hdr == NULL)
3276                 return;
3277         if (HDR_BUF_AVAILABLE(hdr)) {
3278                 arc_buf_t *buf = hdr->b_buf;
3279                 add_reference(hdr, hash_lock, FTAG);
3280                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3281                 mutex_exit(hash_lock);
3282 
3283                 arc_release(buf, FTAG);
3284                 (void) arc_buf_remove_ref(buf, FTAG);
3285         } else {
3286                 mutex_exit(hash_lock);
3287         }
3288 
3289 }
3290 
3291 /*
3292  * Clear the user eviction callback set by arc_set_callback(), first calling
3293  * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3294  * clearing the callback may result in the arc_buf being destroyed.  However,
3295  * it will not result in the *last* arc_buf being destroyed, hence the data
3296  * will remain cached in the ARC. We make a copy of the arc buffer here so
3297  * that we can process the callback without holding any locks.
3298  *
3299  * It's possible that the callback is already in the process of being cleared
3300  * by another thread.  In this case we can not clear the callback.
3301  *
3302  * Returns B_TRUE if the callback was successfully called and cleared.
3303  */
3304 boolean_t
3305 arc_clear_callback(arc_buf_t *buf)
3306 {
3307         arc_buf_hdr_t *hdr;
3308         kmutex_t *hash_lock;
3309         arc_evict_func_t *efunc = buf->b_efunc;
3310         void *private = buf->b_private;
3311 
3312         mutex_enter(&buf->b_evict_lock);
3313         hdr = buf->b_hdr;
3314         if (hdr == NULL) {
3315                 /*
3316                  * We are in arc_do_user_evicts().
3317                  */
3318                 ASSERT(buf->b_data == NULL);
3319                 mutex_exit(&buf->b_evict_lock);
3320                 return (B_FALSE);
3321         } else if (buf->b_data == NULL) {
3322                 /*
3323                  * We are on the eviction list; process this buffer now
3324                  * but let arc_do_user_evicts() do the reaping.
3325                  */
3326                 buf->b_efunc = NULL;
3327                 mutex_exit(&buf->b_evict_lock);
3328                 VERIFY0(efunc(private));
3329                 return (B_TRUE);
3330         }
3331         hash_lock = HDR_LOCK(hdr);
3332         mutex_enter(hash_lock);
3333         hdr = buf->b_hdr;
3334         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3335 
3336         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3337         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3338 
3339         buf->b_efunc = NULL;
3340         buf->b_private = NULL;
3341 
3342         if (hdr->b_datacnt > 1) {
3343                 mutex_exit(&buf->b_evict_lock);
3344                 arc_buf_destroy(buf, FALSE, TRUE);
3345         } else {
3346                 ASSERT(buf == hdr->b_buf);
3347                 hdr->b_flags |= ARC_BUF_AVAILABLE;
3348                 mutex_exit(&buf->b_evict_lock);
3349         }
3350 
3351         mutex_exit(hash_lock);
3352         VERIFY0(efunc(private));
3353         return (B_TRUE);
3354 }
3355 
3356 /*
3357  * Release this buffer from the cache, making it an anonymous buffer.  This
3358  * must be done after a read and prior to modifying the buffer contents.
3359  * If the buffer has more than one reference, we must make
3360  * a new hdr for the buffer.
3361  */
3362 void
3363 arc_release(arc_buf_t *buf, void *tag)
3364 {
3365         arc_buf_hdr_t *hdr;
3366         kmutex_t *hash_lock = NULL;
3367         l2arc_buf_hdr_t *l2hdr;
3368         uint64_t buf_size;
3369 
3370         /*
3371          * It would be nice to assert that if it's DMU metadata (level >
3372          * 0 || it's the dnode file), then it must be syncing context.
3373          * But we don't know that information at this level.
3374          */
3375 
3376         mutex_enter(&buf->b_evict_lock);
3377         hdr = buf->b_hdr;
3378 
3379         /* this buffer is not on any list */
3380         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3381 
3382         if (hdr->b_state == arc_anon) {
3383                 /* this buffer is already released */
3384                 ASSERT(buf->b_efunc == NULL);
3385         } else {
3386                 hash_lock = HDR_LOCK(hdr);
3387                 mutex_enter(hash_lock);
3388                 hdr = buf->b_hdr;
3389                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3390         }
3391 
3392         l2hdr = hdr->b_l2hdr;
3393         if (l2hdr) {
3394                 mutex_enter(&l2arc_buflist_mtx);
3395                 hdr->b_l2hdr = NULL;
3396                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3397         }
3398         buf_size = hdr->b_size;
3399 
3400         /*
3401          * Do we have more than one buf?
3402          */
3403         if (hdr->b_datacnt > 1) {
3404                 arc_buf_hdr_t *nhdr;
3405                 arc_buf_t **bufp;
3406                 uint64_t blksz = hdr->b_size;
3407                 uint64_t spa = hdr->b_spa;
3408                 arc_buf_contents_t type = hdr->b_type;
3409                 uint32_t flags = hdr->b_flags;
3410 
3411                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3412                 /*
3413                  * Pull the data off of this hdr and attach it to
3414                  * a new anonymous hdr.
3415                  */
3416                 (void) remove_reference(hdr, hash_lock, tag);
3417                 bufp = &hdr->b_buf;
3418                 while (*bufp != buf)
3419                         bufp = &(*bufp)->b_next;
3420                 *bufp = buf->b_next;
3421                 buf->b_next = NULL;
3422 
3423                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3424                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3425                 if (refcount_is_zero(&hdr->b_refcnt)) {
3426                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3427                         ASSERT3U(*size, >=, hdr->b_size);
3428                         atomic_add_64(size, -hdr->b_size);
3429                 }
3430 
3431                 /*
3432                  * We're releasing a duplicate user data buffer, update
3433                  * our statistics accordingly.
3434                  */
3435                 if (hdr->b_type == ARC_BUFC_DATA) {
3436                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3437                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3438                             -hdr->b_size);
3439                 }
3440                 hdr->b_datacnt -= 1;
3441                 arc_cksum_verify(buf);
3442                 arc_buf_unwatch(buf);
3443 
3444                 mutex_exit(hash_lock);
3445 
3446                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3447                 nhdr->b_size = blksz;
3448                 nhdr->b_spa = spa;
3449                 nhdr->b_type = type;
3450                 nhdr->b_buf = buf;
3451                 nhdr->b_state = arc_anon;
3452                 nhdr->b_arc_access = 0;
3453                 nhdr->b_flags = flags & ARC_L2_WRITING;
3454                 nhdr->b_l2hdr = NULL;
3455                 nhdr->b_datacnt = 1;
3456                 nhdr->b_freeze_cksum = NULL;
3457                 (void) refcount_add(&nhdr->b_refcnt, tag);
3458                 buf->b_hdr = nhdr;
3459                 mutex_exit(&buf->b_evict_lock);
3460                 atomic_add_64(&arc_anon->arcs_size, blksz);
3461         } else {
3462                 mutex_exit(&buf->b_evict_lock);
3463                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3464                 ASSERT(!list_link_active(&hdr->b_arc_node));
3465                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3466                 if (hdr->b_state != arc_anon)
3467                         arc_change_state(arc_anon, hdr, hash_lock);
3468                 hdr->b_arc_access = 0;
3469                 if (hash_lock)
3470                         mutex_exit(hash_lock);
3471 
3472                 buf_discard_identity(hdr);
3473                 arc_buf_thaw(buf);
3474         }
3475         buf->b_efunc = NULL;
3476         buf->b_private = NULL;
3477 
3478         if (l2hdr) {
3479                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3480                 if (l2hdr->b_dev->l2ad_vdev)
3481                         vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3482                             -l2hdr->b_asize, 0, 0);
3483                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3484                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3485                 mutex_exit(&l2arc_buflist_mtx);
3486         }
3487 }
3488 
3489 int
3490 arc_released(arc_buf_t *buf)
3491 {
3492         int released;
3493 
3494         mutex_enter(&buf->b_evict_lock);
3495         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3496         mutex_exit(&buf->b_evict_lock);
3497         return (released);
3498 }
3499 
3500 #ifdef ZFS_DEBUG
3501 int
3502 arc_referenced(arc_buf_t *buf)
3503 {
3504         int referenced;
3505 
3506         mutex_enter(&buf->b_evict_lock);
3507         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3508         mutex_exit(&buf->b_evict_lock);
3509         return (referenced);
3510 }
3511 #endif
3512 
3513 static void
3514 arc_write_ready(zio_t *zio)
3515 {
3516         arc_write_callback_t *callback = zio->io_private;
3517         arc_buf_t *buf = callback->awcb_buf;
3518         arc_buf_hdr_t *hdr = buf->b_hdr;
3519 
3520         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3521         callback->awcb_ready(zio, buf, callback->awcb_private);
3522 
3523         /*
3524          * If the IO is already in progress, then this is a re-write
3525          * attempt, so we need to thaw and re-compute the cksum.
3526          * It is the responsibility of the callback to handle the
3527          * accounting for any re-write attempt.
3528          */
3529         if (HDR_IO_IN_PROGRESS(hdr)) {
3530                 mutex_enter(&hdr->b_freeze_lock);
3531                 if (hdr->b_freeze_cksum != NULL) {
3532                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3533                         hdr->b_freeze_cksum = NULL;
3534                 }
3535                 mutex_exit(&hdr->b_freeze_lock);
3536         }
3537         arc_cksum_compute(buf, B_FALSE);
3538         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3539 }
3540 
3541 /*
3542  * The SPA calls this callback for each physical write that happens on behalf
3543  * of a logical write.  See the comment in dbuf_write_physdone() for details.
3544  */
3545 static void
3546 arc_write_physdone(zio_t *zio)
3547 {
3548         arc_write_callback_t *cb = zio->io_private;
3549         if (cb->awcb_physdone != NULL)
3550                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3551 }
3552 
3553 static void
3554 arc_write_done(zio_t *zio)
3555 {
3556         arc_write_callback_t *callback = zio->io_private;
3557         arc_buf_t *buf = callback->awcb_buf;
3558         arc_buf_hdr_t *hdr = buf->b_hdr;
3559 
3560         ASSERT(hdr->b_acb == NULL);
3561 
3562         if (zio->io_error == 0) {
3563                 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3564                         buf_discard_identity(hdr);
3565                 } else {
3566                         hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3567                         hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3568                         hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3569                 }
3570         } else {
3571                 ASSERT(BUF_EMPTY(hdr));
3572         }
3573 
3574         /*
3575          * If the block to be written was all-zero or compressed enough to be
3576          * embedded in the BP, no write was performed so there will be no
3577          * dva/birth/checksum.  The buffer must therefore remain anonymous
3578          * (and uncached).
3579          */
3580         if (!BUF_EMPTY(hdr)) {
3581                 arc_buf_hdr_t *exists;
3582                 kmutex_t *hash_lock;
3583 
3584                 ASSERT(zio->io_error == 0);
3585 
3586                 arc_cksum_verify(buf);
3587 
3588                 exists = buf_hash_insert(hdr, &hash_lock);
3589                 if (exists) {
3590                         /*
3591                          * This can only happen if we overwrite for
3592                          * sync-to-convergence, because we remove
3593                          * buffers from the hash table when we arc_free().
3594                          */
3595                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3596                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3597                                         panic("bad overwrite, hdr=%p exists=%p",
3598                                             (void *)hdr, (void *)exists);
3599                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3600                                 arc_change_state(arc_anon, exists, hash_lock);
3601                                 mutex_exit(hash_lock);
3602                                 arc_hdr_destroy(exists);
3603                                 exists = buf_hash_insert(hdr, &hash_lock);
3604                                 ASSERT3P(exists, ==, NULL);
3605                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3606                                 /* nopwrite */
3607                                 ASSERT(zio->io_prop.zp_nopwrite);
3608                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3609                                         panic("bad nopwrite, hdr=%p exists=%p",
3610                                             (void *)hdr, (void *)exists);
3611                         } else {
3612                                 /* Dedup */
3613                                 ASSERT(hdr->b_datacnt == 1);
3614                                 ASSERT(hdr->b_state == arc_anon);
3615                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3616                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3617                         }
3618                 }
3619                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3620                 /* if it's not anon, we are doing a scrub */
3621                 if (!exists && hdr->b_state == arc_anon)
3622                         arc_access(hdr, hash_lock);
3623                 mutex_exit(hash_lock);
3624         } else {
3625                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3626         }
3627 
3628         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3629         callback->awcb_done(zio, buf, callback->awcb_private);
3630 
3631         kmem_free(callback, sizeof (arc_write_callback_t));
3632 }
3633 
3634 zio_t *
3635 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3636     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3637     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3638     arc_done_func_t *done, void *private, zio_priority_t priority,
3639     int zio_flags, const zbookmark_phys_t *zb)
3640 {
3641         arc_buf_hdr_t *hdr = buf->b_hdr;
3642         arc_write_callback_t *callback;
3643         zio_t *zio;
3644 
3645         ASSERT(ready != NULL);
3646         ASSERT(done != NULL);
3647         ASSERT(!HDR_IO_ERROR(hdr));
3648         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3649         ASSERT(hdr->b_acb == NULL);
3650         if (l2arc)
3651                 hdr->b_flags |= ARC_L2CACHE;
3652         if (l2arc_compress)
3653                 hdr->b_flags |= ARC_L2COMPRESS;
3654         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3655         callback->awcb_ready = ready;
3656         callback->awcb_physdone = physdone;
3657         callback->awcb_done = done;
3658         callback->awcb_private = private;
3659         callback->awcb_buf = buf;
3660 
3661         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3662             arc_write_ready, arc_write_physdone, arc_write_done, callback,
3663             priority, zio_flags, zb);
3664 
3665         return (zio);
3666 }
3667 
3668 static int
3669 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3670 {
3671 #ifdef _KERNEL
3672         uint64_t available_memory = ptob(freemem);
3673         static uint64_t page_load = 0;
3674         static uint64_t last_txg = 0;
3675 
3676 #if defined(__i386)
3677         available_memory =
3678             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3679 #endif
3680 
3681         if (freemem > physmem * arc_lotsfree_percent / 100)
3682                 return (0);
3683 
3684         if (txg > last_txg) {
3685                 last_txg = txg;
3686                 page_load = 0;
3687         }
3688         /*
3689          * If we are in pageout, we know that memory is already tight,
3690          * the arc is already going to be evicting, so we just want to
3691          * continue to let page writes occur as quickly as possible.
3692          */
3693         if (curproc == proc_pageout) {
3694                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3695                         return (SET_ERROR(ERESTART));
3696                 /* Note: reserve is inflated, so we deflate */
3697                 page_load += reserve / 8;
3698                 return (0);
3699         } else if (page_load > 0 && arc_reclaim_needed()) {
3700                 /* memory is low, delay before restarting */
3701                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3702                 return (SET_ERROR(EAGAIN));
3703         }
3704         page_load = 0;
3705 #endif
3706         return (0);
3707 }
3708 
3709 void
3710 arc_tempreserve_clear(uint64_t reserve)
3711 {
3712         atomic_add_64(&arc_tempreserve, -reserve);
3713         ASSERT((int64_t)arc_tempreserve >= 0);
3714 }
3715 
3716 int
3717 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3718 {
3719         int error;
3720         uint64_t anon_size;
3721 
3722         if (reserve > arc_c/4 && !arc_no_grow)
3723                 arc_c = MIN(arc_c_max, reserve * 4);
3724         if (reserve > arc_c)
3725                 return (SET_ERROR(ENOMEM));
3726 
3727         /*
3728          * Don't count loaned bufs as in flight dirty data to prevent long
3729          * network delays from blocking transactions that are ready to be
3730          * assigned to a txg.
3731          */
3732         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3733 
3734         /*
3735          * Writes will, almost always, require additional memory allocations
3736          * in order to compress/encrypt/etc the data.  We therefore need to
3737          * make sure that there is sufficient available memory for this.
3738          */
3739         error = arc_memory_throttle(reserve, txg);
3740         if (error != 0)
3741                 return (error);
3742 
3743         /*
3744          * Throttle writes when the amount of dirty data in the cache
3745          * gets too large.  We try to keep the cache less than half full
3746          * of dirty blocks so that our sync times don't grow too large.
3747          * Note: if two requests come in concurrently, we might let them
3748          * both succeed, when one of them should fail.  Not a huge deal.
3749          */
3750 
3751         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3752             anon_size > arc_c / 4) {
3753                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3754                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3755                     arc_tempreserve>>10,
3756                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3757                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3758                     reserve>>10, arc_c>>10);
3759                 return (SET_ERROR(ERESTART));
3760         }
3761         atomic_add_64(&arc_tempreserve, reserve);
3762         return (0);
3763 }
3764 
3765 /* Tuneable, default is 64, which is essentially arbitrary */
3766 int zfs_flush_ntasks = 64;
3767 
3768 void
3769 arc_init(void)
3770 {
3771         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3772         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3773 
3774         /* Convert seconds to clock ticks */
3775         arc_min_prefetch_lifespan = 1 * hz;
3776 
3777         /* Start out with 1/8 of all memory */
3778         arc_c = physmem * PAGESIZE / 8;
3779 
3780 #ifdef _KERNEL
3781         /*
3782          * On architectures where the physical memory can be larger
3783          * than the addressable space (intel in 32-bit mode), we may
3784          * need to limit the cache to 1/8 of VM size.
3785          */
3786         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3787 #endif
3788 
3789         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3790         arc_c_min = MAX(arc_c / 4, 64<<20);
3791         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3792         if (arc_c * 8 >= 1<<30)
3793                 arc_c_max = (arc_c * 8) - (1<<30);
3794         else
3795                 arc_c_max = arc_c_min;
3796         arc_c_max = MAX(arc_c * 6, arc_c_max);
3797 
3798         /*
3799          * Allow the tunables to override our calculations if they are
3800          * reasonable (ie. over 64MB)
3801          */
3802         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3803                 arc_c_max = zfs_arc_max;
3804         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3805                 arc_c_min = zfs_arc_min;
3806 
3807         arc_c = arc_c_max;
3808         arc_p = (arc_c >> 1);
3809 
3810         /* limit meta-data to 1/4 of the arc capacity */
3811         arc_meta_limit = arc_c_max / 4;
3812 
3813         /* Allow the tunable to override if it is reasonable */
3814         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3815                 arc_meta_limit = zfs_arc_meta_limit;
3816 
3817         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3818                 arc_c_min = arc_meta_limit / 2;
3819 
3820         if (zfs_arc_grow_retry > 0)
3821                 arc_grow_retry = zfs_arc_grow_retry;
3822 
3823         if (zfs_arc_shrink_shift > 0)
3824                 arc_shrink_shift = zfs_arc_shrink_shift;
3825 
3826         if (zfs_arc_p_min_shift > 0)
3827                 arc_p_min_shift = zfs_arc_p_min_shift;
3828 
3829         /* if kmem_flags are set, lets try to use less memory */
3830         if (kmem_debugging())
3831                 arc_c = arc_c / 2;
3832         if (arc_c < arc_c_min)
3833                 arc_c = arc_c_min;
3834 
3835         arc_anon = &ARC_anon;
3836         arc_mru = &ARC_mru;
3837         arc_mru_ghost = &ARC_mru_ghost;
3838         arc_mfu = &ARC_mfu;
3839         arc_mfu_ghost = &ARC_mfu_ghost;
3840         arc_l2c_only = &ARC_l2c_only;
3841         arc_size = 0;
3842 
3843         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3844         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3845         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3846         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3847         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3848         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3849 
3850         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3851             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3852         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3853             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3854         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3855             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3856         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3857             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3858         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3859             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3860         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3861             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3862         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3863             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3864         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3865             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3866         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3867             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3868         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3869             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3870 
3871         arc_flush_taskq = taskq_create("arc_flush_tq",
3872             max_ncpus, minclsyspri, 1, zfs_flush_ntasks, TASKQ_DYNAMIC);
3873         buf_init();
3874 
3875         arc_thread_exit = 0;
3876         arc_eviction_list = NULL;
3877         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3878         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3879 
3880         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3881             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3882 
3883         if (arc_ksp != NULL) {
3884                 arc_ksp->ks_data = &arc_stats;
3885                 kstat_install(arc_ksp);
3886         }
3887 
3888         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3889             TS_RUN, minclsyspri);
3890 
3891         arc_dead = FALSE;
3892         arc_warm = B_FALSE;
3893 
3894         /*
3895          * Calculate maximum amount of dirty data per pool.
3896          *
3897          * If it has been set by /etc/system, take that.
3898          * Otherwise, use a percentage of physical memory defined by
3899          * zfs_dirty_data_max_percent (default 10%) with a cap at
3900          * zfs_dirty_data_max_max (default 4GB).
3901          */
3902         if (zfs_dirty_data_max == 0) {
3903                 zfs_dirty_data_max = physmem * PAGESIZE *
3904                     zfs_dirty_data_max_percent / 100;
3905                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
3906                     zfs_dirty_data_max_max);
3907         }
3908 }
3909 
3910 void
3911 arc_fini(void)
3912 {
3913         mutex_enter(&arc_reclaim_thr_lock);
3914         arc_thread_exit = 1;
3915         while (arc_thread_exit != 0)
3916                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3917         mutex_exit(&arc_reclaim_thr_lock);
3918 
3919         arc_flush(NULL);
3920 
3921         arc_dead = TRUE;
3922 
3923         if (arc_ksp != NULL) {
3924                 kstat_delete(arc_ksp);
3925                 arc_ksp = NULL;
3926         }
3927 
3928         mutex_destroy(&arc_eviction_mtx);
3929         mutex_destroy(&arc_reclaim_thr_lock);
3930         cv_destroy(&arc_reclaim_thr_cv);
3931 
3932         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3933         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3934         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3935         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3936         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3937         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3938         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3939         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3940 
3941         mutex_destroy(&arc_anon->arcs_mtx);
3942         mutex_destroy(&arc_mru->arcs_mtx);
3943         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3944         mutex_destroy(&arc_mfu->arcs_mtx);
3945         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3946         mutex_destroy(&arc_l2c_only->arcs_mtx);
3947 
3948         taskq_destroy(arc_flush_taskq);
3949         buf_fini();
3950 
3951         ASSERT(arc_loaned_bytes == 0);
3952 }
3953 
3954 /*
3955  * Level 2 ARC
3956  *
3957  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3958  * It uses dedicated storage devices to hold cached data, which are populated
3959  * using large infrequent writes.  The main role of this cache is to boost
3960  * the performance of random read workloads.  The intended L2ARC devices
3961  * include short-stroked disks, solid state disks, and other media with
3962  * substantially faster read latency than disk.
3963  *
3964  *                 +-----------------------+
3965  *                 |         ARC           |
3966  *                 +-----------------------+
3967  *                    |         ^     ^
3968  *                    |         |     |
3969  *      l2arc_feed_thread()    arc_read()
3970  *                    |         |     |
3971  *                    |  l2arc read   |
3972  *                    V         |     |
3973  *               +---------------+    |
3974  *               |     L2ARC     |    |
3975  *               +---------------+    |
3976  *                   |    ^           |
3977  *          l2arc_write() |           |
3978  *                   |    |           |
3979  *                   V    |           |
3980  *                 +-------+      +-------+
3981  *                 | vdev  |      | vdev  |
3982  *                 | cache |      | cache |
3983  *                 +-------+      +-------+
3984  *                 +=========+     .-----.
3985  *                 :  L2ARC  :    |-_____-|
3986  *                 : devices :    | Disks |
3987  *                 +=========+    `-_____-'
3988  *
3989  * Read requests are satisfied from the following sources, in order:
3990  *
3991  *      1) ARC
3992  *      2) vdev cache of L2ARC devices
3993  *      3) L2ARC devices
3994  *      4) vdev cache of disks
3995  *      5) disks
3996  *
3997  * Some L2ARC device types exhibit extremely slow write performance.
3998  * To accommodate for this there are some significant differences between
3999  * the L2ARC and traditional cache design:
4000  *
4001  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4002  * the ARC behave as usual, freeing buffers and placing headers on ghost
4003  * lists.  The ARC does not send buffers to the L2ARC during eviction as
4004  * this would add inflated write latencies for all ARC memory pressure.
4005  *
4006  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4007  * It does this by periodically scanning buffers from the eviction-end of
4008  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4009  * not already there. It scans until a headroom of buffers is satisfied,
4010  * which itself is a buffer for ARC eviction. If a compressible buffer is
4011  * found during scanning and selected for writing to an L2ARC device, we
4012  * temporarily boost scanning headroom during the next scan cycle to make
4013  * sure we adapt to compression effects (which might significantly reduce
4014  * the data volume we write to L2ARC). The thread that does this is
4015  * l2arc_feed_thread(), illustrated below; example sizes are included to
4016  * provide a better sense of ratio than this diagram:
4017  *
4018  *             head -->                        tail
4019  *              +---------------------+----------+
4020  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4021  *              +---------------------+----------+   |   o L2ARC eligible
4022  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4023  *              +---------------------+----------+   |
4024  *                   15.9 Gbytes      ^ 32 Mbytes    |
4025  *                                 headroom          |
4026  *                                            l2arc_feed_thread()
4027  *                                                   |
4028  *                       l2arc write hand <--[oooo]--'
4029  *                               |           8 Mbyte
4030  *                               |          write max
4031  *                               V
4032  *                +==============================+
4033  *      L2ARC dev |####|#|###|###|    |####| ... |
4034  *                +==============================+
4035  *                           32 Gbytes
4036  *
4037  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4038  * evicted, then the L2ARC has cached a buffer much sooner than it probably
4039  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4040  * safe to say that this is an uncommon case, since buffers at the end of
4041  * the ARC lists have moved there due to inactivity.
4042  *
4043  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4044  * then the L2ARC simply misses copying some buffers.  This serves as a
4045  * pressure valve to prevent heavy read workloads from both stalling the ARC
4046  * with waits and clogging the L2ARC with writes.  This also helps prevent
4047  * the potential for the L2ARC to churn if it attempts to cache content too
4048  * quickly, such as during backups of the entire pool.
4049  *
4050  * 5. After system boot and before the ARC has filled main memory, there are
4051  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4052  * lists can remain mostly static.  Instead of searching from tail of these
4053  * lists as pictured, the l2arc_feed_thread() will search from the list heads
4054  * for eligible buffers, greatly increasing its chance of finding them.
4055  *
4056  * The L2ARC device write speed is also boosted during this time so that
4057  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4058  * there are no L2ARC reads, and no fear of degrading read performance
4059  * through increased writes.
4060  *
4061  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4062  * the vdev queue can aggregate them into larger and fewer writes.  Each
4063  * device is written to in a rotor fashion, sweeping writes through
4064  * available space then repeating.
4065  *
4066  * 7. The L2ARC does not store dirty content.  It never needs to flush
4067  * write buffers back to disk based storage.
4068  *
4069  * 8. If an ARC buffer is written (and dirtied) which also exists in the
4070  * L2ARC, the now stale L2ARC buffer is immediately dropped.
4071  *
4072  * The performance of the L2ARC can be tweaked by a number of tunables, which
4073  * may be necessary for different workloads:
4074  *
4075  *      l2arc_write_max         max write bytes per interval
4076  *      l2arc_write_boost       extra write bytes during device warmup
4077  *      l2arc_noprefetch        skip caching prefetched buffers
4078  *      l2arc_headroom          number of max device writes to precache
4079  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4080  *                              scanning, we multiply headroom by this
4081  *                              percentage factor for the next scan cycle,
4082  *                              since more compressed buffers are likely to
4083  *                              be present
4084  *      l2arc_feed_secs         seconds between L2ARC writing
4085  *
4086  * Tunables may be removed or added as future performance improvements are
4087  * integrated, and also may become zpool properties.
4088  *
4089  * There are three key functions that control how the L2ARC warms up:
4090  *
4091  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4092  *      l2arc_write_size()      calculate how much to write
4093  *      l2arc_write_interval()  calculate sleep delay between writes
4094  *
4095  * These three functions determine what to write, how much, and how quickly
4096  * to send writes.
4097  */
4098 
4099 static boolean_t
4100 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4101 {
4102         /*
4103          * A buffer is *not* eligible for the L2ARC if it:
4104          * 1. belongs to a different spa.
4105          * 2. is already cached on the L2ARC.
4106          * 3. has an I/O in progress (it may be an incomplete read).
4107          * 4. is flagged not eligible (zfs property).
4108          */
4109         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4110             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4111                 return (B_FALSE);
4112 
4113         return (B_TRUE);
4114 }
4115 
4116 static uint64_t
4117 l2arc_write_size(void)
4118 {
4119         uint64_t size;
4120 
4121         /*
4122          * Make sure our globals have meaningful values in case the user
4123          * altered them.
4124          */
4125         size = l2arc_write_max;
4126         if (size == 0) {
4127                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4128                     "be greater than zero, resetting it to the default (%d)",
4129                     L2ARC_WRITE_SIZE);
4130                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4131         }
4132 
4133         if (arc_warm == B_FALSE)
4134                 size += l2arc_write_boost;
4135 
4136         return (size);
4137 
4138 }
4139 
4140 static clock_t
4141 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4142 {
4143         clock_t interval, next, now;
4144 
4145         /*
4146          * If the ARC lists are busy, increase our write rate; if the
4147          * lists are stale, idle back.  This is achieved by checking
4148          * how much we previously wrote - if it was more than half of
4149          * what we wanted, schedule the next write much sooner.
4150          */
4151         if (l2arc_feed_again && wrote > (wanted / 2))
4152                 interval = (hz * l2arc_feed_min_ms) / 1000;
4153         else
4154                 interval = hz * l2arc_feed_secs;
4155 
4156         now = ddi_get_lbolt();
4157         next = MAX(now, MIN(now + interval, began + interval));
4158 
4159         return (next);
4160 }
4161 
4162 static void
4163 l2arc_hdr_stat_add(void)
4164 {
4165         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4166         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4167 }
4168 
4169 static void
4170 l2arc_hdr_stat_remove(void)
4171 {
4172         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4173         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4174 }
4175 
4176 /*
4177  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4178  * If a device is returned, this also returns holding the spa config lock.
4179  */
4180 static l2arc_dev_t *
4181 l2arc_dev_get_next(void)
4182 {
4183         l2arc_dev_t *first, *next = NULL;
4184 
4185         /*
4186          * Lock out the removal of spas (spa_namespace_lock), then removal
4187          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4188          * both locks will be dropped and a spa config lock held instead.
4189          */
4190         mutex_enter(&spa_namespace_lock);
4191         mutex_enter(&l2arc_dev_mtx);
4192 
4193         /* if there are no vdevs, there is nothing to do */
4194         if (l2arc_ndev == 0)
4195                 goto out;
4196 
4197         first = NULL;
4198         next = l2arc_dev_last;
4199         do {
4200                 /* loop around the list looking for a non-faulted vdev */
4201                 if (next == NULL) {
4202                         next = list_head(l2arc_dev_list);
4203                 } else {
4204                         next = list_next(l2arc_dev_list, next);
4205                         if (next == NULL)
4206                                 next = list_head(l2arc_dev_list);
4207                 }
4208 
4209                 /* if we have come back to the start, bail out */
4210                 if (first == NULL)
4211                         first = next;
4212                 else if (next == first)
4213                         break;
4214 
4215         } while (vdev_is_dead(next->l2ad_vdev));
4216 
4217         /* if we were unable to find any usable vdevs, return NULL */
4218         if (vdev_is_dead(next->l2ad_vdev))
4219                 next = NULL;
4220 
4221         l2arc_dev_last = next;
4222 
4223 out:
4224         mutex_exit(&l2arc_dev_mtx);
4225 
4226         /*
4227          * Grab the config lock to prevent the 'next' device from being
4228          * removed while we are writing to it.
4229          */
4230         if (next != NULL)
4231                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4232         mutex_exit(&spa_namespace_lock);
4233 
4234         return (next);
4235 }
4236 
4237 /*
4238  * Free buffers that were tagged for destruction.
4239  */
4240 static void
4241 l2arc_do_free_on_write()
4242 {
4243         list_t *buflist;
4244         l2arc_data_free_t *df, *df_prev;
4245 
4246         mutex_enter(&l2arc_free_on_write_mtx);
4247         buflist = l2arc_free_on_write;
4248 
4249         for (df = list_tail(buflist); df; df = df_prev) {
4250                 df_prev = list_prev(buflist, df);
4251                 ASSERT(df->l2df_data != NULL);
4252                 ASSERT(df->l2df_func != NULL);
4253                 df->l2df_func(df->l2df_data, df->l2df_size);
4254                 list_remove(buflist, df);
4255                 kmem_free(df, sizeof (l2arc_data_free_t));
4256         }
4257 
4258         mutex_exit(&l2arc_free_on_write_mtx);
4259 }
4260 
4261 /*
4262  * A write to a cache device has completed.  Update all headers to allow
4263  * reads from these buffers to begin.
4264  */
4265 static void
4266 l2arc_write_done(zio_t *zio)
4267 {
4268         l2arc_write_callback_t *cb;
4269         l2arc_dev_t *dev;
4270         list_t *buflist;
4271         arc_buf_hdr_t *head, *ab, *ab_prev;
4272         l2arc_buf_hdr_t *abl2;
4273         kmutex_t *hash_lock;
4274         int64_t bytes_dropped = 0;
4275 
4276         cb = zio->io_private;
4277         ASSERT(cb != NULL);
4278         dev = cb->l2wcb_dev;
4279         ASSERT(dev != NULL);
4280         head = cb->l2wcb_head;
4281         ASSERT(head != NULL);
4282         buflist = dev->l2ad_buflist;
4283         ASSERT(buflist != NULL);
4284         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4285             l2arc_write_callback_t *, cb);
4286 
4287         if (zio->io_error != 0)
4288                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4289 
4290         mutex_enter(&l2arc_buflist_mtx);
4291 
4292         /*
4293          * All writes completed, or an error was hit.
4294          */
4295         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4296                 ab_prev = list_prev(buflist, ab);
4297                 abl2 = ab->b_l2hdr;
4298 
4299                 /*
4300                  * Release the temporary compressed buffer as soon as possible.
4301                  */
4302                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4303                         l2arc_release_cdata_buf(ab);
4304 
4305                 hash_lock = HDR_LOCK(ab);
4306                 if (!mutex_tryenter(hash_lock)) {
4307                         /*
4308                          * This buffer misses out.  It may be in a stage
4309                          * of eviction.  Its ARC_L2_WRITING flag will be
4310                          * left set, denying reads to this buffer.
4311                          */
4312                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4313                         continue;
4314                 }
4315 
4316                 if (zio->io_error != 0) {
4317                         /*
4318                          * Error - drop L2ARC entry.
4319                          */
4320                         list_remove(buflist, ab);
4321                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4322                         bytes_dropped += abl2->b_asize;
4323                         ab->b_l2hdr = NULL;
4324                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4325                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4326                 }
4327 
4328                 /*
4329                  * Allow ARC to begin reads to this L2ARC entry.
4330                  */
4331                 ab->b_flags &= ~ARC_L2_WRITING;
4332 
4333                 mutex_exit(hash_lock);
4334         }
4335 
4336         atomic_inc_64(&l2arc_writes_done);
4337         list_remove(buflist, head);
4338         kmem_cache_free(hdr_cache, head);
4339         mutex_exit(&l2arc_buflist_mtx);
4340 
4341         vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4342 
4343         l2arc_do_free_on_write();
4344 
4345         kmem_free(cb, sizeof (l2arc_write_callback_t));
4346 }
4347 
4348 /*
4349  * A read to a cache device completed.  Validate buffer contents before
4350  * handing over to the regular ARC routines.
4351  */
4352 static void
4353 l2arc_read_done(zio_t *zio)
4354 {
4355         l2arc_read_callback_t *cb;
4356         arc_buf_hdr_t *hdr;
4357         arc_buf_t *buf;
4358         kmutex_t *hash_lock;
4359         int equal;
4360 
4361         ASSERT(zio->io_vd != NULL);
4362         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4363 
4364         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4365 
4366         cb = zio->io_private;
4367         ASSERT(cb != NULL);
4368         buf = cb->l2rcb_buf;
4369         ASSERT(buf != NULL);
4370 
4371         hash_lock = HDR_LOCK(buf->b_hdr);
4372         mutex_enter(hash_lock);
4373         hdr = buf->b_hdr;
4374         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4375 
4376         /*
4377          * If the buffer was compressed, decompress it first.
4378          */
4379         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4380                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4381         ASSERT(zio->io_data != NULL);
4382 
4383         /*
4384          * Check this survived the L2ARC journey.
4385          */
4386         equal = arc_cksum_equal(buf);
4387         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4388                 mutex_exit(hash_lock);
4389                 zio->io_private = buf;
4390                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
4391                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
4392                 arc_read_done(zio);
4393         } else {
4394                 mutex_exit(hash_lock);
4395                 /*
4396                  * Buffer didn't survive caching.  Increment stats and
4397                  * reissue to the original storage device.
4398                  */
4399                 if (zio->io_error != 0) {
4400                         ARCSTAT_BUMP(arcstat_l2_io_error);
4401                 } else {
4402                         zio->io_error = SET_ERROR(EIO);
4403                 }
4404                 if (!equal)
4405                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4406 
4407                 /*
4408                  * If there's no waiter, issue an async i/o to the primary
4409                  * storage now.  If there *is* a waiter, the caller must
4410                  * issue the i/o in a context where it's OK to block.
4411                  */
4412                 if (zio->io_waiter == NULL) {
4413                         zio_t *pio = zio_unique_parent(zio);
4414 
4415                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4416 
4417                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4418                             buf->b_data, zio->io_size, arc_read_done, buf,
4419                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4420                 }
4421         }
4422 
4423         kmem_free(cb, sizeof (l2arc_read_callback_t));
4424 }
4425 
4426 /*
4427  * This is the list priority from which the L2ARC will search for pages to
4428  * cache.  This is used within loops (0..3) to cycle through lists in the
4429  * desired order.  This order can have a significant effect on cache
4430  * performance.
4431  *
4432  * Currently the metadata lists are hit first, MFU then MRU, followed by
4433  * the data lists.  This function returns a locked list, and also returns
4434  * the lock pointer.
4435  */
4436 static list_t *
4437 l2arc_list_locked(int list_num, kmutex_t **lock)
4438 {
4439         list_t *list = NULL;
4440 
4441         ASSERT(list_num >= 0 && list_num <= 3);
4442 
4443         switch (list_num) {
4444         case 0:
4445                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4446                 *lock = &arc_mfu->arcs_mtx;
4447                 break;
4448         case 1:
4449                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4450                 *lock = &arc_mru->arcs_mtx;
4451                 break;
4452         case 2:
4453                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4454                 *lock = &arc_mfu->arcs_mtx;
4455                 break;
4456         case 3:
4457                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4458                 *lock = &arc_mru->arcs_mtx;
4459                 break;
4460         }
4461 
4462         ASSERT(!(MUTEX_HELD(*lock)));
4463         mutex_enter(*lock);
4464         return (list);
4465 }
4466 
4467 /*
4468  * Evict buffers from the device write hand to the distance specified in
4469  * bytes.  This distance may span populated buffers, it may span nothing.
4470  * This is clearing a region on the L2ARC device ready for writing.
4471  * If the 'all' boolean is set, every buffer is evicted.
4472  */
4473 static void
4474 _l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all,
4475     boolean_t space_update)
4476 {
4477         list_t *buflist;
4478         l2arc_buf_hdr_t *abl2;
4479         arc_buf_hdr_t *ab, *ab_prev;
4480         kmutex_t *hash_lock;
4481         uint64_t taddr;
4482         int64_t bytes_evicted = 0;
4483 
4484         buflist = dev->l2ad_buflist;
4485 
4486         if (buflist == NULL)
4487                 return;
4488 
4489         if (!all && dev->l2ad_first) {
4490                 /*
4491                  * This is the first sweep through the device.  There is
4492                  * nothing to evict.
4493                  */
4494                 return;
4495         }
4496 
4497         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4498                 /*
4499                  * When nearing the end of the device, evict to the end
4500                  * before the device write hand jumps to the start.
4501                  */
4502                 taddr = dev->l2ad_end;
4503         } else {
4504                 taddr = dev->l2ad_hand + distance;
4505         }
4506         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4507             uint64_t, taddr, boolean_t, all);
4508 
4509 top:
4510         mutex_enter(&l2arc_buflist_mtx);
4511         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4512                 ab_prev = list_prev(buflist, ab);
4513 
4514                 hash_lock = HDR_LOCK(ab);
4515                 if (!mutex_tryenter(hash_lock)) {
4516                         /*
4517                          * Missed the hash lock.  Retry.
4518                          */
4519                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4520                         mutex_exit(&l2arc_buflist_mtx);
4521                         mutex_enter(hash_lock);
4522                         mutex_exit(hash_lock);
4523                         goto top;
4524                 }
4525 
4526                 if (HDR_L2_WRITE_HEAD(ab)) {
4527                         /*
4528                          * We hit a write head node.  Leave it for
4529                          * l2arc_write_done().
4530                          */
4531                         list_remove(buflist, ab);
4532                         mutex_exit(hash_lock);
4533                         continue;
4534                 }
4535 
4536                 if (!all && ab->b_l2hdr != NULL &&
4537                     (ab->b_l2hdr->b_daddr > taddr ||
4538                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4539                         /*
4540                          * We've evicted to the target address,
4541                          * or the end of the device.
4542                          */
4543                         mutex_exit(hash_lock);
4544                         break;
4545                 }
4546 
4547                 if (HDR_FREE_IN_PROGRESS(ab)) {
4548                         /*
4549                          * Already on the path to destruction.
4550                          */
4551                         mutex_exit(hash_lock);
4552                         continue;
4553                 }
4554 
4555                 if (ab->b_state == arc_l2c_only) {
4556                         ASSERT(!HDR_L2_READING(ab));
4557                         /*
4558                          * This doesn't exist in the ARC.  Destroy.
4559                          * arc_hdr_destroy() will call list_remove()
4560                          * and decrement arcstat_l2_size.
4561                          */
4562                         arc_change_state(arc_anon, ab, hash_lock);
4563                         arc_hdr_destroy(ab);
4564                 } else {
4565                         /*
4566                          * Invalidate issued or about to be issued
4567                          * reads, since we may be about to write
4568                          * over this location.
4569                          */
4570                         if (HDR_L2_READING(ab)) {
4571                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4572                                 ab->b_flags |= ARC_L2_EVICTED;
4573                         }
4574 
4575                         /*
4576                          * Tell ARC this no longer exists in L2ARC.
4577                          */
4578                         if (ab->b_l2hdr != NULL) {
4579                                 abl2 = ab->b_l2hdr;
4580                                 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4581                                 bytes_evicted += abl2->b_asize;
4582                                 ab->b_l2hdr = NULL;
4583                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4584                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4585                         }
4586                         list_remove(buflist, ab);
4587 
4588                         /*
4589                          * This may have been leftover after a
4590                          * failed write.
4591                          */
4592                         ab->b_flags &= ~ARC_L2_WRITING;
4593                 }
4594                 mutex_exit(hash_lock);
4595         }
4596         mutex_exit(&l2arc_buflist_mtx);
4597 
4598         /*
4599          * Note: l2ad_vdev can only be touched if space_update is set,
4600          * otherwise the vdev might have been removed by an async
4601          * spa_unload.
4602          */
4603         if (space_update) {
4604                 vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
4605                 dev->l2ad_evict = taddr;
4606         }
4607 }
4608 
4609 /*
4610  * Asynchronous task for eviction of all the buffers for this L2ARC device
4611  * The task is dispatched in l2arc_evict()
4612  */
4613 typedef struct {
4614         l2arc_dev_t *dev;
4615 } l2arc_evict_data_t;
4616 
4617 static void
4618 l2arc_evict_task(void *arg)
4619 {
4620         l2arc_evict_data_t *d = (l2arc_evict_data_t *)arg;
4621         ASSERT(d && d->dev);
4622 
4623         /*
4624          * Evict l2arc buffers asynchronously; we need to keep the device
4625          * around until we are sure there aren't any buffers referencing it.
4626          * We do not need to hold any config locks, etc. because at this point,
4627          * we are the only ones who knows about this device (the in-core
4628          * structure), so no new buffers can be created (e.g. if the pool is
4629          * re-imported while the asynchronous eviction is in progress) that
4630          * reference this same in-core structure. Also remove the vdev link
4631          * since further use of it as l2arc device is prohibited.
4632          */
4633         d->dev->l2ad_vdev = NULL;
4634         _l2arc_evict(d->dev, 0LL, B_TRUE, B_FALSE);
4635 
4636         /* Same cleanup as in the synchronous path */
4637         list_destroy(d->dev->l2ad_buflist);
4638         kmem_free(d->dev->l2ad_buflist, sizeof (list_t));
4639         kmem_free(d->dev, sizeof (l2arc_dev_t));
4640         /* Task argument cleanup */
4641         kmem_free(arg, sizeof (l2arc_evict_data_t));
4642 }
4643 
4644 boolean_t zfs_l2arc_async_evict = B_TRUE;
4645 
4646 /*
4647  * Perform l2arc eviction for buffers associated with this device
4648  * If evicting all buffers (done at pool export time), try to evict
4649  * asynchronously, and fall back to synchronous eviction in case of error
4650  * Tell the caller whether to cleanup the device:
4651  *  - B_TRUE means "asynchronous eviction, do not cleanup"
4652  *  - B_FALSE means "synchronous eviction, done, please cleanup"
4653  */
4654 static boolean_t
4655 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4656 {
4657         /*
4658          *  If we are evicting all the buffers for this device, which happens
4659          *  at pool export time, schedule asynchronous task
4660          */
4661         if (all && zfs_l2arc_async_evict) {
4662                 l2arc_evict_data_t *arg =
4663                     kmem_alloc(sizeof (l2arc_evict_data_t), KM_SLEEP);
4664                 arg->dev = dev;
4665 
4666                 dev->l2ad_evict = dev->l2ad_end;
4667 
4668                 if ((taskq_dispatch(arc_flush_taskq, l2arc_evict_task,
4669                     arg, TQ_NOSLEEP) == NULL)) {
4670                         /*
4671                          * Failed to dispatch asynchronous task
4672                          * cleanup, evict synchronously, avoid adjusting
4673                          * vdev space second time
4674                          */
4675                         kmem_free(arg, sizeof (l2arc_evict_data_t));
4676                         _l2arc_evict(dev, distance, all, B_FALSE);
4677                 } else {
4678                         /*
4679                          * Successfull dispatch, vdev space updated
4680                          */
4681                         return (B_TRUE);
4682                 }
4683         } else {
4684                 /* Evict synchronously */
4685                 _l2arc_evict(dev, distance, all, B_TRUE);
4686         }
4687 
4688         return (B_FALSE);
4689 }
4690 
4691 /*
4692  * Find and write ARC buffers to the L2ARC device.
4693  *
4694  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4695  * for reading until they have completed writing.
4696  * The headroom_boost is an in-out parameter used to maintain headroom boost
4697  * state between calls to this function.
4698  *
4699  * Returns the number of bytes actually written (which may be smaller than
4700  * the delta by which the device hand has changed due to alignment).
4701  */
4702 static uint64_t
4703 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4704     boolean_t *headroom_boost)
4705 {
4706         arc_buf_hdr_t *ab, *ab_prev, *head;
4707         list_t *list;
4708         uint64_t write_asize, write_psize, write_sz, headroom,
4709             buf_compress_minsz;
4710         void *buf_data;
4711         kmutex_t *list_lock;
4712         boolean_t full;
4713         l2arc_write_callback_t *cb;
4714         zio_t *pio, *wzio;
4715         uint64_t guid = spa_load_guid(spa);
4716         const boolean_t do_headroom_boost = *headroom_boost;
4717 
4718         ASSERT(dev->l2ad_vdev != NULL);
4719 
4720         /* Lower the flag now, we might want to raise it again later. */
4721         *headroom_boost = B_FALSE;
4722 
4723         pio = NULL;
4724         write_sz = write_asize = write_psize = 0;
4725         full = B_FALSE;
4726         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4727         head->b_flags |= ARC_L2_WRITE_HEAD;
4728 
4729         /*
4730          * We will want to try to compress buffers that are at least 2x the
4731          * device sector size.
4732          */
4733         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4734 
4735         /*
4736          * Copy buffers for L2ARC writing.
4737          */
4738         mutex_enter(&l2arc_buflist_mtx);
4739         for (int try = 0; try <= 3; try++) {
4740                 uint64_t passed_sz = 0;
4741 
4742                 list = l2arc_list_locked(try, &list_lock);
4743 
4744                 /*
4745                  * L2ARC fast warmup.
4746                  *
4747                  * Until the ARC is warm and starts to evict, read from the
4748                  * head of the ARC lists rather than the tail.
4749                  */
4750                 if (arc_warm == B_FALSE)
4751                         ab = list_head(list);
4752                 else
4753                         ab = list_tail(list);
4754 
4755                 headroom = target_sz * l2arc_headroom;
4756                 if (do_headroom_boost)
4757                         headroom = (headroom * l2arc_headroom_boost) / 100;
4758 
4759                 for (; ab; ab = ab_prev) {
4760                         l2arc_buf_hdr_t *l2hdr;
4761                         kmutex_t *hash_lock;
4762                         uint64_t buf_sz;
4763 
4764                         if (arc_warm == B_FALSE)
4765                                 ab_prev = list_next(list, ab);
4766                         else
4767                                 ab_prev = list_prev(list, ab);
4768 
4769                         hash_lock = HDR_LOCK(ab);
4770                         if (!mutex_tryenter(hash_lock)) {
4771                                 /*
4772                                  * Skip this buffer rather than waiting.
4773                                  */
4774                                 continue;
4775                         }
4776 
4777                         passed_sz += ab->b_size;
4778                         if (passed_sz > headroom) {
4779                                 /*
4780                                  * Searched too far.
4781                                  */
4782                                 mutex_exit(hash_lock);
4783                                 break;
4784                         }
4785 
4786                         if (!l2arc_write_eligible(guid, ab)) {
4787                                 mutex_exit(hash_lock);
4788                                 continue;
4789                         }
4790 
4791                         if ((write_sz + ab->b_size) > target_sz) {
4792                                 full = B_TRUE;
4793                                 mutex_exit(hash_lock);
4794                                 break;
4795                         }
4796 
4797                         if (pio == NULL) {
4798                                 /*
4799                                  * Insert a dummy header on the buflist so
4800                                  * l2arc_write_done() can find where the
4801                                  * write buffers begin without searching.
4802                                  */
4803                                 list_insert_head(dev->l2ad_buflist, head);
4804 
4805                                 cb = kmem_alloc(
4806                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4807                                 cb->l2wcb_dev = dev;
4808                                 cb->l2wcb_head = head;
4809                                 pio = zio_root(spa, l2arc_write_done, cb,
4810                                     ZIO_FLAG_CANFAIL);
4811                         }
4812 
4813                         /*
4814                          * Create and add a new L2ARC header.
4815                          */
4816                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4817                         l2hdr->b_dev = dev;
4818                         ab->b_flags |= ARC_L2_WRITING;
4819 
4820                         /*
4821                          * Temporarily stash the data buffer in b_tmp_cdata.
4822                          * The subsequent write step will pick it up from
4823                          * there. This is because can't access ab->b_buf
4824                          * without holding the hash_lock, which we in turn
4825                          * can't access without holding the ARC list locks
4826                          * (which we want to avoid during compression/writing).
4827                          */
4828                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
4829                         l2hdr->b_asize = ab->b_size;
4830                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4831 
4832                         buf_sz = ab->b_size;
4833                         ab->b_l2hdr = l2hdr;
4834 
4835                         list_insert_head(dev->l2ad_buflist, ab);
4836 
4837                         /*
4838                          * Compute and store the buffer cksum before
4839                          * writing.  On debug the cksum is verified first.
4840                          */
4841                         arc_cksum_verify(ab->b_buf);
4842                         arc_cksum_compute(ab->b_buf, B_TRUE);
4843 
4844                         mutex_exit(hash_lock);
4845 
4846                         write_sz += buf_sz;
4847                 }
4848 
4849                 mutex_exit(list_lock);
4850 
4851                 if (full == B_TRUE)
4852                         break;
4853         }
4854 
4855         /* No buffers selected for writing? */
4856         if (pio == NULL) {
4857                 ASSERT0(write_sz);
4858                 mutex_exit(&l2arc_buflist_mtx);
4859                 kmem_cache_free(hdr_cache, head);
4860                 return (0);
4861         }
4862 
4863         /*
4864          * Now start writing the buffers. We're starting at the write head
4865          * and work backwards, retracing the course of the buffer selector
4866          * loop above.
4867          */
4868         for (ab = list_prev(dev->l2ad_buflist, head); ab;
4869             ab = list_prev(dev->l2ad_buflist, ab)) {
4870                 l2arc_buf_hdr_t *l2hdr;
4871                 uint64_t buf_sz;
4872 
4873                 /*
4874                  * We shouldn't need to lock the buffer here, since we flagged
4875                  * it as ARC_L2_WRITING in the previous step, but we must take
4876                  * care to only access its L2 cache parameters. In particular,
4877                  * ab->b_buf may be invalid by now due to ARC eviction.
4878                  */
4879                 l2hdr = ab->b_l2hdr;
4880                 l2hdr->b_daddr = dev->l2ad_hand;
4881 
4882                 if ((ab->b_flags & ARC_L2COMPRESS) &&
4883                     l2hdr->b_asize >= buf_compress_minsz) {
4884                         if (l2arc_compress_buf(l2hdr)) {
4885                                 /*
4886                                  * If compression succeeded, enable headroom
4887                                  * boost on the next scan cycle.
4888                                  */
4889                                 *headroom_boost = B_TRUE;
4890                         }
4891                 }
4892 
4893                 /*
4894                  * Pick up the buffer data we had previously stashed away
4895                  * (and now potentially also compressed).
4896                  */
4897                 buf_data = l2hdr->b_tmp_cdata;
4898                 buf_sz = l2hdr->b_asize;
4899 
4900                 /* Compression may have squashed the buffer to zero length. */
4901                 if (buf_sz != 0) {
4902                         uint64_t buf_p_sz;
4903 
4904                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4905                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4906                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4907                             ZIO_FLAG_CANFAIL, B_FALSE);
4908 
4909                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4910                             zio_t *, wzio);
4911                         (void) zio_nowait(wzio);
4912 
4913                         write_asize += buf_sz;
4914                         /*
4915                          * Keep the clock hand suitably device-aligned.
4916                          */
4917                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4918                         write_psize += buf_p_sz;
4919                         dev->l2ad_hand += buf_p_sz;
4920                 }
4921         }
4922 
4923         mutex_exit(&l2arc_buflist_mtx);
4924 
4925         ASSERT3U(write_asize, <=, target_sz);
4926         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4927         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4928         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4929         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4930         vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
4931 
4932         /*
4933          * Bump device hand to the device start if it is approaching the end.
4934          * l2arc_evict() will already have evicted ahead for this case.
4935          */
4936         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4937                 dev->l2ad_hand = dev->l2ad_start;
4938                 dev->l2ad_evict = dev->l2ad_start;
4939                 dev->l2ad_first = B_FALSE;
4940         }
4941 
4942         dev->l2ad_writing = B_TRUE;
4943         (void) zio_wait(pio);
4944         dev->l2ad_writing = B_FALSE;
4945 
4946         return (write_asize);
4947 }
4948 
4949 /*
4950  * Compresses an L2ARC buffer.
4951  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4952  * size in l2hdr->b_asize. This routine tries to compress the data and
4953  * depending on the compression result there are three possible outcomes:
4954  * *) The buffer was incompressible. The original l2hdr contents were left
4955  *    untouched and are ready for writing to an L2 device.
4956  * *) The buffer was all-zeros, so there is no need to write it to an L2
4957  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4958  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4959  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4960  *    data buffer which holds the compressed data to be written, and b_asize
4961  *    tells us how much data there is. b_compress is set to the appropriate
4962  *    compression algorithm. Once writing is done, invoke
4963  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4964  *
4965  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4966  * buffer was incompressible).
4967  */
4968 static boolean_t
4969 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4970 {
4971         void *cdata;
4972         size_t csize, len, rounded;
4973 
4974         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4975         ASSERT(l2hdr->b_tmp_cdata != NULL);
4976 
4977         len = l2hdr->b_asize;
4978         cdata = zio_data_buf_alloc(len);
4979         csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4980             cdata, l2hdr->b_asize);
4981 
4982         rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
4983         if (rounded > csize) {
4984                 bzero((char *)cdata + csize, rounded - csize);
4985                 csize = rounded;
4986         }
4987 
4988         if (csize == 0) {
4989                 /* zero block, indicate that there's nothing to write */
4990                 zio_data_buf_free(cdata, len);
4991                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4992                 l2hdr->b_asize = 0;
4993                 l2hdr->b_tmp_cdata = NULL;
4994                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4995                 return (B_TRUE);
4996         } else if (csize > 0 && csize < len) {
4997                 /*
4998                  * Compression succeeded, we'll keep the cdata around for
4999                  * writing and release it afterwards.
5000                  */
5001                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5002                 l2hdr->b_asize = csize;
5003                 l2hdr->b_tmp_cdata = cdata;
5004                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
5005                 return (B_TRUE);
5006         } else {
5007                 /*
5008                  * Compression failed, release the compressed buffer.
5009                  * l2hdr will be left unmodified.
5010                  */
5011                 zio_data_buf_free(cdata, len);
5012                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
5013                 return (B_FALSE);
5014         }
5015 }
5016 
5017 /*
5018  * Decompresses a zio read back from an l2arc device. On success, the
5019  * underlying zio's io_data buffer is overwritten by the uncompressed
5020  * version. On decompression error (corrupt compressed stream), the
5021  * zio->io_error value is set to signal an I/O error.
5022  *
5023  * Please note that the compressed data stream is not checksummed, so
5024  * if the underlying device is experiencing data corruption, we may feed
5025  * corrupt data to the decompressor, so the decompressor needs to be
5026  * able to handle this situation (LZ4 does).
5027  */
5028 static void
5029 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5030 {
5031         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5032 
5033         if (zio->io_error != 0) {
5034                 /*
5035                  * An io error has occured, just restore the original io
5036                  * size in preparation for a main pool read.
5037                  */
5038                 zio->io_orig_size = zio->io_size = hdr->b_size;
5039                 return;
5040         }
5041 
5042         if (c == ZIO_COMPRESS_EMPTY) {
5043                 /*
5044                  * An empty buffer results in a null zio, which means we
5045                  * need to fill its io_data after we're done restoring the
5046                  * buffer's contents.
5047                  */
5048                 ASSERT(hdr->b_buf != NULL);
5049                 bzero(hdr->b_buf->b_data, hdr->b_size);
5050                 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5051         } else {
5052                 ASSERT(zio->io_data != NULL);
5053                 /*
5054                  * We copy the compressed data from the start of the arc buffer
5055                  * (the zio_read will have pulled in only what we need, the
5056                  * rest is garbage which we will overwrite at decompression)
5057                  * and then decompress back to the ARC data buffer. This way we
5058                  * can minimize copying by simply decompressing back over the
5059                  * original compressed data (rather than decompressing to an
5060                  * aux buffer and then copying back the uncompressed buffer,
5061                  * which is likely to be much larger).
5062                  */
5063                 uint64_t csize;
5064                 void *cdata;
5065 
5066                 csize = zio->io_size;
5067                 cdata = zio_data_buf_alloc(csize);
5068                 bcopy(zio->io_data, cdata, csize);
5069                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5070                     hdr->b_size) != 0)
5071                         zio->io_error = EIO;
5072                 zio_data_buf_free(cdata, csize);
5073         }
5074 
5075         /* Restore the expected uncompressed IO size. */
5076         zio->io_orig_size = zio->io_size = hdr->b_size;
5077 }
5078 
5079 /*
5080  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5081  * This buffer serves as a temporary holder of compressed data while
5082  * the buffer entry is being written to an l2arc device. Once that is
5083  * done, we can dispose of it.
5084  */
5085 static void
5086 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5087 {
5088         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5089 
5090         if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5091                 /*
5092                  * If the data was compressed, then we've allocated a
5093                  * temporary buffer for it, so now we need to release it.
5094                  */
5095                 ASSERT(l2hdr->b_tmp_cdata != NULL);
5096                 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5097         }
5098         l2hdr->b_tmp_cdata = NULL;
5099 }
5100 
5101 /*
5102  * This thread feeds the L2ARC at regular intervals.  This is the beating
5103  * heart of the L2ARC.
5104  */
5105 static void
5106 l2arc_feed_thread(void)
5107 {
5108         callb_cpr_t cpr;
5109         l2arc_dev_t *dev;
5110         spa_t *spa;
5111         uint64_t size, wrote;
5112         clock_t begin, next = ddi_get_lbolt();
5113         boolean_t headroom_boost = B_FALSE;
5114 
5115         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5116 
5117         mutex_enter(&l2arc_feed_thr_lock);
5118 
5119         while (l2arc_thread_exit == 0) {
5120                 CALLB_CPR_SAFE_BEGIN(&cpr);
5121                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5122                     next);
5123                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5124                 next = ddi_get_lbolt() + hz;
5125 
5126                 /*
5127                  * Quick check for L2ARC devices.
5128                  */
5129                 mutex_enter(&l2arc_dev_mtx);
5130                 if (l2arc_ndev == 0) {
5131                         mutex_exit(&l2arc_dev_mtx);
5132                         continue;
5133                 }
5134                 mutex_exit(&l2arc_dev_mtx);
5135                 begin = ddi_get_lbolt();
5136 
5137                 /*
5138                  * This selects the next l2arc device to write to, and in
5139                  * doing so the next spa to feed from: dev->l2ad_spa.   This
5140                  * will return NULL if there are now no l2arc devices or if
5141                  * they are all faulted.
5142                  *
5143                  * If a device is returned, its spa's config lock is also
5144                  * held to prevent device removal.  l2arc_dev_get_next()
5145                  * will grab and release l2arc_dev_mtx.
5146                  */
5147                 if ((dev = l2arc_dev_get_next()) == NULL)
5148                         continue;
5149 
5150                 spa = dev->l2ad_spa;
5151                 ASSERT(spa != NULL);
5152 
5153                 /*
5154                  * If the pool is read-only then force the feed thread to
5155                  * sleep a little longer.
5156                  */
5157                 if (!spa_writeable(spa)) {
5158                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5159                         spa_config_exit(spa, SCL_L2ARC, dev);
5160                         continue;
5161                 }
5162 
5163                 /*
5164                  * Avoid contributing to memory pressure.
5165                  */
5166                 if (arc_reclaim_needed()) {
5167                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5168                         spa_config_exit(spa, SCL_L2ARC, dev);
5169                         continue;
5170                 }
5171 
5172                 ARCSTAT_BUMP(arcstat_l2_feeds);
5173 
5174                 size = l2arc_write_size();
5175 
5176                 /*
5177                  * Evict L2ARC buffers that will be overwritten.
5178                  * B_FALSE guarantees synchronous eviction.
5179                  */
5180                 (void) l2arc_evict(dev, size, B_FALSE);
5181 
5182                 /*
5183                  * Write ARC buffers.
5184                  */
5185                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5186 
5187                 /*
5188                  * Calculate interval between writes.
5189                  */
5190                 next = l2arc_write_interval(begin, size, wrote);
5191                 spa_config_exit(spa, SCL_L2ARC, dev);
5192         }
5193 
5194         l2arc_thread_exit = 0;
5195         cv_broadcast(&l2arc_feed_thr_cv);
5196         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
5197         thread_exit();
5198 }
5199 
5200 boolean_t
5201 l2arc_vdev_present(vdev_t *vd)
5202 {
5203         l2arc_dev_t *dev;
5204 
5205         mutex_enter(&l2arc_dev_mtx);
5206         for (dev = list_head(l2arc_dev_list); dev != NULL;
5207             dev = list_next(l2arc_dev_list, dev)) {
5208                 if (dev->l2ad_vdev == vd)
5209                         break;
5210         }
5211         mutex_exit(&l2arc_dev_mtx);
5212 
5213         return (dev != NULL);
5214 }
5215 
5216 /*
5217  * Add a vdev for use by the L2ARC.  By this point the spa has already
5218  * validated the vdev and opened it.
5219  */
5220 void
5221 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5222 {
5223         l2arc_dev_t *adddev;
5224 
5225         ASSERT(!l2arc_vdev_present(vd));
5226 
5227         /*
5228          * Create a new l2arc device entry.
5229          */
5230         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5231         adddev->l2ad_spa = spa;
5232         adddev->l2ad_vdev = vd;
5233         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5234         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5235         adddev->l2ad_hand = adddev->l2ad_start;
5236         adddev->l2ad_evict = adddev->l2ad_start;
5237         adddev->l2ad_first = B_TRUE;
5238         adddev->l2ad_writing = B_FALSE;
5239 
5240         /*
5241          * This is a list of all ARC buffers that are still valid on the
5242          * device.
5243          */
5244         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5245         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5246             offsetof(arc_buf_hdr_t, b_l2node));
5247 
5248         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5249 
5250         /*
5251          * Add device to global list
5252          */
5253         mutex_enter(&l2arc_dev_mtx);
5254         list_insert_head(l2arc_dev_list, adddev);
5255         atomic_inc_64(&l2arc_ndev);
5256         mutex_exit(&l2arc_dev_mtx);
5257 }
5258 
5259 /*
5260  * Remove a vdev from the L2ARC.
5261  */
5262 void
5263 l2arc_remove_vdev(vdev_t *vd)
5264 {
5265         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5266 
5267         /*
5268          * Find the device by vdev
5269          */
5270         mutex_enter(&l2arc_dev_mtx);
5271         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5272                 nextdev = list_next(l2arc_dev_list, dev);
5273                 if (vd == dev->l2ad_vdev) {
5274                         remdev = dev;
5275                         break;
5276                 }
5277         }
5278         ASSERT(remdev != NULL);
5279 
5280         /*
5281          * Remove device from global list
5282          */
5283         list_remove(l2arc_dev_list, remdev);
5284         l2arc_dev_last = NULL;          /* may have been invalidated */
5285         atomic_dec_64(&l2arc_ndev);
5286         mutex_exit(&l2arc_dev_mtx);
5287 
5288         /*
5289          * Clear all buflists and ARC references.  L2ARC device flush.
5290          */
5291         if (l2arc_evict(remdev, 0, B_TRUE) == B_FALSE) {
5292                 /*
5293                  * The eviction was done synchronously, cleanup here
5294                  * Otherwise, the asynchronous task will cleanup
5295                  */
5296                 list_destroy(remdev->l2ad_buflist);
5297                 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5298                 kmem_free(remdev, sizeof (l2arc_dev_t));
5299         }
5300 }
5301 
5302 void
5303 l2arc_init(void)
5304 {
5305         l2arc_thread_exit = 0;
5306         l2arc_ndev = 0;
5307         l2arc_writes_sent = 0;
5308         l2arc_writes_done = 0;
5309 
5310         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5311         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5312         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5313         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5314         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5315 
5316         l2arc_dev_list = &L2ARC_dev_list;
5317         l2arc_free_on_write = &L2ARC_free_on_write;
5318         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5319             offsetof(l2arc_dev_t, l2ad_node));
5320         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5321             offsetof(l2arc_data_free_t, l2df_list_node));
5322 }
5323 
5324 void
5325 l2arc_fini(void)
5326 {
5327         /*
5328          * This is called from dmu_fini(), which is called from spa_fini();
5329          * Because of this, we can assume that all l2arc devices have
5330          * already been removed when the pools themselves were removed.
5331          */
5332 
5333         l2arc_do_free_on_write();
5334 
5335         mutex_destroy(&l2arc_feed_thr_lock);
5336         cv_destroy(&l2arc_feed_thr_cv);
5337         mutex_destroy(&l2arc_dev_mtx);
5338         mutex_destroy(&l2arc_buflist_mtx);
5339         mutex_destroy(&l2arc_free_on_write_mtx);
5340 
5341         list_destroy(l2arc_dev_list);
5342         list_destroy(l2arc_free_on_write);
5343 }
5344 
5345 void
5346 l2arc_start(void)
5347 {
5348         if (!(spa_mode_global & FWRITE))
5349                 return;
5350 
5351         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5352             TS_RUN, minclsyspri);
5353 }
5354 
5355 void
5356 l2arc_stop(void)
5357 {
5358         if (!(spa_mode_global & FWRITE))
5359                 return;
5360 
5361         mutex_enter(&l2arc_feed_thr_lock);
5362         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
5363         l2arc_thread_exit = 1;
5364         while (l2arc_thread_exit != 0)
5365                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5366         mutex_exit(&l2arc_feed_thr_lock);
5367 }