illumos-gate.git Old usr/src/uts/common/fs/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26  */
  27 
  28 /*
  29  * DVA-based Adjustable Replacement Cache
  30  *
  31  * While much of the theory of operation used here is
  32  * based on the self-tuning, low overhead replacement cache
  33  * presented by Megiddo and Modha at FAST 2003, there are some
  34  * significant differences:
  35  *
  36  * 1. The Megiddo and Modha model assumes any page is evictable.
  37  * Pages in its cache cannot be "locked" into memory.  This makes
  38  * the eviction algorithm simple: evict the last page in the list.
  39  * This also make the performance characteristics easy to reason
  40  * about.  Our cache is not so simple.  At any given moment, some
  41  * subset of the blocks in the cache are un-evictable because we
  42  * have handed out a reference to them.  Blocks are only evictable
  43  * when there are no external references active.  This makes
  44  * eviction far more problematic:  we choose to evict the evictable
  45  * blocks that are the "lowest" in the list.
  46  *
  47  * There are times when it is not possible to evict the requested
  48  * space.  In these circumstances we are unable to adjust the cache
  49  * size.  To prevent the cache growing unbounded at these times we
  50  * implement a "cache throttle" that slows the flow of new data
  51  * into the cache until we can make space available.
  52  *
  53  * 2. The Megiddo and Modha model assumes a fixed cache size.
  54  * Pages are evicted when the cache is full and there is a cache
  55  * miss.  Our model has a variable sized cache.  It grows with
  56  * high use, but also tries to react to memory pressure from the
  57  * operating system: decreasing its size when system memory is
  58  * tight.
  59  *
  60  * 3. The Megiddo and Modha model assumes a fixed page size. All
  61  * elements of the cache are therefore exactly the same size.  So
  62  * when adjusting the cache size following a cache miss, its simply
  63  * a matter of choosing a single page to evict.  In our model, we
  64  * have variable sized cache blocks (rangeing from 512 bytes to
  65  * 128K bytes).  We therefore choose a set of blocks to evict to make
  66  * space for a cache miss that approximates as closely as possible
  67  * the space used by the new block.
  68  *
  69  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  70  * by N. Megiddo & D. Modha, FAST 2003
  71  */
  72 
  73 /*
  74  * The locking model:
  75  *
  76  * A new reference to a cache buffer can be obtained in two
  77  * ways: 1) via a hash table lookup using the DVA as a key,
  78  * or 2) via one of the ARC lists.  The arc_read() interface
  79  * uses method 1, while the internal arc algorithms for
  80  * adjusting the cache use method 2.  We therefore provide two
  81  * types of locks: 1) the hash table lock array, and 2) the
  82  * arc list locks.
  83  *
  84  * Buffers do not have their own mutexes, rather they rely on the
  85  * hash table mutexes for the bulk of their protection (i.e. most
  86  * fields in the arc_buf_hdr_t are protected by these mutexes).
  87  *
  88  * buf_hash_find() returns the appropriate mutex (held) when it
  89  * locates the requested buffer in the hash table.  It returns
  90  * NULL for the mutex if the buffer was not in the table.
  91  *
  92  * buf_hash_remove() expects the appropriate hash mutex to be
  93  * already held before it is invoked.
  94  *
  95  * Each arc state also has a mutex which is used to protect the
  96  * buffer list associated with the state.  When attempting to
  97  * obtain a hash table lock while holding an arc list lock you
  98  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  99  * the active state mutex must be held before the ghost state mutex.
 100  *
 101  * Arc buffers may have an associated eviction callback function.
 102  * This function will be invoked prior to removing the buffer (e.g.
 103  * in arc_do_user_evicts()).  Note however that the data associated
 104  * with the buffer may be evicted prior to the callback.  The callback
 105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  * the users of callbacks must ensure that their private data is
 107  * protected from simultaneous callbacks from arc_buf_evict()
 108  * and arc_do_user_evicts().
 109  *
 110  * Note that the majority of the performance stats are manipulated
 111  * with atomic operations.
 112  *
 113  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 114  *
 115  *      - L2ARC buflist creation
 116  *      - L2ARC buflist eviction
 117  *      - L2ARC write completion, which walks L2ARC buflists
 118  *      - ARC header destruction, as it removes from L2ARC buflists
 119  *      - ARC header release, as it removes from L2ARC buflists
 120  */
 121 
 122 #include <sys/spa.h>
 123 #include <sys/zio.h>
 124 #include <sys/zio_compress.h>
 125 #include <sys/zfs_context.h>
 126 #include <sys/arc.h>
 127 #include <sys/refcount.h>
 128 #include <sys/vdev.h>
 129 #include <sys/vdev_impl.h>
 130 #ifdef _KERNEL
 131 #include <sys/vmsystm.h>
 132 #include <vm/anon.h>
 133 #include <sys/fs/swapnode.h>
 134 #include <sys/dnlc.h>
 135 #endif
 136 #include <sys/callb.h>
 137 #include <sys/kstat.h>
 138 #include <zfs_fletcher.h>
 139 
 140 #ifndef _KERNEL
 141 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 142 boolean_t arc_watch = B_FALSE;
 143 int arc_procfd;
 144 #endif
 145 
 146 static kmutex_t         arc_reclaim_thr_lock;
 147 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 148 static uint8_t          arc_thread_exit;
 149 
 150 extern int zfs_write_limit_shift;
 151 extern uint64_t zfs_write_limit_max;
 152 extern kmutex_t zfs_write_limit_lock;
 153 
 154 #define ARC_REDUCE_DNLC_PERCENT 3
 155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 156 
 157 typedef enum arc_reclaim_strategy {
 158         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 159         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 160 } arc_reclaim_strategy_t;
 161 
 162 /* number of seconds before growing cache again */
 163 static int              arc_grow_retry = 60;
 164 
 165 /* shift of arc_c for calculating both min and max arc_p */
 166 static int              arc_p_min_shift = 4;
 167 
 168 /* log2(fraction of arc to reclaim) */
 169 static int              arc_shrink_shift = 5;
 170 
 171 /*
 172  * minimum lifespan of a prefetch block in clock ticks
 173  * (initialized in arc_init())
 174  */
 175 static int              arc_min_prefetch_lifespan;
 176 
 177 static int arc_dead;
 178 
 179 /*
 180  * The arc has filled available memory and has now warmed up.
 181  */
 182 static boolean_t arc_warm;
 183 
 184 /*
 185  * These tunables are for performance analysis.
 186  */
 187 uint64_t zfs_arc_max;
 188 uint64_t zfs_arc_min;
 189 uint64_t zfs_arc_meta_limit = 0;
 190 int zfs_arc_grow_retry = 0;
 191 int zfs_arc_shrink_shift = 0;
 192 int zfs_arc_p_min_shift = 0;
 193 int zfs_disable_dup_eviction = 0;
 194 
 195 /*
 196  * Note that buffers can be in one of 6 states:
 197  *      ARC_anon        - anonymous (discussed below)
 198  *      ARC_mru         - recently used, currently cached
 199  *      ARC_mru_ghost   - recentely used, no longer in cache
 200  *      ARC_mfu         - frequently used, currently cached
 201  *      ARC_mfu_ghost   - frequently used, no longer in cache
 202  *      ARC_l2c_only    - exists in L2ARC but not other states
 203  * When there are no active references to the buffer, they are
 204  * are linked onto a list in one of these arc states.  These are
 205  * the only buffers that can be evicted or deleted.  Within each
 206  * state there are multiple lists, one for meta-data and one for
 207  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 208  * etc.) is tracked separately so that it can be managed more
 209  * explicitly: favored over data, limited explicitly.
 210  *
 211  * Anonymous buffers are buffers that are not associated with
 212  * a DVA.  These are buffers that hold dirty block copies
 213  * before they are written to stable storage.  By definition,
 214  * they are "ref'd" and are considered part of arc_mru
 215  * that cannot be freed.  Generally, they will aquire a DVA
 216  * as they are written and migrate onto the arc_mru list.
 217  *
 218  * The ARC_l2c_only state is for buffers that are in the second
 219  * level ARC but no longer in any of the ARC_m* lists.  The second
 220  * level ARC itself may also contain buffers that are in any of
 221  * the ARC_m* states - meaning that a buffer can exist in two
 222  * places.  The reason for the ARC_l2c_only state is to keep the
 223  * buffer header in the hash table, so that reads that hit the
 224  * second level ARC benefit from these fast lookups.
 225  */
 226 
 227 typedef struct arc_state {
 228         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 229         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 230         uint64_t arcs_size;     /* total amount of data in this state */
 231         kmutex_t arcs_mtx;
 232 } arc_state_t;
 233 
 234 /* The 6 states: */
 235 static arc_state_t ARC_anon;
 236 static arc_state_t ARC_mru;
 237 static arc_state_t ARC_mru_ghost;
 238 static arc_state_t ARC_mfu;
 239 static arc_state_t ARC_mfu_ghost;
 240 static arc_state_t ARC_l2c_only;
 241 
 242 typedef struct arc_stats {
 243         kstat_named_t arcstat_hits;
 244         kstat_named_t arcstat_misses;
 245         kstat_named_t arcstat_demand_data_hits;
 246         kstat_named_t arcstat_demand_data_misses;
 247         kstat_named_t arcstat_demand_metadata_hits;
 248         kstat_named_t arcstat_demand_metadata_misses;
 249         kstat_named_t arcstat_prefetch_data_hits;
 250         kstat_named_t arcstat_prefetch_data_misses;
 251         kstat_named_t arcstat_prefetch_metadata_hits;
 252         kstat_named_t arcstat_prefetch_metadata_misses;
 253         kstat_named_t arcstat_mru_hits;
 254         kstat_named_t arcstat_mru_ghost_hits;
 255         kstat_named_t arcstat_mfu_hits;
 256         kstat_named_t arcstat_mfu_ghost_hits;
 257         kstat_named_t arcstat_deleted;
 258         kstat_named_t arcstat_recycle_miss;
 259         /*
 260          * Number of buffers that could not be evicted because the hash lock
 261          * was held by another thread.  The lock may not necessarily be held
 262          * by something using the same buffer, since hash locks are shared
 263          * by multiple buffers.
 264          */
 265         kstat_named_t arcstat_mutex_miss;
 266         /*
 267          * Number of buffers skipped because they have I/O in progress, are
 268          * indrect prefetch buffers that have not lived long enough, or are
 269          * not from the spa we're trying to evict from.
 270          */
 271         kstat_named_t arcstat_evict_skip;
 272         kstat_named_t arcstat_evict_l2_cached;
 273         kstat_named_t arcstat_evict_l2_eligible;
 274         kstat_named_t arcstat_evict_l2_ineligible;
 275         kstat_named_t arcstat_hash_elements;
 276         kstat_named_t arcstat_hash_elements_max;
 277         kstat_named_t arcstat_hash_collisions;
 278         kstat_named_t arcstat_hash_chains;
 279         kstat_named_t arcstat_hash_chain_max;
 280         kstat_named_t arcstat_p;
 281         kstat_named_t arcstat_c;
 282         kstat_named_t arcstat_c_min;
 283         kstat_named_t arcstat_c_max;
 284         kstat_named_t arcstat_size;
 285         kstat_named_t arcstat_hdr_size;
 286         kstat_named_t arcstat_data_size;
 287         kstat_named_t arcstat_other_size;
 288         kstat_named_t arcstat_l2_hits;
 289         kstat_named_t arcstat_l2_misses;
 290         kstat_named_t arcstat_l2_feeds;
 291         kstat_named_t arcstat_l2_rw_clash;
 292         kstat_named_t arcstat_l2_read_bytes;
 293         kstat_named_t arcstat_l2_write_bytes;
 294         kstat_named_t arcstat_l2_writes_sent;
 295         kstat_named_t arcstat_l2_writes_done;
 296         kstat_named_t arcstat_l2_writes_error;
 297         kstat_named_t arcstat_l2_writes_hdr_miss;
 298         kstat_named_t arcstat_l2_evict_lock_retry;
 299         kstat_named_t arcstat_l2_evict_reading;
 300         kstat_named_t arcstat_l2_free_on_write;
 301         kstat_named_t arcstat_l2_abort_lowmem;
 302         kstat_named_t arcstat_l2_cksum_bad;
 303         kstat_named_t arcstat_l2_io_error;
 304         kstat_named_t arcstat_l2_size;
 305         kstat_named_t arcstat_l2_asize;
 306         kstat_named_t arcstat_l2_hdr_size;
 307         kstat_named_t arcstat_l2_compress_successes;
 308         kstat_named_t arcstat_l2_compress_zeros;
 309         kstat_named_t arcstat_l2_compress_failures;
 310         kstat_named_t arcstat_memory_throttle_count;
 311         kstat_named_t arcstat_duplicate_buffers;
 312         kstat_named_t arcstat_duplicate_buffers_size;
 313         kstat_named_t arcstat_duplicate_reads;
 314         kstat_named_t arcstat_meta_used;
 315         kstat_named_t arcstat_meta_limit;
 316         kstat_named_t arcstat_meta_max;
 317 } arc_stats_t;
 318 
 319 static arc_stats_t arc_stats = {
 320         { "hits",                       KSTAT_DATA_UINT64 },
 321         { "misses",                     KSTAT_DATA_UINT64 },
 322         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 323         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 324         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 325         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 326         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 327         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 328         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 329         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 330         { "mru_hits",                   KSTAT_DATA_UINT64 },
 331         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 332         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 333         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 334         { "deleted",                    KSTAT_DATA_UINT64 },
 335         { "recycle_miss",               KSTAT_DATA_UINT64 },
 336         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 337         { "evict_skip",                 KSTAT_DATA_UINT64 },
 338         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 339         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 340         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 341         { "hash_elements",              KSTAT_DATA_UINT64 },
 342         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 343         { "hash_collisions",            KSTAT_DATA_UINT64 },
 344         { "hash_chains",                KSTAT_DATA_UINT64 },
 345         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 346         { "p",                          KSTAT_DATA_UINT64 },
 347         { "c",                          KSTAT_DATA_UINT64 },
 348         { "c_min",                      KSTAT_DATA_UINT64 },
 349         { "c_max",                      KSTAT_DATA_UINT64 },
 350         { "size",                       KSTAT_DATA_UINT64 },
 351         { "hdr_size",                   KSTAT_DATA_UINT64 },
 352         { "data_size",                  KSTAT_DATA_UINT64 },
 353         { "other_size",                 KSTAT_DATA_UINT64 },
 354         { "l2_hits",                    KSTAT_DATA_UINT64 },
 355         { "l2_misses",                  KSTAT_DATA_UINT64 },
 356         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 357         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 358         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 359         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 360         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 361         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 362         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 363         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 364         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 365         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 366         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 367         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 368         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 369         { "l2_io_error",                KSTAT_DATA_UINT64 },
 370         { "l2_size",                    KSTAT_DATA_UINT64 },
 371         { "l2_asize",                   KSTAT_DATA_UINT64 },
 372         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 373         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 374         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 375         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 376         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 377         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 378         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 379         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 380         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 381         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 382         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 383 };
 384 
 385 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 386 
 387 #define ARCSTAT_INCR(stat, val) \
 388         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 389 
 390 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 391 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 392 
 393 #define ARCSTAT_MAX(stat, val) {                                        \
 394         uint64_t m;                                                     \
 395         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 396             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 397                 continue;                                               \
 398 }
 399 
 400 #define ARCSTAT_MAXSTAT(stat) \
 401         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 402 
 403 /*
 404  * We define a macro to allow ARC hits/misses to be easily broken down by
 405  * two separate conditions, giving a total of four different subtypes for
 406  * each of hits and misses (so eight statistics total).
 407  */
 408 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 409         if (cond1) {                                                    \
 410                 if (cond2) {                                            \
 411                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 412                 } else {                                                \
 413                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 414                 }                                                       \
 415         } else {                                                        \
 416                 if (cond2) {                                            \
 417                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 418                 } else {                                                \
 419                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 420                 }                                                       \
 421         }
 422 
 423 kstat_t                 *arc_ksp;
 424 static arc_state_t      *arc_anon;
 425 static arc_state_t      *arc_mru;
 426 static arc_state_t      *arc_mru_ghost;
 427 static arc_state_t      *arc_mfu;
 428 static arc_state_t      *arc_mfu_ghost;
 429 static arc_state_t      *arc_l2c_only;
 430 
 431 /*
 432  * There are several ARC variables that are critical to export as kstats --
 433  * but we don't want to have to grovel around in the kstat whenever we wish to
 434  * manipulate them.  For these variables, we therefore define them to be in
 435  * terms of the statistic variable.  This assures that we are not introducing
 436  * the possibility of inconsistency by having shadow copies of the variables,
 437  * while still allowing the code to be readable.
 438  */
 439 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 440 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 441 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 442 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 443 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 444 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 445 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 446 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 447 
 448 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 449         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 450 
 451 static int              arc_no_grow;    /* Don't try to grow cache size */
 452 static uint64_t         arc_tempreserve;
 453 static uint64_t         arc_loaned_bytes;
 454 
 455 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 456 
 457 typedef struct arc_callback arc_callback_t;
 458 
 459 struct arc_callback {
 460         void                    *acb_private;
 461         arc_done_func_t         *acb_done;
 462         arc_buf_t               *acb_buf;
 463         zio_t                   *acb_zio_dummy;
 464         arc_callback_t          *acb_next;
 465 };
 466 
 467 typedef struct arc_write_callback arc_write_callback_t;
 468 
 469 struct arc_write_callback {
 470         void            *awcb_private;
 471         arc_done_func_t *awcb_ready;
 472         arc_done_func_t *awcb_done;
 473         arc_buf_t       *awcb_buf;
 474 };
 475 
 476 struct arc_buf_hdr {
 477         /* protected by hash lock */
 478         dva_t                   b_dva;
 479         uint64_t                b_birth;
 480         uint64_t                b_cksum0;
 481 
 482         kmutex_t                b_freeze_lock;
 483         zio_cksum_t             *b_freeze_cksum;
 484         void                    *b_thawed;
 485 
 486         arc_buf_hdr_t           *b_hash_next;
 487         arc_buf_t               *b_buf;
 488         uint32_t                b_flags;
 489         uint32_t                b_datacnt;
 490 
 491         arc_callback_t          *b_acb;
 492         kcondvar_t              b_cv;
 493 
 494         /* immutable */
 495         arc_buf_contents_t      b_type;
 496         uint64_t                b_size;
 497         uint64_t                b_spa;
 498 
 499         /* protected by arc state mutex */
 500         arc_state_t             *b_state;
 501         list_node_t             b_arc_node;
 502 
 503         /* updated atomically */
 504         clock_t                 b_arc_access;
 505 
 506         /* self protecting */
 507         refcount_t              b_refcnt;
 508 
 509         l2arc_buf_hdr_t         *b_l2hdr;
 510         list_node_t             b_l2node;
 511 };
 512 
 513 static arc_buf_t *arc_eviction_list;
 514 static kmutex_t arc_eviction_mtx;
 515 static arc_buf_hdr_t arc_eviction_hdr;
 516 static void arc_get_data_buf(arc_buf_t *buf);
 517 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 518 static int arc_evict_needed(arc_buf_contents_t type);
 519 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 520 static void arc_buf_watch(arc_buf_t *buf);
 521 
 522 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 523 
 524 #define GHOST_STATE(state)      \
 525         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 526         (state) == arc_l2c_only)
 527 
 528 /*
 529  * Private ARC flags.  These flags are private ARC only flags that will show up
 530  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 531  * be passed in as arc_flags in things like arc_read.  However, these flags
 532  * should never be passed and should only be set by ARC code.  When adding new
 533  * public flags, make sure not to smash the private ones.
 534  */
 535 
 536 #define ARC_IN_HASH_TABLE       (1 << 9)  /* this buffer is hashed */
 537 #define ARC_IO_IN_PROGRESS      (1 << 10) /* I/O in progress for buf */
 538 #define ARC_IO_ERROR            (1 << 11) /* I/O failed for buf */
 539 #define ARC_FREED_IN_READ       (1 << 12) /* buf freed while in read */
 540 #define ARC_BUF_AVAILABLE       (1 << 13) /* block not in active use */
 541 #define ARC_INDIRECT            (1 << 14) /* this is an indirect block */
 542 #define ARC_FREE_IN_PROGRESS    (1 << 15) /* hdr about to be freed */
 543 #define ARC_L2_WRITING          (1 << 16) /* L2ARC write in progress */
 544 #define ARC_L2_EVICTED          (1 << 17) /* evicted during I/O */
 545 #define ARC_L2_WRITE_HEAD       (1 << 18) /* head of write list */
 546 
 547 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 548 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 549 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 550 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 551 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 552 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 553 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 554 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 555 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS &&  \
 556                                     (hdr)->b_l2hdr != NULL)
 557 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 558 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 559 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 560 
 561 /*
 562  * Other sizes
 563  */
 564 
 565 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 566 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 567 
 568 /*
 569  * Hash table routines
 570  */
 571 
 572 #define HT_LOCK_PAD     64
 573 
 574 struct ht_lock {
 575         kmutex_t        ht_lock;
 576 #ifdef _KERNEL
 577         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 578 #endif
 579 };
 580 
 581 #define BUF_LOCKS 256
 582 typedef struct buf_hash_table {
 583         uint64_t ht_mask;
 584         arc_buf_hdr_t **ht_table;
 585         struct ht_lock ht_locks[BUF_LOCKS];
 586 } buf_hash_table_t;
 587 
 588 static buf_hash_table_t buf_hash_table;
 589 
 590 #define BUF_HASH_INDEX(spa, dva, birth) \
 591         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 592 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 593 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 594 #define HDR_LOCK(hdr) \
 595         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 596 
 597 uint64_t zfs_crc64_table[256];
 598 
 599 /*
 600  * Level 2 ARC
 601  */
 602 
 603 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 604 #define L2ARC_HEADROOM          2                       /* num of writes */
 605 /*
 606  * If we discover during ARC scan any buffers to be compressed, we boost
 607  * our headroom for the next scanning cycle by this percentage multiple.
 608  */
 609 #define L2ARC_HEADROOM_BOOST    200
 610 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 611 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 612 
 613 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 614 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 615 
 616 /* L2ARC Performance Tunables */
 617 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 618 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 619 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 620 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 621 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 622 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 623 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 624 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 625 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 626 
 627 /*
 628  * L2ARC Internals
 629  */
 630 typedef struct l2arc_dev {
 631         vdev_t                  *l2ad_vdev;     /* vdev */
 632         spa_t                   *l2ad_spa;      /* spa */
 633         uint64_t                l2ad_hand;      /* next write location */
 634         uint64_t                l2ad_start;     /* first addr on device */
 635         uint64_t                l2ad_end;       /* last addr on device */
 636         uint64_t                l2ad_evict;     /* last addr eviction reached */
 637         boolean_t               l2ad_first;     /* first sweep through */
 638         boolean_t               l2ad_writing;   /* currently writing */
 639         list_t                  *l2ad_buflist;  /* buffer list */
 640         list_node_t             l2ad_node;      /* device list node */
 641 } l2arc_dev_t;
 642 
 643 static list_t L2ARC_dev_list;                   /* device list */
 644 static list_t *l2arc_dev_list;                  /* device list pointer */
 645 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 646 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 647 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 648 static list_t L2ARC_free_on_write;              /* free after write buf list */
 649 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 650 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 651 static uint64_t l2arc_ndev;                     /* number of devices */
 652 
 653 typedef struct l2arc_read_callback {
 654         arc_buf_t               *l2rcb_buf;             /* read buffer */
 655         spa_t                   *l2rcb_spa;             /* spa */
 656         blkptr_t                l2rcb_bp;               /* original blkptr */
 657         zbookmark_t             l2rcb_zb;               /* original bookmark */
 658         int                     l2rcb_flags;            /* original flags */
 659         enum zio_compress       l2rcb_compress;         /* applied compress */
 660 } l2arc_read_callback_t;
 661 
 662 typedef struct l2arc_write_callback {
 663         l2arc_dev_t     *l2wcb_dev;             /* device info */
 664         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 665 } l2arc_write_callback_t;
 666 
 667 struct l2arc_buf_hdr {
 668         /* protected by arc_buf_hdr  mutex */
 669         l2arc_dev_t             *b_dev;         /* L2ARC device */
 670         uint64_t                b_daddr;        /* disk address, offset byte */
 671         /* compression applied to buffer data */
 672         enum zio_compress       b_compress;
 673         /* real alloc'd buffer size depending on b_compress applied */
 674         int                     b_asize;
 675         /* temporary buffer holder for in-flight compressed data */
 676         void                    *b_tmp_cdata;
 677 };
 678 
 679 typedef struct l2arc_data_free {
 680         /* protected by l2arc_free_on_write_mtx */
 681         void            *l2df_data;
 682         size_t          l2df_size;
 683         void            (*l2df_func)(void *, size_t);
 684         list_node_t     l2df_list_node;
 685 } l2arc_data_free_t;
 686 
 687 static kmutex_t l2arc_feed_thr_lock;
 688 static kcondvar_t l2arc_feed_thr_cv;
 689 static uint8_t l2arc_thread_exit;
 690 
 691 static void l2arc_read_done(zio_t *zio);
 692 static void l2arc_hdr_stat_add(void);
 693 static void l2arc_hdr_stat_remove(void);
 694 
 695 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 696 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 697     enum zio_compress c);
 698 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 699 
 700 static uint64_t
 701 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 702 {
 703         uint8_t *vdva = (uint8_t *)dva;
 704         uint64_t crc = -1ULL;
 705         int i;
 706 
 707         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 708 
 709         for (i = 0; i < sizeof (dva_t); i++)
 710                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 711 
 712         crc ^= (spa>>8) ^ birth;
 713 
 714         return (crc);
 715 }
 716 
 717 #define BUF_EMPTY(buf)                                          \
 718         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 719         (buf)->b_dva.dva_word[1] == 0 &&                     \
 720         (buf)->b_birth == 0)
 721 
 722 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 723         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&       \
 724         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&       \
 725         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 726 
 727 static void
 728 buf_discard_identity(arc_buf_hdr_t *hdr)
 729 {
 730         hdr->b_dva.dva_word[0] = 0;
 731         hdr->b_dva.dva_word[1] = 0;
 732         hdr->b_birth = 0;
 733         hdr->b_cksum0 = 0;
 734 }
 735 
 736 static arc_buf_hdr_t *
 737 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 738 {
 739         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 740         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 741         arc_buf_hdr_t *buf;
 742 
 743         mutex_enter(hash_lock);
 744         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 745             buf = buf->b_hash_next) {
 746                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 747                         *lockp = hash_lock;
 748                         return (buf);
 749                 }
 750         }
 751         mutex_exit(hash_lock);
 752         *lockp = NULL;
 753         return (NULL);
 754 }
 755 
 756 /*
 757  * Insert an entry into the hash table.  If there is already an element
 758  * equal to elem in the hash table, then the already existing element
 759  * will be returned and the new element will not be inserted.
 760  * Otherwise returns NULL.
 761  */
 762 static arc_buf_hdr_t *
 763 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 764 {
 765         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 766         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 767         arc_buf_hdr_t *fbuf;
 768         uint32_t i;
 769 
 770         ASSERT(!HDR_IN_HASH_TABLE(buf));
 771         *lockp = hash_lock;
 772         mutex_enter(hash_lock);
 773         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 774             fbuf = fbuf->b_hash_next, i++) {
 775                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 776                         return (fbuf);
 777         }
 778 
 779         buf->b_hash_next = buf_hash_table.ht_table[idx];
 780         buf_hash_table.ht_table[idx] = buf;
 781         buf->b_flags |= ARC_IN_HASH_TABLE;
 782 
 783         /* collect some hash table performance data */
 784         if (i > 0) {
 785                 ARCSTAT_BUMP(arcstat_hash_collisions);
 786                 if (i == 1)
 787                         ARCSTAT_BUMP(arcstat_hash_chains);
 788 
 789                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 790         }
 791 
 792         ARCSTAT_BUMP(arcstat_hash_elements);
 793         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 794 
 795         return (NULL);
 796 }
 797 
 798 static void
 799 buf_hash_remove(arc_buf_hdr_t *buf)
 800 {
 801         arc_buf_hdr_t *fbuf, **bufp;
 802         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 803 
 804         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 805         ASSERT(HDR_IN_HASH_TABLE(buf));
 806 
 807         bufp = &buf_hash_table.ht_table[idx];
 808         while ((fbuf = *bufp) != buf) {
 809                 ASSERT(fbuf != NULL);
 810                 bufp = &fbuf->b_hash_next;
 811         }
 812         *bufp = buf->b_hash_next;
 813         buf->b_hash_next = NULL;
 814         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 815 
 816         /* collect some hash table performance data */
 817         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 818 
 819         if (buf_hash_table.ht_table[idx] &&
 820             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 821                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 822 }
 823 
 824 /*
 825  * Global data structures and functions for the buf kmem cache.
 826  */
 827 static kmem_cache_t *hdr_cache;
 828 static kmem_cache_t *buf_cache;
 829 
 830 static void
 831 buf_fini(void)
 832 {
 833         int i;
 834 
 835         kmem_free(buf_hash_table.ht_table,
 836             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 837         for (i = 0; i < BUF_LOCKS; i++)
 838                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 839         kmem_cache_destroy(hdr_cache);
 840         kmem_cache_destroy(buf_cache);
 841 }
 842 
 843 /*
 844  * Constructor callback - called when the cache is empty
 845  * and a new buf is requested.
 846  */
 847 /* ARGSUSED */
 848 static int
 849 hdr_cons(void *vbuf, void *unused, int kmflag)
 850 {
 851         arc_buf_hdr_t *buf = vbuf;
 852 
 853         bzero(buf, sizeof (arc_buf_hdr_t));
 854         refcount_create(&buf->b_refcnt);
 855         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 856         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 857         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 858 
 859         return (0);
 860 }
 861 
 862 /* ARGSUSED */
 863 static int
 864 buf_cons(void *vbuf, void *unused, int kmflag)
 865 {
 866         arc_buf_t *buf = vbuf;
 867 
 868         bzero(buf, sizeof (arc_buf_t));
 869         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 870         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 871 
 872         return (0);
 873 }
 874 
 875 /*
 876  * Destructor callback - called when a cached buf is
 877  * no longer required.
 878  */
 879 /* ARGSUSED */
 880 static void
 881 hdr_dest(void *vbuf, void *unused)
 882 {
 883         arc_buf_hdr_t *buf = vbuf;
 884 
 885         ASSERT(BUF_EMPTY(buf));
 886         refcount_destroy(&buf->b_refcnt);
 887         cv_destroy(&buf->b_cv);
 888         mutex_destroy(&buf->b_freeze_lock);
 889         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 890 }
 891 
 892 /* ARGSUSED */
 893 static void
 894 buf_dest(void *vbuf, void *unused)
 895 {
 896         arc_buf_t *buf = vbuf;
 897 
 898         mutex_destroy(&buf->b_evict_lock);
 899         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 900 }
 901 
 902 /*
 903  * Reclaim callback -- invoked when memory is low.
 904  */
 905 /* ARGSUSED */
 906 static void
 907 hdr_recl(void *unused)
 908 {
 909         dprintf("hdr_recl called\n");
 910         /*
 911          * umem calls the reclaim func when we destroy the buf cache,
 912          * which is after we do arc_fini().
 913          */
 914         if (!arc_dead)
 915                 cv_signal(&arc_reclaim_thr_cv);
 916 }
 917 
 918 static void
 919 buf_init(void)
 920 {
 921         uint64_t *ct;
 922         uint64_t hsize = 1ULL << 12;
 923         int i, j;
 924 
 925         /*
 926          * The hash table is big enough to fill all of physical memory
 927          * with an average 64K block size.  The table will take up
 928          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 929          */
 930         while (hsize * 65536 < physmem * PAGESIZE)
 931                 hsize <<= 1;
 932 retry:
 933         buf_hash_table.ht_mask = hsize - 1;
 934         buf_hash_table.ht_table =
 935             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 936         if (buf_hash_table.ht_table == NULL) {
 937                 ASSERT(hsize > (1ULL << 8));
 938                 hsize >>= 1;
 939                 goto retry;
 940         }
 941 
 942         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 943             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 944         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 945             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 946 
 947         for (i = 0; i < 256; i++)
 948                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 949                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 950 
 951         for (i = 0; i < BUF_LOCKS; i++) {
 952                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 953                     NULL, MUTEX_DEFAULT, NULL);
 954         }
 955 }
 956 
 957 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 958 
 959 static void
 960 arc_cksum_verify(arc_buf_t *buf)
 961 {
 962         zio_cksum_t zc;
 963 
 964         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 965                 return;
 966 
 967         mutex_enter(&buf->b_hdr->b_freeze_lock);
 968         if (buf->b_hdr->b_freeze_cksum == NULL ||
 969             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 970                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 971                 return;
 972         }
 973         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 974         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 975                 panic("buffer modified while frozen!");
 976         mutex_exit(&buf->b_hdr->b_freeze_lock);
 977 }
 978 
 979 static int
 980 arc_cksum_equal(arc_buf_t *buf)
 981 {
 982         zio_cksum_t zc;
 983         int equal;
 984 
 985         mutex_enter(&buf->b_hdr->b_freeze_lock);
 986         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 987         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 988         mutex_exit(&buf->b_hdr->b_freeze_lock);
 989 
 990         return (equal);
 991 }
 992 
 993 static void
 994 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 995 {
 996         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 997                 return;
 998 
 999         mutex_enter(&buf->b_hdr->b_freeze_lock);
1000         if (buf->b_hdr->b_freeze_cksum != NULL) {
1001                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1002                 return;
1003         }
1004         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1005         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1006             buf->b_hdr->b_freeze_cksum);
1007         mutex_exit(&buf->b_hdr->b_freeze_lock);
1008         arc_buf_watch(buf);
1009 }
1010 
1011 #ifndef _KERNEL
1012 typedef struct procctl {
1013         long cmd;
1014         prwatch_t prwatch;
1015 } procctl_t;
1016 #endif
1017 
1018 /* ARGSUSED */
1019 static void
1020 arc_buf_unwatch(arc_buf_t *buf)
1021 {
1022 #ifndef _KERNEL
1023         if (arc_watch) {
1024                 int result;
1025                 procctl_t ctl;
1026                 ctl.cmd = PCWATCH;
1027                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1028                 ctl.prwatch.pr_size = 0;
1029                 ctl.prwatch.pr_wflags = 0;
1030                 result = write(arc_procfd, &ctl, sizeof (ctl));
1031                 ASSERT3U(result, ==, sizeof (ctl));
1032         }
1033 #endif
1034 }
1035 
1036 /* ARGSUSED */
1037 static void
1038 arc_buf_watch(arc_buf_t *buf)
1039 {
1040 #ifndef _KERNEL
1041         if (arc_watch) {
1042                 int result;
1043                 procctl_t ctl;
1044                 ctl.cmd = PCWATCH;
1045                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1046                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1047                 ctl.prwatch.pr_wflags = WA_WRITE;
1048                 result = write(arc_procfd, &ctl, sizeof (ctl));
1049                 ASSERT3U(result, ==, sizeof (ctl));
1050         }
1051 #endif
1052 }
1053 
1054 void
1055 arc_buf_thaw(arc_buf_t *buf)
1056 {
1057         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1058                 if (buf->b_hdr->b_state != arc_anon)
1059                         panic("modifying non-anon buffer!");
1060                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1061                         panic("modifying buffer while i/o in progress!");
1062                 arc_cksum_verify(buf);
1063         }
1064 
1065         mutex_enter(&buf->b_hdr->b_freeze_lock);
1066         if (buf->b_hdr->b_freeze_cksum != NULL) {
1067                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1068                 buf->b_hdr->b_freeze_cksum = NULL;
1069         }
1070 
1071         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1072                 if (buf->b_hdr->b_thawed)
1073                         kmem_free(buf->b_hdr->b_thawed, 1);
1074                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1075         }
1076 
1077         mutex_exit(&buf->b_hdr->b_freeze_lock);
1078 
1079         arc_buf_unwatch(buf);
1080 }
1081 
1082 void
1083 arc_buf_freeze(arc_buf_t *buf)
1084 {
1085         kmutex_t *hash_lock;
1086 
1087         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1088                 return;
1089 
1090         hash_lock = HDR_LOCK(buf->b_hdr);
1091         mutex_enter(hash_lock);
1092 
1093         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1094             buf->b_hdr->b_state == arc_anon);
1095         arc_cksum_compute(buf, B_FALSE);
1096         mutex_exit(hash_lock);
1097 
1098 }
1099 
1100 static void
1101 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1102 {
1103         ASSERT(MUTEX_HELD(hash_lock));
1104 
1105         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1106             (ab->b_state != arc_anon)) {
1107                 uint64_t delta = ab->b_size * ab->b_datacnt;
1108                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1109                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1110 
1111                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1112                 mutex_enter(&ab->b_state->arcs_mtx);
1113                 ASSERT(list_link_active(&ab->b_arc_node));
1114                 list_remove(list, ab);
1115                 if (GHOST_STATE(ab->b_state)) {
1116                         ASSERT0(ab->b_datacnt);
1117                         ASSERT3P(ab->b_buf, ==, NULL);
1118                         delta = ab->b_size;
1119                 }
1120                 ASSERT(delta > 0);
1121                 ASSERT3U(*size, >=, delta);
1122                 atomic_add_64(size, -delta);
1123                 mutex_exit(&ab->b_state->arcs_mtx);
1124                 /* remove the prefetch flag if we get a reference */
1125                 if (ab->b_flags & ARC_PREFETCH)
1126                         ab->b_flags &= ~ARC_PREFETCH;
1127         }
1128 }
1129 
1130 static int
1131 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1132 {
1133         int cnt;
1134         arc_state_t *state = ab->b_state;
1135 
1136         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1137         ASSERT(!GHOST_STATE(state));
1138 
1139         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1140             (state != arc_anon)) {
1141                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1142 
1143                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1144                 mutex_enter(&state->arcs_mtx);
1145                 ASSERT(!list_link_active(&ab->b_arc_node));
1146                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1147                 ASSERT(ab->b_datacnt > 0);
1148                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1149                 mutex_exit(&state->arcs_mtx);
1150         }
1151         return (cnt);
1152 }
1153 
1154 /*
1155  * Move the supplied buffer to the indicated state.  The mutex
1156  * for the buffer must be held by the caller.
1157  */
1158 static void
1159 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1160 {
1161         arc_state_t *old_state = ab->b_state;
1162         int64_t refcnt = refcount_count(&ab->b_refcnt);
1163         uint64_t from_delta, to_delta;
1164 
1165         ASSERT(MUTEX_HELD(hash_lock));
1166         ASSERT(new_state != old_state);
1167         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1168         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1169         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1170 
1171         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1172 
1173         /*
1174          * If this buffer is evictable, transfer it from the
1175          * old state list to the new state list.
1176          */
1177         if (refcnt == 0) {
1178                 if (old_state != arc_anon) {
1179                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1180                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1181 
1182                         if (use_mutex)
1183                                 mutex_enter(&old_state->arcs_mtx);
1184 
1185                         ASSERT(list_link_active(&ab->b_arc_node));
1186                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1187 
1188                         /*
1189                          * If prefetching out of the ghost cache,
1190                          * we will have a non-zero datacnt.
1191                          */
1192                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1193                                 /* ghost elements have a ghost size */
1194                                 ASSERT(ab->b_buf == NULL);
1195                                 from_delta = ab->b_size;
1196                         }
1197                         ASSERT3U(*size, >=, from_delta);
1198                         atomic_add_64(size, -from_delta);
1199 
1200                         if (use_mutex)
1201                                 mutex_exit(&old_state->arcs_mtx);
1202                 }
1203                 if (new_state != arc_anon) {
1204                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1205                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1206 
1207                         if (use_mutex)
1208                                 mutex_enter(&new_state->arcs_mtx);
1209 
1210                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1211 
1212                         /* ghost elements have a ghost size */
1213                         if (GHOST_STATE(new_state)) {
1214                                 ASSERT(ab->b_datacnt == 0);
1215                                 ASSERT(ab->b_buf == NULL);
1216                                 to_delta = ab->b_size;
1217                         }
1218                         atomic_add_64(size, to_delta);
1219 
1220                         if (use_mutex)
1221                                 mutex_exit(&new_state->arcs_mtx);
1222                 }
1223         }
1224 
1225         ASSERT(!BUF_EMPTY(ab));
1226         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1227                 buf_hash_remove(ab);
1228 
1229         /* adjust state sizes */
1230         if (to_delta)
1231                 atomic_add_64(&new_state->arcs_size, to_delta);
1232         if (from_delta) {
1233                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1234                 atomic_add_64(&old_state->arcs_size, -from_delta);
1235         }
1236         ab->b_state = new_state;
1237 
1238         /* adjust l2arc hdr stats */
1239         if (new_state == arc_l2c_only)
1240                 l2arc_hdr_stat_add();
1241         else if (old_state == arc_l2c_only)
1242                 l2arc_hdr_stat_remove();
1243 }
1244 
1245 void
1246 arc_space_consume(uint64_t space, arc_space_type_t type)
1247 {
1248         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1249 
1250         switch (type) {
1251         case ARC_SPACE_DATA:
1252                 ARCSTAT_INCR(arcstat_data_size, space);
1253                 break;
1254         case ARC_SPACE_OTHER:
1255                 ARCSTAT_INCR(arcstat_other_size, space);
1256                 break;
1257         case ARC_SPACE_HDRS:
1258                 ARCSTAT_INCR(arcstat_hdr_size, space);
1259                 break;
1260         case ARC_SPACE_L2HDRS:
1261                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1262                 break;
1263         }
1264 
1265         ARCSTAT_INCR(arcstat_meta_used, space);
1266         atomic_add_64(&arc_size, space);
1267 }
1268 
1269 void
1270 arc_space_return(uint64_t space, arc_space_type_t type)
1271 {
1272         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1273 
1274         switch (type) {
1275         case ARC_SPACE_DATA:
1276                 ARCSTAT_INCR(arcstat_data_size, -space);
1277                 break;
1278         case ARC_SPACE_OTHER:
1279                 ARCSTAT_INCR(arcstat_other_size, -space);
1280                 break;
1281         case ARC_SPACE_HDRS:
1282                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1283                 break;
1284         case ARC_SPACE_L2HDRS:
1285                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1286                 break;
1287         }
1288 
1289         ASSERT(arc_meta_used >= space);
1290         if (arc_meta_max < arc_meta_used)
1291                 arc_meta_max = arc_meta_used;
1292         ARCSTAT_INCR(arcstat_meta_used, -space);
1293         ASSERT(arc_size >= space);
1294         atomic_add_64(&arc_size, -space);
1295 }
1296 
1297 void *
1298 arc_data_buf_alloc(uint64_t size)
1299 {
1300         if (arc_evict_needed(ARC_BUFC_DATA))
1301                 cv_signal(&arc_reclaim_thr_cv);
1302         atomic_add_64(&arc_size, size);
1303         return (zio_data_buf_alloc(size));
1304 }
1305 
1306 void
1307 arc_data_buf_free(void *buf, uint64_t size)
1308 {
1309         zio_data_buf_free(buf, size);
1310         ASSERT(arc_size >= size);
1311         atomic_add_64(&arc_size, -size);
1312 }
1313 
1314 arc_buf_t *
1315 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1316 {
1317         arc_buf_hdr_t *hdr;
1318         arc_buf_t *buf;
1319 
1320         ASSERT3U(size, >, 0);
1321         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1322         ASSERT(BUF_EMPTY(hdr));
1323         hdr->b_size = size;
1324         hdr->b_type = type;
1325         hdr->b_spa = spa_load_guid(spa);
1326         hdr->b_state = arc_anon;
1327         hdr->b_arc_access = 0;
1328         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1329         buf->b_hdr = hdr;
1330         buf->b_data = NULL;
1331         buf->b_efunc = NULL;
1332         buf->b_private = NULL;
1333         buf->b_next = NULL;
1334         hdr->b_buf = buf;
1335         arc_get_data_buf(buf);
1336         hdr->b_datacnt = 1;
1337         hdr->b_flags = 0;
1338         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1339         (void) refcount_add(&hdr->b_refcnt, tag);
1340 
1341         return (buf);
1342 }
1343 
1344 static char *arc_onloan_tag = "onloan";
1345 
1346 /*
1347  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1348  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1349  * buffers must be returned to the arc before they can be used by the DMU or
1350  * freed.
1351  */
1352 arc_buf_t *
1353 arc_loan_buf(spa_t *spa, int size)
1354 {
1355         arc_buf_t *buf;
1356 
1357         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1358 
1359         atomic_add_64(&arc_loaned_bytes, size);
1360         return (buf);
1361 }
1362 
1363 /*
1364  * Return a loaned arc buffer to the arc.
1365  */
1366 void
1367 arc_return_buf(arc_buf_t *buf, void *tag)
1368 {
1369         arc_buf_hdr_t *hdr = buf->b_hdr;
1370 
1371         ASSERT(buf->b_data != NULL);
1372         (void) refcount_add(&hdr->b_refcnt, tag);
1373         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1374 
1375         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1376 }
1377 
1378 /* Detach an arc_buf from a dbuf (tag) */
1379 void
1380 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1381 {
1382         arc_buf_hdr_t *hdr;
1383 
1384         ASSERT(buf->b_data != NULL);
1385         hdr = buf->b_hdr;
1386         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1387         (void) refcount_remove(&hdr->b_refcnt, tag);
1388         buf->b_efunc = NULL;
1389         buf->b_private = NULL;
1390 
1391         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1392 }
1393 
1394 static arc_buf_t *
1395 arc_buf_clone(arc_buf_t *from)
1396 {
1397         arc_buf_t *buf;
1398         arc_buf_hdr_t *hdr = from->b_hdr;
1399         uint64_t size = hdr->b_size;
1400 
1401         ASSERT(hdr->b_state != arc_anon);
1402 
1403         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1404         buf->b_hdr = hdr;
1405         buf->b_data = NULL;
1406         buf->b_efunc = NULL;
1407         buf->b_private = NULL;
1408         buf->b_next = hdr->b_buf;
1409         hdr->b_buf = buf;
1410         arc_get_data_buf(buf);
1411         bcopy(from->b_data, buf->b_data, size);
1412 
1413         /*
1414          * This buffer already exists in the arc so create a duplicate
1415          * copy for the caller.  If the buffer is associated with user data
1416          * then track the size and number of duplicates.  These stats will be
1417          * updated as duplicate buffers are created and destroyed.
1418          */
1419         if (hdr->b_type == ARC_BUFC_DATA) {
1420                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1421                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1422         }
1423         hdr->b_datacnt += 1;
1424         return (buf);
1425 }
1426 
1427 void
1428 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1429 {
1430         arc_buf_hdr_t *hdr;
1431         kmutex_t *hash_lock;
1432 
1433         /*
1434          * Check to see if this buffer is evicted.  Callers
1435          * must verify b_data != NULL to know if the add_ref
1436          * was successful.
1437          */
1438         mutex_enter(&buf->b_evict_lock);
1439         if (buf->b_data == NULL) {
1440                 mutex_exit(&buf->b_evict_lock);
1441                 return;
1442         }
1443         hash_lock = HDR_LOCK(buf->b_hdr);
1444         mutex_enter(hash_lock);
1445         hdr = buf->b_hdr;
1446         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1447         mutex_exit(&buf->b_evict_lock);
1448 
1449         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1450         add_reference(hdr, hash_lock, tag);
1451         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1452         arc_access(hdr, hash_lock);
1453         mutex_exit(hash_lock);
1454         ARCSTAT_BUMP(arcstat_hits);
1455         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1456             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1457             data, metadata, hits);
1458 }
1459 
1460 /*
1461  * Free the arc data buffer.  If it is an l2arc write in progress,
1462  * the buffer is placed on l2arc_free_on_write to be freed later.
1463  */
1464 static void
1465 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1466 {
1467         arc_buf_hdr_t *hdr = buf->b_hdr;
1468 
1469         if (HDR_L2_WRITING(hdr)) {
1470                 l2arc_data_free_t *df;
1471                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1472                 df->l2df_data = buf->b_data;
1473                 df->l2df_size = hdr->b_size;
1474                 df->l2df_func = free_func;
1475                 mutex_enter(&l2arc_free_on_write_mtx);
1476                 list_insert_head(l2arc_free_on_write, df);
1477                 mutex_exit(&l2arc_free_on_write_mtx);
1478                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1479         } else {
1480                 free_func(buf->b_data, hdr->b_size);
1481         }
1482 }
1483 
1484 static void
1485 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1486 {
1487         arc_buf_t **bufp;
1488 
1489         /* free up data associated with the buf */
1490         if (buf->b_data) {
1491                 arc_state_t *state = buf->b_hdr->b_state;
1492                 uint64_t size = buf->b_hdr->b_size;
1493                 arc_buf_contents_t type = buf->b_hdr->b_type;
1494 
1495                 arc_cksum_verify(buf);
1496                 arc_buf_unwatch(buf);
1497 
1498                 if (!recycle) {
1499                         if (type == ARC_BUFC_METADATA) {
1500                                 arc_buf_data_free(buf, zio_buf_free);
1501                                 arc_space_return(size, ARC_SPACE_DATA);
1502                         } else {
1503                                 ASSERT(type == ARC_BUFC_DATA);
1504                                 arc_buf_data_free(buf, zio_data_buf_free);
1505                                 ARCSTAT_INCR(arcstat_data_size, -size);
1506                                 atomic_add_64(&arc_size, -size);
1507                         }
1508                 }
1509                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1510                         uint64_t *cnt = &state->arcs_lsize[type];
1511 
1512                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1513                         ASSERT(state != arc_anon);
1514 
1515                         ASSERT3U(*cnt, >=, size);
1516                         atomic_add_64(cnt, -size);
1517                 }
1518                 ASSERT3U(state->arcs_size, >=, size);
1519                 atomic_add_64(&state->arcs_size, -size);
1520                 buf->b_data = NULL;
1521 
1522                 /*
1523                  * If we're destroying a duplicate buffer make sure
1524                  * that the appropriate statistics are updated.
1525                  */
1526                 if (buf->b_hdr->b_datacnt > 1 &&
1527                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1528                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1529                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1530                 }
1531                 ASSERT(buf->b_hdr->b_datacnt > 0);
1532                 buf->b_hdr->b_datacnt -= 1;
1533         }
1534 
1535         /* only remove the buf if requested */
1536         if (!all)
1537                 return;
1538 
1539         /* remove the buf from the hdr list */
1540         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1541                 continue;
1542         *bufp = buf->b_next;
1543         buf->b_next = NULL;
1544 
1545         ASSERT(buf->b_efunc == NULL);
1546 
1547         /* clean up the buf */
1548         buf->b_hdr = NULL;
1549         kmem_cache_free(buf_cache, buf);
1550 }
1551 
1552 static void
1553 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1554 {
1555         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1556         ASSERT3P(hdr->b_state, ==, arc_anon);
1557         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1558         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1559 
1560         if (l2hdr != NULL) {
1561                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1562                 /*
1563                  * To prevent arc_free() and l2arc_evict() from
1564                  * attempting to free the same buffer at the same time,
1565                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1566                  * give it priority.  l2arc_evict() can't destroy this
1567                  * header while we are waiting on l2arc_buflist_mtx.
1568                  *
1569                  * The hdr may be removed from l2ad_buflist before we
1570                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1571                  */
1572                 if (!buflist_held) {
1573                         mutex_enter(&l2arc_buflist_mtx);
1574                         l2hdr = hdr->b_l2hdr;
1575                 }
1576 
1577                 if (l2hdr != NULL) {
1578                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1579                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1580                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1581                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1582                         if (hdr->b_state == arc_l2c_only)
1583                                 l2arc_hdr_stat_remove();
1584                         hdr->b_l2hdr = NULL;
1585                 }
1586 
1587                 if (!buflist_held)
1588                         mutex_exit(&l2arc_buflist_mtx);
1589         }
1590 
1591         if (!BUF_EMPTY(hdr)) {
1592                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1593                 buf_discard_identity(hdr);
1594         }
1595         while (hdr->b_buf) {
1596                 arc_buf_t *buf = hdr->b_buf;
1597 
1598                 if (buf->b_efunc) {
1599                         mutex_enter(&arc_eviction_mtx);
1600                         mutex_enter(&buf->b_evict_lock);
1601                         ASSERT(buf->b_hdr != NULL);
1602                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1603                         hdr->b_buf = buf->b_next;
1604                         buf->b_hdr = &arc_eviction_hdr;
1605                         buf->b_next = arc_eviction_list;
1606                         arc_eviction_list = buf;
1607                         mutex_exit(&buf->b_evict_lock);
1608                         mutex_exit(&arc_eviction_mtx);
1609                 } else {
1610                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1611                 }
1612         }
1613         if (hdr->b_freeze_cksum != NULL) {
1614                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1615                 hdr->b_freeze_cksum = NULL;
1616         }
1617         if (hdr->b_thawed) {
1618                 kmem_free(hdr->b_thawed, 1);
1619                 hdr->b_thawed = NULL;
1620         }
1621 
1622         ASSERT(!list_link_active(&hdr->b_arc_node));
1623         ASSERT3P(hdr->b_hash_next, ==, NULL);
1624         ASSERT3P(hdr->b_acb, ==, NULL);
1625         kmem_cache_free(hdr_cache, hdr);
1626 }
1627 
1628 void
1629 arc_buf_free(arc_buf_t *buf, void *tag)
1630 {
1631         arc_buf_hdr_t *hdr = buf->b_hdr;
1632         int hashed = hdr->b_state != arc_anon;
1633 
1634         ASSERT(buf->b_efunc == NULL);
1635         ASSERT(buf->b_data != NULL);
1636 
1637         if (hashed) {
1638                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1639 
1640                 mutex_enter(hash_lock);
1641                 hdr = buf->b_hdr;
1642                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1643 
1644                 (void) remove_reference(hdr, hash_lock, tag);
1645                 if (hdr->b_datacnt > 1) {
1646                         arc_buf_destroy(buf, FALSE, TRUE);
1647                 } else {
1648                         ASSERT(buf == hdr->b_buf);
1649                         ASSERT(buf->b_efunc == NULL);
1650                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1651                 }
1652                 mutex_exit(hash_lock);
1653         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1654                 int destroy_hdr;
1655                 /*
1656                  * We are in the middle of an async write.  Don't destroy
1657                  * this buffer unless the write completes before we finish
1658                  * decrementing the reference count.
1659                  */
1660                 mutex_enter(&arc_eviction_mtx);
1661                 (void) remove_reference(hdr, NULL, tag);
1662                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1663                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1664                 mutex_exit(&arc_eviction_mtx);
1665                 if (destroy_hdr)
1666                         arc_hdr_destroy(hdr);
1667         } else {
1668                 if (remove_reference(hdr, NULL, tag) > 0)
1669                         arc_buf_destroy(buf, FALSE, TRUE);
1670                 else
1671                         arc_hdr_destroy(hdr);
1672         }
1673 }
1674 
1675 boolean_t
1676 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1677 {
1678         arc_buf_hdr_t *hdr = buf->b_hdr;
1679         kmutex_t *hash_lock = HDR_LOCK(hdr);
1680         boolean_t no_callback = (buf->b_efunc == NULL);
1681 
1682         if (hdr->b_state == arc_anon) {
1683                 ASSERT(hdr->b_datacnt == 1);
1684                 arc_buf_free(buf, tag);
1685                 return (no_callback);
1686         }
1687 
1688         mutex_enter(hash_lock);
1689         hdr = buf->b_hdr;
1690         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1691         ASSERT(hdr->b_state != arc_anon);
1692         ASSERT(buf->b_data != NULL);
1693 
1694         (void) remove_reference(hdr, hash_lock, tag);
1695         if (hdr->b_datacnt > 1) {
1696                 if (no_callback)
1697                         arc_buf_destroy(buf, FALSE, TRUE);
1698         } else if (no_callback) {
1699                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1700                 ASSERT(buf->b_efunc == NULL);
1701                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1702         }
1703         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1704             refcount_is_zero(&hdr->b_refcnt));
1705         mutex_exit(hash_lock);
1706         return (no_callback);
1707 }
1708 
1709 int
1710 arc_buf_size(arc_buf_t *buf)
1711 {
1712         return (buf->b_hdr->b_size);
1713 }
1714 
1715 /*
1716  * Called from the DMU to determine if the current buffer should be
1717  * evicted. In order to ensure proper locking, the eviction must be initiated
1718  * from the DMU. Return true if the buffer is associated with user data and
1719  * duplicate buffers still exist.
1720  */
1721 boolean_t
1722 arc_buf_eviction_needed(arc_buf_t *buf)
1723 {
1724         arc_buf_hdr_t *hdr;
1725         boolean_t evict_needed = B_FALSE;
1726 
1727         if (zfs_disable_dup_eviction)
1728                 return (B_FALSE);
1729 
1730         mutex_enter(&buf->b_evict_lock);
1731         hdr = buf->b_hdr;
1732         if (hdr == NULL) {
1733                 /*
1734                  * We are in arc_do_user_evicts(); let that function
1735                  * perform the eviction.
1736                  */
1737                 ASSERT(buf->b_data == NULL);
1738                 mutex_exit(&buf->b_evict_lock);
1739                 return (B_FALSE);
1740         } else if (buf->b_data == NULL) {
1741                 /*
1742                  * We have already been added to the arc eviction list;
1743                  * recommend eviction.
1744                  */
1745                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1746                 mutex_exit(&buf->b_evict_lock);
1747                 return (B_TRUE);
1748         }
1749 
1750         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1751                 evict_needed = B_TRUE;
1752 
1753         mutex_exit(&buf->b_evict_lock);
1754         return (evict_needed);
1755 }
1756 
1757 /*
1758  * Evict buffers from list until we've removed the specified number of
1759  * bytes.  Move the removed buffers to the appropriate evict state.
1760  * If the recycle flag is set, then attempt to "recycle" a buffer:
1761  * - look for a buffer to evict that is `bytes' long.
1762  * - return the data block from this buffer rather than freeing it.
1763  * This flag is used by callers that are trying to make space for a
1764  * new buffer in a full arc cache.
1765  *
1766  * This function makes a "best effort".  It skips over any buffers
1767  * it can't get a hash_lock on, and so may not catch all candidates.
1768  * It may also return without evicting as much space as requested.
1769  */
1770 static void *
1771 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1772     arc_buf_contents_t type)
1773 {
1774         arc_state_t *evicted_state;
1775         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1776         arc_buf_hdr_t *ab, *ab_prev = NULL;
1777         list_t *list = &state->arcs_list[type];
1778         kmutex_t *hash_lock;
1779         boolean_t have_lock;
1780         void *stolen = NULL;
1781 
1782         ASSERT(state == arc_mru || state == arc_mfu);
1783 
1784         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1785 
1786         mutex_enter(&state->arcs_mtx);
1787         mutex_enter(&evicted_state->arcs_mtx);
1788 
1789         for (ab = list_tail(list); ab; ab = ab_prev) {
1790                 ab_prev = list_prev(list, ab);
1791                 /* prefetch buffers have a minimum lifespan */
1792                 if (HDR_IO_IN_PROGRESS(ab) ||
1793                     (spa && ab->b_spa != spa) ||
1794                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1795                     ddi_get_lbolt() - ab->b_arc_access <
1796                     arc_min_prefetch_lifespan)) {
1797                         skipped++;
1798                         continue;
1799                 }
1800                 /* "lookahead" for better eviction candidate */
1801                 if (recycle && ab->b_size != bytes &&
1802                     ab_prev && ab_prev->b_size == bytes)
1803                         continue;
1804                 hash_lock = HDR_LOCK(ab);
1805                 have_lock = MUTEX_HELD(hash_lock);
1806                 if (have_lock || mutex_tryenter(hash_lock)) {
1807                         ASSERT0(refcount_count(&ab->b_refcnt));
1808                         ASSERT(ab->b_datacnt > 0);
1809                         while (ab->b_buf) {
1810                                 arc_buf_t *buf = ab->b_buf;
1811                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1812                                         missed += 1;
1813                                         break;
1814                                 }
1815                                 if (buf->b_data) {
1816                                         bytes_evicted += ab->b_size;
1817                                         if (recycle && ab->b_type == type &&
1818                                             ab->b_size == bytes &&
1819                                             !HDR_L2_WRITING(ab)) {
1820                                                 stolen = buf->b_data;
1821                                                 recycle = FALSE;
1822                                         }
1823                                 }
1824                                 if (buf->b_efunc) {
1825                                         mutex_enter(&arc_eviction_mtx);
1826                                         arc_buf_destroy(buf,
1827                                             buf->b_data == stolen, FALSE);
1828                                         ab->b_buf = buf->b_next;
1829                                         buf->b_hdr = &arc_eviction_hdr;
1830                                         buf->b_next = arc_eviction_list;
1831                                         arc_eviction_list = buf;
1832                                         mutex_exit(&arc_eviction_mtx);
1833                                         mutex_exit(&buf->b_evict_lock);
1834                                 } else {
1835                                         mutex_exit(&buf->b_evict_lock);
1836                                         arc_buf_destroy(buf,
1837                                             buf->b_data == stolen, TRUE);
1838                                 }
1839                         }
1840 
1841                         if (ab->b_l2hdr) {
1842                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1843                                     ab->b_size);
1844                         } else {
1845                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1846                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1847                                             ab->b_size);
1848                                 } else {
1849                                         ARCSTAT_INCR(
1850                                             arcstat_evict_l2_ineligible,
1851                                             ab->b_size);
1852                                 }
1853                         }
1854 
1855                         if (ab->b_datacnt == 0) {
1856                                 arc_change_state(evicted_state, ab, hash_lock);
1857                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1858                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1859                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1860                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1861                         }
1862                         if (!have_lock)
1863                                 mutex_exit(hash_lock);
1864                         if (bytes >= 0 && bytes_evicted >= bytes)
1865                                 break;
1866                 } else {
1867                         missed += 1;
1868                 }
1869         }
1870 
1871         mutex_exit(&evicted_state->arcs_mtx);
1872         mutex_exit(&state->arcs_mtx);
1873 
1874         if (bytes_evicted < bytes)
1875                 dprintf("only evicted %lld bytes from %x",
1876                     (longlong_t)bytes_evicted, state);
1877 
1878         if (skipped)
1879                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1880 
1881         if (missed)
1882                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1883 
1884         /*
1885          * We have just evicted some data into the ghost state, make
1886          * sure we also adjust the ghost state size if necessary.
1887          */
1888         if (arc_no_grow &&
1889             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1890                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1891                     arc_mru_ghost->arcs_size - arc_c;
1892 
1893                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1894                         int64_t todelete =
1895                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1896                         arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1897                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1898                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1899                             arc_mru_ghost->arcs_size +
1900                             arc_mfu_ghost->arcs_size - arc_c);
1901                         arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1902                 }
1903         }
1904 
1905         return (stolen);
1906 }
1907 
1908 /*
1909  * Remove buffers from list until we've removed the specified number of
1910  * bytes.  Destroy the buffers that are removed.
1911  */
1912 static void
1913 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1914 {
1915         arc_buf_hdr_t *ab, *ab_prev;
1916         arc_buf_hdr_t marker = { 0 };
1917         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1918         kmutex_t *hash_lock;
1919         uint64_t bytes_deleted = 0;
1920         uint64_t bufs_skipped = 0;
1921 
1922         ASSERT(GHOST_STATE(state));
1923 top:
1924         mutex_enter(&state->arcs_mtx);
1925         for (ab = list_tail(list); ab; ab = ab_prev) {
1926                 ab_prev = list_prev(list, ab);
1927                 if (spa && ab->b_spa != spa)
1928                         continue;
1929 
1930                 /* ignore markers */
1931                 if (ab->b_spa == 0)
1932                         continue;
1933 
1934                 hash_lock = HDR_LOCK(ab);
1935                 /* caller may be trying to modify this buffer, skip it */
1936                 if (MUTEX_HELD(hash_lock))
1937                         continue;
1938                 if (mutex_tryenter(hash_lock)) {
1939                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1940                         ASSERT(ab->b_buf == NULL);
1941                         ARCSTAT_BUMP(arcstat_deleted);
1942                         bytes_deleted += ab->b_size;
1943 
1944                         if (ab->b_l2hdr != NULL) {
1945                                 /*
1946                                  * This buffer is cached on the 2nd Level ARC;
1947                                  * don't destroy the header.
1948                                  */
1949                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1950                                 mutex_exit(hash_lock);
1951                         } else {
1952                                 arc_change_state(arc_anon, ab, hash_lock);
1953                                 mutex_exit(hash_lock);
1954                                 arc_hdr_destroy(ab);
1955                         }
1956 
1957                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1958                         if (bytes >= 0 && bytes_deleted >= bytes)
1959                                 break;
1960                 } else if (bytes < 0) {
1961                         /*
1962                          * Insert a list marker and then wait for the
1963                          * hash lock to become available. Once its
1964                          * available, restart from where we left off.
1965                          */
1966                         list_insert_after(list, ab, &marker);
1967                         mutex_exit(&state->arcs_mtx);
1968                         mutex_enter(hash_lock);
1969                         mutex_exit(hash_lock);
1970                         mutex_enter(&state->arcs_mtx);
1971                         ab_prev = list_prev(list, &marker);
1972                         list_remove(list, &marker);
1973                 } else
1974                         bufs_skipped += 1;
1975         }
1976         mutex_exit(&state->arcs_mtx);
1977 
1978         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1979             (bytes < 0 || bytes_deleted < bytes)) {
1980                 list = &state->arcs_list[ARC_BUFC_METADATA];
1981                 goto top;
1982         }
1983 
1984         if (bufs_skipped) {
1985                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1986                 ASSERT(bytes >= 0);
1987         }
1988 
1989         if (bytes_deleted < bytes)
1990                 dprintf("only deleted %lld bytes from %p",
1991                     (longlong_t)bytes_deleted, state);
1992 }
1993 
1994 static void
1995 arc_adjust(void)
1996 {
1997         int64_t adjustment, delta;
1998 
1999         /*
2000          * Adjust MRU size
2001          */
2002 
2003         adjustment = MIN((int64_t)(arc_size - arc_c),
2004             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2005             arc_p));
2006 
2007         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2008                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2009                 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2010                 adjustment -= delta;
2011         }
2012 
2013         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2014                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2015                 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2016                     ARC_BUFC_METADATA);
2017         }
2018 
2019         /*
2020          * Adjust MFU size
2021          */
2022 
2023         adjustment = arc_size - arc_c;
2024 
2025         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2026                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2027                 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2028                 adjustment -= delta;
2029         }
2030 
2031         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2032                 int64_t delta = MIN(adjustment,
2033                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2034                 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2035                     ARC_BUFC_METADATA);
2036         }
2037 
2038         /*
2039          * Adjust ghost lists
2040          */
2041 
2042         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2043 
2044         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2045                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2046                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2047         }
2048 
2049         adjustment =
2050             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2051 
2052         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2053                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2054                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2055         }
2056 }
2057 
2058 static void
2059 arc_do_user_evicts(void)
2060 {
2061         mutex_enter(&arc_eviction_mtx);
2062         while (arc_eviction_list != NULL) {
2063                 arc_buf_t *buf = arc_eviction_list;
2064                 arc_eviction_list = buf->b_next;
2065                 mutex_enter(&buf->b_evict_lock);
2066                 buf->b_hdr = NULL;
2067                 mutex_exit(&buf->b_evict_lock);
2068                 mutex_exit(&arc_eviction_mtx);
2069 
2070                 if (buf->b_efunc != NULL)
2071                         VERIFY(buf->b_efunc(buf) == 0);
2072 
2073                 buf->b_efunc = NULL;
2074                 buf->b_private = NULL;
2075                 kmem_cache_free(buf_cache, buf);
2076                 mutex_enter(&arc_eviction_mtx);
2077         }
2078         mutex_exit(&arc_eviction_mtx);
2079 }
2080 
2081 /*
2082  * Flush all *evictable* data from the cache for the given spa.
2083  * NOTE: this will not touch "active" (i.e. referenced) data.
2084  */
2085 void
2086 arc_flush(spa_t *spa)
2087 {
2088         uint64_t guid = 0;
2089 
2090         if (spa)
2091                 guid = spa_load_guid(spa);
2092 
2093         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2094                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2095                 if (spa)
2096                         break;
2097         }
2098         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2099                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2100                 if (spa)
2101                         break;
2102         }
2103         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2104                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2105                 if (spa)
2106                         break;
2107         }
2108         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2109                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2110                 if (spa)
2111                         break;
2112         }
2113 
2114         arc_evict_ghost(arc_mru_ghost, guid, -1);
2115         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2116 
2117         mutex_enter(&arc_reclaim_thr_lock);
2118         arc_do_user_evicts();
2119         mutex_exit(&arc_reclaim_thr_lock);
2120         ASSERT(spa || arc_eviction_list == NULL);
2121 }
2122 
2123 void
2124 arc_shrink(void)
2125 {
2126         if (arc_c > arc_c_min) {
2127                 uint64_t to_free;
2128 
2129 #ifdef _KERNEL
2130                 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2131 #else
2132                 to_free = arc_c >> arc_shrink_shift;
2133 #endif
2134                 if (arc_c > arc_c_min + to_free)
2135                         atomic_add_64(&arc_c, -to_free);
2136                 else
2137                         arc_c = arc_c_min;
2138 
2139                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2140                 if (arc_c > arc_size)
2141                         arc_c = MAX(arc_size, arc_c_min);
2142                 if (arc_p > arc_c)
2143                         arc_p = (arc_c >> 1);
2144                 ASSERT(arc_c >= arc_c_min);
2145                 ASSERT((int64_t)arc_p >= 0);
2146         }
2147 
2148         if (arc_size > arc_c)
2149                 arc_adjust();
2150 }
2151 
2152 /*
2153  * Determine if the system is under memory pressure and is asking
2154  * to reclaim memory. A return value of 1 indicates that the system
2155  * is under memory pressure and that the arc should adjust accordingly.
2156  */
2157 static int
2158 arc_reclaim_needed(void)
2159 {
2160         uint64_t extra;
2161 
2162 #ifdef _KERNEL
2163 
2164         if (needfree)
2165                 return (1);
2166 
2167         /*
2168          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2169          */
2170         extra = desfree;
2171 
2172         /*
2173          * check that we're out of range of the pageout scanner.  It starts to
2174          * schedule paging if freemem is less than lotsfree and needfree.
2175          * lotsfree is the high-water mark for pageout, and needfree is the
2176          * number of needed free pages.  We add extra pages here to make sure
2177          * the scanner doesn't start up while we're freeing memory.
2178          */
2179         if (freemem < lotsfree + needfree + extra)
2180                 return (1);
2181 
2182         /*
2183          * check to make sure that swapfs has enough space so that anon
2184          * reservations can still succeed. anon_resvmem() checks that the
2185          * availrmem is greater than swapfs_minfree, and the number of reserved
2186          * swap pages.  We also add a bit of extra here just to prevent
2187          * circumstances from getting really dire.
2188          */
2189         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2190                 return (1);
2191 
2192 #if defined(__i386)
2193         /*
2194          * If we're on an i386 platform, it's possible that we'll exhaust the
2195          * kernel heap space before we ever run out of available physical
2196          * memory.  Most checks of the size of the heap_area compare against
2197          * tune.t_minarmem, which is the minimum available real memory that we
2198          * can have in the system.  However, this is generally fixed at 25 pages
2199          * which is so low that it's useless.  In this comparison, we seek to
2200          * calculate the total heap-size, and reclaim if more than 3/4ths of the
2201          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2202          * free)
2203          */
2204         if (vmem_size(heap_arena, VMEM_FREE) <
2205             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2206                 return (1);
2207 #endif
2208 
2209         /*
2210          * If zio data pages are being allocated out of a separate heap segment,
2211          * then enforce that the size of available vmem for this arena remains
2212          * above about 1/16th free.
2213          *
2214          * Note: The 1/16th arena free requirement was put in place
2215          * to aggressively evict memory from the arc in order to avoid
2216          * memory fragmentation issues.
2217          */
2218         if (zio_arena != NULL &&
2219             vmem_size(zio_arena, VMEM_FREE) <
2220             (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2221                 return (1);
2222 #else
2223         if (spa_get_random(100) == 0)
2224                 return (1);
2225 #endif
2226         return (0);
2227 }
2228 
2229 static void
2230 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2231 {
2232         size_t                  i;
2233         kmem_cache_t            *prev_cache = NULL;
2234         kmem_cache_t            *prev_data_cache = NULL;
2235         extern kmem_cache_t     *zio_buf_cache[];
2236         extern kmem_cache_t     *zio_data_buf_cache[];
2237 
2238 #ifdef _KERNEL
2239         if (arc_meta_used >= arc_meta_limit) {
2240                 /*
2241                  * We are exceeding our meta-data cache limit.
2242                  * Purge some DNLC entries to release holds on meta-data.
2243                  */
2244                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2245         }
2246 #if defined(__i386)
2247         /*
2248          * Reclaim unused memory from all kmem caches.
2249          */
2250         kmem_reap();
2251 #endif
2252 #endif
2253 
2254         /*
2255          * An aggressive reclamation will shrink the cache size as well as
2256          * reap free buffers from the arc kmem caches.
2257          */
2258         if (strat == ARC_RECLAIM_AGGR)
2259                 arc_shrink();
2260 
2261         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2262                 if (zio_buf_cache[i] != prev_cache) {
2263                         prev_cache = zio_buf_cache[i];
2264                         kmem_cache_reap_now(zio_buf_cache[i]);
2265                 }
2266                 if (zio_data_buf_cache[i] != prev_data_cache) {
2267                         prev_data_cache = zio_data_buf_cache[i];
2268                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2269                 }
2270         }
2271         kmem_cache_reap_now(buf_cache);
2272         kmem_cache_reap_now(hdr_cache);
2273 
2274         /*
2275          * Ask the vmem areana to reclaim unused memory from its
2276          * quantum caches.
2277          */
2278         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2279                 vmem_qcache_reap(zio_arena);
2280 }
2281 
2282 static void
2283 arc_reclaim_thread(void)
2284 {
2285         clock_t                 growtime = 0;
2286         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2287         callb_cpr_t             cpr;
2288 
2289         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2290 
2291         mutex_enter(&arc_reclaim_thr_lock);
2292         while (arc_thread_exit == 0) {
2293                 if (arc_reclaim_needed()) {
2294 
2295                         if (arc_no_grow) {
2296                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2297                                         last_reclaim = ARC_RECLAIM_AGGR;
2298                                 } else {
2299                                         last_reclaim = ARC_RECLAIM_CONS;
2300                                 }
2301                         } else {
2302                                 arc_no_grow = TRUE;
2303                                 last_reclaim = ARC_RECLAIM_AGGR;
2304                                 membar_producer();
2305                         }
2306 
2307                         /* reset the growth delay for every reclaim */
2308                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2309 
2310                         arc_kmem_reap_now(last_reclaim);
2311                         arc_warm = B_TRUE;
2312 
2313                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2314                         arc_no_grow = FALSE;
2315                 }
2316 
2317                 arc_adjust();
2318 
2319                 if (arc_eviction_list != NULL)
2320                         arc_do_user_evicts();
2321 
2322                 /* block until needed, or one second, whichever is shorter */
2323                 CALLB_CPR_SAFE_BEGIN(&cpr);
2324                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2325                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2326                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2327         }
2328 
2329         arc_thread_exit = 0;
2330         cv_broadcast(&arc_reclaim_thr_cv);
2331         CALLB_CPR_EXIT(&cpr);               /* drops arc_reclaim_thr_lock */
2332         thread_exit();
2333 }
2334 
2335 /*
2336  * Adapt arc info given the number of bytes we are trying to add and
2337  * the state that we are comming from.  This function is only called
2338  * when we are adding new content to the cache.
2339  */
2340 static void
2341 arc_adapt(int bytes, arc_state_t *state)
2342 {
2343         int mult;
2344         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2345 
2346         if (state == arc_l2c_only)
2347                 return;
2348 
2349         ASSERT(bytes > 0);
2350         /*
2351          * Adapt the target size of the MRU list:
2352          *      - if we just hit in the MRU ghost list, then increase
2353          *        the target size of the MRU list.
2354          *      - if we just hit in the MFU ghost list, then increase
2355          *        the target size of the MFU list by decreasing the
2356          *        target size of the MRU list.
2357          */
2358         if (state == arc_mru_ghost) {
2359                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2360                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2361                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2362 
2363                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2364         } else if (state == arc_mfu_ghost) {
2365                 uint64_t delta;
2366 
2367                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2368                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2369                 mult = MIN(mult, 10);
2370 
2371                 delta = MIN(bytes * mult, arc_p);
2372                 arc_p = MAX(arc_p_min, arc_p - delta);
2373         }
2374         ASSERT((int64_t)arc_p >= 0);
2375 
2376         if (arc_reclaim_needed()) {
2377                 cv_signal(&arc_reclaim_thr_cv);
2378                 return;
2379         }
2380 
2381         if (arc_no_grow)
2382                 return;
2383 
2384         if (arc_c >= arc_c_max)
2385                 return;
2386 
2387         /*
2388          * If we're within (2 * maxblocksize) bytes of the target
2389          * cache size, increment the target cache size
2390          */
2391         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2392                 atomic_add_64(&arc_c, (int64_t)bytes);
2393                 if (arc_c > arc_c_max)
2394                         arc_c = arc_c_max;
2395                 else if (state == arc_anon)
2396                         atomic_add_64(&arc_p, (int64_t)bytes);
2397                 if (arc_p > arc_c)
2398                         arc_p = arc_c;
2399         }
2400         ASSERT((int64_t)arc_p >= 0);
2401 }
2402 
2403 /*
2404  * Check if the cache has reached its limits and eviction is required
2405  * prior to insert.
2406  */
2407 static int
2408 arc_evict_needed(arc_buf_contents_t type)
2409 {
2410         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2411                 return (1);
2412 
2413         if (arc_reclaim_needed())
2414                 return (1);
2415 
2416         return (arc_size > arc_c);
2417 }
2418 
2419 /*
2420  * The buffer, supplied as the first argument, needs a data block.
2421  * So, if we are at cache max, determine which cache should be victimized.
2422  * We have the following cases:
2423  *
2424  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2425  * In this situation if we're out of space, but the resident size of the MFU is
2426  * under the limit, victimize the MFU cache to satisfy this insertion request.
2427  *
2428  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2429  * Here, we've used up all of the available space for the MRU, so we need to
2430  * evict from our own cache instead.  Evict from the set of resident MRU
2431  * entries.
2432  *
2433  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2434  * c minus p represents the MFU space in the cache, since p is the size of the
2435  * cache that is dedicated to the MRU.  In this situation there's still space on
2436  * the MFU side, so the MRU side needs to be victimized.
2437  *
2438  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2439  * MFU's resident set is consuming more space than it has been allotted.  In
2440  * this situation, we must victimize our own cache, the MFU, for this insertion.
2441  */
2442 static void
2443 arc_get_data_buf(arc_buf_t *buf)
2444 {
2445         arc_state_t             *state = buf->b_hdr->b_state;
2446         uint64_t                size = buf->b_hdr->b_size;
2447         arc_buf_contents_t      type = buf->b_hdr->b_type;
2448 
2449         arc_adapt(size, state);
2450 
2451         /*
2452          * We have not yet reached cache maximum size,
2453          * just allocate a new buffer.
2454          */
2455         if (!arc_evict_needed(type)) {
2456                 if (type == ARC_BUFC_METADATA) {
2457                         buf->b_data = zio_buf_alloc(size);
2458                         arc_space_consume(size, ARC_SPACE_DATA);
2459                 } else {
2460                         ASSERT(type == ARC_BUFC_DATA);
2461                         buf->b_data = zio_data_buf_alloc(size);
2462                         ARCSTAT_INCR(arcstat_data_size, size);
2463                         atomic_add_64(&arc_size, size);
2464                 }
2465                 goto out;
2466         }
2467 
2468         /*
2469          * If we are prefetching from the mfu ghost list, this buffer
2470          * will end up on the mru list; so steal space from there.
2471          */
2472         if (state == arc_mfu_ghost)
2473                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2474         else if (state == arc_mru_ghost)
2475                 state = arc_mru;
2476 
2477         if (state == arc_mru || state == arc_anon) {
2478                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2479                 state = (arc_mfu->arcs_lsize[type] >= size &&
2480                     arc_p > mru_used) ? arc_mfu : arc_mru;
2481         } else {
2482                 /* MFU cases */
2483                 uint64_t mfu_space = arc_c - arc_p;
2484                 state =  (arc_mru->arcs_lsize[type] >= size &&
2485                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2486         }
2487         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2488                 if (type == ARC_BUFC_METADATA) {
2489                         buf->b_data = zio_buf_alloc(size);
2490                         arc_space_consume(size, ARC_SPACE_DATA);
2491                 } else {
2492                         ASSERT(type == ARC_BUFC_DATA);
2493                         buf->b_data = zio_data_buf_alloc(size);
2494                         ARCSTAT_INCR(arcstat_data_size, size);
2495                         atomic_add_64(&arc_size, size);
2496                 }
2497                 ARCSTAT_BUMP(arcstat_recycle_miss);
2498         }
2499         ASSERT(buf->b_data != NULL);
2500 out:
2501         /*
2502          * Update the state size.  Note that ghost states have a
2503          * "ghost size" and so don't need to be updated.
2504          */
2505         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2506                 arc_buf_hdr_t *hdr = buf->b_hdr;
2507 
2508                 atomic_add_64(&hdr->b_state->arcs_size, size);
2509                 if (list_link_active(&hdr->b_arc_node)) {
2510                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2511                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2512                 }
2513                 /*
2514                  * If we are growing the cache, and we are adding anonymous
2515                  * data, and we have outgrown arc_p, update arc_p
2516                  */
2517                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2518                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2519                         arc_p = MIN(arc_c, arc_p + size);
2520         }
2521 }
2522 
2523 /*
2524  * This routine is called whenever a buffer is accessed.
2525  * NOTE: the hash lock is dropped in this function.
2526  */
2527 static void
2528 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2529 {
2530         clock_t now;
2531 
2532         ASSERT(MUTEX_HELD(hash_lock));
2533 
2534         if (buf->b_state == arc_anon) {
2535                 /*
2536                  * This buffer is not in the cache, and does not
2537                  * appear in our "ghost" list.  Add the new buffer
2538                  * to the MRU state.
2539                  */
2540 
2541                 ASSERT(buf->b_arc_access == 0);
2542                 buf->b_arc_access = ddi_get_lbolt();
2543                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2544                 arc_change_state(arc_mru, buf, hash_lock);
2545 
2546         } else if (buf->b_state == arc_mru) {
2547                 now = ddi_get_lbolt();
2548 
2549                 /*
2550                  * If this buffer is here because of a prefetch, then either:
2551                  * - clear the flag if this is a "referencing" read
2552                  *   (any subsequent access will bump this into the MFU state).
2553                  * or
2554                  * - move the buffer to the head of the list if this is
2555                  *   another prefetch (to make it less likely to be evicted).
2556                  */
2557                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2558                         if (refcount_count(&buf->b_refcnt) == 0) {
2559                                 ASSERT(list_link_active(&buf->b_arc_node));
2560                         } else {
2561                                 buf->b_flags &= ~ARC_PREFETCH;
2562                                 ARCSTAT_BUMP(arcstat_mru_hits);
2563                         }
2564                         buf->b_arc_access = now;
2565                         return;
2566                 }
2567 
2568                 /*
2569                  * This buffer has been "accessed" only once so far,
2570                  * but it is still in the cache. Move it to the MFU
2571                  * state.
2572                  */
2573                 if (now > buf->b_arc_access + ARC_MINTIME) {
2574                         /*
2575                          * More than 125ms have passed since we
2576                          * instantiated this buffer.  Move it to the
2577                          * most frequently used state.
2578                          */
2579                         buf->b_arc_access = now;
2580                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2581                         arc_change_state(arc_mfu, buf, hash_lock);
2582                 }
2583                 ARCSTAT_BUMP(arcstat_mru_hits);
2584         } else if (buf->b_state == arc_mru_ghost) {
2585                 arc_state_t     *new_state;
2586                 /*
2587                  * This buffer has been "accessed" recently, but
2588                  * was evicted from the cache.  Move it to the
2589                  * MFU state.
2590                  */
2591 
2592                 if (buf->b_flags & ARC_PREFETCH) {
2593                         new_state = arc_mru;
2594                         if (refcount_count(&buf->b_refcnt) > 0)
2595                                 buf->b_flags &= ~ARC_PREFETCH;
2596                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2597                 } else {
2598                         new_state = arc_mfu;
2599                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2600                 }
2601 
2602                 buf->b_arc_access = ddi_get_lbolt();
2603                 arc_change_state(new_state, buf, hash_lock);
2604 
2605                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2606         } else if (buf->b_state == arc_mfu) {
2607                 /*
2608                  * This buffer has been accessed more than once and is
2609                  * still in the cache.  Keep it in the MFU state.
2610                  *
2611                  * NOTE: an add_reference() that occurred when we did
2612                  * the arc_read() will have kicked this off the list.
2613                  * If it was a prefetch, we will explicitly move it to
2614                  * the head of the list now.
2615                  */
2616                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2617                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2618                         ASSERT(list_link_active(&buf->b_arc_node));
2619                 }
2620                 ARCSTAT_BUMP(arcstat_mfu_hits);
2621                 buf->b_arc_access = ddi_get_lbolt();
2622         } else if (buf->b_state == arc_mfu_ghost) {
2623                 arc_state_t     *new_state = arc_mfu;
2624                 /*
2625                  * This buffer has been accessed more than once but has
2626                  * been evicted from the cache.  Move it back to the
2627                  * MFU state.
2628                  */
2629 
2630                 if (buf->b_flags & ARC_PREFETCH) {
2631                         /*
2632                          * This is a prefetch access...
2633                          * move this block back to the MRU state.
2634                          */
2635                         ASSERT0(refcount_count(&buf->b_refcnt));
2636                         new_state = arc_mru;
2637                 }
2638 
2639                 buf->b_arc_access = ddi_get_lbolt();
2640                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2641                 arc_change_state(new_state, buf, hash_lock);
2642 
2643                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2644         } else if (buf->b_state == arc_l2c_only) {
2645                 /*
2646                  * This buffer is on the 2nd Level ARC.
2647                  */
2648 
2649                 buf->b_arc_access = ddi_get_lbolt();
2650                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2651                 arc_change_state(arc_mfu, buf, hash_lock);
2652         } else {
2653                 ASSERT(!"invalid arc state");
2654         }
2655 }
2656 
2657 /* a generic arc_done_func_t which you can use */
2658 /* ARGSUSED */
2659 void
2660 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2661 {
2662         if (zio == NULL || zio->io_error == 0)
2663                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2664         VERIFY(arc_buf_remove_ref(buf, arg));
2665 }
2666 
2667 /* a generic arc_done_func_t */
2668 void
2669 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2670 {
2671         arc_buf_t **bufp = arg;
2672         if (zio && zio->io_error) {
2673                 VERIFY(arc_buf_remove_ref(buf, arg));
2674                 *bufp = NULL;
2675         } else {
2676                 *bufp = buf;
2677                 ASSERT(buf->b_data);
2678         }
2679 }
2680 
2681 static void
2682 arc_read_done(zio_t *zio)
2683 {
2684         arc_buf_hdr_t   *hdr, *found;
2685         arc_buf_t       *buf;
2686         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2687         kmutex_t        *hash_lock;
2688         arc_callback_t  *callback_list, *acb;
2689         int             freeable = FALSE;
2690 
2691         buf = zio->io_private;
2692         hdr = buf->b_hdr;
2693 
2694         /*
2695          * The hdr was inserted into hash-table and removed from lists
2696          * prior to starting I/O.  We should find this header, since
2697          * it's in the hash table, and it should be legit since it's
2698          * not possible to evict it during the I/O.  The only possible
2699          * reason for it not to be found is if we were freed during the
2700          * read.
2701          */
2702         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2703             &hash_lock);
2704 
2705         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2706             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2707             (found == hdr && HDR_L2_READING(hdr)));
2708 
2709         hdr->b_flags &= ~ARC_L2_EVICTED;
2710         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2711                 hdr->b_flags &= ~ARC_L2CACHE;
2712 
2713         /* byteswap if necessary */
2714         callback_list = hdr->b_acb;
2715         ASSERT(callback_list != NULL);
2716         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2717                 dmu_object_byteswap_t bswap =
2718                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2719                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2720                     byteswap_uint64_array :
2721                     dmu_ot_byteswap[bswap].ob_func;
2722                 func(buf->b_data, hdr->b_size);
2723         }
2724 
2725         arc_cksum_compute(buf, B_FALSE);
2726         arc_buf_watch(buf);
2727 
2728         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2729                 /*
2730                  * Only call arc_access on anonymous buffers.  This is because
2731                  * if we've issued an I/O for an evicted buffer, we've already
2732                  * called arc_access (to prevent any simultaneous readers from
2733                  * getting confused).
2734                  */
2735                 arc_access(hdr, hash_lock);
2736         }
2737 
2738         /* create copies of the data buffer for the callers */
2739         abuf = buf;
2740         for (acb = callback_list; acb; acb = acb->acb_next) {
2741                 if (acb->acb_done) {
2742                         if (abuf == NULL) {
2743                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
2744                                 abuf = arc_buf_clone(buf);
2745                         }
2746                         acb->acb_buf = abuf;
2747                         abuf = NULL;
2748                 }
2749         }
2750         hdr->b_acb = NULL;
2751         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2752         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2753         if (abuf == buf) {
2754                 ASSERT(buf->b_efunc == NULL);
2755                 ASSERT(hdr->b_datacnt == 1);
2756                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2757         }
2758 
2759         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2760 
2761         if (zio->io_error != 0) {
2762                 hdr->b_flags |= ARC_IO_ERROR;
2763                 if (hdr->b_state != arc_anon)
2764                         arc_change_state(arc_anon, hdr, hash_lock);
2765                 if (HDR_IN_HASH_TABLE(hdr))
2766                         buf_hash_remove(hdr);
2767                 freeable = refcount_is_zero(&hdr->b_refcnt);
2768         }
2769 
2770         /*
2771          * Broadcast before we drop the hash_lock to avoid the possibility
2772          * that the hdr (and hence the cv) might be freed before we get to
2773          * the cv_broadcast().
2774          */
2775         cv_broadcast(&hdr->b_cv);
2776 
2777         if (hash_lock) {
2778                 mutex_exit(hash_lock);
2779         } else {
2780                 /*
2781                  * This block was freed while we waited for the read to
2782                  * complete.  It has been removed from the hash table and
2783                  * moved to the anonymous state (so that it won't show up
2784                  * in the cache).
2785                  */
2786                 ASSERT3P(hdr->b_state, ==, arc_anon);
2787                 freeable = refcount_is_zero(&hdr->b_refcnt);
2788         }
2789 
2790         /* execute each callback and free its structure */
2791         while ((acb = callback_list) != NULL) {
2792                 if (acb->acb_done)
2793                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2794 
2795                 if (acb->acb_zio_dummy != NULL) {
2796                         acb->acb_zio_dummy->io_error = zio->io_error;
2797                         zio_nowait(acb->acb_zio_dummy);
2798                 }
2799 
2800                 callback_list = acb->acb_next;
2801                 kmem_free(acb, sizeof (arc_callback_t));
2802         }
2803 
2804         if (freeable)
2805                 arc_hdr_destroy(hdr);
2806 }
2807 
2808 /*
2809  * "Read" the block at the specified DVA (in bp) via the
2810  * cache.  If the block is found in the cache, invoke the provided
2811  * callback immediately and return.  Note that the `zio' parameter
2812  * in the callback will be NULL in this case, since no IO was
2813  * required.  If the block is not in the cache pass the read request
2814  * on to the spa with a substitute callback function, so that the
2815  * requested block will be added to the cache.
2816  *
2817  * If a read request arrives for a block that has a read in-progress,
2818  * either wait for the in-progress read to complete (and return the
2819  * results); or, if this is a read with a "done" func, add a record
2820  * to the read to invoke the "done" func when the read completes,
2821  * and return; or just return.
2822  *
2823  * arc_read_done() will invoke all the requested "done" functions
2824  * for readers of this block.
2825  */
2826 int
2827 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2828     void *private, int priority, int zio_flags, uint32_t *arc_flags,
2829     const zbookmark_t *zb)
2830 {
2831         arc_buf_hdr_t *hdr;
2832         arc_buf_t *buf = NULL;
2833         kmutex_t *hash_lock;
2834         zio_t *rzio;
2835         uint64_t guid = spa_load_guid(spa);
2836 
2837 top:
2838         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2839             &hash_lock);
2840         if (hdr && hdr->b_datacnt > 0) {
2841 
2842                 *arc_flags |= ARC_CACHED;
2843 
2844                 if (HDR_IO_IN_PROGRESS(hdr)) {
2845 
2846                         if (*arc_flags & ARC_WAIT) {
2847                                 cv_wait(&hdr->b_cv, hash_lock);
2848                                 mutex_exit(hash_lock);
2849                                 goto top;
2850                         }
2851                         ASSERT(*arc_flags & ARC_NOWAIT);
2852 
2853                         if (done) {
2854                                 arc_callback_t  *acb = NULL;
2855 
2856                                 acb = kmem_zalloc(sizeof (arc_callback_t),
2857                                     KM_SLEEP);
2858                                 acb->acb_done = done;
2859                                 acb->acb_private = private;
2860                                 if (pio != NULL)
2861                                         acb->acb_zio_dummy = zio_null(pio,
2862                                             spa, NULL, NULL, NULL, zio_flags);
2863 
2864                                 ASSERT(acb->acb_done != NULL);
2865                                 acb->acb_next = hdr->b_acb;
2866                                 hdr->b_acb = acb;
2867                                 add_reference(hdr, hash_lock, private);
2868                                 mutex_exit(hash_lock);
2869                                 return (0);
2870                         }
2871                         mutex_exit(hash_lock);
2872                         return (0);
2873                 }
2874 
2875                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2876 
2877                 if (done) {
2878                         add_reference(hdr, hash_lock, private);
2879                         /*
2880                          * If this block is already in use, create a new
2881                          * copy of the data so that we will be guaranteed
2882                          * that arc_release() will always succeed.
2883                          */
2884                         buf = hdr->b_buf;
2885                         ASSERT(buf);
2886                         ASSERT(buf->b_data);
2887                         if (HDR_BUF_AVAILABLE(hdr)) {
2888                                 ASSERT(buf->b_efunc == NULL);
2889                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2890                         } else {
2891                                 buf = arc_buf_clone(buf);
2892                         }
2893 
2894                 } else if (*arc_flags & ARC_PREFETCH &&
2895                     refcount_count(&hdr->b_refcnt) == 0) {
2896                         hdr->b_flags |= ARC_PREFETCH;
2897                 }
2898                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2899                 arc_access(hdr, hash_lock);
2900                 if (*arc_flags & ARC_L2CACHE)
2901                         hdr->b_flags |= ARC_L2CACHE;
2902                 if (*arc_flags & ARC_L2COMPRESS)
2903                         hdr->b_flags |= ARC_L2COMPRESS;
2904                 mutex_exit(hash_lock);
2905                 ARCSTAT_BUMP(arcstat_hits);
2906                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2907                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2908                     data, metadata, hits);
2909 
2910                 if (done)
2911                         done(NULL, buf, private);
2912         } else {
2913                 uint64_t size = BP_GET_LSIZE(bp);
2914                 arc_callback_t  *acb;
2915                 vdev_t *vd = NULL;
2916                 uint64_t addr = 0;
2917                 boolean_t devw = B_FALSE;
2918 
2919                 if (hdr == NULL) {
2920                         /* this block is not in the cache */
2921                         arc_buf_hdr_t   *exists;
2922                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2923                         buf = arc_buf_alloc(spa, size, private, type);
2924                         hdr = buf->b_hdr;
2925                         hdr->b_dva = *BP_IDENTITY(bp);
2926                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2927                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2928                         exists = buf_hash_insert(hdr, &hash_lock);
2929                         if (exists) {
2930                                 /* somebody beat us to the hash insert */
2931                                 mutex_exit(hash_lock);
2932                                 buf_discard_identity(hdr);
2933                                 (void) arc_buf_remove_ref(buf, private);
2934                                 goto top; /* restart the IO request */
2935                         }
2936                         /* if this is a prefetch, we don't have a reference */
2937                         if (*arc_flags & ARC_PREFETCH) {
2938                                 (void) remove_reference(hdr, hash_lock,
2939                                     private);
2940                                 hdr->b_flags |= ARC_PREFETCH;
2941                         }
2942                         if (*arc_flags & ARC_L2CACHE)
2943                                 hdr->b_flags |= ARC_L2CACHE;
2944                         if (*arc_flags & ARC_L2COMPRESS)
2945                                 hdr->b_flags |= ARC_L2COMPRESS;
2946                         if (BP_GET_LEVEL(bp) > 0)
2947                                 hdr->b_flags |= ARC_INDIRECT;
2948                 } else {
2949                         /* this block is in the ghost cache */
2950                         ASSERT(GHOST_STATE(hdr->b_state));
2951                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2952                         ASSERT0(refcount_count(&hdr->b_refcnt));
2953                         ASSERT(hdr->b_buf == NULL);
2954 
2955                         /* if this is a prefetch, we don't have a reference */
2956                         if (*arc_flags & ARC_PREFETCH)
2957                                 hdr->b_flags |= ARC_PREFETCH;
2958                         else
2959                                 add_reference(hdr, hash_lock, private);
2960                         if (*arc_flags & ARC_L2CACHE)
2961                                 hdr->b_flags |= ARC_L2CACHE;
2962                         if (*arc_flags & ARC_L2COMPRESS)
2963                                 hdr->b_flags |= ARC_L2COMPRESS;
2964                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2965                         buf->b_hdr = hdr;
2966                         buf->b_data = NULL;
2967                         buf->b_efunc = NULL;
2968                         buf->b_private = NULL;
2969                         buf->b_next = NULL;
2970                         hdr->b_buf = buf;
2971                         ASSERT(hdr->b_datacnt == 0);
2972                         hdr->b_datacnt = 1;
2973                         arc_get_data_buf(buf);
2974                         arc_access(hdr, hash_lock);
2975                 }
2976 
2977                 ASSERT(!GHOST_STATE(hdr->b_state));
2978 
2979                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2980                 acb->acb_done = done;
2981                 acb->acb_private = private;
2982 
2983                 ASSERT(hdr->b_acb == NULL);
2984                 hdr->b_acb = acb;
2985                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2986 
2987                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2988                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2989                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2990                         addr = hdr->b_l2hdr->b_daddr;
2991                         /*
2992                          * Lock out device removal.
2993                          */
2994                         if (vdev_is_dead(vd) ||
2995                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2996                                 vd = NULL;
2997                 }
2998 
2999                 mutex_exit(hash_lock);
3000 
3001                 /*
3002                  * At this point, we have a level 1 cache miss.  Try again in
3003                  * L2ARC if possible.
3004                  */
3005                 ASSERT3U(hdr->b_size, ==, size);
3006                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3007                     uint64_t, size, zbookmark_t *, zb);
3008                 ARCSTAT_BUMP(arcstat_misses);
3009                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3010                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3011                     data, metadata, misses);
3012 
3013                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3014                         /*
3015                          * Read from the L2ARC if the following are true:
3016                          * 1. The L2ARC vdev was previously cached.
3017                          * 2. This buffer still has L2ARC metadata.
3018                          * 3. This buffer isn't currently writing to the L2ARC.
3019                          * 4. The L2ARC entry wasn't evicted, which may
3020                          *    also have invalidated the vdev.
3021                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3022                          */
3023                         if (hdr->b_l2hdr != NULL &&
3024                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3025                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3026                                 l2arc_read_callback_t *cb;
3027 
3028                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3029                                 ARCSTAT_BUMP(arcstat_l2_hits);
3030 
3031                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3032                                     KM_SLEEP);
3033                                 cb->l2rcb_buf = buf;
3034                                 cb->l2rcb_spa = spa;
3035                                 cb->l2rcb_bp = *bp;
3036                                 cb->l2rcb_zb = *zb;
3037                                 cb->l2rcb_flags = zio_flags;
3038                                 cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3039 
3040                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3041                                     addr + size < vd->vdev_psize -
3042                                     VDEV_LABEL_END_SIZE);
3043 
3044                                 /*
3045                                  * l2arc read.  The SCL_L2ARC lock will be
3046                                  * released by l2arc_read_done().
3047                                  * Issue a null zio if the underlying buffer
3048                                  * was squashed to zero size by compression.
3049                                  */
3050                                 if (hdr->b_l2hdr->b_compress ==
3051                                     ZIO_COMPRESS_EMPTY) {
3052                                         rzio = zio_null(pio, spa, vd,
3053                                             l2arc_read_done, cb,
3054                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3055                                             ZIO_FLAG_CANFAIL |
3056                                             ZIO_FLAG_DONT_PROPAGATE |
3057                                             ZIO_FLAG_DONT_RETRY);
3058                                 } else {
3059                                         rzio = zio_read_phys(pio, vd, addr,
3060                                             hdr->b_l2hdr->b_asize,
3061                                             buf->b_data, ZIO_CHECKSUM_OFF,
3062                                             l2arc_read_done, cb, priority,
3063                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3064                                             ZIO_FLAG_CANFAIL |
3065                                             ZIO_FLAG_DONT_PROPAGATE |
3066                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
3067                                 }
3068                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3069                                     zio_t *, rzio);
3070                                 ARCSTAT_INCR(arcstat_l2_read_bytes,
3071                                     hdr->b_l2hdr->b_asize);
3072 
3073                                 if (*arc_flags & ARC_NOWAIT) {
3074                                         zio_nowait(rzio);
3075                                         return (0);
3076                                 }
3077 
3078                                 ASSERT(*arc_flags & ARC_WAIT);
3079                                 if (zio_wait(rzio) == 0)
3080                                         return (0);
3081 
3082                                 /* l2arc read error; goto zio_read() */
3083                         } else {
3084                                 DTRACE_PROBE1(l2arc__miss,
3085                                     arc_buf_hdr_t *, hdr);
3086                                 ARCSTAT_BUMP(arcstat_l2_misses);
3087                                 if (HDR_L2_WRITING(hdr))
3088                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3089                                 spa_config_exit(spa, SCL_L2ARC, vd);
3090                         }
3091                 } else {
3092                         if (vd != NULL)
3093                                 spa_config_exit(spa, SCL_L2ARC, vd);
3094                         if (l2arc_ndev != 0) {
3095                                 DTRACE_PROBE1(l2arc__miss,
3096                                     arc_buf_hdr_t *, hdr);
3097                                 ARCSTAT_BUMP(arcstat_l2_misses);
3098                         }
3099                 }
3100 
3101                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3102                     arc_read_done, buf, priority, zio_flags, zb);
3103 
3104                 if (*arc_flags & ARC_WAIT)
3105                         return (zio_wait(rzio));
3106 
3107                 ASSERT(*arc_flags & ARC_NOWAIT);
3108                 zio_nowait(rzio);
3109         }
3110         return (0);
3111 }
3112 
3113 void
3114 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3115 {
3116         ASSERT(buf->b_hdr != NULL);
3117         ASSERT(buf->b_hdr->b_state != arc_anon);
3118         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3119         ASSERT(buf->b_efunc == NULL);
3120         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3121 
3122         buf->b_efunc = func;
3123         buf->b_private = private;
3124 }
3125 
3126 /*
3127  * Notify the arc that a block was freed, and thus will never be used again.
3128  */
3129 void
3130 arc_freed(spa_t *spa, const blkptr_t *bp)
3131 {
3132         arc_buf_hdr_t *hdr;
3133         kmutex_t *hash_lock;
3134         uint64_t guid = spa_load_guid(spa);
3135 
3136         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3137             &hash_lock);
3138         if (hdr == NULL)
3139                 return;
3140         if (HDR_BUF_AVAILABLE(hdr)) {
3141                 arc_buf_t *buf = hdr->b_buf;
3142                 add_reference(hdr, hash_lock, FTAG);
3143                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3144                 mutex_exit(hash_lock);
3145 
3146                 arc_release(buf, FTAG);
3147                 (void) arc_buf_remove_ref(buf, FTAG);
3148         } else {
3149                 mutex_exit(hash_lock);
3150         }
3151 
3152 }
3153 
3154 /*
3155  * This is used by the DMU to let the ARC know that a buffer is
3156  * being evicted, so the ARC should clean up.  If this arc buf
3157  * is not yet in the evicted state, it will be put there.
3158  */
3159 int
3160 arc_buf_evict(arc_buf_t *buf)
3161 {
3162         arc_buf_hdr_t *hdr;
3163         kmutex_t *hash_lock;
3164         arc_buf_t **bufp;
3165 
3166         mutex_enter(&buf->b_evict_lock);
3167         hdr = buf->b_hdr;
3168         if (hdr == NULL) {
3169                 /*
3170                  * We are in arc_do_user_evicts().
3171                  */
3172                 ASSERT(buf->b_data == NULL);
3173                 mutex_exit(&buf->b_evict_lock);
3174                 return (0);
3175         } else if (buf->b_data == NULL) {
3176                 arc_buf_t copy = *buf; /* structure assignment */
3177                 /*
3178                  * We are on the eviction list; process this buffer now
3179                  * but let arc_do_user_evicts() do the reaping.
3180                  */
3181                 buf->b_efunc = NULL;
3182                 mutex_exit(&buf->b_evict_lock);
3183                 VERIFY(copy.b_efunc(&copy) == 0);
3184                 return (1);
3185         }
3186         hash_lock = HDR_LOCK(hdr);
3187         mutex_enter(hash_lock);
3188         hdr = buf->b_hdr;
3189         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3190 
3191         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3192         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3193 
3194         /*
3195          * Pull this buffer off of the hdr
3196          */
3197         bufp = &hdr->b_buf;
3198         while (*bufp != buf)
3199                 bufp = &(*bufp)->b_next;
3200         *bufp = buf->b_next;
3201 
3202         ASSERT(buf->b_data != NULL);
3203         arc_buf_destroy(buf, FALSE, FALSE);
3204 
3205         if (hdr->b_datacnt == 0) {
3206                 arc_state_t *old_state = hdr->b_state;
3207                 arc_state_t *evicted_state;
3208 
3209                 ASSERT(hdr->b_buf == NULL);
3210                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3211 
3212                 evicted_state =
3213                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3214 
3215                 mutex_enter(&old_state->arcs_mtx);
3216                 mutex_enter(&evicted_state->arcs_mtx);
3217 
3218                 arc_change_state(evicted_state, hdr, hash_lock);
3219                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3220                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3221                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3222 
3223                 mutex_exit(&evicted_state->arcs_mtx);
3224                 mutex_exit(&old_state->arcs_mtx);
3225         }
3226         mutex_exit(hash_lock);
3227         mutex_exit(&buf->b_evict_lock);
3228 
3229         VERIFY(buf->b_efunc(buf) == 0);
3230         buf->b_efunc = NULL;
3231         buf->b_private = NULL;
3232         buf->b_hdr = NULL;
3233         buf->b_next = NULL;
3234         kmem_cache_free(buf_cache, buf);
3235         return (1);
3236 }
3237 
3238 /*
3239  * Release this buffer from the cache, making it an anonymous buffer.  This
3240  * must be done after a read and prior to modifying the buffer contents.
3241  * If the buffer has more than one reference, we must make
3242  * a new hdr for the buffer.
3243  */
3244 void
3245 arc_release(arc_buf_t *buf, void *tag)
3246 {
3247         arc_buf_hdr_t *hdr;
3248         kmutex_t *hash_lock = NULL;
3249         l2arc_buf_hdr_t *l2hdr;
3250         uint64_t buf_size;
3251 
3252         /*
3253          * It would be nice to assert that if it's DMU metadata (level >
3254          * 0 || it's the dnode file), then it must be syncing context.
3255          * But we don't know that information at this level.
3256          */
3257 
3258         mutex_enter(&buf->b_evict_lock);
3259         hdr = buf->b_hdr;
3260 
3261         /* this buffer is not on any list */
3262         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3263 
3264         if (hdr->b_state == arc_anon) {
3265                 /* this buffer is already released */
3266                 ASSERT(buf->b_efunc == NULL);
3267         } else {
3268                 hash_lock = HDR_LOCK(hdr);
3269                 mutex_enter(hash_lock);
3270                 hdr = buf->b_hdr;
3271                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3272         }
3273 
3274         l2hdr = hdr->b_l2hdr;
3275         if (l2hdr) {
3276                 mutex_enter(&l2arc_buflist_mtx);
3277                 hdr->b_l2hdr = NULL;
3278         }
3279         buf_size = hdr->b_size;
3280 
3281         /*
3282          * Do we have more than one buf?
3283          */
3284         if (hdr->b_datacnt > 1) {
3285                 arc_buf_hdr_t *nhdr;
3286                 arc_buf_t **bufp;
3287                 uint64_t blksz = hdr->b_size;
3288                 uint64_t spa = hdr->b_spa;
3289                 arc_buf_contents_t type = hdr->b_type;
3290                 uint32_t flags = hdr->b_flags;
3291 
3292                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3293                 /*
3294                  * Pull the data off of this hdr and attach it to
3295                  * a new anonymous hdr.
3296                  */
3297                 (void) remove_reference(hdr, hash_lock, tag);
3298                 bufp = &hdr->b_buf;
3299                 while (*bufp != buf)
3300                         bufp = &(*bufp)->b_next;
3301                 *bufp = buf->b_next;
3302                 buf->b_next = NULL;
3303 
3304                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3305                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3306                 if (refcount_is_zero(&hdr->b_refcnt)) {
3307                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3308                         ASSERT3U(*size, >=, hdr->b_size);
3309                         atomic_add_64(size, -hdr->b_size);
3310                 }
3311 
3312                 /*
3313                  * We're releasing a duplicate user data buffer, update
3314                  * our statistics accordingly.
3315                  */
3316                 if (hdr->b_type == ARC_BUFC_DATA) {
3317                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3318                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3319                             -hdr->b_size);
3320                 }
3321                 hdr->b_datacnt -= 1;
3322                 arc_cksum_verify(buf);
3323                 arc_buf_unwatch(buf);
3324 
3325                 mutex_exit(hash_lock);
3326 
3327                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3328                 nhdr->b_size = blksz;
3329                 nhdr->b_spa = spa;
3330                 nhdr->b_type = type;
3331                 nhdr->b_buf = buf;
3332                 nhdr->b_state = arc_anon;
3333                 nhdr->b_arc_access = 0;
3334                 nhdr->b_flags = flags & ARC_L2_WRITING;
3335                 nhdr->b_l2hdr = NULL;
3336                 nhdr->b_datacnt = 1;
3337                 nhdr->b_freeze_cksum = NULL;
3338                 (void) refcount_add(&nhdr->b_refcnt, tag);
3339                 buf->b_hdr = nhdr;
3340                 mutex_exit(&buf->b_evict_lock);
3341                 atomic_add_64(&arc_anon->arcs_size, blksz);
3342         } else {
3343                 mutex_exit(&buf->b_evict_lock);
3344                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3345                 ASSERT(!list_link_active(&hdr->b_arc_node));
3346                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3347                 if (hdr->b_state != arc_anon)
3348                         arc_change_state(arc_anon, hdr, hash_lock);
3349                 hdr->b_arc_access = 0;
3350                 if (hash_lock)
3351                         mutex_exit(hash_lock);
3352 
3353                 buf_discard_identity(hdr);
3354                 arc_buf_thaw(buf);
3355         }
3356         buf->b_efunc = NULL;
3357         buf->b_private = NULL;
3358 
3359         if (l2hdr) {
3360                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3361                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3362                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3363                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3364                 mutex_exit(&l2arc_buflist_mtx);
3365         }
3366 }
3367 
3368 int
3369 arc_released(arc_buf_t *buf)
3370 {
3371         int released;
3372 
3373         mutex_enter(&buf->b_evict_lock);
3374         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3375         mutex_exit(&buf->b_evict_lock);
3376         return (released);
3377 }
3378 
3379 int
3380 arc_has_callback(arc_buf_t *buf)
3381 {
3382         int callback;
3383 
3384         mutex_enter(&buf->b_evict_lock);
3385         callback = (buf->b_efunc != NULL);
3386         mutex_exit(&buf->b_evict_lock);
3387         return (callback);
3388 }
3389 
3390 #ifdef ZFS_DEBUG
3391 int
3392 arc_referenced(arc_buf_t *buf)
3393 {
3394         int referenced;
3395 
3396         mutex_enter(&buf->b_evict_lock);
3397         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3398         mutex_exit(&buf->b_evict_lock);
3399         return (referenced);
3400 }
3401 #endif
3402 
3403 static void
3404 arc_write_ready(zio_t *zio)
3405 {
3406         arc_write_callback_t *callback = zio->io_private;
3407         arc_buf_t *buf = callback->awcb_buf;
3408         arc_buf_hdr_t *hdr = buf->b_hdr;
3409 
3410         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3411         callback->awcb_ready(zio, buf, callback->awcb_private);
3412 
3413         /*
3414          * If the IO is already in progress, then this is a re-write
3415          * attempt, so we need to thaw and re-compute the cksum.
3416          * It is the responsibility of the callback to handle the
3417          * accounting for any re-write attempt.
3418          */
3419         if (HDR_IO_IN_PROGRESS(hdr)) {
3420                 mutex_enter(&hdr->b_freeze_lock);
3421                 if (hdr->b_freeze_cksum != NULL) {
3422                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3423                         hdr->b_freeze_cksum = NULL;
3424                 }
3425                 mutex_exit(&hdr->b_freeze_lock);
3426         }
3427         arc_cksum_compute(buf, B_FALSE);
3428         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3429 }
3430 
3431 static void
3432 arc_write_done(zio_t *zio)
3433 {
3434         arc_write_callback_t *callback = zio->io_private;
3435         arc_buf_t *buf = callback->awcb_buf;
3436         arc_buf_hdr_t *hdr = buf->b_hdr;
3437 
3438         ASSERT(hdr->b_acb == NULL);
3439 
3440         if (zio->io_error == 0) {
3441                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3442                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3443                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3444         } else {
3445                 ASSERT(BUF_EMPTY(hdr));
3446         }
3447 
3448         /*
3449          * If the block to be written was all-zero, we may have
3450          * compressed it away.  In this case no write was performed
3451          * so there will be no dva/birth/checksum.  The buffer must
3452          * therefore remain anonymous (and uncached).
3453          */
3454         if (!BUF_EMPTY(hdr)) {
3455                 arc_buf_hdr_t *exists;
3456                 kmutex_t *hash_lock;
3457 
3458                 ASSERT(zio->io_error == 0);
3459 
3460                 arc_cksum_verify(buf);
3461 
3462                 exists = buf_hash_insert(hdr, &hash_lock);
3463                 if (exists) {
3464                         /*
3465                          * This can only happen if we overwrite for
3466                          * sync-to-convergence, because we remove
3467                          * buffers from the hash table when we arc_free().
3468                          */
3469                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3470                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3471                                         panic("bad overwrite, hdr=%p exists=%p",
3472                                             (void *)hdr, (void *)exists);
3473                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3474                                 arc_change_state(arc_anon, exists, hash_lock);
3475                                 mutex_exit(hash_lock);
3476                                 arc_hdr_destroy(exists);
3477                                 exists = buf_hash_insert(hdr, &hash_lock);
3478                                 ASSERT3P(exists, ==, NULL);
3479                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3480                                 /* nopwrite */
3481                                 ASSERT(zio->io_prop.zp_nopwrite);
3482                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3483                                         panic("bad nopwrite, hdr=%p exists=%p",
3484                                             (void *)hdr, (void *)exists);
3485                         } else {
3486                                 /* Dedup */
3487                                 ASSERT(hdr->b_datacnt == 1);
3488                                 ASSERT(hdr->b_state == arc_anon);
3489                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3490                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3491                         }
3492                 }
3493                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3494                 /* if it's not anon, we are doing a scrub */
3495                 if (!exists && hdr->b_state == arc_anon)
3496                         arc_access(hdr, hash_lock);
3497                 mutex_exit(hash_lock);
3498         } else {
3499                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3500         }
3501 
3502         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3503         callback->awcb_done(zio, buf, callback->awcb_private);
3504 
3505         kmem_free(callback, sizeof (arc_write_callback_t));
3506 }
3507 
3508 zio_t *
3509 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3510     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3511     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3512     void *private, int priority, int zio_flags, const zbookmark_t *zb)
3513 {
3514         arc_buf_hdr_t *hdr = buf->b_hdr;
3515         arc_write_callback_t *callback;
3516         zio_t *zio;
3517 
3518         ASSERT(ready != NULL);
3519         ASSERT(done != NULL);
3520         ASSERT(!HDR_IO_ERROR(hdr));
3521         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3522         ASSERT(hdr->b_acb == NULL);
3523         if (l2arc)
3524                 hdr->b_flags |= ARC_L2CACHE;
3525         if (l2arc_compress)
3526                 hdr->b_flags |= ARC_L2COMPRESS;
3527         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3528         callback->awcb_ready = ready;
3529         callback->awcb_done = done;
3530         callback->awcb_private = private;
3531         callback->awcb_buf = buf;
3532 
3533         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3534             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3535 
3536         return (zio);
3537 }
3538 
3539 static int
3540 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3541 {
3542 #ifdef _KERNEL
3543         uint64_t available_memory = ptob(freemem);
3544         static uint64_t page_load = 0;
3545         static uint64_t last_txg = 0;
3546 
3547 #if defined(__i386)
3548         available_memory =
3549             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3550 #endif
3551         if (available_memory >= zfs_write_limit_max)
3552                 return (0);
3553 
3554         if (txg > last_txg) {
3555                 last_txg = txg;
3556                 page_load = 0;
3557         }
3558         /*
3559          * If we are in pageout, we know that memory is already tight,
3560          * the arc is already going to be evicting, so we just want to
3561          * continue to let page writes occur as quickly as possible.
3562          */
3563         if (curproc == proc_pageout) {
3564                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3565                         return (SET_ERROR(ERESTART));
3566                 /* Note: reserve is inflated, so we deflate */
3567                 page_load += reserve / 8;
3568                 return (0);
3569         } else if (page_load > 0 && arc_reclaim_needed()) {
3570                 /* memory is low, delay before restarting */
3571                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3572                 return (SET_ERROR(EAGAIN));
3573         }
3574         page_load = 0;
3575 
3576         if (arc_size > arc_c_min) {
3577                 uint64_t evictable_memory =
3578                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3579                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3580                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3581                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3582                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3583         }
3584 
3585         if (inflight_data > available_memory / 4) {
3586                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3587                 return (SET_ERROR(ERESTART));
3588         }
3589 #endif
3590         return (0);
3591 }
3592 
3593 void
3594 arc_tempreserve_clear(uint64_t reserve)
3595 {
3596         atomic_add_64(&arc_tempreserve, -reserve);
3597         ASSERT((int64_t)arc_tempreserve >= 0);
3598 }
3599 
3600 int
3601 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3602 {
3603         int error;
3604         uint64_t anon_size;
3605 
3606 #ifdef ZFS_DEBUG
3607         /*
3608          * Once in a while, fail for no reason.  Everything should cope.
3609          */
3610         if (spa_get_random(10000) == 0) {
3611                 dprintf("forcing random failure\n");
3612                 return (SET_ERROR(ERESTART));
3613         }
3614 #endif
3615         if (reserve > arc_c/4 && !arc_no_grow)
3616                 arc_c = MIN(arc_c_max, reserve * 4);
3617         if (reserve > arc_c)
3618                 return (SET_ERROR(ENOMEM));
3619 
3620         /*
3621          * Don't count loaned bufs as in flight dirty data to prevent long
3622          * network delays from blocking transactions that are ready to be
3623          * assigned to a txg.
3624          */
3625         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3626 
3627         /*
3628          * Writes will, almost always, require additional memory allocations
3629          * in order to compress/encrypt/etc the data.  We therefore need to
3630          * make sure that there is sufficient available memory for this.
3631          */
3632         if (error = arc_memory_throttle(reserve, anon_size, txg))
3633                 return (error);
3634 
3635         /*
3636          * Throttle writes when the amount of dirty data in the cache
3637          * gets too large.  We try to keep the cache less than half full
3638          * of dirty blocks so that our sync times don't grow too large.
3639          * Note: if two requests come in concurrently, we might let them
3640          * both succeed, when one of them should fail.  Not a huge deal.
3641          */
3642 
3643         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3644             anon_size > arc_c / 4) {
3645                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3646                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3647                     arc_tempreserve>>10,
3648                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3649                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3650                     reserve>>10, arc_c>>10);
3651                 return (SET_ERROR(ERESTART));
3652         }
3653         atomic_add_64(&arc_tempreserve, reserve);
3654         return (0);
3655 }
3656 
3657 void
3658 arc_init(void)
3659 {
3660         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3661         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3662 
3663         /* Convert seconds to clock ticks */
3664         arc_min_prefetch_lifespan = 1 * hz;
3665 
3666         /* Start out with 1/8 of all memory */
3667         arc_c = physmem * PAGESIZE / 8;
3668 
3669 #ifdef _KERNEL
3670         /*
3671          * On architectures where the physical memory can be larger
3672          * than the addressable space (intel in 32-bit mode), we may
3673          * need to limit the cache to 1/8 of VM size.
3674          */
3675         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3676 #endif
3677 
3678         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3679         arc_c_min = MAX(arc_c / 4, 64<<20);
3680         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3681         if (arc_c * 8 >= 1<<30)
3682                 arc_c_max = (arc_c * 8) - (1<<30);
3683         else
3684                 arc_c_max = arc_c_min;
3685         arc_c_max = MAX(arc_c * 6, arc_c_max);
3686 
3687         /*
3688          * Allow the tunables to override our calculations if they are
3689          * reasonable (ie. over 64MB)
3690          */
3691         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3692                 arc_c_max = zfs_arc_max;
3693         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3694                 arc_c_min = zfs_arc_min;
3695 
3696         arc_c = arc_c_max;
3697         arc_p = (arc_c >> 1);
3698 
3699         /* limit meta-data to 1/4 of the arc capacity */
3700         arc_meta_limit = arc_c_max / 4;
3701 
3702         /* Allow the tunable to override if it is reasonable */
3703         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3704                 arc_meta_limit = zfs_arc_meta_limit;
3705 
3706         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3707                 arc_c_min = arc_meta_limit / 2;
3708 
3709         if (zfs_arc_grow_retry > 0)
3710                 arc_grow_retry = zfs_arc_grow_retry;
3711 
3712         if (zfs_arc_shrink_shift > 0)
3713                 arc_shrink_shift = zfs_arc_shrink_shift;
3714 
3715         if (zfs_arc_p_min_shift > 0)
3716                 arc_p_min_shift = zfs_arc_p_min_shift;
3717 
3718         /* if kmem_flags are set, lets try to use less memory */
3719         if (kmem_debugging())
3720                 arc_c = arc_c / 2;
3721         if (arc_c < arc_c_min)
3722                 arc_c = arc_c_min;
3723 
3724         arc_anon = &ARC_anon;
3725         arc_mru = &ARC_mru;
3726         arc_mru_ghost = &ARC_mru_ghost;
3727         arc_mfu = &ARC_mfu;
3728         arc_mfu_ghost = &ARC_mfu_ghost;
3729         arc_l2c_only = &ARC_l2c_only;
3730         arc_size = 0;
3731 
3732         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3733         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3734         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3735         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3736         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3737         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3738 
3739         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3740             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3741         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3742             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3743         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3744             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3745         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3746             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3747         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3748             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3749         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3750             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3751         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3752             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3753         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3754             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3755         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3756             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3757         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3758             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3759 
3760         buf_init();
3761 
3762         arc_thread_exit = 0;
3763         arc_eviction_list = NULL;
3764         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3765         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3766 
3767         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3768             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3769 
3770         if (arc_ksp != NULL) {
3771                 arc_ksp->ks_data = &arc_stats;
3772                 kstat_install(arc_ksp);
3773         }
3774 
3775         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3776             TS_RUN, minclsyspri);
3777 
3778         arc_dead = FALSE;
3779         arc_warm = B_FALSE;
3780 
3781         if (zfs_write_limit_max == 0)
3782                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3783         else
3784                 zfs_write_limit_shift = 0;
3785         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3786 }
3787 
3788 void
3789 arc_fini(void)
3790 {
3791         mutex_enter(&arc_reclaim_thr_lock);
3792         arc_thread_exit = 1;
3793         while (arc_thread_exit != 0)
3794                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3795         mutex_exit(&arc_reclaim_thr_lock);
3796 
3797         arc_flush(NULL);
3798 
3799         arc_dead = TRUE;
3800 
3801         if (arc_ksp != NULL) {
3802                 kstat_delete(arc_ksp);
3803                 arc_ksp = NULL;
3804         }
3805 
3806         mutex_destroy(&arc_eviction_mtx);
3807         mutex_destroy(&arc_reclaim_thr_lock);
3808         cv_destroy(&arc_reclaim_thr_cv);
3809 
3810         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3811         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3812         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3813         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3814         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3815         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3816         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3817         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3818 
3819         mutex_destroy(&arc_anon->arcs_mtx);
3820         mutex_destroy(&arc_mru->arcs_mtx);
3821         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3822         mutex_destroy(&arc_mfu->arcs_mtx);
3823         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3824         mutex_destroy(&arc_l2c_only->arcs_mtx);
3825 
3826         mutex_destroy(&zfs_write_limit_lock);
3827 
3828         buf_fini();
3829 
3830         ASSERT(arc_loaned_bytes == 0);
3831 }
3832 
3833 /*
3834  * Level 2 ARC
3835  *
3836  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3837  * It uses dedicated storage devices to hold cached data, which are populated
3838  * using large infrequent writes.  The main role of this cache is to boost
3839  * the performance of random read workloads.  The intended L2ARC devices
3840  * include short-stroked disks, solid state disks, and other media with
3841  * substantially faster read latency than disk.
3842  *
3843  *                 +-----------------------+
3844  *                 |         ARC           |
3845  *                 +-----------------------+
3846  *                    |         ^     ^
3847  *                    |         |     |
3848  *      l2arc_feed_thread()    arc_read()
3849  *                    |         |     |
3850  *                    |  l2arc read   |
3851  *                    V         |     |
3852  *               +---------------+    |
3853  *               |     L2ARC     |    |
3854  *               +---------------+    |
3855  *                   |    ^           |
3856  *          l2arc_write() |           |
3857  *                   |    |           |
3858  *                   V    |           |
3859  *                 +-------+      +-------+
3860  *                 | vdev  |      | vdev  |
3861  *                 | cache |      | cache |
3862  *                 +-------+      +-------+
3863  *                 +=========+     .-----.
3864  *                 :  L2ARC  :    |-_____-|
3865  *                 : devices :    | Disks |
3866  *                 +=========+    `-_____-'
3867  *
3868  * Read requests are satisfied from the following sources, in order:
3869  *
3870  *      1) ARC
3871  *      2) vdev cache of L2ARC devices
3872  *      3) L2ARC devices
3873  *      4) vdev cache of disks
3874  *      5) disks
3875  *
3876  * Some L2ARC device types exhibit extremely slow write performance.
3877  * To accommodate for this there are some significant differences between
3878  * the L2ARC and traditional cache design:
3879  *
3880  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3881  * the ARC behave as usual, freeing buffers and placing headers on ghost
3882  * lists.  The ARC does not send buffers to the L2ARC during eviction as
3883  * this would add inflated write latencies for all ARC memory pressure.
3884  *
3885  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3886  * It does this by periodically scanning buffers from the eviction-end of
3887  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3888  * not already there. It scans until a headroom of buffers is satisfied,
3889  * which itself is a buffer for ARC eviction. If a compressible buffer is
3890  * found during scanning and selected for writing to an L2ARC device, we
3891  * temporarily boost scanning headroom during the next scan cycle to make
3892  * sure we adapt to compression effects (which might significantly reduce
3893  * the data volume we write to L2ARC). The thread that does this is
3894  * l2arc_feed_thread(), illustrated below; example sizes are included to
3895  * provide a better sense of ratio than this diagram:
3896  *
3897  *             head -->                        tail
3898  *              +---------------------+----------+
3899  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3900  *              +---------------------+----------+   |   o L2ARC eligible
3901  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3902  *              +---------------------+----------+   |
3903  *                   15.9 Gbytes      ^ 32 Mbytes    |
3904  *                                 headroom          |
3905  *                                            l2arc_feed_thread()
3906  *                                                   |
3907  *                       l2arc write hand <--[oooo]--'
3908  *                               |           8 Mbyte
3909  *                               |          write max
3910  *                               V
3911  *                +==============================+
3912  *      L2ARC dev |####|#|###|###|    |####| ... |
3913  *                +==============================+
3914  *                           32 Gbytes
3915  *
3916  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3917  * evicted, then the L2ARC has cached a buffer much sooner than it probably
3918  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3919  * safe to say that this is an uncommon case, since buffers at the end of
3920  * the ARC lists have moved there due to inactivity.
3921  *
3922  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3923  * then the L2ARC simply misses copying some buffers.  This serves as a
3924  * pressure valve to prevent heavy read workloads from both stalling the ARC
3925  * with waits and clogging the L2ARC with writes.  This also helps prevent
3926  * the potential for the L2ARC to churn if it attempts to cache content too
3927  * quickly, such as during backups of the entire pool.
3928  *
3929  * 5. After system boot and before the ARC has filled main memory, there are
3930  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3931  * lists can remain mostly static.  Instead of searching from tail of these
3932  * lists as pictured, the l2arc_feed_thread() will search from the list heads
3933  * for eligible buffers, greatly increasing its chance of finding them.
3934  *
3935  * The L2ARC device write speed is also boosted during this time so that
3936  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3937  * there are no L2ARC reads, and no fear of degrading read performance
3938  * through increased writes.
3939  *
3940  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3941  * the vdev queue can aggregate them into larger and fewer writes.  Each
3942  * device is written to in a rotor fashion, sweeping writes through
3943  * available space then repeating.
3944  *
3945  * 7. The L2ARC does not store dirty content.  It never needs to flush
3946  * write buffers back to disk based storage.
3947  *
3948  * 8. If an ARC buffer is written (and dirtied) which also exists in the
3949  * L2ARC, the now stale L2ARC buffer is immediately dropped.
3950  *
3951  * The performance of the L2ARC can be tweaked by a number of tunables, which
3952  * may be necessary for different workloads:
3953  *
3954  *      l2arc_write_max         max write bytes per interval
3955  *      l2arc_write_boost       extra write bytes during device warmup
3956  *      l2arc_noprefetch        skip caching prefetched buffers
3957  *      l2arc_headroom          number of max device writes to precache
3958  *      l2arc_headroom_boost    when we find compressed buffers during ARC
3959  *                              scanning, we multiply headroom by this
3960  *                              percentage factor for the next scan cycle,
3961  *                              since more compressed buffers are likely to
3962  *                              be present
3963  *      l2arc_feed_secs         seconds between L2ARC writing
3964  *
3965  * Tunables may be removed or added as future performance improvements are
3966  * integrated, and also may become zpool properties.
3967  *
3968  * There are three key functions that control how the L2ARC warms up:
3969  *
3970  *      l2arc_write_eligible()  check if a buffer is eligible to cache
3971  *      l2arc_write_size()      calculate how much to write
3972  *      l2arc_write_interval()  calculate sleep delay between writes
3973  *
3974  * These three functions determine what to write, how much, and how quickly
3975  * to send writes.
3976  */
3977 
3978 static boolean_t
3979 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3980 {
3981         /*
3982          * A buffer is *not* eligible for the L2ARC if it:
3983          * 1. belongs to a different spa.
3984          * 2. is already cached on the L2ARC.
3985          * 3. has an I/O in progress (it may be an incomplete read).
3986          * 4. is flagged not eligible (zfs property).
3987          */
3988         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3989             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3990                 return (B_FALSE);
3991 
3992         return (B_TRUE);
3993 }
3994 
3995 static uint64_t
3996 l2arc_write_size(void)
3997 {
3998         uint64_t size;
3999 
4000         /*
4001          * Make sure our globals have meaningful values in case the user
4002          * altered them.
4003          */
4004         size = l2arc_write_max;
4005         if (size == 0) {
4006                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4007                     "be greater than zero, resetting it to the default (%d)",
4008                     L2ARC_WRITE_SIZE);
4009                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4010         }
4011 
4012         if (arc_warm == B_FALSE)
4013                 size += l2arc_write_boost;
4014 
4015         return (size);
4016 
4017 }
4018 
4019 static clock_t
4020 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4021 {
4022         clock_t interval, next, now;
4023 
4024         /*
4025          * If the ARC lists are busy, increase our write rate; if the
4026          * lists are stale, idle back.  This is achieved by checking
4027          * how much we previously wrote - if it was more than half of
4028          * what we wanted, schedule the next write much sooner.
4029          */
4030         if (l2arc_feed_again && wrote > (wanted / 2))
4031                 interval = (hz * l2arc_feed_min_ms) / 1000;
4032         else
4033                 interval = hz * l2arc_feed_secs;
4034 
4035         now = ddi_get_lbolt();
4036         next = MAX(now, MIN(now + interval, began + interval));
4037 
4038         return (next);
4039 }
4040 
4041 static void
4042 l2arc_hdr_stat_add(void)
4043 {
4044         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4045         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4046 }
4047 
4048 static void
4049 l2arc_hdr_stat_remove(void)
4050 {
4051         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4052         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4053 }
4054 
4055 /*
4056  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4057  * If a device is returned, this also returns holding the spa config lock.
4058  */
4059 static l2arc_dev_t *
4060 l2arc_dev_get_next(void)
4061 {
4062         l2arc_dev_t *first, *next = NULL;
4063 
4064         /*
4065          * Lock out the removal of spas (spa_namespace_lock), then removal
4066          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4067          * both locks will be dropped and a spa config lock held instead.
4068          */
4069         mutex_enter(&spa_namespace_lock);
4070         mutex_enter(&l2arc_dev_mtx);
4071 
4072         /* if there are no vdevs, there is nothing to do */
4073         if (l2arc_ndev == 0)
4074                 goto out;
4075 
4076         first = NULL;
4077         next = l2arc_dev_last;
4078         do {
4079                 /* loop around the list looking for a non-faulted vdev */
4080                 if (next == NULL) {
4081                         next = list_head(l2arc_dev_list);
4082                 } else {
4083                         next = list_next(l2arc_dev_list, next);
4084                         if (next == NULL)
4085                                 next = list_head(l2arc_dev_list);
4086                 }
4087 
4088                 /* if we have come back to the start, bail out */
4089                 if (first == NULL)
4090                         first = next;
4091                 else if (next == first)
4092                         break;
4093 
4094         } while (vdev_is_dead(next->l2ad_vdev));
4095 
4096         /* if we were unable to find any usable vdevs, return NULL */
4097         if (vdev_is_dead(next->l2ad_vdev))
4098                 next = NULL;
4099 
4100         l2arc_dev_last = next;
4101 
4102 out:
4103         mutex_exit(&l2arc_dev_mtx);
4104 
4105         /*
4106          * Grab the config lock to prevent the 'next' device from being
4107          * removed while we are writing to it.
4108          */
4109         if (next != NULL)
4110                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4111         mutex_exit(&spa_namespace_lock);
4112 
4113         return (next);
4114 }
4115 
4116 /*
4117  * Free buffers that were tagged for destruction.
4118  */
4119 static void
4120 l2arc_do_free_on_write()
4121 {
4122         list_t *buflist;
4123         l2arc_data_free_t *df, *df_prev;
4124 
4125         mutex_enter(&l2arc_free_on_write_mtx);
4126         buflist = l2arc_free_on_write;
4127 
4128         for (df = list_tail(buflist); df; df = df_prev) {
4129                 df_prev = list_prev(buflist, df);
4130                 ASSERT(df->l2df_data != NULL);
4131                 ASSERT(df->l2df_func != NULL);
4132                 df->l2df_func(df->l2df_data, df->l2df_size);
4133                 list_remove(buflist, df);
4134                 kmem_free(df, sizeof (l2arc_data_free_t));
4135         }
4136 
4137         mutex_exit(&l2arc_free_on_write_mtx);
4138 }
4139 
4140 /*
4141  * A write to a cache device has completed.  Update all headers to allow
4142  * reads from these buffers to begin.
4143  */
4144 static void
4145 l2arc_write_done(zio_t *zio)
4146 {
4147         l2arc_write_callback_t *cb;
4148         l2arc_dev_t *dev;
4149         list_t *buflist;
4150         arc_buf_hdr_t *head, *ab, *ab_prev;
4151         l2arc_buf_hdr_t *abl2;
4152         kmutex_t *hash_lock;
4153 
4154         cb = zio->io_private;
4155         ASSERT(cb != NULL);
4156         dev = cb->l2wcb_dev;
4157         ASSERT(dev != NULL);
4158         head = cb->l2wcb_head;
4159         ASSERT(head != NULL);
4160         buflist = dev->l2ad_buflist;
4161         ASSERT(buflist != NULL);
4162         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4163             l2arc_write_callback_t *, cb);
4164 
4165         if (zio->io_error != 0)
4166                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4167 
4168         mutex_enter(&l2arc_buflist_mtx);
4169 
4170         /*
4171          * All writes completed, or an error was hit.
4172          */
4173         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4174                 ab_prev = list_prev(buflist, ab);
4175 
4176                 hash_lock = HDR_LOCK(ab);
4177                 if (!mutex_tryenter(hash_lock)) {
4178                         /*
4179                          * This buffer misses out.  It may be in a stage
4180                          * of eviction.  Its ARC_L2_WRITING flag will be
4181                          * left set, denying reads to this buffer.
4182                          */
4183                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4184                         continue;
4185                 }
4186 
4187                 abl2 = ab->b_l2hdr;
4188 
4189                 /*
4190                  * Release the temporary compressed buffer as soon as possible.
4191                  */
4192                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4193                         l2arc_release_cdata_buf(ab);
4194 
4195                 if (zio->io_error != 0) {
4196                         /*
4197                          * Error - drop L2ARC entry.
4198                          */
4199                         list_remove(buflist, ab);
4200                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4201                         ab->b_l2hdr = NULL;
4202                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4203                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4204                 }
4205 
4206                 /*
4207                  * Allow ARC to begin reads to this L2ARC entry.
4208                  */
4209                 ab->b_flags &= ~ARC_L2_WRITING;
4210 
4211                 mutex_exit(hash_lock);
4212         }
4213 
4214         atomic_inc_64(&l2arc_writes_done);
4215         list_remove(buflist, head);
4216         kmem_cache_free(hdr_cache, head);
4217         mutex_exit(&l2arc_buflist_mtx);
4218 
4219         l2arc_do_free_on_write();
4220 
4221         kmem_free(cb, sizeof (l2arc_write_callback_t));
4222 }
4223 
4224 /*
4225  * A read to a cache device completed.  Validate buffer contents before
4226  * handing over to the regular ARC routines.
4227  */
4228 static void
4229 l2arc_read_done(zio_t *zio)
4230 {
4231         l2arc_read_callback_t *cb;
4232         arc_buf_hdr_t *hdr;
4233         arc_buf_t *buf;
4234         kmutex_t *hash_lock;
4235         int equal;
4236 
4237         ASSERT(zio->io_vd != NULL);
4238         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4239 
4240         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4241 
4242         cb = zio->io_private;
4243         ASSERT(cb != NULL);
4244         buf = cb->l2rcb_buf;
4245         ASSERT(buf != NULL);
4246 
4247         hash_lock = HDR_LOCK(buf->b_hdr);
4248         mutex_enter(hash_lock);
4249         hdr = buf->b_hdr;
4250         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4251 
4252         /*
4253          * If the buffer was compressed, decompress it first.
4254          */
4255         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4256                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4257         ASSERT(zio->io_data != NULL);
4258 
4259         /*
4260          * Check this survived the L2ARC journey.
4261          */
4262         equal = arc_cksum_equal(buf);
4263         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4264                 mutex_exit(hash_lock);
4265                 zio->io_private = buf;
4266                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
4267                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
4268                 arc_read_done(zio);
4269         } else {
4270                 mutex_exit(hash_lock);
4271                 /*
4272                  * Buffer didn't survive caching.  Increment stats and
4273                  * reissue to the original storage device.
4274                  */
4275                 if (zio->io_error != 0) {
4276                         ARCSTAT_BUMP(arcstat_l2_io_error);
4277                 } else {
4278                         zio->io_error = SET_ERROR(EIO);
4279                 }
4280                 if (!equal)
4281                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4282 
4283                 /*
4284                  * If there's no waiter, issue an async i/o to the primary
4285                  * storage now.  If there *is* a waiter, the caller must
4286                  * issue the i/o in a context where it's OK to block.
4287                  */
4288                 if (zio->io_waiter == NULL) {
4289                         zio_t *pio = zio_unique_parent(zio);
4290 
4291                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4292 
4293                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4294                             buf->b_data, zio->io_size, arc_read_done, buf,
4295                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4296                 }
4297         }
4298 
4299         kmem_free(cb, sizeof (l2arc_read_callback_t));
4300 }
4301 
4302 /*
4303  * This is the list priority from which the L2ARC will search for pages to
4304  * cache.  This is used within loops (0..3) to cycle through lists in the
4305  * desired order.  This order can have a significant effect on cache
4306  * performance.
4307  *
4308  * Currently the metadata lists are hit first, MFU then MRU, followed by
4309  * the data lists.  This function returns a locked list, and also returns
4310  * the lock pointer.
4311  */
4312 static list_t *
4313 l2arc_list_locked(int list_num, kmutex_t **lock)
4314 {
4315         list_t *list = NULL;
4316 
4317         ASSERT(list_num >= 0 && list_num <= 3);
4318 
4319         switch (list_num) {
4320         case 0:
4321                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4322                 *lock = &arc_mfu->arcs_mtx;
4323                 break;
4324         case 1:
4325                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4326                 *lock = &arc_mru->arcs_mtx;
4327                 break;
4328         case 2:
4329                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4330                 *lock = &arc_mfu->arcs_mtx;
4331                 break;
4332         case 3:
4333                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4334                 *lock = &arc_mru->arcs_mtx;
4335                 break;
4336         }
4337 
4338         ASSERT(!(MUTEX_HELD(*lock)));
4339         mutex_enter(*lock);
4340         return (list);
4341 }
4342 
4343 /*
4344  * Evict buffers from the device write hand to the distance specified in
4345  * bytes.  This distance may span populated buffers, it may span nothing.
4346  * This is clearing a region on the L2ARC device ready for writing.
4347  * If the 'all' boolean is set, every buffer is evicted.
4348  */
4349 static void
4350 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4351 {
4352         list_t *buflist;
4353         l2arc_buf_hdr_t *abl2;
4354         arc_buf_hdr_t *ab, *ab_prev;
4355         kmutex_t *hash_lock;
4356         uint64_t taddr;
4357 
4358         buflist = dev->l2ad_buflist;
4359 
4360         if (buflist == NULL)
4361                 return;
4362 
4363         if (!all && dev->l2ad_first) {
4364                 /*
4365                  * This is the first sweep through the device.  There is
4366                  * nothing to evict.
4367                  */
4368                 return;
4369         }
4370 
4371         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4372                 /*
4373                  * When nearing the end of the device, evict to the end
4374                  * before the device write hand jumps to the start.
4375                  */
4376                 taddr = dev->l2ad_end;
4377         } else {
4378                 taddr = dev->l2ad_hand + distance;
4379         }
4380         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4381             uint64_t, taddr, boolean_t, all);
4382 
4383 top:
4384         mutex_enter(&l2arc_buflist_mtx);
4385         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4386                 ab_prev = list_prev(buflist, ab);
4387 
4388                 hash_lock = HDR_LOCK(ab);
4389                 if (!mutex_tryenter(hash_lock)) {
4390                         /*
4391                          * Missed the hash lock.  Retry.
4392                          */
4393                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4394                         mutex_exit(&l2arc_buflist_mtx);
4395                         mutex_enter(hash_lock);
4396                         mutex_exit(hash_lock);
4397                         goto top;
4398                 }
4399 
4400                 if (HDR_L2_WRITE_HEAD(ab)) {
4401                         /*
4402                          * We hit a write head node.  Leave it for
4403                          * l2arc_write_done().
4404                          */
4405                         list_remove(buflist, ab);
4406                         mutex_exit(hash_lock);
4407                         continue;
4408                 }
4409 
4410                 if (!all && ab->b_l2hdr != NULL &&
4411                     (ab->b_l2hdr->b_daddr > taddr ||
4412                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4413                         /*
4414                          * We've evicted to the target address,
4415                          * or the end of the device.
4416                          */
4417                         mutex_exit(hash_lock);
4418                         break;
4419                 }
4420 
4421                 if (HDR_FREE_IN_PROGRESS(ab)) {
4422                         /*
4423                          * Already on the path to destruction.
4424                          */
4425                         mutex_exit(hash_lock);
4426                         continue;
4427                 }
4428 
4429                 if (ab->b_state == arc_l2c_only) {
4430                         ASSERT(!HDR_L2_READING(ab));
4431                         /*
4432                          * This doesn't exist in the ARC.  Destroy.
4433                          * arc_hdr_destroy() will call list_remove()
4434                          * and decrement arcstat_l2_size.
4435                          */
4436                         arc_change_state(arc_anon, ab, hash_lock);
4437                         arc_hdr_destroy(ab);
4438                 } else {
4439                         /*
4440                          * Invalidate issued or about to be issued
4441                          * reads, since we may be about to write
4442                          * over this location.
4443                          */
4444                         if (HDR_L2_READING(ab)) {
4445                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4446                                 ab->b_flags |= ARC_L2_EVICTED;
4447                         }
4448 
4449                         /*
4450                          * Tell ARC this no longer exists in L2ARC.
4451                          */
4452                         if (ab->b_l2hdr != NULL) {
4453                                 abl2 = ab->b_l2hdr;
4454                                 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4455                                 ab->b_l2hdr = NULL;
4456                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4457                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4458                         }
4459                         list_remove(buflist, ab);
4460 
4461                         /*
4462                          * This may have been leftover after a
4463                          * failed write.
4464                          */
4465                         ab->b_flags &= ~ARC_L2_WRITING;
4466                 }
4467                 mutex_exit(hash_lock);
4468         }
4469         mutex_exit(&l2arc_buflist_mtx);
4470 
4471         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4472         dev->l2ad_evict = taddr;
4473 }
4474 
4475 /*
4476  * Find and write ARC buffers to the L2ARC device.
4477  *
4478  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4479  * for reading until they have completed writing.
4480  * The headroom_boost is an in-out parameter used to maintain headroom boost
4481  * state between calls to this function.
4482  *
4483  * Returns the number of bytes actually written (which may be smaller than
4484  * the delta by which the device hand has changed due to alignment).
4485  */
4486 static uint64_t
4487 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4488     boolean_t *headroom_boost)
4489 {
4490         arc_buf_hdr_t *ab, *ab_prev, *head;
4491         list_t *list;
4492         uint64_t write_asize, write_psize, write_sz, headroom,
4493             buf_compress_minsz;
4494         void *buf_data;
4495         kmutex_t *list_lock;
4496         boolean_t full;
4497         l2arc_write_callback_t *cb;
4498         zio_t *pio, *wzio;
4499         uint64_t guid = spa_load_guid(spa);
4500         const boolean_t do_headroom_boost = *headroom_boost;
4501 
4502         ASSERT(dev->l2ad_vdev != NULL);
4503 
4504         /* Lower the flag now, we might want to raise it again later. */
4505         *headroom_boost = B_FALSE;
4506 
4507         pio = NULL;
4508         write_sz = write_asize = write_psize = 0;
4509         full = B_FALSE;
4510         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4511         head->b_flags |= ARC_L2_WRITE_HEAD;
4512 
4513         /*
4514          * We will want to try to compress buffers that are at least 2x the
4515          * device sector size.
4516          */
4517         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4518 
4519         /*
4520          * Copy buffers for L2ARC writing.
4521          */
4522         mutex_enter(&l2arc_buflist_mtx);
4523         for (int try = 0; try <= 3; try++) {
4524                 uint64_t passed_sz = 0;
4525 
4526                 list = l2arc_list_locked(try, &list_lock);
4527 
4528                 /*
4529                  * L2ARC fast warmup.
4530                  *
4531                  * Until the ARC is warm and starts to evict, read from the
4532                  * head of the ARC lists rather than the tail.
4533                  */
4534                 if (arc_warm == B_FALSE)
4535                         ab = list_head(list);
4536                 else
4537                         ab = list_tail(list);
4538 
4539                 headroom = target_sz * l2arc_headroom;
4540                 if (do_headroom_boost)
4541                         headroom = (headroom * l2arc_headroom_boost) / 100;
4542 
4543                 for (; ab; ab = ab_prev) {
4544                         l2arc_buf_hdr_t *l2hdr;
4545                         kmutex_t *hash_lock;
4546                         uint64_t buf_sz;
4547 
4548                         if (arc_warm == B_FALSE)
4549                                 ab_prev = list_next(list, ab);
4550                         else
4551                                 ab_prev = list_prev(list, ab);
4552 
4553                         hash_lock = HDR_LOCK(ab);
4554                         if (!mutex_tryenter(hash_lock)) {
4555                                 /*
4556                                  * Skip this buffer rather than waiting.
4557                                  */
4558                                 continue;
4559                         }
4560 
4561                         passed_sz += ab->b_size;
4562                         if (passed_sz > headroom) {
4563                                 /*
4564                                  * Searched too far.
4565                                  */
4566                                 mutex_exit(hash_lock);
4567                                 break;
4568                         }
4569 
4570                         if (!l2arc_write_eligible(guid, ab)) {
4571                                 mutex_exit(hash_lock);
4572                                 continue;
4573                         }
4574 
4575                         if ((write_sz + ab->b_size) > target_sz) {
4576                                 full = B_TRUE;
4577                                 mutex_exit(hash_lock);
4578                                 break;
4579                         }
4580 
4581                         if (pio == NULL) {
4582                                 /*
4583                                  * Insert a dummy header on the buflist so
4584                                  * l2arc_write_done() can find where the
4585                                  * write buffers begin without searching.
4586                                  */
4587                                 list_insert_head(dev->l2ad_buflist, head);
4588 
4589                                 cb = kmem_alloc(
4590                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4591                                 cb->l2wcb_dev = dev;
4592                                 cb->l2wcb_head = head;
4593                                 pio = zio_root(spa, l2arc_write_done, cb,
4594                                     ZIO_FLAG_CANFAIL);
4595                         }
4596 
4597                         /*
4598                          * Create and add a new L2ARC header.
4599                          */
4600                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4601                         l2hdr->b_dev = dev;
4602                         ab->b_flags |= ARC_L2_WRITING;
4603 
4604                         /*
4605                          * Temporarily stash the data buffer in b_tmp_cdata.
4606                          * The subsequent write step will pick it up from
4607                          * there. This is because can't access ab->b_buf
4608                          * without holding the hash_lock, which we in turn
4609                          * can't access without holding the ARC list locks
4610                          * (which we want to avoid during compression/writing).
4611                          */
4612                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
4613                         l2hdr->b_asize = ab->b_size;
4614                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4615 
4616                         buf_sz = ab->b_size;
4617                         ab->b_l2hdr = l2hdr;
4618 
4619                         list_insert_head(dev->l2ad_buflist, ab);
4620 
4621                         /*
4622                          * Compute and store the buffer cksum before
4623                          * writing.  On debug the cksum is verified first.
4624                          */
4625                         arc_cksum_verify(ab->b_buf);
4626                         arc_cksum_compute(ab->b_buf, B_TRUE);
4627 
4628                         mutex_exit(hash_lock);
4629 
4630                         write_sz += buf_sz;
4631                 }
4632 
4633                 mutex_exit(list_lock);
4634 
4635                 if (full == B_TRUE)
4636                         break;
4637         }
4638 
4639         /* No buffers selected for writing? */
4640         if (pio == NULL) {
4641                 ASSERT0(write_sz);
4642                 mutex_exit(&l2arc_buflist_mtx);
4643                 kmem_cache_free(hdr_cache, head);
4644                 return (0);
4645         }
4646 
4647         /*
4648          * Now start writing the buffers. We're starting at the write head
4649          * and work backwards, retracing the course of the buffer selector
4650          * loop above.
4651          */
4652         for (ab = list_prev(dev->l2ad_buflist, head); ab;
4653             ab = list_prev(dev->l2ad_buflist, ab)) {
4654                 l2arc_buf_hdr_t *l2hdr;
4655                 uint64_t buf_sz;
4656 
4657                 /*
4658                  * We shouldn't need to lock the buffer here, since we flagged
4659                  * it as ARC_L2_WRITING in the previous step, but we must take
4660                  * care to only access its L2 cache parameters. In particular,
4661                  * ab->b_buf may be invalid by now due to ARC eviction.
4662                  */
4663                 l2hdr = ab->b_l2hdr;
4664                 l2hdr->b_daddr = dev->l2ad_hand;
4665 
4666                 if ((ab->b_flags & ARC_L2COMPRESS) &&
4667                     l2hdr->b_asize >= buf_compress_minsz) {
4668                         if (l2arc_compress_buf(l2hdr)) {
4669                                 /*
4670                                  * If compression succeeded, enable headroom
4671                                  * boost on the next scan cycle.
4672                                  */
4673                                 *headroom_boost = B_TRUE;
4674                         }
4675                 }
4676 
4677                 /*
4678                  * Pick up the buffer data we had previously stashed away
4679                  * (and now potentially also compressed).
4680                  */
4681                 buf_data = l2hdr->b_tmp_cdata;
4682                 buf_sz = l2hdr->b_asize;
4683 
4684                 /* Compression may have squashed the buffer to zero length. */
4685                 if (buf_sz != 0) {
4686                         uint64_t buf_p_sz;
4687 
4688                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4689                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4690                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4691                             ZIO_FLAG_CANFAIL, B_FALSE);
4692 
4693                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4694                             zio_t *, wzio);
4695                         (void) zio_nowait(wzio);
4696 
4697                         write_asize += buf_sz;
4698                         /*
4699                          * Keep the clock hand suitably device-aligned.
4700                          */
4701                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4702                         write_psize += buf_p_sz;
4703                         dev->l2ad_hand += buf_p_sz;
4704                 }
4705         }
4706 
4707         mutex_exit(&l2arc_buflist_mtx);
4708 
4709         ASSERT3U(write_asize, <=, target_sz);
4710         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4711         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4712         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4713         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4714         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4715 
4716         /*
4717          * Bump device hand to the device start if it is approaching the end.
4718          * l2arc_evict() will already have evicted ahead for this case.
4719          */
4720         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4721                 vdev_space_update(dev->l2ad_vdev,
4722                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4723                 dev->l2ad_hand = dev->l2ad_start;
4724                 dev->l2ad_evict = dev->l2ad_start;
4725                 dev->l2ad_first = B_FALSE;
4726         }
4727 
4728         dev->l2ad_writing = B_TRUE;
4729         (void) zio_wait(pio);
4730         dev->l2ad_writing = B_FALSE;
4731 
4732         return (write_asize);
4733 }
4734 
4735 /*
4736  * Compresses an L2ARC buffer.
4737  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4738  * size in l2hdr->b_asize. This routine tries to compress the data and
4739  * depending on the compression result there are three possible outcomes:
4740  * *) The buffer was incompressible. The original l2hdr contents were left
4741  *    untouched and are ready for writing to an L2 device.
4742  * *) The buffer was all-zeros, so there is no need to write it to an L2
4743  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4744  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4745  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4746  *    data buffer which holds the compressed data to be written, and b_asize
4747  *    tells us how much data there is. b_compress is set to the appropriate
4748  *    compression algorithm. Once writing is done, invoke
4749  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4750  *
4751  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4752  * buffer was incompressible).
4753  */
4754 static boolean_t
4755 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4756 {
4757         void *cdata;
4758         size_t csize, len;
4759 
4760         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4761         ASSERT(l2hdr->b_tmp_cdata != NULL);
4762 
4763         len = l2hdr->b_asize;
4764         cdata = zio_data_buf_alloc(len);
4765         csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4766             cdata, l2hdr->b_asize);
4767 
4768         if (csize == 0) {
4769                 /* zero block, indicate that there's nothing to write */
4770                 zio_data_buf_free(cdata, len);
4771                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4772                 l2hdr->b_asize = 0;
4773                 l2hdr->b_tmp_cdata = NULL;
4774                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4775                 return (B_TRUE);
4776         } else if (csize > 0 && csize < len) {
4777                 /*
4778                  * Compression succeeded, we'll keep the cdata around for
4779                  * writing and release it afterwards.
4780                  */
4781                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4782                 l2hdr->b_asize = csize;
4783                 l2hdr->b_tmp_cdata = cdata;
4784                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
4785                 return (B_TRUE);
4786         } else {
4787                 /*
4788                  * Compression failed, release the compressed buffer.
4789                  * l2hdr will be left unmodified.
4790                  */
4791                 zio_data_buf_free(cdata, len);
4792                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
4793                 return (B_FALSE);
4794         }
4795 }
4796 
4797 /*
4798  * Decompresses a zio read back from an l2arc device. On success, the
4799  * underlying zio's io_data buffer is overwritten by the uncompressed
4800  * version. On decompression error (corrupt compressed stream), the
4801  * zio->io_error value is set to signal an I/O error.
4802  *
4803  * Please note that the compressed data stream is not checksummed, so
4804  * if the underlying device is experiencing data corruption, we may feed
4805  * corrupt data to the decompressor, so the decompressor needs to be
4806  * able to handle this situation (LZ4 does).
4807  */
4808 static void
4809 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4810 {
4811         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4812 
4813         if (zio->io_error != 0) {
4814                 /*
4815                  * An io error has occured, just restore the original io
4816                  * size in preparation for a main pool read.
4817                  */
4818                 zio->io_orig_size = zio->io_size = hdr->b_size;
4819                 return;
4820         }
4821 
4822         if (c == ZIO_COMPRESS_EMPTY) {
4823                 /*
4824                  * An empty buffer results in a null zio, which means we
4825                  * need to fill its io_data after we're done restoring the
4826                  * buffer's contents.
4827                  */
4828                 ASSERT(hdr->b_buf != NULL);
4829                 bzero(hdr->b_buf->b_data, hdr->b_size);
4830                 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
4831         } else {
4832                 ASSERT(zio->io_data != NULL);
4833                 /*
4834                  * We copy the compressed data from the start of the arc buffer
4835                  * (the zio_read will have pulled in only what we need, the
4836                  * rest is garbage which we will overwrite at decompression)
4837                  * and then decompress back to the ARC data buffer. This way we
4838                  * can minimize copying by simply decompressing back over the
4839                  * original compressed data (rather than decompressing to an
4840                  * aux buffer and then copying back the uncompressed buffer,
4841                  * which is likely to be much larger).
4842                  */
4843                 uint64_t csize;
4844                 void *cdata;
4845 
4846                 csize = zio->io_size;
4847                 cdata = zio_data_buf_alloc(csize);
4848                 bcopy(zio->io_data, cdata, csize);
4849                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
4850                     hdr->b_size) != 0)
4851                         zio->io_error = EIO;
4852                 zio_data_buf_free(cdata, csize);
4853         }
4854 
4855         /* Restore the expected uncompressed IO size. */
4856         zio->io_orig_size = zio->io_size = hdr->b_size;
4857 }
4858 
4859 /*
4860  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
4861  * This buffer serves as a temporary holder of compressed data while
4862  * the buffer entry is being written to an l2arc device. Once that is
4863  * done, we can dispose of it.
4864  */
4865 static void
4866 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
4867 {
4868         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4869 
4870         if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
4871                 /*
4872                  * If the data was compressed, then we've allocated a
4873                  * temporary buffer for it, so now we need to release it.
4874                  */
4875                 ASSERT(l2hdr->b_tmp_cdata != NULL);
4876                 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
4877         }
4878         l2hdr->b_tmp_cdata = NULL;
4879 }
4880 
4881 /*
4882  * This thread feeds the L2ARC at regular intervals.  This is the beating
4883  * heart of the L2ARC.
4884  */
4885 static void
4886 l2arc_feed_thread(void)
4887 {
4888         callb_cpr_t cpr;
4889         l2arc_dev_t *dev;
4890         spa_t *spa;
4891         uint64_t size, wrote;
4892         clock_t begin, next = ddi_get_lbolt();
4893         boolean_t headroom_boost = B_FALSE;
4894 
4895         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4896 
4897         mutex_enter(&l2arc_feed_thr_lock);
4898 
4899         while (l2arc_thread_exit == 0) {
4900                 CALLB_CPR_SAFE_BEGIN(&cpr);
4901                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4902                     next);
4903                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4904                 next = ddi_get_lbolt() + hz;
4905 
4906                 /*
4907                  * Quick check for L2ARC devices.
4908                  */
4909                 mutex_enter(&l2arc_dev_mtx);
4910                 if (l2arc_ndev == 0) {
4911                         mutex_exit(&l2arc_dev_mtx);
4912                         continue;
4913                 }
4914                 mutex_exit(&l2arc_dev_mtx);
4915                 begin = ddi_get_lbolt();
4916 
4917                 /*
4918                  * This selects the next l2arc device to write to, and in
4919                  * doing so the next spa to feed from: dev->l2ad_spa.   This
4920                  * will return NULL if there are now no l2arc devices or if
4921                  * they are all faulted.
4922                  *
4923                  * If a device is returned, its spa's config lock is also
4924                  * held to prevent device removal.  l2arc_dev_get_next()
4925                  * will grab and release l2arc_dev_mtx.
4926                  */
4927                 if ((dev = l2arc_dev_get_next()) == NULL)
4928                         continue;
4929 
4930                 spa = dev->l2ad_spa;
4931                 ASSERT(spa != NULL);
4932 
4933                 /*
4934                  * If the pool is read-only then force the feed thread to
4935                  * sleep a little longer.
4936                  */
4937                 if (!spa_writeable(spa)) {
4938                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4939                         spa_config_exit(spa, SCL_L2ARC, dev);
4940                         continue;
4941                 }
4942 
4943                 /*
4944                  * Avoid contributing to memory pressure.
4945                  */
4946                 if (arc_reclaim_needed()) {
4947                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4948                         spa_config_exit(spa, SCL_L2ARC, dev);
4949                         continue;
4950                 }
4951 
4952                 ARCSTAT_BUMP(arcstat_l2_feeds);
4953 
4954                 size = l2arc_write_size();
4955 
4956                 /*
4957                  * Evict L2ARC buffers that will be overwritten.
4958                  */
4959                 l2arc_evict(dev, size, B_FALSE);
4960 
4961                 /*
4962                  * Write ARC buffers.
4963                  */
4964                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
4965 
4966                 /*
4967                  * Calculate interval between writes.
4968                  */
4969                 next = l2arc_write_interval(begin, size, wrote);
4970                 spa_config_exit(spa, SCL_L2ARC, dev);
4971         }
4972 
4973         l2arc_thread_exit = 0;
4974         cv_broadcast(&l2arc_feed_thr_cv);
4975         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
4976         thread_exit();
4977 }
4978 
4979 boolean_t
4980 l2arc_vdev_present(vdev_t *vd)
4981 {
4982         l2arc_dev_t *dev;
4983 
4984         mutex_enter(&l2arc_dev_mtx);
4985         for (dev = list_head(l2arc_dev_list); dev != NULL;
4986             dev = list_next(l2arc_dev_list, dev)) {
4987                 if (dev->l2ad_vdev == vd)
4988                         break;
4989         }
4990         mutex_exit(&l2arc_dev_mtx);
4991 
4992         return (dev != NULL);
4993 }
4994 
4995 /*
4996  * Add a vdev for use by the L2ARC.  By this point the spa has already
4997  * validated the vdev and opened it.
4998  */
4999 void
5000 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5001 {
5002         l2arc_dev_t *adddev;
5003 
5004         ASSERT(!l2arc_vdev_present(vd));
5005 
5006         /*
5007          * Create a new l2arc device entry.
5008          */
5009         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5010         adddev->l2ad_spa = spa;
5011         adddev->l2ad_vdev = vd;
5012         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5013         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5014         adddev->l2ad_hand = adddev->l2ad_start;
5015         adddev->l2ad_evict = adddev->l2ad_start;
5016         adddev->l2ad_first = B_TRUE;
5017         adddev->l2ad_writing = B_FALSE;
5018 
5019         /*
5020          * This is a list of all ARC buffers that are still valid on the
5021          * device.
5022          */
5023         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5024         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5025             offsetof(arc_buf_hdr_t, b_l2node));
5026 
5027         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5028 
5029         /*
5030          * Add device to global list
5031          */
5032         mutex_enter(&l2arc_dev_mtx);
5033         list_insert_head(l2arc_dev_list, adddev);
5034         atomic_inc_64(&l2arc_ndev);
5035         mutex_exit(&l2arc_dev_mtx);
5036 }
5037 
5038 /*
5039  * Remove a vdev from the L2ARC.
5040  */
5041 void
5042 l2arc_remove_vdev(vdev_t *vd)
5043 {
5044         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5045 
5046         /*
5047          * Find the device by vdev
5048          */
5049         mutex_enter(&l2arc_dev_mtx);
5050         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5051                 nextdev = list_next(l2arc_dev_list, dev);
5052                 if (vd == dev->l2ad_vdev) {
5053                         remdev = dev;
5054                         break;
5055                 }
5056         }
5057         ASSERT(remdev != NULL);
5058 
5059         /*
5060          * Remove device from global list
5061          */
5062         list_remove(l2arc_dev_list, remdev);
5063         l2arc_dev_last = NULL;          /* may have been invalidated */
5064         atomic_dec_64(&l2arc_ndev);
5065         mutex_exit(&l2arc_dev_mtx);
5066 
5067         /*
5068          * Clear all buflists and ARC references.  L2ARC device flush.
5069          */
5070         l2arc_evict(remdev, 0, B_TRUE);
5071         list_destroy(remdev->l2ad_buflist);
5072         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5073         kmem_free(remdev, sizeof (l2arc_dev_t));
5074 }
5075 
5076 void
5077 l2arc_init(void)
5078 {
5079         l2arc_thread_exit = 0;
5080         l2arc_ndev = 0;
5081         l2arc_writes_sent = 0;
5082         l2arc_writes_done = 0;
5083 
5084         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5085         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5086         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5087         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5088         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5089 
5090         l2arc_dev_list = &L2ARC_dev_list;
5091         l2arc_free_on_write = &L2ARC_free_on_write;
5092         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5093             offsetof(l2arc_dev_t, l2ad_node));
5094         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5095             offsetof(l2arc_data_free_t, l2df_list_node));
5096 }
5097 
5098 void
5099 l2arc_fini(void)
5100 {
5101         /*
5102          * This is called from dmu_fini(), which is called from spa_fini();
5103          * Because of this, we can assume that all l2arc devices have
5104          * already been removed when the pools themselves were removed.
5105          */
5106 
5107         l2arc_do_free_on_write();
5108 
5109         mutex_destroy(&l2arc_feed_thr_lock);
5110         cv_destroy(&l2arc_feed_thr_cv);
5111         mutex_destroy(&l2arc_dev_mtx);
5112         mutex_destroy(&l2arc_buflist_mtx);
5113         mutex_destroy(&l2arc_free_on_write_mtx);
5114 
5115         list_destroy(l2arc_dev_list);
5116         list_destroy(l2arc_free_on_write);
5117 }
5118 
5119 void
5120 l2arc_start(void)
5121 {
5122         if (!(spa_mode_global & FWRITE))
5123                 return;
5124 
5125         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5126             TS_RUN, minclsyspri);
5127 }
5128 
5129 void
5130 l2arc_stop(void)
5131 {
5132         if (!(spa_mode_global & FWRITE))
5133                 return;
5134 
5135         mutex_enter(&l2arc_feed_thr_lock);
5136         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
5137         l2arc_thread_exit = 1;
5138         while (l2arc_thread_exit != 0)
5139                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5140         mutex_exit(&l2arc_feed_thr_lock);
5141 }