1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 /*
  30  * DVA-based Adjustable Replacement Cache
  31  *
  32  * While much of the theory of operation used here is
  33  * based on the self-tuning, low overhead replacement cache
  34  * presented by Megiddo and Modha at FAST 2003, there are some
  35  * significant differences:
  36  *
  37  * 1. The Megiddo and Modha model assumes any page is evictable.
  38  * Pages in its cache cannot be "locked" into memory.  This makes
  39  * the eviction algorithm simple: evict the last page in the list.
  40  * This also make the performance characteristics easy to reason
  41  * about.  Our cache is not so simple.  At any given moment, some
  42  * subset of the blocks in the cache are un-evictable because we
  43  * have handed out a reference to them.  Blocks are only evictable
  44  * when there are no external references active.  This makes
  45  * eviction far more problematic:  we choose to evict the evictable
  46  * blocks that are the "lowest" in the list.
  47  *
  48  * There are times when it is not possible to evict the requested
  49  * space.  In these circumstances we are unable to adjust the cache
  50  * size.  To prevent the cache growing unbounded at these times we
  51  * implement a "cache throttle" that slows the flow of new data
  52  * into the cache until we can make space available.
  53  *
  54  * 2. The Megiddo and Modha model assumes a fixed cache size.
  55  * Pages are evicted when the cache is full and there is a cache
  56  * miss.  Our model has a variable sized cache.  It grows with
  57  * high use, but also tries to react to memory pressure from the
  58  * operating system: decreasing its size when system memory is
  59  * tight.
  60  *
  61  * 3. The Megiddo and Modha model assumes a fixed page size. All
  62  * elements of the cache are therefore exactly the same size.  So
  63  * when adjusting the cache size following a cache miss, its simply
  64  * a matter of choosing a single page to evict.  In our model, we
  65  * have variable sized cache blocks (rangeing from 512 bytes to
  66  * 128K bytes).  We therefore choose a set of blocks to evict to make
  67  * space for a cache miss that approximates as closely as possible
  68  * the space used by the new block.
  69  *
  70  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71  * by N. Megiddo & D. Modha, FAST 2003
  72  */
  73 
  74 /*
  75  * The locking model:
  76  *
  77  * A new reference to a cache buffer can be obtained in two
  78  * ways: 1) via a hash table lookup using the DVA as a key,
  79  * or 2) via one of the ARC lists.  The arc_read() interface
  80  * uses method 1, while the internal arc algorithms for
  81  * adjusting the cache use method 2.  We therefore provide two
  82  * types of locks: 1) the hash table lock array, and 2) the
  83  * arc list locks.
  84  *
  85  * Buffers do not have their own mutexes, rather they rely on the
  86  * hash table mutexes for the bulk of their protection (i.e. most
  87  * fields in the arc_buf_hdr_t are protected by these mutexes).
  88  *
  89  * buf_hash_find() returns the appropriate mutex (held) when it
  90  * locates the requested buffer in the hash table.  It returns
  91  * NULL for the mutex if the buffer was not in the table.
  92  *
  93  * buf_hash_remove() expects the appropriate hash mutex to be
  94  * already held before it is invoked.
  95  *
  96  * Each arc state also has a mutex which is used to protect the
  97  * buffer list associated with the state.  When attempting to
  98  * obtain a hash table lock while holding an arc list lock you
  99  * must use: mutex_tryenter() to avoid deadlock.  Also note that
 100  * the active state mutex must be held before the ghost state mutex.
 101  *
 102  * Arc buffers may have an associated eviction callback function.
 103  * This function will be invoked prior to removing the buffer (e.g.
 104  * in arc_do_user_evicts()).  Note however that the data associated
 105  * with the buffer may be evicted prior to the callback.  The callback
 106  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 107  * the users of callbacks must ensure that their private data is
 108  * protected from simultaneous callbacks from arc_buf_evict()
 109  * and arc_do_user_evicts().
 110  *
 111  * Note that the majority of the performance stats are manipulated
 112  * with atomic operations.
 113  *
 114  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 115  *
 116  *      - L2ARC buflist creation
 117  *      - L2ARC buflist eviction
 118  *      - L2ARC write completion, which walks L2ARC buflists
 119  *      - ARC header destruction, as it removes from L2ARC buflists
 120  *      - ARC header release, as it removes from L2ARC buflists
 121  */
 122 
 123 #include <sys/spa.h>
 124 #include <sys/zio.h>
 125 #include <sys/zio_compress.h>
 126 #include <sys/zfs_context.h>
 127 #include <sys/arc.h>
 128 #include <sys/refcount.h>
 129 #include <sys/vdev.h>
 130 #include <sys/vdev_impl.h>
 131 #include <sys/dsl_pool.h>
 132 #ifdef _KERNEL
 133 #include <sys/vmsystm.h>
 134 #include <vm/anon.h>
 135 #include <sys/fs/swapnode.h>
 136 #include <sys/dnlc.h>
 137 #endif
 138 #include <sys/callb.h>
 139 #include <sys/kstat.h>
 140 #include <zfs_fletcher.h>
 141 #include <sys/byteorder.h>
 142 #include <sys/spa_impl.h>
 143 
 144 #ifndef _KERNEL
 145 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 146 boolean_t arc_watch = B_FALSE;
 147 int arc_procfd;
 148 #endif
 149 
 150 static kmutex_t         arc_reclaim_thr_lock;
 151 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 152 static uint8_t          arc_thread_exit;
 153 
 154 #define ARC_REDUCE_DNLC_PERCENT 3
 155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 156 
 157 typedef enum arc_reclaim_strategy {
 158         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 159         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 160 } arc_reclaim_strategy_t;
 161 
 162 /*
 163  * The number of iterations through arc_evict_*() before we
 164  * drop & reacquire the lock.
 165  */
 166 int arc_evict_iterations = 100;
 167 
 168 /* number of seconds before growing cache again */
 169 static int              arc_grow_retry = 60;
 170 
 171 /* shift of arc_c for calculating both min and max arc_p */
 172 static int              arc_p_min_shift = 4;
 173 
 174 /* log2(fraction of arc to reclaim) */
 175 static int              arc_shrink_shift = 5;
 176 
 177 /*
 178  * minimum lifespan of a prefetch block in clock ticks
 179  * (initialized in arc_init())
 180  */
 181 static int              arc_min_prefetch_lifespan;
 182 
 183 /*
 184  * If this percent of memory is free, don't throttle.
 185  */
 186 int arc_lotsfree_percent = 10;
 187 
 188 static int arc_dead;
 189 
 190 /*
 191  * The arc has filled available memory and has now warmed up.
 192  */
 193 static boolean_t arc_warm;
 194 
 195 /*
 196  * These tunables are for performance analysis.
 197  */
 198 uint64_t zfs_arc_max;
 199 uint64_t zfs_arc_min;
 200 uint64_t zfs_arc_meta_limit = 0;
 201 int zfs_arc_grow_retry = 0;
 202 int zfs_arc_shrink_shift = 0;
 203 int zfs_arc_p_min_shift = 0;
 204 int zfs_disable_dup_eviction = 0;
 205 
 206 /*
 207  * Note that buffers can be in one of 6 states:
 208  *      ARC_anon        - anonymous (discussed below)
 209  *      ARC_mru         - recently used, currently cached
 210  *      ARC_mru_ghost   - recentely used, no longer in cache
 211  *      ARC_mfu         - frequently used, currently cached
 212  *      ARC_mfu_ghost   - frequently used, no longer in cache
 213  *      ARC_l2c_only    - exists in L2ARC but not other states
 214  * When there are no active references to the buffer, they are
 215  * are linked onto a list in one of these arc states.  These are
 216  * the only buffers that can be evicted or deleted.  Within each
 217  * state there are multiple lists, one for meta-data and one for
 218  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 219  * etc.) is tracked separately so that it can be managed more
 220  * explicitly: favored over data, limited explicitly.
 221  *
 222  * Anonymous buffers are buffers that are not associated with
 223  * a DVA.  These are buffers that hold dirty block copies
 224  * before they are written to stable storage.  By definition,
 225  * they are "ref'd" and are considered part of arc_mru
 226  * that cannot be freed.  Generally, they will aquire a DVA
 227  * as they are written and migrate onto the arc_mru list.
 228  *
 229  * The ARC_l2c_only state is for buffers that are in the second
 230  * level ARC but no longer in any of the ARC_m* lists.  The second
 231  * level ARC itself may also contain buffers that are in any of
 232  * the ARC_m* states - meaning that a buffer can exist in two
 233  * places.  The reason for the ARC_l2c_only state is to keep the
 234  * buffer header in the hash table, so that reads that hit the
 235  * second level ARC benefit from these fast lookups.
 236  */
 237 
 238 typedef struct arc_state {
 239         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 240         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 241         uint64_t arcs_size;     /* total amount of data in this state */
 242         kmutex_t arcs_mtx;
 243 } arc_state_t;
 244 
 245 /* The 6 states: */
 246 static arc_state_t ARC_anon;
 247 static arc_state_t ARC_mru;
 248 static arc_state_t ARC_mru_ghost;
 249 static arc_state_t ARC_mfu;
 250 static arc_state_t ARC_mfu_ghost;
 251 static arc_state_t ARC_l2c_only;
 252 
 253 typedef struct arc_stats {
 254         kstat_named_t arcstat_hits;
 255         kstat_named_t arcstat_misses;
 256         kstat_named_t arcstat_demand_data_hits;
 257         kstat_named_t arcstat_demand_data_misses;
 258         kstat_named_t arcstat_demand_metadata_hits;
 259         kstat_named_t arcstat_demand_metadata_misses;
 260         kstat_named_t arcstat_prefetch_data_hits;
 261         kstat_named_t arcstat_prefetch_data_misses;
 262         kstat_named_t arcstat_prefetch_metadata_hits;
 263         kstat_named_t arcstat_prefetch_metadata_misses;
 264         kstat_named_t arcstat_mru_hits;
 265         kstat_named_t arcstat_mru_ghost_hits;
 266         kstat_named_t arcstat_mfu_hits;
 267         kstat_named_t arcstat_mfu_ghost_hits;
 268         kstat_named_t arcstat_deleted;
 269         kstat_named_t arcstat_recycle_miss;
 270         /*
 271          * Number of buffers that could not be evicted because the hash lock
 272          * was held by another thread.  The lock may not necessarily be held
 273          * by something using the same buffer, since hash locks are shared
 274          * by multiple buffers.
 275          */
 276         kstat_named_t arcstat_mutex_miss;
 277         /*
 278          * Number of buffers skipped because they have I/O in progress, are
 279          * indrect prefetch buffers that have not lived long enough, or are
 280          * not from the spa we're trying to evict from.
 281          */
 282         kstat_named_t arcstat_evict_skip;
 283         kstat_named_t arcstat_evict_l2_cached;
 284         kstat_named_t arcstat_evict_l2_eligible;
 285         kstat_named_t arcstat_evict_l2_ineligible;
 286         kstat_named_t arcstat_hash_elements;
 287         kstat_named_t arcstat_hash_elements_max;
 288         kstat_named_t arcstat_hash_collisions;
 289         kstat_named_t arcstat_hash_chains;
 290         kstat_named_t arcstat_hash_chain_max;
 291         kstat_named_t arcstat_p;
 292         kstat_named_t arcstat_c;
 293         kstat_named_t arcstat_c_min;
 294         kstat_named_t arcstat_c_max;
 295         kstat_named_t arcstat_size;
 296         kstat_named_t arcstat_hdr_size;
 297         kstat_named_t arcstat_data_size;
 298         kstat_named_t arcstat_other_size;
 299         kstat_named_t arcstat_l2_hits;
 300         kstat_named_t arcstat_l2_misses;
 301         kstat_named_t arcstat_l2_feeds;
 302         kstat_named_t arcstat_l2_rw_clash;
 303         kstat_named_t arcstat_l2_read_bytes;
 304         kstat_named_t arcstat_l2_write_bytes;
 305         kstat_named_t arcstat_l2_writes_sent;
 306         kstat_named_t arcstat_l2_writes_done;
 307         kstat_named_t arcstat_l2_writes_error;
 308         kstat_named_t arcstat_l2_writes_hdr_miss;
 309         kstat_named_t arcstat_l2_evict_lock_retry;
 310         kstat_named_t arcstat_l2_evict_reading;
 311         kstat_named_t arcstat_l2_free_on_write;
 312         kstat_named_t arcstat_l2_abort_lowmem;
 313         kstat_named_t arcstat_l2_cksum_bad;
 314         kstat_named_t arcstat_l2_io_error;
 315         kstat_named_t arcstat_l2_size;
 316         kstat_named_t arcstat_l2_asize;
 317         kstat_named_t arcstat_l2_hdr_size;
 318         kstat_named_t arcstat_l2_compress_successes;
 319         kstat_named_t arcstat_l2_compress_zeros;
 320         kstat_named_t arcstat_l2_compress_failures;
 321         kstat_named_t arcstat_l2_log_blk_writes;
 322         kstat_named_t arcstat_l2_log_blk_avg_size;
 323         kstat_named_t arcstat_l2_data_to_meta_ratio;
 324         kstat_named_t arcstat_l2_rebuild_successes;
 325         kstat_named_t arcstat_l2_rebuild_abort_unsupported;
 326         kstat_named_t arcstat_l2_rebuild_abort_timeout;
 327         kstat_named_t arcstat_l2_rebuild_abort_io_errors;
 328         kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
 329         kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
 330         kstat_named_t arcstat_l2_rebuild_abort_lowmem;
 331         kstat_named_t arcstat_l2_rebuild_size;
 332         kstat_named_t arcstat_l2_rebuild_bufs;
 333         kstat_named_t arcstat_l2_rebuild_bufs_precached;
 334         kstat_named_t arcstat_l2_rebuild_psize;
 335         kstat_named_t arcstat_l2_rebuild_log_blks;
 336         kstat_named_t arcstat_memory_throttle_count;
 337         kstat_named_t arcstat_duplicate_buffers;
 338         kstat_named_t arcstat_duplicate_buffers_size;
 339         kstat_named_t arcstat_duplicate_reads;
 340         kstat_named_t arcstat_meta_used;
 341         kstat_named_t arcstat_meta_limit;
 342         kstat_named_t arcstat_meta_max;
 343 } arc_stats_t;
 344 
 345 static arc_stats_t arc_stats = {
 346         { "hits",                       KSTAT_DATA_UINT64 },
 347         { "misses",                     KSTAT_DATA_UINT64 },
 348         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 349         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 350         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 351         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 352         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 353         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 354         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 355         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 356         { "mru_hits",                   KSTAT_DATA_UINT64 },
 357         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 358         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 359         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 360         { "deleted",                    KSTAT_DATA_UINT64 },
 361         { "recycle_miss",               KSTAT_DATA_UINT64 },
 362         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 363         { "evict_skip",                 KSTAT_DATA_UINT64 },
 364         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 365         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 366         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 367         { "hash_elements",              KSTAT_DATA_UINT64 },
 368         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 369         { "hash_collisions",            KSTAT_DATA_UINT64 },
 370         { "hash_chains",                KSTAT_DATA_UINT64 },
 371         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 372         { "p",                          KSTAT_DATA_UINT64 },
 373         { "c",                          KSTAT_DATA_UINT64 },
 374         { "c_min",                      KSTAT_DATA_UINT64 },
 375         { "c_max",                      KSTAT_DATA_UINT64 },
 376         { "size",                       KSTAT_DATA_UINT64 },
 377         { "hdr_size",                   KSTAT_DATA_UINT64 },
 378         { "data_size",                  KSTAT_DATA_UINT64 },
 379         { "other_size",                 KSTAT_DATA_UINT64 },
 380         { "l2_hits",                    KSTAT_DATA_UINT64 },
 381         { "l2_misses",                  KSTAT_DATA_UINT64 },
 382         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 383         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 384         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 385         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 386         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 387         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 388         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 389         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 390         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 391         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 392         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 393         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 394         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 395         { "l2_io_error",                KSTAT_DATA_UINT64 },
 396         { "l2_size",                    KSTAT_DATA_UINT64 },
 397         { "l2_asize",                   KSTAT_DATA_UINT64 },
 398         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 399         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 400         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 401         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 402         { "l2_log_blk_writes",          KSTAT_DATA_UINT64 },
 403         { "l2_log_blk_avg_size",        KSTAT_DATA_UINT64 },
 404         { "l2_data_to_meta_ratio",      KSTAT_DATA_UINT64 },
 405         { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
 406         { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
 407         { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
 408         { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
 409         { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
 410         { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
 411         { "l2_rebuild_lowmem",          KSTAT_DATA_UINT64 },
 412         { "l2_rebuild_psize",           KSTAT_DATA_UINT64 },
 413         { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
 414         { "l2_rebuild_bufs_precached",  KSTAT_DATA_UINT64 },
 415         { "l2_rebuild_size",            KSTAT_DATA_UINT64 },
 416         { "l2_rebuild_log_blks",        KSTAT_DATA_UINT64 },
 417         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 418         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 419         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 420         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 421         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 422         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 423         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 424 };
 425 
 426 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 427 
 428 #define ARCSTAT_INCR(stat, val) \
 429         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 430 
 431 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 432 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 433 
 434 #define ARCSTAT_MAX(stat, val) {                                        \
 435         uint64_t m;                                                     \
 436         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 437             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 438                 continue;                                               \
 439 }
 440 
 441 #define ARCSTAT_MAXSTAT(stat) \
 442         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 443 
 444 /*
 445  * We define a macro to allow ARC hits/misses to be easily broken down by
 446  * two separate conditions, giving a total of four different subtypes for
 447  * each of hits and misses (so eight statistics total).
 448  */
 449 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 450         if (cond1) {                                                    \
 451                 if (cond2) {                                            \
 452                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 453                 } else {                                                \
 454                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 455                 }                                                       \
 456         } else {                                                        \
 457                 if (cond2) {                                            \
 458                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 459                 } else {                                                \
 460                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 461                 }                                                       \
 462         }
 463 
 464 /*
 465  * This macro allows us to use kstats as floating averages. Each time we
 466  * update this kstat, we first factor it and the update value by
 467  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
 468  * average. This macro assumes that integer loads and stores are atomic, but
 469  * is not safe for multiple writers updating the kstat in parallel (only the
 470  * last writer's update will remain).
 471  */
 472 #define ARCSTAT_F_AVG_FACTOR    3
 473 #define ARCSTAT_F_AVG(stat, value) \
 474         do { \
 475                 uint64_t x = ARCSTAT(stat); \
 476                 x = x - x / ARCSTAT_F_AVG_FACTOR + \
 477                     (value) / ARCSTAT_F_AVG_FACTOR; \
 478                 ARCSTAT(stat) = x; \
 479                 _NOTE(NOTREACHED) \
 480                 _NOTE(CONSTCOND) \
 481         } while (0)
 482 
 483 kstat_t                 *arc_ksp;
 484 static arc_state_t      *arc_anon;
 485 static arc_state_t      *arc_mru;
 486 static arc_state_t      *arc_mru_ghost;
 487 static arc_state_t      *arc_mfu;
 488 static arc_state_t      *arc_mfu_ghost;
 489 static arc_state_t      *arc_l2c_only;
 490 
 491 /*
 492  * There are several ARC variables that are critical to export as kstats --
 493  * but we don't want to have to grovel around in the kstat whenever we wish to
 494  * manipulate them.  For these variables, we therefore define them to be in
 495  * terms of the statistic variable.  This assures that we are not introducing
 496  * the possibility of inconsistency by having shadow copies of the variables,
 497  * while still allowing the code to be readable.
 498  */
 499 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 500 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 501 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 502 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 503 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 504 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 505 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 506 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 507 
 508 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 509         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 510 
 511 static int              arc_no_grow;    /* Don't try to grow cache size */
 512 static uint64_t         arc_tempreserve;
 513 static uint64_t         arc_loaned_bytes;
 514 
 515 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 516 
 517 typedef struct arc_callback arc_callback_t;
 518 
 519 struct arc_callback {
 520         void                    *acb_private;
 521         arc_done_func_t         *acb_done;
 522         arc_buf_t               *acb_buf;
 523         zio_t                   *acb_zio_dummy;
 524         arc_callback_t          *acb_next;
 525 };
 526 
 527 typedef struct arc_write_callback arc_write_callback_t;
 528 
 529 struct arc_write_callback {
 530         void            *awcb_private;
 531         arc_done_func_t *awcb_ready;
 532         arc_done_func_t *awcb_physdone;
 533         arc_done_func_t *awcb_done;
 534         arc_buf_t       *awcb_buf;
 535 };
 536 
 537 struct arc_buf_hdr {
 538         /* protected by hash lock */
 539         dva_t                   b_dva;
 540         uint64_t                b_birth;
 541         uint64_t                b_cksum0;
 542 
 543         kmutex_t                b_freeze_lock;
 544         zio_cksum_t             *b_freeze_cksum;
 545         void                    *b_thawed;
 546 
 547         arc_buf_hdr_t           *b_hash_next;
 548         arc_buf_t               *b_buf;
 549         uint32_t                b_flags;
 550         uint32_t                b_datacnt;
 551 
 552         arc_callback_t          *b_acb;
 553         kcondvar_t              b_cv;
 554 
 555         /* immutable */
 556         arc_buf_contents_t      b_type;
 557         uint64_t                b_size;
 558         uint64_t                b_spa;
 559 
 560         /* protected by arc state mutex */
 561         arc_state_t             *b_state;
 562         list_node_t             b_arc_node;
 563 
 564         /* updated atomically */
 565         clock_t                 b_arc_access;
 566 
 567         /* self protecting */
 568         refcount_t              b_refcnt;
 569 
 570         l2arc_buf_hdr_t         *b_l2hdr;
 571         list_node_t             b_l2node;
 572 };
 573 
 574 static arc_buf_t *arc_eviction_list;
 575 static kmutex_t arc_eviction_mtx;
 576 static arc_buf_hdr_t arc_eviction_hdr;
 577 static void arc_get_data_buf(arc_buf_t *buf);
 578 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 579 static int arc_evict_needed(arc_buf_contents_t type);
 580 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 581 static void arc_buf_watch(arc_buf_t *buf);
 582 
 583 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 584 
 585 #define GHOST_STATE(state)      \
 586         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 587         (state) == arc_l2c_only)
 588 
 589 /*
 590  * Private ARC flags.  These flags are private ARC only flags that will show up
 591  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 592  * be passed in as arc_flags in things like arc_read.  However, these flags
 593  * should never be passed and should only be set by ARC code.  When adding new
 594  * public flags, make sure not to smash the private ones.
 595  */
 596 
 597 #define ARC_IN_HASH_TABLE       (1 << 9)  /* this buffer is hashed */
 598 #define ARC_IO_IN_PROGRESS      (1 << 10) /* I/O in progress for buf */
 599 #define ARC_IO_ERROR            (1 << 11) /* I/O failed for buf */
 600 #define ARC_FREED_IN_READ       (1 << 12) /* buf freed while in read */
 601 #define ARC_BUF_AVAILABLE       (1 << 13) /* block not in active use */
 602 #define ARC_INDIRECT            (1 << 14) /* this is an indirect block */
 603 #define ARC_FREE_IN_PROGRESS    (1 << 15) /* hdr about to be freed */
 604 #define ARC_L2_WRITING          (1 << 16) /* L2ARC write in progress */
 605 #define ARC_L2_EVICTED          (1 << 17) /* evicted during I/O */
 606 #define ARC_L2_WRITE_HEAD       (1 << 18) /* head of write list */
 607 
 608 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 609 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 610 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 611 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 612 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 613 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 614 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 615 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 616 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS &&  \
 617                                     (hdr)->b_l2hdr != NULL)
 618 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 619 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 620 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 621 
 622 /*
 623  * Other sizes
 624  */
 625 
 626 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 627 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 628 
 629 /*
 630  * Hash table routines
 631  */
 632 
 633 #define HT_LOCK_PAD     64
 634 
 635 struct ht_lock {
 636         kmutex_t        ht_lock;
 637 #ifdef _KERNEL
 638         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 639 #endif
 640 };
 641 
 642 #define BUF_LOCKS 256
 643 typedef struct buf_hash_table {
 644         uint64_t ht_mask;
 645         arc_buf_hdr_t **ht_table;
 646         struct ht_lock ht_locks[BUF_LOCKS];
 647 } buf_hash_table_t;
 648 
 649 static buf_hash_table_t buf_hash_table;
 650 
 651 #define BUF_HASH_INDEX(spa, dva, birth) \
 652         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 653 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 654 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 655 #define HDR_LOCK(hdr) \
 656         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 657 
 658 uint64_t zfs_crc64_table[256];
 659 
 660 /*
 661  * Level 2 ARC
 662  */
 663 
 664 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 665 #define L2ARC_HEADROOM          2                       /* num of writes */
 666 /*
 667  * If we discover during ARC scan any buffers to be compressed, we boost
 668  * our headroom for the next scanning cycle by this percentage multiple.
 669  */
 670 #define L2ARC_HEADROOM_BOOST    200
 671 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 672 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 673 
 674 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 675 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 676 
 677 /* L2ARC Performance Tunables */
 678 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 679 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 680 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 681 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 682 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 683 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 684 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 685 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 686 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 687 
 688 /*
 689  * L2ARC Internals
 690  */
 691 typedef struct l2arc_dev l2arc_dev_t;
 692 static list_t L2ARC_dev_list;                   /* device list */
 693 static list_t *l2arc_dev_list;                  /* device list pointer */
 694 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 695 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 696 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 697 static list_t L2ARC_free_on_write;              /* free after write buf list */
 698 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 699 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 700 static uint64_t l2arc_ndev;                     /* number of devices */
 701 
 702 typedef struct l2arc_read_callback {
 703         arc_buf_t               *l2rcb_buf;             /* read buffer */
 704         spa_t                   *l2rcb_spa;             /* spa */
 705         blkptr_t                l2rcb_bp;               /* original blkptr */
 706         zbookmark_t             l2rcb_zb;               /* original bookmark */
 707         int                     l2rcb_flags;            /* original flags */
 708         enum zio_compress       l2rcb_compress;         /* applied compress */
 709 } l2arc_read_callback_t;
 710 
 711 typedef struct l2arc_write_callback {
 712         l2arc_dev_t     *l2wcb_dev;             /* device info */
 713         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 714         /* list of in-flight l2arc_log_blk_buf_t's */
 715         list_t          l2wcb_log_blk_buf_list;
 716 } l2arc_write_callback_t;
 717 
 718 struct l2arc_buf_hdr {
 719         /* protected by arc_buf_hdr  mutex */
 720         l2arc_dev_t             *b_dev;         /* L2ARC device */
 721         uint64_t                b_daddr;        /* disk address, offset byte */
 722         /* compression applied to buffer data */
 723         enum zio_compress       b_compress;
 724         /* real alloc'd buffer size depending on b_compress applied */
 725         int                     b_asize;
 726         /* temporary buffer holder for in-flight compressed data */
 727         void                    *b_tmp_cdata;
 728 };
 729 
 730 typedef struct l2arc_data_free {
 731         /* protected by l2arc_free_on_write_mtx */
 732         void            *l2df_data;
 733         size_t          l2df_size;
 734         void            (*l2df_func)(void *, size_t);
 735         list_node_t     l2df_list_node;
 736 } l2arc_data_free_t;
 737 
 738 static kmutex_t l2arc_feed_thr_lock;
 739 static kcondvar_t l2arc_feed_thr_cv;
 740 static uint8_t l2arc_thread_exit;
 741 
 742 static void l2arc_read_done(zio_t *zio);
 743 static void l2arc_hdr_stat_add(boolean_t from_arc);
 744 static void l2arc_hdr_stat_remove(void);
 745 static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
 746 
 747 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 748 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 749     enum zio_compress c);
 750 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 751 
 752 enum {
 753         L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0)      /* mirror of l2ad_first */
 754 };
 755 
 756 /*
 757  * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
 758  */
 759 typedef struct l2arc_log_blk_ptr {
 760         uint64_t        l2lbp_daddr;    /* device address of log */
 761         /*
 762          * l2lbp_prop is the same format as the blk_prop in blkptr_t:
 763          *      * logical size (in sectors)
 764          *      * physical (compressed) size (in sectors)
 765          *      * compression algorithm (we always LZ4-compress l2arc logs)
 766          *      * checksum algorithm (used for l2lbp_cksum)
 767          *      * object type & level (unused for now)
 768          */
 769         uint64_t        l2lbp_prop;
 770         zio_cksum_t     l2lbp_cksum;    /* fletcher4 of log */
 771 } l2arc_log_blk_ptr_t;
 772 
 773 /*
 774  * The persistent L2ARC device header.
 775  */
 776 typedef struct l2arc_dev_hdr_phys {
 777         uint64_t        l2dh_magic;
 778         zio_cksum_t     l2dh_self_cksum;        /* fletcher4 of fields below */
 779 
 780         /*
 781          * Global L2ARC device state and metadata.
 782          */
 783         uint64_t        l2dh_spa_guid;
 784         uint64_t        l2dh_evict_tail;        /* current evict pointer */
 785         uint64_t        l2dh_alloc_space;       /* vdev space alloc status */
 786         uint64_t        l2dh_flags;             /* l2arc_dev_hdr_flags_t */
 787 
 788         /*
 789          * Start of log block chain. [0] -> newest log, [1] -> one older (used
 790          * for initiating prefetch).
 791          */
 792         l2arc_log_blk_ptr_t     l2dh_start_lbps[2];
 793 
 794         const uint64_t  l2dh_pad[43];           /* pad to 512 bytes */
 795 } l2arc_dev_hdr_phys_t;
 796 CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
 797 
 798 /*
 799  * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
 800  */
 801 typedef struct l2arc_log_ent_phys {
 802         dva_t                   l2le_dva;       /* dva of buffer */
 803         uint64_t                l2le_birth;     /* birth txg of buffer */
 804         uint64_t                l2le_cksum0;
 805         zio_cksum_t             l2le_freeze_cksum;
 806         /*
 807          * l2le_prop is the same format as the blk_prop in blkptr_t:
 808          *      * logical size (in sectors)
 809          *      * physical (compressed) size (in sectors)
 810          *      * compression algorithm
 811          *      * checksum algorithm (used for cksum0)
 812          *      * object type & level (used to restore arc_buf_contents_t)
 813          */
 814         uint64_t                l2le_prop;
 815         uint64_t                l2le_daddr;     /* buf location on l2dev */
 816         const uint64_t          l2le_pad[6];    /* resv'd for future use */
 817 } l2arc_log_ent_phys_t;
 818 
 819 /*
 820  * These design limits give us the following overhead (before compression):
 821  *      avg_blk_sz      overhead
 822  *      1k              12.51 %
 823  *      2k               6.26 %
 824  *      4k               3.13 %
 825  *      8k               1.56 %
 826  *      16k              0.78 %
 827  *      32k              0.39 %
 828  *      64k              0.20 %
 829  *      128k             0.10 %
 830  * Compression should be able to sequeeze these down by about a factor of 2x.
 831  */
 832 #define L2ARC_LOG_BLK_SIZE                      (128 * 1024)    /* 128k */
 833 #define L2ARC_LOG_BLK_HEADER_LEN                (128)
 834 #define L2ARC_LOG_BLK_ENTRIES                   /* 1023 entries */      \
 835         ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) /              \
 836         sizeof (l2arc_log_ent_phys_t))
 837 /*
 838  * Maximum amount of data in an l2arc log block (used to terminate rebuilding
 839  * before we hit the write head and restore potentially corrupted blocks).
 840  */
 841 #define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE  \
 842         (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
 843 /*
 844  * For the persistency and rebuild algorithms to operate reliably we need
 845  * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
 846  * excessive log block looping might confuse the log chain end detection).
 847  * Under normal circumstances this is not a problem, since this is somewhere
 848  * around only 400 MB.
 849  */
 850 #define L2ARC_PERSIST_MIN_SIZE  (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
 851 
 852 /*
 853  * A log block of up to 1023 ARC buffer log entries, chained into the
 854  * persistent L2ARC metadata linked list.
 855  */
 856 typedef struct l2arc_log_blk_phys {
 857         /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
 858         uint64_t                l2lb_magic;
 859         l2arc_log_blk_ptr_t     l2lb_back2_lbp; /* back 2 steps in chain */
 860         uint64_t                l2lb_pad[9];    /* resv'd for future use */
 861         /* Payload */
 862         l2arc_log_ent_phys_t    l2lb_entries[L2ARC_LOG_BLK_ENTRIES];
 863 } l2arc_log_blk_phys_t;
 864 
 865 CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
 866 CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) -
 867     offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
 868 
 869 /*
 870  * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
 871  * written to the L2ARC device. They may be compressed, hence the uint8_t[].
 872  */
 873 typedef struct l2arc_log_blk_buf {
 874         uint8_t         l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
 875         list_node_t     l2lbb_node;
 876 } l2arc_log_blk_buf_t;
 877 
 878 /* Macros for the manipulation fields in the blk_prop format of blkptr_t */
 879 #define BLKPROP_GET_LSIZE(_obj, _field)         \
 880         BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
 881 #define BLKPROP_SET_LSIZE(_obj, _field, x)      \
 882         BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
 883 #define BLKPROP_GET_PSIZE(_obj, _field)         \
 884         BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
 885 #define BLKPROP_SET_PSIZE(_obj, _field, x)      \
 886         BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
 887 #define BLKPROP_GET_COMPRESS(_obj, _field)      \
 888         BF64_GET((_obj)->_field, 32, 8)
 889 #define BLKPROP_SET_COMPRESS(_obj, _field, x)   \
 890         BF64_SET((_obj)->_field, 32, 8, x)
 891 #define BLKPROP_GET_CHECKSUM(_obj, _field)      \
 892         BF64_GET((_obj)->_field, 40, 8)
 893 #define BLKPROP_SET_CHECKSUM(_obj, _field, x)   \
 894         BF64_SET((_obj)->_field, 40, 8, x)
 895 #define BLKPROP_GET_TYPE(_obj, _field)          \
 896         BF64_GET((_obj)->_field, 48, 8)
 897 #define BLKPROP_SET_TYPE(_obj, _field, x)       \
 898         BF64_SET((_obj)->_field, 48, 8, x)
 899 
 900 /* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */
 901 #define LBP_GET_LSIZE(_add)             BLKPROP_GET_LSIZE(_add, l2lbp_prop)
 902 #define LBP_SET_LSIZE(_add, x)          BLKPROP_SET_LSIZE(_add, l2lbp_prop, x)
 903 #define LBP_GET_PSIZE(_add)             BLKPROP_GET_PSIZE(_add, l2lbp_prop)
 904 #define LBP_SET_PSIZE(_add, x)          BLKPROP_SET_PSIZE(_add, l2lbp_prop, x)
 905 #define LBP_GET_COMPRESS(_add)          BLKPROP_GET_COMPRESS(_add, l2lbp_prop)
 906 #define LBP_SET_COMPRESS(_add, x)       BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \
 907     x)
 908 #define LBP_GET_CHECKSUM(_add)          BLKPROP_GET_CHECKSUM(_add, l2lbp_prop)
 909 #define LBP_SET_CHECKSUM(_add, x)       BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \
 910     x)
 911 #define LBP_GET_TYPE(_add)              BLKPROP_GET_TYPE(_add, l2lbp_prop)
 912 #define LBP_SET_TYPE(_add, x)           BLKPROP_SET_TYPE(_add, l2lbp_prop, x)
 913 
 914 /* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */
 915 #define LE_GET_LSIZE(_le)       BLKPROP_GET_LSIZE(_le, l2le_prop)
 916 #define LE_SET_LSIZE(_le, x)    BLKPROP_SET_LSIZE(_le, l2le_prop, x)
 917 #define LE_GET_PSIZE(_le)       BLKPROP_GET_PSIZE(_le, l2le_prop)
 918 #define LE_SET_PSIZE(_le, x)    BLKPROP_SET_PSIZE(_le, l2le_prop, x)
 919 #define LE_GET_COMPRESS(_le)    BLKPROP_GET_COMPRESS(_le, l2le_prop)
 920 #define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x)
 921 #define LE_GET_CHECKSUM(_le)    BLKPROP_GET_CHECKSUM(_le, l2le_prop)
 922 #define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x)
 923 #define LE_GET_TYPE(_le)        BLKPROP_GET_TYPE(_le, l2le_prop)
 924 #define LE_SET_TYPE(_le, x)     BLKPROP_SET_TYPE(_le, l2le_prop, x)
 925 
 926 #define PTR_SWAP(x, y)          \
 927         do {                    \
 928                 void *tmp = (x);\
 929                 x = y;          \
 930                 y = tmp;        \
 931                 _NOTE(CONSTCOND)\
 932         } while (0)
 933 
 934 #define L2ARC_DEV_HDR_MAGIC     0x12bab10c00000001LLU
 935 #define L2ARC_LOG_BLK_MAGIC     0x120103b10c000001LLU
 936 #define L2ARC_REBUILD_TIMEOUT   300     /* a rebuild may take at most 300s */
 937 
 938 struct l2arc_dev {
 939         vdev_t                  *l2ad_vdev;     /* vdev */
 940         spa_t                   *l2ad_spa;      /* spa */
 941         uint64_t                l2ad_hand;      /* next write location */
 942         uint64_t                l2ad_start;     /* first addr on device */
 943         uint64_t                l2ad_end;       /* last addr on device */
 944         uint64_t                l2ad_evict;     /* last addr eviction reached */
 945         boolean_t               l2ad_first;     /* first sweep through */
 946         boolean_t               l2ad_writing;   /* currently writing */
 947         list_t                  *l2ad_buflist;  /* buffer list */
 948         list_node_t             l2ad_node;      /* device list node */
 949         l2arc_dev_hdr_phys_t    l2ad_dev_hdr;   /* persistent device header */
 950         l2arc_log_blk_phys_t    l2ad_log_blk;   /* currently open log block */
 951         int                     l2ad_log_ent_idx; /* index into cur log blk */
 952         /* number of bytes in current log block's payload */
 953         uint64_t                l2ad_log_blk_payload_asize;
 954         /* flag indicating whether a rebuild is scheduled or is going on */
 955         boolean_t               l2ad_rebuild;
 956 };
 957 
 958 /*
 959  * Performance tuning of L2ARC persistency:
 960  *
 961  * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
 962  *              pool import or when adding one manually later) will attempt
 963  *              to rebuild L2ARC buffer contents. In special circumstances,
 964  *              the administrator may want to set this to B_FALSE, if they
 965  *              are having trouble importing a pool or attaching an L2ARC
 966  *              device (e.g. the L2ARC device is slow to read in stored log
 967  *              metadata, or the metadata has become somehow
 968  *              fragmented/unusable).
 969  * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
 970  *              avoid a slow L2ARC device from preventing pool import. If we
 971  *              are not done rebuilding an L2ARC device by this time, we
 972  *              stop the rebuild and return immediately.
 973  */
 974 boolean_t l2arc_rebuild_enabled = B_TRUE;
 975 uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
 976 
 977 /*
 978  * L2ARC persistency rebuild routines.
 979  */
 980 static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
 981 static int l2arc_rebuild(l2arc_dev_t *dev);
 982 static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
 983     l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
 984 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
 985     l2arc_dev_t *dev, uint64_t guid);
 986 
 987 /*
 988  * L2ARC persistency read I/O routines.
 989  */
 990 static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr);
 991 static int l2arc_log_blk_read(l2arc_dev_t *dev,
 992     const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp,
 993     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
 994     uint8_t *this_lb_buf, uint8_t *next_lb_buf,
 995     zio_t *this_io, zio_t **next_io);
 996 static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev,
 997     const l2arc_log_blk_ptr_t *lp);
 998 static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
 999     const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf);
1000 static void l2arc_log_blk_prefetch_abort(zio_t *zio);
1001 
1002 /*
1003  * L2ARC persistency write I/O routines.
1004  */
1005 static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
1006 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
1007     l2arc_write_callback_t *cb);
1008 
1009 /*
1010  * L2ARC persistency auxilliary routines.
1011  */
1012 static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
1013     zio_cksum_t *cksum);
1014 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
1015     const arc_buf_hdr_t *ab);
1016 static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
1017     uint64_t top, uint64_t check);
1018 static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline);
1019 
1020 static inline uint64_t
1021 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1022 {
1023         uint8_t *vdva = (uint8_t *)dva;
1024         uint64_t crc = -1ULL;
1025         int i;
1026 
1027         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
1028 
1029         for (i = 0; i < sizeof (dva_t); i++)
1030                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
1031 
1032         crc ^= (spa>>8) ^ birth;
1033 
1034         return (crc);
1035 }
1036 
1037 #define BUF_EMPTY(buf)                                          \
1038         ((buf)->b_dva.dva_word[0] == 0 &&                    \
1039         (buf)->b_dva.dva_word[1] == 0 &&                     \
1040         (buf)->b_birth == 0)
1041 
1042 #define BUF_EQUAL(spa, dva, birth, buf)                         \
1043         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&       \
1044         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&       \
1045         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
1046 
1047 static void
1048 buf_discard_identity(arc_buf_hdr_t *hdr)
1049 {
1050         hdr->b_dva.dva_word[0] = 0;
1051         hdr->b_dva.dva_word[1] = 0;
1052         hdr->b_birth = 0;
1053         hdr->b_cksum0 = 0;
1054 }
1055 
1056 static arc_buf_hdr_t *
1057 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
1058 {
1059         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1060         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1061         arc_buf_hdr_t *buf;
1062 
1063         mutex_enter(hash_lock);
1064         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
1065             buf = buf->b_hash_next) {
1066                 if (BUF_EQUAL(spa, dva, birth, buf)) {
1067                         *lockp = hash_lock;
1068                         return (buf);
1069                 }
1070         }
1071         mutex_exit(hash_lock);
1072         *lockp = NULL;
1073         return (NULL);
1074 }
1075 
1076 /*
1077  * Insert an entry into the hash table.  If there is already an element
1078  * equal to elem in the hash table, then the already existing element
1079  * will be returned and the new element will not be inserted.
1080  * Otherwise returns NULL.
1081  */
1082 static arc_buf_hdr_t *
1083 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
1084 {
1085         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1086         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1087         arc_buf_hdr_t *fbuf;
1088         uint32_t i;
1089 
1090         ASSERT(!HDR_IN_HASH_TABLE(buf));
1091         *lockp = hash_lock;
1092         mutex_enter(hash_lock);
1093         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
1094             fbuf = fbuf->b_hash_next, i++) {
1095                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
1096                         return (fbuf);
1097         }
1098 
1099         buf->b_hash_next = buf_hash_table.ht_table[idx];
1100         buf_hash_table.ht_table[idx] = buf;
1101         buf->b_flags |= ARC_IN_HASH_TABLE;
1102 
1103         /* collect some hash table performance data */
1104         if (i > 0) {
1105                 ARCSTAT_BUMP(arcstat_hash_collisions);
1106                 if (i == 1)
1107                         ARCSTAT_BUMP(arcstat_hash_chains);
1108 
1109                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1110         }
1111 
1112         ARCSTAT_BUMP(arcstat_hash_elements);
1113         ARCSTAT_MAXSTAT(arcstat_hash_elements);
1114 
1115         return (NULL);
1116 }
1117 
1118 static void
1119 buf_hash_remove(arc_buf_hdr_t *buf)
1120 {
1121         arc_buf_hdr_t *fbuf, **bufp;
1122         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1123 
1124         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1125         ASSERT(HDR_IN_HASH_TABLE(buf));
1126 
1127         bufp = &buf_hash_table.ht_table[idx];
1128         while ((fbuf = *bufp) != buf) {
1129                 ASSERT(fbuf != NULL);
1130                 bufp = &fbuf->b_hash_next;
1131         }
1132         *bufp = buf->b_hash_next;
1133         buf->b_hash_next = NULL;
1134         buf->b_flags &= ~ARC_IN_HASH_TABLE;
1135 
1136         /* collect some hash table performance data */
1137         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1138 
1139         if (buf_hash_table.ht_table[idx] &&
1140             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1141                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1142 }
1143 
1144 /*
1145  * Global data structures and functions for the buf kmem cache.
1146  */
1147 static kmem_cache_t *hdr_cache;
1148 static kmem_cache_t *buf_cache;
1149 
1150 static void
1151 buf_fini(void)
1152 {
1153         int i;
1154 
1155         kmem_free(buf_hash_table.ht_table,
1156             (buf_hash_table.ht_mask + 1) * sizeof (void *));
1157         for (i = 0; i < BUF_LOCKS; i++)
1158                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1159         kmem_cache_destroy(hdr_cache);
1160         kmem_cache_destroy(buf_cache);
1161 }
1162 
1163 /*
1164  * Constructor callback - called when the cache is empty
1165  * and a new buf is requested.
1166  */
1167 /* ARGSUSED */
1168 static int
1169 hdr_cons(void *vbuf, void *unused, int kmflag)
1170 {
1171         arc_buf_hdr_t *buf = vbuf;
1172 
1173         bzero(buf, sizeof (arc_buf_hdr_t));
1174         refcount_create(&buf->b_refcnt);
1175         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
1176         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1177         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1178 
1179         return (0);
1180 }
1181 
1182 /* ARGSUSED */
1183 static int
1184 buf_cons(void *vbuf, void *unused, int kmflag)
1185 {
1186         arc_buf_t *buf = vbuf;
1187 
1188         bzero(buf, sizeof (arc_buf_t));
1189         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1190         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1191 
1192         return (0);
1193 }
1194 
1195 /*
1196  * Destructor callback - called when a cached buf is
1197  * no longer required.
1198  */
1199 /* ARGSUSED */
1200 static void
1201 hdr_dest(void *vbuf, void *unused)
1202 {
1203         arc_buf_hdr_t *buf = vbuf;
1204 
1205         ASSERT(BUF_EMPTY(buf));
1206         refcount_destroy(&buf->b_refcnt);
1207         cv_destroy(&buf->b_cv);
1208         mutex_destroy(&buf->b_freeze_lock);
1209         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1210 }
1211 
1212 /* ARGSUSED */
1213 static void
1214 buf_dest(void *vbuf, void *unused)
1215 {
1216         arc_buf_t *buf = vbuf;
1217 
1218         mutex_destroy(&buf->b_evict_lock);
1219         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1220 }
1221 
1222 /*
1223  * Reclaim callback -- invoked when memory is low.
1224  */
1225 /* ARGSUSED */
1226 static void
1227 hdr_recl(void *unused)
1228 {
1229         dprintf("hdr_recl called\n");
1230         /*
1231          * umem calls the reclaim func when we destroy the buf cache,
1232          * which is after we do arc_fini().
1233          */
1234         if (!arc_dead)
1235                 cv_signal(&arc_reclaim_thr_cv);
1236 }
1237 
1238 static void
1239 buf_init(void)
1240 {
1241         uint64_t *ct;
1242         uint64_t hsize = 1ULL << 12;
1243         int i, j;
1244 
1245         /*
1246          * The hash table is big enough to fill all of physical memory
1247          * with an average 64K block size.  The table will take up
1248          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
1249          */
1250         while (hsize * 65536 < physmem * PAGESIZE)
1251                 hsize <<= 1;
1252 retry:
1253         buf_hash_table.ht_mask = hsize - 1;
1254         buf_hash_table.ht_table =
1255             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1256         if (buf_hash_table.ht_table == NULL) {
1257                 ASSERT(hsize > (1ULL << 8));
1258                 hsize >>= 1;
1259                 goto retry;
1260         }
1261 
1262         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1263             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1264         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1265             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1266 
1267         for (i = 0; i < 256; i++)
1268                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1269                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1270 
1271         for (i = 0; i < BUF_LOCKS; i++) {
1272                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1273                     NULL, MUTEX_DEFAULT, NULL);
1274         }
1275 }
1276 
1277 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1278 
1279 static void
1280 arc_cksum_verify(arc_buf_t *buf)
1281 {
1282         zio_cksum_t zc;
1283 
1284         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1285                 return;
1286 
1287         mutex_enter(&buf->b_hdr->b_freeze_lock);
1288         if (buf->b_hdr->b_freeze_cksum == NULL ||
1289             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1290                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1291                 return;
1292         }
1293         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1294         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1295                 panic("buffer modified while frozen!");
1296         mutex_exit(&buf->b_hdr->b_freeze_lock);
1297 }
1298 
1299 static int
1300 arc_cksum_equal(arc_buf_t *buf)
1301 {
1302         zio_cksum_t zc;
1303         int equal;
1304 
1305         mutex_enter(&buf->b_hdr->b_freeze_lock);
1306         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1307         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1308         mutex_exit(&buf->b_hdr->b_freeze_lock);
1309 
1310         return (equal);
1311 }
1312 
1313 static void
1314 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1315 {
1316         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1317                 return;
1318 
1319         mutex_enter(&buf->b_hdr->b_freeze_lock);
1320         if (buf->b_hdr->b_freeze_cksum != NULL) {
1321                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1322                 return;
1323         }
1324         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1325         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1326             buf->b_hdr->b_freeze_cksum);
1327         mutex_exit(&buf->b_hdr->b_freeze_lock);
1328         arc_buf_watch(buf);
1329 }
1330 
1331 #ifndef _KERNEL
1332 typedef struct procctl {
1333         long cmd;
1334         prwatch_t prwatch;
1335 } procctl_t;
1336 #endif
1337 
1338 /* ARGSUSED */
1339 static void
1340 arc_buf_unwatch(arc_buf_t *buf)
1341 {
1342 #ifndef _KERNEL
1343         if (arc_watch) {
1344                 int result;
1345                 procctl_t ctl;
1346                 ctl.cmd = PCWATCH;
1347                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1348                 ctl.prwatch.pr_size = 0;
1349                 ctl.prwatch.pr_wflags = 0;
1350                 result = write(arc_procfd, &ctl, sizeof (ctl));
1351                 ASSERT3U(result, ==, sizeof (ctl));
1352         }
1353 #endif
1354 }
1355 
1356 /* ARGSUSED */
1357 static void
1358 arc_buf_watch(arc_buf_t *buf)
1359 {
1360 #ifndef _KERNEL
1361         if (arc_watch) {
1362                 int result;
1363                 procctl_t ctl;
1364                 ctl.cmd = PCWATCH;
1365                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1366                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1367                 ctl.prwatch.pr_wflags = WA_WRITE;
1368                 result = write(arc_procfd, &ctl, sizeof (ctl));
1369                 ASSERT3U(result, ==, sizeof (ctl));
1370         }
1371 #endif
1372 }
1373 
1374 void
1375 arc_buf_thaw(arc_buf_t *buf)
1376 {
1377         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1378                 if (buf->b_hdr->b_state != arc_anon)
1379                         panic("modifying non-anon buffer!");
1380                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1381                         panic("modifying buffer while i/o in progress!");
1382                 arc_cksum_verify(buf);
1383         }
1384 
1385         mutex_enter(&buf->b_hdr->b_freeze_lock);
1386         if (buf->b_hdr->b_freeze_cksum != NULL) {
1387                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1388                 buf->b_hdr->b_freeze_cksum = NULL;
1389         }
1390 
1391         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1392                 if (buf->b_hdr->b_thawed)
1393                         kmem_free(buf->b_hdr->b_thawed, 1);
1394                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1395         }
1396 
1397         mutex_exit(&buf->b_hdr->b_freeze_lock);
1398 
1399         arc_buf_unwatch(buf);
1400 }
1401 
1402 void
1403 arc_buf_freeze(arc_buf_t *buf)
1404 {
1405         kmutex_t *hash_lock;
1406 
1407         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1408                 return;
1409 
1410         hash_lock = HDR_LOCK(buf->b_hdr);
1411         mutex_enter(hash_lock);
1412 
1413         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1414             buf->b_hdr->b_state == arc_anon);
1415         arc_cksum_compute(buf, B_FALSE);
1416         mutex_exit(hash_lock);
1417 
1418 }
1419 
1420 static void
1421 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1422 {
1423         ASSERT(MUTEX_HELD(hash_lock));
1424 
1425         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1426             (ab->b_state != arc_anon)) {
1427                 uint64_t delta = ab->b_size * ab->b_datacnt;
1428                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1429                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1430 
1431                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1432                 mutex_enter(&ab->b_state->arcs_mtx);
1433                 ASSERT(list_link_active(&ab->b_arc_node));
1434                 list_remove(list, ab);
1435                 if (GHOST_STATE(ab->b_state)) {
1436                         ASSERT0(ab->b_datacnt);
1437                         ASSERT3P(ab->b_buf, ==, NULL);
1438                         delta = ab->b_size;
1439                 }
1440                 ASSERT(delta > 0);
1441                 ASSERT3U(*size, >=, delta);
1442                 atomic_add_64(size, -delta);
1443                 mutex_exit(&ab->b_state->arcs_mtx);
1444                 /* remove the prefetch flag if we get a reference */
1445                 if (ab->b_flags & ARC_PREFETCH)
1446                         ab->b_flags &= ~ARC_PREFETCH;
1447         }
1448 }
1449 
1450 static int
1451 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1452 {
1453         int cnt;
1454         arc_state_t *state = ab->b_state;
1455 
1456         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1457         ASSERT(!GHOST_STATE(state));
1458 
1459         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1460             (state != arc_anon)) {
1461                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1462 
1463                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1464                 mutex_enter(&state->arcs_mtx);
1465                 ASSERT(!list_link_active(&ab->b_arc_node));
1466                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1467                 ASSERT(ab->b_datacnt > 0);
1468                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1469                 mutex_exit(&state->arcs_mtx);
1470         }
1471         return (cnt);
1472 }
1473 
1474 /*
1475  * Move the supplied buffer to the indicated state.  The mutex
1476  * for the buffer must be held by the caller.
1477  */
1478 static void
1479 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1480 {
1481         arc_state_t *old_state = ab->b_state;
1482         int64_t refcnt = refcount_count(&ab->b_refcnt);
1483         uint64_t from_delta, to_delta;
1484 
1485         ASSERT(MUTEX_HELD(hash_lock));
1486         ASSERT3P(new_state, !=, old_state);
1487         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1488         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1489         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1490 
1491         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1492 
1493         /*
1494          * If this buffer is evictable, transfer it from the
1495          * old state list to the new state list.
1496          */
1497         if (refcnt == 0) {
1498                 if (old_state != arc_anon) {
1499                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1500                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1501 
1502                         if (use_mutex)
1503                                 mutex_enter(&old_state->arcs_mtx);
1504 
1505                         ASSERT(list_link_active(&ab->b_arc_node));
1506                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1507 
1508                         /*
1509                          * If prefetching out of the ghost cache,
1510                          * we will have a non-zero datacnt.
1511                          */
1512                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1513                                 /* ghost elements have a ghost size */
1514                                 ASSERT(ab->b_buf == NULL);
1515                                 from_delta = ab->b_size;
1516                         }
1517                         ASSERT3U(*size, >=, from_delta);
1518                         atomic_add_64(size, -from_delta);
1519 
1520                         if (use_mutex)
1521                                 mutex_exit(&old_state->arcs_mtx);
1522                 }
1523                 if (new_state != arc_anon) {
1524                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1525                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1526 
1527                         if (use_mutex)
1528                                 mutex_enter(&new_state->arcs_mtx);
1529 
1530                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1531 
1532                         /* ghost elements have a ghost size */
1533                         if (GHOST_STATE(new_state)) {
1534                                 ASSERT(ab->b_datacnt == 0);
1535                                 ASSERT(ab->b_buf == NULL);
1536                                 to_delta = ab->b_size;
1537                         }
1538                         atomic_add_64(size, to_delta);
1539 
1540                         if (use_mutex)
1541                                 mutex_exit(&new_state->arcs_mtx);
1542                 }
1543         }
1544 
1545         ASSERT(!BUF_EMPTY(ab));
1546         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1547                 buf_hash_remove(ab);
1548 
1549         /* adjust state sizes */
1550         if (to_delta)
1551                 atomic_add_64(&new_state->arcs_size, to_delta);
1552         if (from_delta) {
1553                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1554                 atomic_add_64(&old_state->arcs_size, -from_delta);
1555         }
1556         ab->b_state = new_state;
1557 
1558         /* adjust l2arc hdr stats */
1559         if (new_state == arc_l2c_only)
1560                 l2arc_hdr_stat_add(old_state != arc_anon);
1561         else if (old_state == arc_l2c_only)
1562                 l2arc_hdr_stat_remove();
1563 }
1564 
1565 void
1566 arc_space_consume(uint64_t space, arc_space_type_t type)
1567 {
1568         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1569 
1570         switch (type) {
1571         case ARC_SPACE_DATA:
1572                 ARCSTAT_INCR(arcstat_data_size, space);
1573                 break;
1574         case ARC_SPACE_OTHER:
1575                 ARCSTAT_INCR(arcstat_other_size, space);
1576                 break;
1577         case ARC_SPACE_HDRS:
1578                 ARCSTAT_INCR(arcstat_hdr_size, space);
1579                 break;
1580         case ARC_SPACE_L2HDRS:
1581                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1582                 break;
1583         }
1584 
1585         ARCSTAT_INCR(arcstat_meta_used, space);
1586         atomic_add_64(&arc_size, space);
1587 }
1588 
1589 void
1590 arc_space_return(uint64_t space, arc_space_type_t type)
1591 {
1592         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1593 
1594         switch (type) {
1595         case ARC_SPACE_DATA:
1596                 ARCSTAT_INCR(arcstat_data_size, -space);
1597                 break;
1598         case ARC_SPACE_OTHER:
1599                 ARCSTAT_INCR(arcstat_other_size, -space);
1600                 break;
1601         case ARC_SPACE_HDRS:
1602                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1603                 break;
1604         case ARC_SPACE_L2HDRS:
1605                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1606                 break;
1607         }
1608 
1609         ASSERT(arc_meta_used >= space);
1610         if (arc_meta_max < arc_meta_used)
1611                 arc_meta_max = arc_meta_used;
1612         ARCSTAT_INCR(arcstat_meta_used, -space);
1613         ASSERT(arc_size >= space);
1614         atomic_add_64(&arc_size, -space);
1615 }
1616 
1617 void *
1618 arc_data_buf_alloc(uint64_t size)
1619 {
1620         if (arc_evict_needed(ARC_BUFC_DATA))
1621                 cv_signal(&arc_reclaim_thr_cv);
1622         atomic_add_64(&arc_size, size);
1623         return (zio_data_buf_alloc(size));
1624 }
1625 
1626 void
1627 arc_data_buf_free(void *buf, uint64_t size)
1628 {
1629         zio_data_buf_free(buf, size);
1630         ASSERT(arc_size >= size);
1631         atomic_add_64(&arc_size, -size);
1632 }
1633 
1634 arc_buf_t *
1635 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1636 {
1637         arc_buf_hdr_t *hdr;
1638         arc_buf_t *buf;
1639 
1640         ASSERT3U(size, >, 0);
1641         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1642         ASSERT(BUF_EMPTY(hdr));
1643         hdr->b_size = size;
1644         hdr->b_type = type;
1645         hdr->b_spa = spa_load_guid(spa);
1646         hdr->b_state = arc_anon;
1647         hdr->b_arc_access = 0;
1648         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1649         buf->b_hdr = hdr;
1650         buf->b_data = NULL;
1651         buf->b_efunc = NULL;
1652         buf->b_private = NULL;
1653         buf->b_next = NULL;
1654         hdr->b_buf = buf;
1655         arc_get_data_buf(buf);
1656         hdr->b_datacnt = 1;
1657         hdr->b_flags = 0;
1658         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1659         (void) refcount_add(&hdr->b_refcnt, tag);
1660 
1661         return (buf);
1662 }
1663 
1664 /*
1665  * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
1666  * This is used during l2arc reconstruction to make empty ARC buffers
1667  * which circumvent the regular disk->arc->l2arc path and instead come
1668  * into being in the reverse order, i.e. l2arc->arc->(disk).
1669  */
1670 arc_buf_hdr_t *
1671 arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
1672 {
1673         arc_buf_hdr_t *hdr;
1674 
1675         ASSERT3U(size, >, 0);
1676         hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
1677         ASSERT(BUF_EMPTY(hdr));
1678         hdr->b_size = size;
1679         hdr->b_type = type;
1680         hdr->b_spa = guid;
1681         hdr->b_state = arc_anon;
1682         hdr->b_arc_access = 0;
1683         hdr->b_buf = NULL;
1684         hdr->b_datacnt = 0;
1685         hdr->b_flags = 0;
1686         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1687 
1688         return (hdr);
1689 }
1690 
1691 static char *arc_onloan_tag = "onloan";
1692 
1693 /*
1694  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1695  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1696  * buffers must be returned to the arc before they can be used by the DMU or
1697  * freed.
1698  */
1699 arc_buf_t *
1700 arc_loan_buf(spa_t *spa, int size)
1701 {
1702         arc_buf_t *buf;
1703 
1704         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1705 
1706         atomic_add_64(&arc_loaned_bytes, size);
1707         return (buf);
1708 }
1709 
1710 /*
1711  * Return a loaned arc buffer to the arc.
1712  */
1713 void
1714 arc_return_buf(arc_buf_t *buf, void *tag)
1715 {
1716         arc_buf_hdr_t *hdr = buf->b_hdr;
1717 
1718         ASSERT(buf->b_data != NULL);
1719         (void) refcount_add(&hdr->b_refcnt, tag);
1720         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1721 
1722         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1723 }
1724 
1725 /* Detach an arc_buf from a dbuf (tag) */
1726 void
1727 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1728 {
1729         arc_buf_hdr_t *hdr;
1730 
1731         ASSERT(buf->b_data != NULL);
1732         hdr = buf->b_hdr;
1733         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1734         (void) refcount_remove(&hdr->b_refcnt, tag);
1735         buf->b_efunc = NULL;
1736         buf->b_private = NULL;
1737 
1738         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1739 }
1740 
1741 static arc_buf_t *
1742 arc_buf_clone(arc_buf_t *from)
1743 {
1744         arc_buf_t *buf;
1745         arc_buf_hdr_t *hdr = from->b_hdr;
1746         uint64_t size = hdr->b_size;
1747 
1748         ASSERT(hdr->b_state != arc_anon);
1749 
1750         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1751         buf->b_hdr = hdr;
1752         buf->b_data = NULL;
1753         buf->b_efunc = NULL;
1754         buf->b_private = NULL;
1755         buf->b_next = hdr->b_buf;
1756         hdr->b_buf = buf;
1757         arc_get_data_buf(buf);
1758         bcopy(from->b_data, buf->b_data, size);
1759 
1760         /*
1761          * This buffer already exists in the arc so create a duplicate
1762          * copy for the caller.  If the buffer is associated with user data
1763          * then track the size and number of duplicates.  These stats will be
1764          * updated as duplicate buffers are created and destroyed.
1765          */
1766         if (hdr->b_type == ARC_BUFC_DATA) {
1767                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1768                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1769         }
1770         hdr->b_datacnt += 1;
1771         return (buf);
1772 }
1773 
1774 void
1775 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1776 {
1777         arc_buf_hdr_t *hdr;
1778         kmutex_t *hash_lock;
1779 
1780         /*
1781          * Check to see if this buffer is evicted.  Callers
1782          * must verify b_data != NULL to know if the add_ref
1783          * was successful.
1784          */
1785         mutex_enter(&buf->b_evict_lock);
1786         if (buf->b_data == NULL) {
1787                 mutex_exit(&buf->b_evict_lock);
1788                 return;
1789         }
1790         hash_lock = HDR_LOCK(buf->b_hdr);
1791         mutex_enter(hash_lock);
1792         hdr = buf->b_hdr;
1793         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1794         mutex_exit(&buf->b_evict_lock);
1795 
1796         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1797         add_reference(hdr, hash_lock, tag);
1798         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1799         arc_access(hdr, hash_lock);
1800         mutex_exit(hash_lock);
1801         ARCSTAT_BUMP(arcstat_hits);
1802         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1803             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1804             data, metadata, hits);
1805 }
1806 
1807 /*
1808  * Free the arc data buffer.  If it is an l2arc write in progress,
1809  * the buffer is placed on l2arc_free_on_write to be freed later.
1810  */
1811 static void
1812 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1813 {
1814         arc_buf_hdr_t *hdr = buf->b_hdr;
1815 
1816         if (HDR_L2_WRITING(hdr)) {
1817                 l2arc_data_free_t *df;
1818                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1819                 df->l2df_data = buf->b_data;
1820                 df->l2df_size = hdr->b_size;
1821                 df->l2df_func = free_func;
1822                 mutex_enter(&l2arc_free_on_write_mtx);
1823                 list_insert_head(l2arc_free_on_write, df);
1824                 mutex_exit(&l2arc_free_on_write_mtx);
1825                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1826         } else {
1827                 free_func(buf->b_data, hdr->b_size);
1828         }
1829 }
1830 
1831 static void
1832 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1833 {
1834         arc_buf_t **bufp;
1835 
1836         /* free up data associated with the buf */
1837         if (buf->b_data) {
1838                 arc_state_t *state = buf->b_hdr->b_state;
1839                 uint64_t size = buf->b_hdr->b_size;
1840                 arc_buf_contents_t type = buf->b_hdr->b_type;
1841 
1842                 arc_cksum_verify(buf);
1843                 arc_buf_unwatch(buf);
1844 
1845                 if (!recycle) {
1846                         if (type == ARC_BUFC_METADATA) {
1847                                 arc_buf_data_free(buf, zio_buf_free);
1848                                 arc_space_return(size, ARC_SPACE_DATA);
1849                         } else {
1850                                 ASSERT(type == ARC_BUFC_DATA);
1851                                 arc_buf_data_free(buf, zio_data_buf_free);
1852                                 ARCSTAT_INCR(arcstat_data_size, -size);
1853                                 atomic_add_64(&arc_size, -size);
1854                         }
1855                 }
1856                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1857                         uint64_t *cnt = &state->arcs_lsize[type];
1858 
1859                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1860                         ASSERT(state != arc_anon);
1861 
1862                         ASSERT3U(*cnt, >=, size);
1863                         atomic_add_64(cnt, -size);
1864                 }
1865                 ASSERT3U(state->arcs_size, >=, size);
1866                 atomic_add_64(&state->arcs_size, -size);
1867                 buf->b_data = NULL;
1868 
1869                 /*
1870                  * If we're destroying a duplicate buffer make sure
1871                  * that the appropriate statistics are updated.
1872                  */
1873                 if (buf->b_hdr->b_datacnt > 1 &&
1874                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1875                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1876                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1877                 }
1878                 ASSERT(buf->b_hdr->b_datacnt > 0);
1879                 buf->b_hdr->b_datacnt -= 1;
1880         }
1881 
1882         /* only remove the buf if requested */
1883         if (!all)
1884                 return;
1885 
1886         /* remove the buf from the hdr list */
1887         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1888                 continue;
1889         *bufp = buf->b_next;
1890         buf->b_next = NULL;
1891 
1892         ASSERT(buf->b_efunc == NULL);
1893 
1894         /* clean up the buf */
1895         buf->b_hdr = NULL;
1896         kmem_cache_free(buf_cache, buf);
1897 }
1898 
1899 static void
1900 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1901 {
1902         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1903         ASSERT3P(hdr->b_state, ==, arc_anon);
1904         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1905         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1906 
1907         if (l2hdr != NULL) {
1908                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1909                 /*
1910                  * To prevent arc_free() and l2arc_evict() from
1911                  * attempting to free the same buffer at the same time,
1912                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1913                  * give it priority.  l2arc_evict() can't destroy this
1914                  * header while we are waiting on l2arc_buflist_mtx.
1915                  *
1916                  * The hdr may be removed from l2ad_buflist before we
1917                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1918                  */
1919                 if (!buflist_held) {
1920                         mutex_enter(&l2arc_buflist_mtx);
1921                         l2hdr = hdr->b_l2hdr;
1922                 }
1923 
1924                 if (l2hdr != NULL) {
1925                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1926                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1927                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1928                         kmem_free(l2hdr, sizeof (*l2hdr));
1929                         if (hdr->b_state == arc_l2c_only)
1930                                 l2arc_hdr_stat_remove();
1931                         hdr->b_l2hdr = NULL;
1932                 }
1933 
1934                 if (!buflist_held)
1935                         mutex_exit(&l2arc_buflist_mtx);
1936         }
1937 
1938         if (!BUF_EMPTY(hdr)) {
1939                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1940                 buf_discard_identity(hdr);
1941         }
1942         while (hdr->b_buf) {
1943                 arc_buf_t *buf = hdr->b_buf;
1944 
1945                 if (buf->b_efunc) {
1946                         mutex_enter(&arc_eviction_mtx);
1947                         mutex_enter(&buf->b_evict_lock);
1948                         ASSERT(buf->b_hdr != NULL);
1949                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1950                         hdr->b_buf = buf->b_next;
1951                         buf->b_hdr = &arc_eviction_hdr;
1952                         buf->b_next = arc_eviction_list;
1953                         arc_eviction_list = buf;
1954                         mutex_exit(&buf->b_evict_lock);
1955                         mutex_exit(&arc_eviction_mtx);
1956                 } else {
1957                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1958                 }
1959         }
1960         if (hdr->b_freeze_cksum != NULL) {
1961                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1962                 hdr->b_freeze_cksum = NULL;
1963         }
1964         if (hdr->b_thawed) {
1965                 kmem_free(hdr->b_thawed, 1);
1966                 hdr->b_thawed = NULL;
1967         }
1968 
1969         ASSERT(!list_link_active(&hdr->b_arc_node));
1970         ASSERT3P(hdr->b_hash_next, ==, NULL);
1971         ASSERT3P(hdr->b_acb, ==, NULL);
1972         kmem_cache_free(hdr_cache, hdr);
1973 }
1974 
1975 void
1976 arc_buf_free(arc_buf_t *buf, void *tag)
1977 {
1978         arc_buf_hdr_t *hdr = buf->b_hdr;
1979         int hashed = hdr->b_state != arc_anon;
1980 
1981         ASSERT(buf->b_efunc == NULL);
1982         ASSERT(buf->b_data != NULL);
1983 
1984         if (hashed) {
1985                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1986 
1987                 mutex_enter(hash_lock);
1988                 hdr = buf->b_hdr;
1989                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1990 
1991                 (void) remove_reference(hdr, hash_lock, tag);
1992                 if (hdr->b_datacnt > 1) {
1993                         arc_buf_destroy(buf, FALSE, TRUE);
1994                 } else {
1995                         ASSERT(buf == hdr->b_buf);
1996                         ASSERT(buf->b_efunc == NULL);
1997                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1998                 }
1999                 mutex_exit(hash_lock);
2000         } else if (HDR_IO_IN_PROGRESS(hdr)) {
2001                 int destroy_hdr;
2002                 /*
2003                  * We are in the middle of an async write.  Don't destroy
2004                  * this buffer unless the write completes before we finish
2005                  * decrementing the reference count.
2006                  */
2007                 mutex_enter(&arc_eviction_mtx);
2008                 (void) remove_reference(hdr, NULL, tag);
2009                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2010                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2011                 mutex_exit(&arc_eviction_mtx);
2012                 if (destroy_hdr)
2013                         arc_hdr_destroy(hdr);
2014         } else {
2015                 if (remove_reference(hdr, NULL, tag) > 0)
2016                         arc_buf_destroy(buf, FALSE, TRUE);
2017                 else
2018                         arc_hdr_destroy(hdr);
2019         }
2020 }
2021 
2022 boolean_t
2023 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2024 {
2025         arc_buf_hdr_t *hdr = buf->b_hdr;
2026         kmutex_t *hash_lock = HDR_LOCK(hdr);
2027         boolean_t no_callback = (buf->b_efunc == NULL);
2028 
2029         if (hdr->b_state == arc_anon) {
2030                 ASSERT(hdr->b_datacnt == 1);
2031                 arc_buf_free(buf, tag);
2032                 return (no_callback);
2033         }
2034 
2035         mutex_enter(hash_lock);
2036         hdr = buf->b_hdr;
2037         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2038         ASSERT(hdr->b_state != arc_anon);
2039         ASSERT(buf->b_data != NULL);
2040 
2041         (void) remove_reference(hdr, hash_lock, tag);
2042         if (hdr->b_datacnt > 1) {
2043                 if (no_callback)
2044                         arc_buf_destroy(buf, FALSE, TRUE);
2045         } else if (no_callback) {
2046                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
2047                 ASSERT(buf->b_efunc == NULL);
2048                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2049         }
2050         ASSERT(no_callback || hdr->b_datacnt > 1 ||
2051             refcount_is_zero(&hdr->b_refcnt));
2052         mutex_exit(hash_lock);
2053         return (no_callback);
2054 }
2055 
2056 int
2057 arc_buf_size(arc_buf_t *buf)
2058 {
2059         return (buf->b_hdr->b_size);
2060 }
2061 
2062 /*
2063  * Called from the DMU to determine if the current buffer should be
2064  * evicted. In order to ensure proper locking, the eviction must be initiated
2065  * from the DMU. Return true if the buffer is associated with user data and
2066  * duplicate buffers still exist.
2067  */
2068 boolean_t
2069 arc_buf_eviction_needed(arc_buf_t *buf)
2070 {
2071         arc_buf_hdr_t *hdr;
2072         boolean_t evict_needed = B_FALSE;
2073 
2074         if (zfs_disable_dup_eviction)
2075                 return (B_FALSE);
2076 
2077         mutex_enter(&buf->b_evict_lock);
2078         hdr = buf->b_hdr;
2079         if (hdr == NULL) {
2080                 /*
2081                  * We are in arc_do_user_evicts(); let that function
2082                  * perform the eviction.
2083                  */
2084                 ASSERT(buf->b_data == NULL);
2085                 mutex_exit(&buf->b_evict_lock);
2086                 return (B_FALSE);
2087         } else if (buf->b_data == NULL) {
2088                 /*
2089                  * We have already been added to the arc eviction list;
2090                  * recommend eviction.
2091                  */
2092                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2093                 mutex_exit(&buf->b_evict_lock);
2094                 return (B_TRUE);
2095         }
2096 
2097         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
2098                 evict_needed = B_TRUE;
2099 
2100         mutex_exit(&buf->b_evict_lock);
2101         return (evict_needed);
2102 }
2103 
2104 /*
2105  * Evict buffers from list until we've removed the specified number of
2106  * bytes.  Move the removed buffers to the appropriate evict state.
2107  * If the recycle flag is set, then attempt to "recycle" a buffer:
2108  * - look for a buffer to evict that is `bytes' long.
2109  * - return the data block from this buffer rather than freeing it.
2110  * This flag is used by callers that are trying to make space for a
2111  * new buffer in a full arc cache.
2112  *
2113  * This function makes a "best effort".  It skips over any buffers
2114  * it can't get a hash_lock on, and so may not catch all candidates.
2115  * It may also return without evicting as much space as requested.
2116  */
2117 static void *
2118 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2119     arc_buf_contents_t type)
2120 {
2121         arc_state_t *evicted_state;
2122         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2123         arc_buf_hdr_t *ab, *ab_prev = NULL;
2124         list_t *list = &state->arcs_list[type];
2125         kmutex_t *hash_lock;
2126         boolean_t have_lock;
2127         void *stolen = NULL;
2128         arc_buf_hdr_t marker = { 0 };
2129         int count = 0;
2130 
2131         ASSERT(state == arc_mru || state == arc_mfu);
2132 
2133         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2134 
2135         mutex_enter(&state->arcs_mtx);
2136         mutex_enter(&evicted_state->arcs_mtx);
2137 
2138         for (ab = list_tail(list); ab; ab = ab_prev) {
2139                 ab_prev = list_prev(list, ab);
2140                 /* prefetch buffers have a minimum lifespan */
2141                 if (HDR_IO_IN_PROGRESS(ab) ||
2142                     (spa && ab->b_spa != spa) ||
2143                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2144                     ddi_get_lbolt() - ab->b_arc_access <
2145                     arc_min_prefetch_lifespan)) {
2146                         skipped++;
2147                         continue;
2148                 }
2149                 /* "lookahead" for better eviction candidate */
2150                 if (recycle && ab->b_size != bytes &&
2151                     ab_prev && ab_prev->b_size == bytes)
2152                         continue;
2153 
2154                 /* ignore markers */
2155                 if (ab->b_spa == 0)
2156                         continue;
2157 
2158                 /*
2159                  * It may take a long time to evict all the bufs requested.
2160                  * To avoid blocking all arc activity, periodically drop
2161                  * the arcs_mtx and give other threads a chance to run
2162                  * before reacquiring the lock.
2163                  *
2164                  * If we are looking for a buffer to recycle, we are in
2165                  * the hot code path, so don't sleep.
2166                  */
2167                 if (!recycle && count++ > arc_evict_iterations) {
2168                         list_insert_after(list, ab, &marker);
2169                         mutex_exit(&evicted_state->arcs_mtx);
2170                         mutex_exit(&state->arcs_mtx);
2171                         kpreempt(KPREEMPT_SYNC);
2172                         mutex_enter(&state->arcs_mtx);
2173                         mutex_enter(&evicted_state->arcs_mtx);
2174                         ab_prev = list_prev(list, &marker);
2175                         list_remove(list, &marker);
2176                         count = 0;
2177                         continue;
2178                 }
2179 
2180                 hash_lock = HDR_LOCK(ab);
2181                 have_lock = MUTEX_HELD(hash_lock);
2182                 if (have_lock || mutex_tryenter(hash_lock)) {
2183                         ASSERT0(refcount_count(&ab->b_refcnt));
2184                         ASSERT(ab->b_datacnt > 0);
2185                         while (ab->b_buf) {
2186                                 arc_buf_t *buf = ab->b_buf;
2187                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
2188                                         missed += 1;
2189                                         break;
2190                                 }
2191                                 if (buf->b_data) {
2192                                         bytes_evicted += ab->b_size;
2193                                         if (recycle && ab->b_type == type &&
2194                                             ab->b_size == bytes &&
2195                                             !HDR_L2_WRITING(ab)) {
2196                                                 stolen = buf->b_data;
2197                                                 recycle = FALSE;
2198                                         }
2199                                 }
2200                                 if (buf->b_efunc) {
2201                                         mutex_enter(&arc_eviction_mtx);
2202                                         arc_buf_destroy(buf,
2203                                             buf->b_data == stolen, FALSE);
2204                                         ab->b_buf = buf->b_next;
2205                                         buf->b_hdr = &arc_eviction_hdr;
2206                                         buf->b_next = arc_eviction_list;
2207                                         arc_eviction_list = buf;
2208                                         mutex_exit(&arc_eviction_mtx);
2209                                         mutex_exit(&buf->b_evict_lock);
2210                                 } else {
2211                                         mutex_exit(&buf->b_evict_lock);
2212                                         arc_buf_destroy(buf,
2213                                             buf->b_data == stolen, TRUE);
2214                                 }
2215                         }
2216 
2217                         if (ab->b_l2hdr) {
2218                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
2219                                     ab->b_size);
2220                         } else {
2221                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
2222                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
2223                                             ab->b_size);
2224                                 } else {
2225                                         ARCSTAT_INCR(
2226                                             arcstat_evict_l2_ineligible,
2227                                             ab->b_size);
2228                                 }
2229                         }
2230 
2231                         if (ab->b_datacnt == 0) {
2232                                 arc_change_state(evicted_state, ab, hash_lock);
2233                                 ASSERT(HDR_IN_HASH_TABLE(ab));
2234                                 ab->b_flags |= ARC_IN_HASH_TABLE;
2235                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
2236                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2237                         }
2238                         if (!have_lock)
2239                                 mutex_exit(hash_lock);
2240                         if (bytes >= 0 && bytes_evicted >= bytes)
2241                                 break;
2242                 } else {
2243                         missed += 1;
2244                 }
2245         }
2246 
2247         mutex_exit(&evicted_state->arcs_mtx);
2248         mutex_exit(&state->arcs_mtx);
2249 
2250         if (bytes_evicted < bytes)
2251                 dprintf("only evicted %lld bytes from %x",
2252                     (longlong_t)bytes_evicted, state);
2253 
2254         if (skipped)
2255                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
2256 
2257         if (missed)
2258                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
2259 
2260         /*
2261          * Note: we have just evicted some data into the ghost state,
2262          * potentially putting the ghost size over the desired size.  Rather
2263          * that evicting from the ghost list in this hot code path, leave
2264          * this chore to the arc_reclaim_thread().
2265          */
2266 
2267         return (stolen);
2268 }
2269 
2270 /*
2271  * Remove buffers from list until we've removed the specified number of
2272  * bytes.  Destroy the buffers that are removed.
2273  */
2274 static void
2275 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2276 {
2277         arc_buf_hdr_t *ab, *ab_prev;
2278         arc_buf_hdr_t marker = { 0 };
2279         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
2280         kmutex_t *hash_lock;
2281         uint64_t bytes_deleted = 0;
2282         uint64_t bufs_skipped = 0;
2283         int count = 0;
2284 
2285         ASSERT(GHOST_STATE(state));
2286 top:
2287         mutex_enter(&state->arcs_mtx);
2288         for (ab = list_tail(list); ab; ab = ab_prev) {
2289                 ab_prev = list_prev(list, ab);
2290                 if (ab->b_type > ARC_BUFC_NUMTYPES)
2291                         panic("invalid ab=%p", (void *)ab);
2292                 if (spa && ab->b_spa != spa)
2293                         continue;
2294 
2295                 /* ignore markers */
2296                 if (ab->b_spa == 0)
2297                         continue;
2298 
2299                 hash_lock = HDR_LOCK(ab);
2300                 /* caller may be trying to modify this buffer, skip it */
2301                 if (MUTEX_HELD(hash_lock))
2302                         continue;
2303 
2304                 /*
2305                  * It may take a long time to evict all the bufs requested.
2306                  * To avoid blocking all arc activity, periodically drop
2307                  * the arcs_mtx and give other threads a chance to run
2308                  * before reacquiring the lock.
2309                  */
2310                 if (count++ > arc_evict_iterations) {
2311                         list_insert_after(list, ab, &marker);
2312                         mutex_exit(&state->arcs_mtx);
2313                         kpreempt(KPREEMPT_SYNC);
2314                         mutex_enter(&state->arcs_mtx);
2315                         ab_prev = list_prev(list, &marker);
2316                         list_remove(list, &marker);
2317                         count = 0;
2318                         continue;
2319                 }
2320                 if (mutex_tryenter(hash_lock)) {
2321                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
2322                         ASSERT(ab->b_buf == NULL);
2323                         ARCSTAT_BUMP(arcstat_deleted);
2324                         bytes_deleted += ab->b_size;
2325 
2326                         if (ab->b_l2hdr != NULL) {
2327                                 /*
2328                                  * This buffer is cached on the 2nd Level ARC;
2329                                  * don't destroy the header.
2330                                  */
2331                                 arc_change_state(arc_l2c_only, ab, hash_lock);
2332                                 mutex_exit(hash_lock);
2333                         } else {
2334                                 arc_change_state(arc_anon, ab, hash_lock);
2335                                 mutex_exit(hash_lock);
2336                                 arc_hdr_destroy(ab);
2337                         }
2338 
2339                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2340                         if (bytes >= 0 && bytes_deleted >= bytes)
2341                                 break;
2342                 } else if (bytes < 0) {
2343                         /*
2344                          * Insert a list marker and then wait for the
2345                          * hash lock to become available. Once its
2346                          * available, restart from where we left off.
2347                          */
2348                         list_insert_after(list, ab, &marker);
2349                         mutex_exit(&state->arcs_mtx);
2350                         mutex_enter(hash_lock);
2351                         mutex_exit(hash_lock);
2352                         mutex_enter(&state->arcs_mtx);
2353                         ab_prev = list_prev(list, &marker);
2354                         list_remove(list, &marker);
2355                 } else {
2356                         bufs_skipped += 1;
2357                 }
2358 
2359         }
2360         mutex_exit(&state->arcs_mtx);
2361 
2362         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2363             (bytes < 0 || bytes_deleted < bytes)) {
2364                 list = &state->arcs_list[ARC_BUFC_METADATA];
2365                 goto top;
2366         }
2367 
2368         if (bufs_skipped) {
2369                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2370                 ASSERT(bytes >= 0);
2371         }
2372 
2373         if (bytes_deleted < bytes)
2374                 dprintf("only deleted %lld bytes from %p",
2375                     (longlong_t)bytes_deleted, state);
2376 }
2377 
2378 static void
2379 arc_adjust(void)
2380 {
2381         int64_t adjustment, delta;
2382 
2383         /*
2384          * Adjust MRU size
2385          */
2386 
2387         adjustment = MIN((int64_t)(arc_size - arc_c),
2388             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2389             arc_p));
2390 
2391         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2392                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2393                 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2394                 adjustment -= delta;
2395         }
2396 
2397         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2398                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2399                 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2400                     ARC_BUFC_METADATA);
2401         }
2402 
2403         /*
2404          * Adjust MFU size
2405          */
2406 
2407         adjustment = arc_size - arc_c;
2408 
2409         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2410                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2411                 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2412                 adjustment -= delta;
2413         }
2414 
2415         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2416                 int64_t delta = MIN(adjustment,
2417                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2418                 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2419                     ARC_BUFC_METADATA);
2420         }
2421 
2422         /*
2423          * Adjust ghost lists
2424          */
2425 
2426         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2427 
2428         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2429                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2430                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2431         }
2432 
2433         adjustment =
2434             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2435 
2436         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2437                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2438                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2439         }
2440 }
2441 
2442 static void
2443 arc_do_user_evicts(void)
2444 {
2445         mutex_enter(&arc_eviction_mtx);
2446         while (arc_eviction_list != NULL) {
2447                 arc_buf_t *buf = arc_eviction_list;
2448                 arc_eviction_list = buf->b_next;
2449                 mutex_enter(&buf->b_evict_lock);
2450                 buf->b_hdr = NULL;
2451                 mutex_exit(&buf->b_evict_lock);
2452                 mutex_exit(&arc_eviction_mtx);
2453 
2454                 if (buf->b_efunc != NULL)
2455                         VERIFY(buf->b_efunc(buf) == 0);
2456 
2457                 buf->b_efunc = NULL;
2458                 buf->b_private = NULL;
2459                 kmem_cache_free(buf_cache, buf);
2460                 mutex_enter(&arc_eviction_mtx);
2461         }
2462         mutex_exit(&arc_eviction_mtx);
2463 }
2464 
2465 /*
2466  * Flush all *evictable* data from the cache for the given spa.
2467  * NOTE: this will not touch "active" (i.e. referenced) data.
2468  */
2469 void
2470 arc_flush(spa_t *spa)
2471 {
2472         uint64_t guid = 0;
2473 
2474         if (spa)
2475                 guid = spa_load_guid(spa);
2476 
2477         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2478                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2479                 if (spa)
2480                         break;
2481         }
2482         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2483                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2484                 if (spa)
2485                         break;
2486         }
2487         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2488                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2489                 if (spa)
2490                         break;
2491         }
2492         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2493                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2494                 if (spa)
2495                         break;
2496         }
2497 
2498         arc_evict_ghost(arc_mru_ghost, guid, -1);
2499         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2500 
2501         mutex_enter(&arc_reclaim_thr_lock);
2502         arc_do_user_evicts();
2503         mutex_exit(&arc_reclaim_thr_lock);
2504         ASSERT(spa || arc_eviction_list == NULL);
2505 }
2506 
2507 void
2508 arc_shrink(void)
2509 {
2510         if (arc_c > arc_c_min) {
2511                 uint64_t to_free;
2512 
2513 #ifdef _KERNEL
2514                 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2515 #else
2516                 to_free = arc_c >> arc_shrink_shift;
2517 #endif
2518                 if (arc_c > arc_c_min + to_free)
2519                         atomic_add_64(&arc_c, -to_free);
2520                 else
2521                         arc_c = arc_c_min;
2522 
2523                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2524                 if (arc_c > arc_size)
2525                         arc_c = MAX(arc_size, arc_c_min);
2526                 if (arc_p > arc_c)
2527                         arc_p = (arc_c >> 1);
2528                 ASSERT(arc_c >= arc_c_min);
2529                 ASSERT((int64_t)arc_p >= 0);
2530         }
2531 
2532         if (arc_size > arc_c)
2533                 arc_adjust();
2534 }
2535 
2536 /*
2537  * Determine if the system is under memory pressure and is asking
2538  * to reclaim memory. A return value of 1 indicates that the system
2539  * is under memory pressure and that the arc should adjust accordingly.
2540  */
2541 static int
2542 arc_reclaim_needed(void)
2543 {
2544         uint64_t extra;
2545 
2546 #ifdef _KERNEL
2547 
2548         if (needfree)
2549                 return (1);
2550 
2551         /*
2552          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2553          */
2554         extra = desfree;
2555 
2556         /*
2557          * check that we're out of range of the pageout scanner.  It starts to
2558          * schedule paging if freemem is less than lotsfree and needfree.
2559          * lotsfree is the high-water mark for pageout, and needfree is the
2560          * number of needed free pages.  We add extra pages here to make sure
2561          * the scanner doesn't start up while we're freeing memory.
2562          */
2563         if (freemem < lotsfree + needfree + extra)
2564                 return (1);
2565 
2566         /*
2567          * check to make sure that swapfs has enough space so that anon
2568          * reservations can still succeed. anon_resvmem() checks that the
2569          * availrmem is greater than swapfs_minfree, and the number of reserved
2570          * swap pages.  We also add a bit of extra here just to prevent
2571          * circumstances from getting really dire.
2572          */
2573         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2574                 return (1);
2575 
2576         /*
2577          * Check that we have enough availrmem that memory locking (e.g., via
2578          * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2579          * stores the number of pages that cannot be locked; when availrmem
2580          * drops below pages_pp_maximum, page locking mechanisms such as
2581          * page_pp_lock() will fail.)
2582          */
2583         if (availrmem <= pages_pp_maximum)
2584                 return (1);
2585 
2586 #if defined(__i386)
2587         /*
2588          * If we're on an i386 platform, it's possible that we'll exhaust the
2589          * kernel heap space before we ever run out of available physical
2590          * memory.  Most checks of the size of the heap_area compare against
2591          * tune.t_minarmem, which is the minimum available real memory that we
2592          * can have in the system.  However, this is generally fixed at 25 pages
2593          * which is so low that it's useless.  In this comparison, we seek to
2594          * calculate the total heap-size, and reclaim if more than 3/4ths of the
2595          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2596          * free)
2597          */
2598         if (vmem_size(heap_arena, VMEM_FREE) <
2599             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2600                 return (1);
2601 #endif
2602 
2603         /*
2604          * If zio data pages are being allocated out of a separate heap segment,
2605          * then enforce that the size of available vmem for this arena remains
2606          * above about 1/16th free.
2607          *
2608          * Note: The 1/16th arena free requirement was put in place
2609          * to aggressively evict memory from the arc in order to avoid
2610          * memory fragmentation issues.
2611          */
2612         if (zio_arena != NULL &&
2613             vmem_size(zio_arena, VMEM_FREE) <
2614             (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2615                 return (1);
2616 #else
2617         if (spa_get_random(100) == 0)
2618                 return (1);
2619 #endif
2620         return (0);
2621 }
2622 
2623 static void
2624 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2625 {
2626         size_t                  i;
2627         kmem_cache_t            *prev_cache = NULL;
2628         kmem_cache_t            *prev_data_cache = NULL;
2629         extern kmem_cache_t     *zio_buf_cache[];
2630         extern kmem_cache_t     *zio_data_buf_cache[];
2631 
2632 #ifdef _KERNEL
2633         if (arc_meta_used >= arc_meta_limit) {
2634                 /*
2635                  * We are exceeding our meta-data cache limit.
2636                  * Purge some DNLC entries to release holds on meta-data.
2637                  */
2638                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2639         }
2640 #if defined(__i386)
2641         /*
2642          * Reclaim unused memory from all kmem caches.
2643          */
2644         kmem_reap();
2645 #endif
2646 #endif
2647 
2648         /*
2649          * An aggressive reclamation will shrink the cache size as well as
2650          * reap free buffers from the arc kmem caches.
2651          */
2652         if (strat == ARC_RECLAIM_AGGR)
2653                 arc_shrink();
2654 
2655         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2656                 if (zio_buf_cache[i] != prev_cache) {
2657                         prev_cache = zio_buf_cache[i];
2658                         kmem_cache_reap_now(zio_buf_cache[i]);
2659                 }
2660                 if (zio_data_buf_cache[i] != prev_data_cache) {
2661                         prev_data_cache = zio_data_buf_cache[i];
2662                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2663                 }
2664         }
2665         kmem_cache_reap_now(buf_cache);
2666         kmem_cache_reap_now(hdr_cache);
2667 
2668         /*
2669          * Ask the vmem areana to reclaim unused memory from its
2670          * quantum caches.
2671          */
2672         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2673                 vmem_qcache_reap(zio_arena);
2674 }
2675 
2676 static void
2677 arc_reclaim_thread(void)
2678 {
2679         clock_t                 growtime = 0;
2680         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2681         callb_cpr_t             cpr;
2682 
2683         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2684 
2685         mutex_enter(&arc_reclaim_thr_lock);
2686         while (arc_thread_exit == 0) {
2687                 if (arc_reclaim_needed()) {
2688 
2689                         if (arc_no_grow) {
2690                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2691                                         last_reclaim = ARC_RECLAIM_AGGR;
2692                                 } else {
2693                                         last_reclaim = ARC_RECLAIM_CONS;
2694                                 }
2695                         } else {
2696                                 arc_no_grow = TRUE;
2697                                 last_reclaim = ARC_RECLAIM_AGGR;
2698                                 membar_producer();
2699                         }
2700 
2701                         /* reset the growth delay for every reclaim */
2702                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2703 
2704                         arc_kmem_reap_now(last_reclaim);
2705                         arc_warm = B_TRUE;
2706 
2707                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2708                         arc_no_grow = FALSE;
2709                 }
2710 
2711                 arc_adjust();
2712 
2713                 if (arc_eviction_list != NULL)
2714                         arc_do_user_evicts();
2715 
2716                 /* block until needed, or one second, whichever is shorter */
2717                 CALLB_CPR_SAFE_BEGIN(&cpr);
2718                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2719                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2720                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2721         }
2722 
2723         arc_thread_exit = 0;
2724         cv_broadcast(&arc_reclaim_thr_cv);
2725         CALLB_CPR_EXIT(&cpr);               /* drops arc_reclaim_thr_lock */
2726         thread_exit();
2727 }
2728 
2729 /*
2730  * Adapt arc info given the number of bytes we are trying to add and
2731  * the state that we are comming from.  This function is only called
2732  * when we are adding new content to the cache.
2733  */
2734 static void
2735 arc_adapt(int bytes, arc_state_t *state)
2736 {
2737         int mult;
2738         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2739 
2740         if (state == arc_l2c_only)
2741                 return;
2742 
2743         ASSERT(bytes > 0);
2744         /*
2745          * Adapt the target size of the MRU list:
2746          *      - if we just hit in the MRU ghost list, then increase
2747          *        the target size of the MRU list.
2748          *      - if we just hit in the MFU ghost list, then increase
2749          *        the target size of the MFU list by decreasing the
2750          *        target size of the MRU list.
2751          */
2752         if (state == arc_mru_ghost) {
2753                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2754                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2755                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2756 
2757                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2758         } else if (state == arc_mfu_ghost) {
2759                 uint64_t delta;
2760 
2761                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2762                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2763                 mult = MIN(mult, 10);
2764 
2765                 delta = MIN(bytes * mult, arc_p);
2766                 arc_p = MAX(arc_p_min, arc_p - delta);
2767         }
2768         ASSERT((int64_t)arc_p >= 0);
2769 
2770         if (arc_reclaim_needed()) {
2771                 cv_signal(&arc_reclaim_thr_cv);
2772                 return;
2773         }
2774 
2775         if (arc_no_grow)
2776                 return;
2777 
2778         if (arc_c >= arc_c_max)
2779                 return;
2780 
2781         /*
2782          * If we're within (2 * maxblocksize) bytes of the target
2783          * cache size, increment the target cache size
2784          */
2785         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2786                 atomic_add_64(&arc_c, (int64_t)bytes);
2787                 if (arc_c > arc_c_max)
2788                         arc_c = arc_c_max;
2789                 else if (state == arc_anon)
2790                         atomic_add_64(&arc_p, (int64_t)bytes);
2791                 if (arc_p > arc_c)
2792                         arc_p = arc_c;
2793         }
2794         ASSERT((int64_t)arc_p >= 0);
2795 }
2796 
2797 /*
2798  * Check if the cache has reached its limits and eviction is required
2799  * prior to insert.
2800  */
2801 static int
2802 arc_evict_needed(arc_buf_contents_t type)
2803 {
2804         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2805                 return (1);
2806 
2807         if (arc_reclaim_needed())
2808                 return (1);
2809 
2810         return (arc_size > arc_c);
2811 }
2812 
2813 /*
2814  * The buffer, supplied as the first argument, needs a data block.
2815  * So, if we are at cache max, determine which cache should be victimized.
2816  * We have the following cases:
2817  *
2818  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2819  * In this situation if we're out of space, but the resident size of the MFU is
2820  * under the limit, victimize the MFU cache to satisfy this insertion request.
2821  *
2822  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2823  * Here, we've used up all of the available space for the MRU, so we need to
2824  * evict from our own cache instead.  Evict from the set of resident MRU
2825  * entries.
2826  *
2827  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2828  * c minus p represents the MFU space in the cache, since p is the size of the
2829  * cache that is dedicated to the MRU.  In this situation there's still space on
2830  * the MFU side, so the MRU side needs to be victimized.
2831  *
2832  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2833  * MFU's resident set is consuming more space than it has been allotted.  In
2834  * this situation, we must victimize our own cache, the MFU, for this insertion.
2835  */
2836 static void
2837 arc_get_data_buf(arc_buf_t *buf)
2838 {
2839         arc_state_t             *state = buf->b_hdr->b_state;
2840         uint64_t                size = buf->b_hdr->b_size;
2841         arc_buf_contents_t      type = buf->b_hdr->b_type;
2842 
2843         arc_adapt(size, state);
2844 
2845         /*
2846          * We have not yet reached cache maximum size,
2847          * just allocate a new buffer.
2848          */
2849         if (!arc_evict_needed(type)) {
2850                 if (type == ARC_BUFC_METADATA) {
2851                         buf->b_data = zio_buf_alloc(size);
2852                         arc_space_consume(size, ARC_SPACE_DATA);
2853                 } else {
2854                         ASSERT(type == ARC_BUFC_DATA);
2855                         buf->b_data = zio_data_buf_alloc(size);
2856                         ARCSTAT_INCR(arcstat_data_size, size);
2857                         atomic_add_64(&arc_size, size);
2858                 }
2859                 goto out;
2860         }
2861 
2862         /*
2863          * If we are prefetching from the mfu ghost list, this buffer
2864          * will end up on the mru list; so steal space from there.
2865          */
2866         if (state == arc_mfu_ghost)
2867                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2868         else if (state == arc_mru_ghost)
2869                 state = arc_mru;
2870 
2871         if (state == arc_mru || state == arc_anon) {
2872                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2873                 state = (arc_mfu->arcs_lsize[type] >= size &&
2874                     arc_p > mru_used) ? arc_mfu : arc_mru;
2875         } else {
2876                 /* MFU cases */
2877                 uint64_t mfu_space = arc_c - arc_p;
2878                 state =  (arc_mru->arcs_lsize[type] >= size &&
2879                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2880         }
2881         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2882                 if (type == ARC_BUFC_METADATA) {
2883                         buf->b_data = zio_buf_alloc(size);
2884                         arc_space_consume(size, ARC_SPACE_DATA);
2885                 } else {
2886                         ASSERT(type == ARC_BUFC_DATA);
2887                         buf->b_data = zio_data_buf_alloc(size);
2888                         ARCSTAT_INCR(arcstat_data_size, size);
2889                         atomic_add_64(&arc_size, size);
2890                 }
2891                 ARCSTAT_BUMP(arcstat_recycle_miss);
2892         }
2893         ASSERT(buf->b_data != NULL);
2894 out:
2895         /*
2896          * Update the state size.  Note that ghost states have a
2897          * "ghost size" and so don't need to be updated.
2898          */
2899         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2900                 arc_buf_hdr_t *hdr = buf->b_hdr;
2901 
2902                 atomic_add_64(&hdr->b_state->arcs_size, size);
2903                 if (list_link_active(&hdr->b_arc_node)) {
2904                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2905                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2906                 }
2907                 /*
2908                  * If we are growing the cache, and we are adding anonymous
2909                  * data, and we have outgrown arc_p, update arc_p
2910                  */
2911                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2912                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2913                         arc_p = MIN(arc_c, arc_p + size);
2914         }
2915 }
2916 
2917 /*
2918  * This routine is called whenever a buffer is accessed.
2919  * NOTE: the hash lock is dropped in this function.
2920  */
2921 static void
2922 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2923 {
2924         clock_t now;
2925 
2926         ASSERT(MUTEX_HELD(hash_lock));
2927 
2928         if (buf->b_state == arc_anon) {
2929                 /*
2930                  * This buffer is not in the cache, and does not
2931                  * appear in our "ghost" list.  Add the new buffer
2932                  * to the MRU state.
2933                  */
2934 
2935                 ASSERT(buf->b_arc_access == 0);
2936                 buf->b_arc_access = ddi_get_lbolt();
2937                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2938                 arc_change_state(arc_mru, buf, hash_lock);
2939 
2940         } else if (buf->b_state == arc_mru) {
2941                 now = ddi_get_lbolt();
2942 
2943                 /*
2944                  * If this buffer is here because of a prefetch, then either:
2945                  * - clear the flag if this is a "referencing" read
2946                  *   (any subsequent access will bump this into the MFU state).
2947                  * or
2948                  * - move the buffer to the head of the list if this is
2949                  *   another prefetch (to make it less likely to be evicted).
2950                  */
2951                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2952                         if (refcount_count(&buf->b_refcnt) == 0) {
2953                                 ASSERT(list_link_active(&buf->b_arc_node));
2954                         } else {
2955                                 buf->b_flags &= ~ARC_PREFETCH;
2956                                 ARCSTAT_BUMP(arcstat_mru_hits);
2957                         }
2958                         buf->b_arc_access = now;
2959                         return;
2960                 }
2961 
2962                 /*
2963                  * This buffer has been "accessed" only once so far,
2964                  * but it is still in the cache. Move it to the MFU
2965                  * state.
2966                  */
2967                 if (now > buf->b_arc_access + ARC_MINTIME) {
2968                         /*
2969                          * More than 125ms have passed since we
2970                          * instantiated this buffer.  Move it to the
2971                          * most frequently used state.
2972                          */
2973                         buf->b_arc_access = now;
2974                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2975                         arc_change_state(arc_mfu, buf, hash_lock);
2976                 }
2977                 ARCSTAT_BUMP(arcstat_mru_hits);
2978         } else if (buf->b_state == arc_mru_ghost) {
2979                 arc_state_t     *new_state;
2980                 /*
2981                  * This buffer has been "accessed" recently, but
2982                  * was evicted from the cache.  Move it to the
2983                  * MFU state.
2984                  */
2985 
2986                 if (buf->b_flags & ARC_PREFETCH) {
2987                         new_state = arc_mru;
2988                         if (refcount_count(&buf->b_refcnt) > 0)
2989                                 buf->b_flags &= ~ARC_PREFETCH;
2990                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2991                 } else {
2992                         new_state = arc_mfu;
2993                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2994                 }
2995 
2996                 buf->b_arc_access = ddi_get_lbolt();
2997                 arc_change_state(new_state, buf, hash_lock);
2998 
2999                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3000         } else if (buf->b_state == arc_mfu) {
3001                 /*
3002                  * This buffer has been accessed more than once and is
3003                  * still in the cache.  Keep it in the MFU state.
3004                  *
3005                  * NOTE: an add_reference() that occurred when we did
3006                  * the arc_read() will have kicked this off the list.
3007                  * If it was a prefetch, we will explicitly move it to
3008                  * the head of the list now.
3009                  */
3010                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
3011                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
3012                         ASSERT(list_link_active(&buf->b_arc_node));
3013                 }
3014                 ARCSTAT_BUMP(arcstat_mfu_hits);
3015                 buf->b_arc_access = ddi_get_lbolt();
3016         } else if (buf->b_state == arc_mfu_ghost) {
3017                 arc_state_t     *new_state = arc_mfu;
3018                 /*
3019                  * This buffer has been accessed more than once but has
3020                  * been evicted from the cache.  Move it back to the
3021                  * MFU state.
3022                  */
3023 
3024                 if (buf->b_flags & ARC_PREFETCH) {
3025                         /*
3026                          * This is a prefetch access...
3027                          * move this block back to the MRU state.
3028                          */
3029                         ASSERT0(refcount_count(&buf->b_refcnt));
3030                         new_state = arc_mru;
3031                 }
3032 
3033                 buf->b_arc_access = ddi_get_lbolt();
3034                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3035                 arc_change_state(new_state, buf, hash_lock);
3036 
3037                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3038         } else if (buf->b_state == arc_l2c_only) {
3039                 /*
3040                  * This buffer is on the 2nd Level ARC.
3041                  */
3042 
3043                 buf->b_arc_access = ddi_get_lbolt();
3044                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3045                 arc_change_state(arc_mfu, buf, hash_lock);
3046         } else {
3047                 ASSERT(!"invalid arc state");
3048         }
3049 }
3050 
3051 /* a generic arc_done_func_t which you can use */
3052 /* ARGSUSED */
3053 void
3054 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3055 {
3056         if (zio == NULL || zio->io_error == 0)
3057                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3058         VERIFY(arc_buf_remove_ref(buf, arg));
3059 }
3060 
3061 /* a generic arc_done_func_t */
3062 void
3063 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3064 {
3065         arc_buf_t **bufp = arg;
3066         if (zio && zio->io_error) {
3067                 VERIFY(arc_buf_remove_ref(buf, arg));
3068                 *bufp = NULL;
3069         } else {
3070                 *bufp = buf;
3071                 ASSERT(buf->b_data);
3072         }
3073 }
3074 
3075 static void
3076 arc_read_done(zio_t *zio)
3077 {
3078         arc_buf_hdr_t   *hdr, *found;
3079         arc_buf_t       *buf;
3080         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
3081         kmutex_t        *hash_lock;
3082         arc_callback_t  *callback_list, *acb;
3083         int             freeable = FALSE;
3084 
3085         buf = zio->io_private;
3086         hdr = buf->b_hdr;
3087 
3088         /*
3089          * The hdr was inserted into hash-table and removed from lists
3090          * prior to starting I/O.  We should find this header, since
3091          * it's in the hash table, and it should be legit since it's
3092          * not possible to evict it during the I/O.  The only possible
3093          * reason for it not to be found is if we were freed during the
3094          * read.
3095          */
3096         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
3097             &hash_lock);
3098 
3099         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
3100             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3101             (found == hdr && HDR_L2_READING(hdr)));
3102 
3103         hdr->b_flags &= ~ARC_L2_EVICTED;
3104         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3105                 hdr->b_flags &= ~ARC_L2CACHE;
3106 
3107         /* byteswap if necessary */
3108         callback_list = hdr->b_acb;
3109         ASSERT(callback_list != NULL);
3110         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3111                 dmu_object_byteswap_t bswap =
3112                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3113                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3114                     byteswap_uint64_array :
3115                     dmu_ot_byteswap[bswap].ob_func;
3116                 func(buf->b_data, hdr->b_size);
3117         }
3118 
3119         arc_cksum_compute(buf, B_FALSE);
3120         arc_buf_watch(buf);
3121 
3122         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3123                 /*
3124                  * Only call arc_access on anonymous buffers.  This is because
3125                  * if we've issued an I/O for an evicted buffer, we've already
3126                  * called arc_access (to prevent any simultaneous readers from
3127                  * getting confused).
3128                  */
3129                 arc_access(hdr, hash_lock);
3130         }
3131 
3132         /* create copies of the data buffer for the callers */
3133         abuf = buf;
3134         for (acb = callback_list; acb; acb = acb->acb_next) {
3135                 if (acb->acb_done) {
3136                         if (abuf == NULL) {
3137                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
3138                                 abuf = arc_buf_clone(buf);
3139                         }
3140                         acb->acb_buf = abuf;
3141                         abuf = NULL;
3142                 }
3143         }
3144         hdr->b_acb = NULL;
3145         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3146         ASSERT(!HDR_BUF_AVAILABLE(hdr));
3147         if (abuf == buf) {
3148                 ASSERT(buf->b_efunc == NULL);
3149                 ASSERT(hdr->b_datacnt == 1);
3150                 hdr->b_flags |= ARC_BUF_AVAILABLE;
3151         }
3152 
3153         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3154 
3155         if (zio->io_error != 0) {
3156                 hdr->b_flags |= ARC_IO_ERROR;
3157                 if (hdr->b_state != arc_anon)
3158                         arc_change_state(arc_anon, hdr, hash_lock);
3159                 if (HDR_IN_HASH_TABLE(hdr))
3160                         buf_hash_remove(hdr);
3161                 freeable = refcount_is_zero(&hdr->b_refcnt);
3162         }
3163 
3164         /*
3165          * Broadcast before we drop the hash_lock to avoid the possibility
3166          * that the hdr (and hence the cv) might be freed before we get to
3167          * the cv_broadcast().
3168          */
3169         cv_broadcast(&hdr->b_cv);
3170 
3171         if (hash_lock) {
3172                 mutex_exit(hash_lock);
3173         } else {
3174                 /*
3175                  * This block was freed while we waited for the read to
3176                  * complete.  It has been removed from the hash table and
3177                  * moved to the anonymous state (so that it won't show up
3178                  * in the cache).
3179                  */
3180                 ASSERT3P(hdr->b_state, ==, arc_anon);
3181                 freeable = refcount_is_zero(&hdr->b_refcnt);
3182         }
3183 
3184         /* execute each callback and free its structure */
3185         while ((acb = callback_list) != NULL) {
3186                 if (acb->acb_done)
3187                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3188 
3189                 if (acb->acb_zio_dummy != NULL) {
3190                         acb->acb_zio_dummy->io_error = zio->io_error;
3191                         zio_nowait(acb->acb_zio_dummy);
3192                 }
3193 
3194                 callback_list = acb->acb_next;
3195                 kmem_free(acb, sizeof (arc_callback_t));
3196         }
3197 
3198         if (freeable)
3199                 arc_hdr_destroy(hdr);
3200 }
3201 
3202 /*
3203  * "Read" the block at the specified DVA (in bp) via the
3204  * cache.  If the block is found in the cache, invoke the provided
3205  * callback immediately and return.  Note that the `zio' parameter
3206  * in the callback will be NULL in this case, since no IO was
3207  * required.  If the block is not in the cache pass the read request
3208  * on to the spa with a substitute callback function, so that the
3209  * requested block will be added to the cache.
3210  *
3211  * If a read request arrives for a block that has a read in-progress,
3212  * either wait for the in-progress read to complete (and return the
3213  * results); or, if this is a read with a "done" func, add a record
3214  * to the read to invoke the "done" func when the read completes,
3215  * and return; or just return.
3216  *
3217  * arc_read_done() will invoke all the requested "done" functions
3218  * for readers of this block.
3219  */
3220 int
3221 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3222     void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3223     const zbookmark_t *zb)
3224 {
3225         arc_buf_hdr_t *hdr;
3226         arc_buf_t *buf = NULL;
3227         kmutex_t *hash_lock;
3228         zio_t *rzio;
3229         uint64_t guid = spa_load_guid(spa);
3230 
3231 top:
3232         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3233             &hash_lock);
3234         if (hdr && hdr->b_datacnt > 0) {
3235 
3236                 *arc_flags |= ARC_CACHED;
3237 
3238                 if (HDR_IO_IN_PROGRESS(hdr)) {
3239 
3240                         if (*arc_flags & ARC_WAIT) {
3241                                 cv_wait(&hdr->b_cv, hash_lock);
3242                                 mutex_exit(hash_lock);
3243                                 goto top;
3244                         }
3245                         ASSERT(*arc_flags & ARC_NOWAIT);
3246 
3247                         if (done) {
3248                                 arc_callback_t  *acb = NULL;
3249 
3250                                 acb = kmem_zalloc(sizeof (arc_callback_t),
3251                                     KM_SLEEP);
3252                                 acb->acb_done = done;
3253                                 acb->acb_private = private;
3254                                 if (pio != NULL)
3255                                         acb->acb_zio_dummy = zio_null(pio,
3256                                             spa, NULL, NULL, NULL, zio_flags);
3257 
3258                                 ASSERT(acb->acb_done != NULL);
3259                                 acb->acb_next = hdr->b_acb;
3260                                 hdr->b_acb = acb;
3261                                 add_reference(hdr, hash_lock, private);
3262                                 mutex_exit(hash_lock);
3263                                 return (0);
3264                         }
3265                         mutex_exit(hash_lock);
3266                         return (0);
3267                 }
3268 
3269                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3270 
3271                 if (done) {
3272                         add_reference(hdr, hash_lock, private);
3273                         /*
3274                          * If this block is already in use, create a new
3275                          * copy of the data so that we will be guaranteed
3276                          * that arc_release() will always succeed.
3277                          */
3278                         buf = hdr->b_buf;
3279                         ASSERT(buf);
3280                         ASSERT(buf->b_data);
3281                         if (HDR_BUF_AVAILABLE(hdr)) {
3282                                 ASSERT(buf->b_efunc == NULL);
3283                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3284                         } else {
3285                                 buf = arc_buf_clone(buf);
3286                         }
3287 
3288                 } else if (*arc_flags & ARC_PREFETCH &&
3289                     refcount_count(&hdr->b_refcnt) == 0) {
3290                         hdr->b_flags |= ARC_PREFETCH;
3291                 }
3292                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3293                 arc_access(hdr, hash_lock);
3294                 if (*arc_flags & ARC_L2CACHE)
3295                         hdr->b_flags |= ARC_L2CACHE;
3296                 if (*arc_flags & ARC_L2COMPRESS)
3297                         hdr->b_flags |= ARC_L2COMPRESS;
3298                 mutex_exit(hash_lock);
3299                 ARCSTAT_BUMP(arcstat_hits);
3300                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3301                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3302                     data, metadata, hits);
3303 
3304                 if (done)
3305                         done(NULL, buf, private);
3306         } else {
3307                 uint64_t size = BP_GET_LSIZE(bp);
3308                 arc_callback_t  *acb;
3309                 vdev_t *vd = NULL;
3310                 uint64_t addr = 0;
3311                 boolean_t devw = B_FALSE;
3312                 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3313                 uint64_t b_asize = 0;
3314 
3315                 if (hdr == NULL) {
3316                         /* this block is not in the cache */
3317                         arc_buf_hdr_t   *exists;
3318                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3319                         buf = arc_buf_alloc(spa, size, private, type);
3320                         hdr = buf->b_hdr;
3321                         hdr->b_dva = *BP_IDENTITY(bp);
3322                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3323                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3324                         exists = buf_hash_insert(hdr, &hash_lock);
3325                         if (exists) {
3326                                 /* somebody beat us to the hash insert */
3327                                 mutex_exit(hash_lock);
3328                                 buf_discard_identity(hdr);
3329                                 (void) arc_buf_remove_ref(buf, private);
3330                                 goto top; /* restart the IO request */
3331                         }
3332                         /* if this is a prefetch, we don't have a reference */
3333                         if (*arc_flags & ARC_PREFETCH) {
3334                                 (void) remove_reference(hdr, hash_lock,
3335                                     private);
3336                                 hdr->b_flags |= ARC_PREFETCH;
3337                         }
3338                         if (*arc_flags & ARC_L2CACHE)
3339                                 hdr->b_flags |= ARC_L2CACHE;
3340                         if (*arc_flags & ARC_L2COMPRESS)
3341                                 hdr->b_flags |= ARC_L2COMPRESS;
3342                         if (BP_GET_LEVEL(bp) > 0)
3343                                 hdr->b_flags |= ARC_INDIRECT;
3344                 } else {
3345                         /* this block is in the ghost cache */
3346                         ASSERT(GHOST_STATE(hdr->b_state));
3347                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3348                         ASSERT0(refcount_count(&hdr->b_refcnt));
3349                         ASSERT(hdr->b_buf == NULL);
3350 
3351                         /* if this is a prefetch, we don't have a reference */
3352                         if (*arc_flags & ARC_PREFETCH)
3353                                 hdr->b_flags |= ARC_PREFETCH;
3354                         else
3355                                 add_reference(hdr, hash_lock, private);
3356                         if (*arc_flags & ARC_L2CACHE)
3357                                 hdr->b_flags |= ARC_L2CACHE;
3358                         if (*arc_flags & ARC_L2COMPRESS)
3359                                 hdr->b_flags |= ARC_L2COMPRESS;
3360                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3361                         buf->b_hdr = hdr;
3362                         buf->b_data = NULL;
3363                         buf->b_efunc = NULL;
3364                         buf->b_private = NULL;
3365                         buf->b_next = NULL;
3366                         hdr->b_buf = buf;
3367                         ASSERT(hdr->b_datacnt == 0);
3368                         hdr->b_datacnt = 1;
3369                         arc_get_data_buf(buf);
3370                         arc_access(hdr, hash_lock);
3371                 }
3372 
3373                 ASSERT(!GHOST_STATE(hdr->b_state));
3374 
3375                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3376                 acb->acb_done = done;
3377                 acb->acb_private = private;
3378 
3379                 ASSERT(hdr->b_acb == NULL);
3380                 hdr->b_acb = acb;
3381                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3382 
3383                 if (hdr->b_l2hdr != NULL &&
3384                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3385                         /*
3386                          * Need to stash these before letting go of hash_lock
3387                          */
3388                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3389                         addr = hdr->b_l2hdr->b_daddr;
3390                         b_compress = hdr->b_l2hdr->b_compress;
3391                         b_asize = hdr->b_l2hdr->b_asize;
3392                         /*
3393                          * Lock out device removal.
3394                          */
3395                         if (vdev_is_dead(vd) ||
3396                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3397                                 vd = NULL;
3398                 }
3399 
3400                 mutex_exit(hash_lock);
3401 
3402                 /*
3403                  * At this point, we have a level 1 cache miss.  Try again in
3404                  * L2ARC if possible.
3405                  */
3406                 ASSERT3U(hdr->b_size, ==, size);
3407                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3408                     uint64_t, size, zbookmark_t *, zb);
3409                 ARCSTAT_BUMP(arcstat_misses);
3410                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3411                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3412                     data, metadata, misses);
3413 
3414                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3415                         /*
3416                          * Read from the L2ARC if the following are true:
3417                          * 1. The L2ARC vdev was previously cached.
3418                          * 2. This buffer still has L2ARC metadata.
3419                          * 3. This buffer isn't currently writing to the L2ARC.
3420                          * 4. The L2ARC entry wasn't evicted, which may
3421                          *    also have invalidated the vdev.
3422                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3423                          */
3424                         if (hdr->b_l2hdr != NULL &&
3425                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3426                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3427                                 l2arc_read_callback_t *cb;
3428 
3429                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3430                                 ARCSTAT_BUMP(arcstat_l2_hits);
3431 
3432                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3433                                     KM_SLEEP);
3434                                 cb->l2rcb_buf = buf;
3435                                 cb->l2rcb_spa = spa;
3436                                 cb->l2rcb_bp = *bp;
3437                                 cb->l2rcb_zb = *zb;
3438                                 cb->l2rcb_flags = zio_flags;
3439                                 cb->l2rcb_compress = b_compress;
3440 
3441                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3442                                     addr + size < vd->vdev_psize -
3443                                     VDEV_LABEL_END_SIZE);
3444 
3445                                 /*
3446                                  * l2arc read.  The SCL_L2ARC lock will be
3447                                  * released by l2arc_read_done().
3448                                  * Issue a null zio if the underlying buffer
3449                                  * was squashed to zero size by compression.
3450                                  */
3451                                 if (b_compress == ZIO_COMPRESS_EMPTY) {
3452                                         rzio = zio_null(pio, spa, vd,
3453                                             l2arc_read_done, cb,
3454                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3455                                             ZIO_FLAG_CANFAIL |
3456                                             ZIO_FLAG_DONT_PROPAGATE |
3457                                             ZIO_FLAG_DONT_RETRY);
3458                                 } else {
3459                                         rzio = zio_read_phys(pio, vd, addr,
3460                                             b_asize, buf->b_data,
3461                                             ZIO_CHECKSUM_OFF,
3462                                             l2arc_read_done, cb, priority,
3463                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3464                                             ZIO_FLAG_CANFAIL |
3465                                             ZIO_FLAG_DONT_PROPAGATE |
3466                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
3467                                 }
3468                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3469                                     zio_t *, rzio);
3470                                 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3471 
3472                                 if (*arc_flags & ARC_NOWAIT) {
3473                                         zio_nowait(rzio);
3474                                         return (0);
3475                                 }
3476 
3477                                 ASSERT(*arc_flags & ARC_WAIT);
3478                                 if (zio_wait(rzio) == 0)
3479                                         return (0);
3480 
3481                                 /* l2arc read error; goto zio_read() */
3482                         } else {
3483                                 DTRACE_PROBE1(l2arc__miss,
3484                                     arc_buf_hdr_t *, hdr);
3485                                 ARCSTAT_BUMP(arcstat_l2_misses);
3486                                 if (HDR_L2_WRITING(hdr))
3487                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3488                                 spa_config_exit(spa, SCL_L2ARC, vd);
3489                         }
3490                 } else {
3491                         if (vd != NULL)
3492                                 spa_config_exit(spa, SCL_L2ARC, vd);
3493                         if (l2arc_ndev != 0) {
3494                                 DTRACE_PROBE1(l2arc__miss,
3495                                     arc_buf_hdr_t *, hdr);
3496                                 ARCSTAT_BUMP(arcstat_l2_misses);
3497                         }
3498                 }
3499 
3500                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3501                     arc_read_done, buf, priority, zio_flags, zb);
3502 
3503                 if (*arc_flags & ARC_WAIT)
3504                         return (zio_wait(rzio));
3505 
3506                 ASSERT(*arc_flags & ARC_NOWAIT);
3507                 zio_nowait(rzio);
3508         }
3509         return (0);
3510 }
3511 
3512 void
3513 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3514 {
3515         ASSERT(buf->b_hdr != NULL);
3516         ASSERT(buf->b_hdr->b_state != arc_anon);
3517         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3518         ASSERT(buf->b_efunc == NULL);
3519         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3520 
3521         buf->b_efunc = func;
3522         buf->b_private = private;
3523 }
3524 
3525 /*
3526  * Notify the arc that a block was freed, and thus will never be used again.
3527  */
3528 void
3529 arc_freed(spa_t *spa, const blkptr_t *bp)
3530 {
3531         arc_buf_hdr_t *hdr;
3532         kmutex_t *hash_lock;
3533         uint64_t guid = spa_load_guid(spa);
3534 
3535         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3536             &hash_lock);
3537         if (hdr == NULL)
3538                 return;
3539         if (HDR_BUF_AVAILABLE(hdr)) {
3540                 arc_buf_t *buf = hdr->b_buf;
3541                 add_reference(hdr, hash_lock, FTAG);
3542                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3543                 mutex_exit(hash_lock);
3544 
3545                 arc_release(buf, FTAG);
3546                 (void) arc_buf_remove_ref(buf, FTAG);
3547         } else {
3548                 mutex_exit(hash_lock);
3549         }
3550 
3551 }
3552 
3553 /*
3554  * This is used by the DMU to let the ARC know that a buffer is
3555  * being evicted, so the ARC should clean up.  If this arc buf
3556  * is not yet in the evicted state, it will be put there.
3557  */
3558 int
3559 arc_buf_evict(arc_buf_t *buf)
3560 {
3561         arc_buf_hdr_t *hdr;
3562         kmutex_t *hash_lock;
3563         arc_buf_t **bufp;
3564 
3565         mutex_enter(&buf->b_evict_lock);
3566         hdr = buf->b_hdr;
3567         if (hdr == NULL) {
3568                 /*
3569                  * We are in arc_do_user_evicts().
3570                  */
3571                 ASSERT(buf->b_data == NULL);
3572                 mutex_exit(&buf->b_evict_lock);
3573                 return (0);
3574         } else if (buf->b_data == NULL) {
3575                 arc_buf_t copy = *buf; /* structure assignment */
3576                 /*
3577                  * We are on the eviction list; process this buffer now
3578                  * but let arc_do_user_evicts() do the reaping.
3579                  */
3580                 buf->b_efunc = NULL;
3581                 mutex_exit(&buf->b_evict_lock);
3582                 VERIFY(copy.b_efunc(&copy) == 0);
3583                 return (1);
3584         }
3585         hash_lock = HDR_LOCK(hdr);
3586         mutex_enter(hash_lock);
3587         hdr = buf->b_hdr;
3588         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3589 
3590         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3591         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3592 
3593         /*
3594          * Pull this buffer off of the hdr
3595          */
3596         bufp = &hdr->b_buf;
3597         while (*bufp != buf)
3598                 bufp = &(*bufp)->b_next;
3599         *bufp = buf->b_next;
3600 
3601         ASSERT(buf->b_data != NULL);
3602         arc_buf_destroy(buf, FALSE, FALSE);
3603 
3604         if (hdr->b_datacnt == 0) {
3605                 arc_state_t *old_state = hdr->b_state;
3606                 arc_state_t *evicted_state;
3607 
3608                 ASSERT(hdr->b_buf == NULL);
3609                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3610 
3611                 evicted_state =
3612                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3613 
3614                 mutex_enter(&old_state->arcs_mtx);
3615                 mutex_enter(&evicted_state->arcs_mtx);
3616 
3617                 arc_change_state(evicted_state, hdr, hash_lock);
3618                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3619                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3620                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3621 
3622                 mutex_exit(&evicted_state->arcs_mtx);
3623                 mutex_exit(&old_state->arcs_mtx);
3624         }
3625         mutex_exit(hash_lock);
3626         mutex_exit(&buf->b_evict_lock);
3627 
3628         VERIFY(buf->b_efunc(buf) == 0);
3629         buf->b_efunc = NULL;
3630         buf->b_private = NULL;
3631         buf->b_hdr = NULL;
3632         buf->b_next = NULL;
3633         kmem_cache_free(buf_cache, buf);
3634         return (1);
3635 }
3636 
3637 /*
3638  * Release this buffer from the cache, making it an anonymous buffer.  This
3639  * must be done after a read and prior to modifying the buffer contents.
3640  * If the buffer has more than one reference, we must make
3641  * a new hdr for the buffer.
3642  */
3643 void
3644 arc_release(arc_buf_t *buf, void *tag)
3645 {
3646         arc_buf_hdr_t *hdr;
3647         kmutex_t *hash_lock = NULL;
3648         l2arc_buf_hdr_t *l2hdr;
3649         uint64_t buf_size;
3650 
3651         /*
3652          * It would be nice to assert that if it's DMU metadata (level >
3653          * 0 || it's the dnode file), then it must be syncing context.
3654          * But we don't know that information at this level.
3655          */
3656 
3657         mutex_enter(&buf->b_evict_lock);
3658         hdr = buf->b_hdr;
3659 
3660         /* this buffer is not on any list */
3661         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3662 
3663         if (hdr->b_state == arc_anon) {
3664                 /* this buffer is already released */
3665                 ASSERT(buf->b_efunc == NULL);
3666         } else {
3667                 hash_lock = HDR_LOCK(hdr);
3668                 mutex_enter(hash_lock);
3669                 hdr = buf->b_hdr;
3670                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3671         }
3672 
3673         l2hdr = hdr->b_l2hdr;
3674         if (l2hdr) {
3675                 mutex_enter(&l2arc_buflist_mtx);
3676                 hdr->b_l2hdr = NULL;
3677                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3678         }
3679         buf_size = hdr->b_size;
3680 
3681         /*
3682          * Do we have more than one buf?
3683          */
3684         if (hdr->b_datacnt > 1) {
3685                 arc_buf_hdr_t *nhdr;
3686                 arc_buf_t **bufp;
3687                 uint64_t blksz = hdr->b_size;
3688                 uint64_t spa = hdr->b_spa;
3689                 arc_buf_contents_t type = hdr->b_type;
3690                 uint32_t flags = hdr->b_flags;
3691 
3692                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3693                 /*
3694                  * Pull the data off of this hdr and attach it to
3695                  * a new anonymous hdr.
3696                  */
3697                 (void) remove_reference(hdr, hash_lock, tag);
3698                 bufp = &hdr->b_buf;
3699                 while (*bufp != buf)
3700                         bufp = &(*bufp)->b_next;
3701                 *bufp = buf->b_next;
3702                 buf->b_next = NULL;
3703 
3704                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3705                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3706                 if (refcount_is_zero(&hdr->b_refcnt)) {
3707                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3708                         ASSERT3U(*size, >=, hdr->b_size);
3709                         atomic_add_64(size, -hdr->b_size);
3710                 }
3711 
3712                 /*
3713                  * We're releasing a duplicate user data buffer, update
3714                  * our statistics accordingly.
3715                  */
3716                 if (hdr->b_type == ARC_BUFC_DATA) {
3717                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3718                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3719                             -hdr->b_size);
3720                 }
3721                 hdr->b_datacnt -= 1;
3722                 arc_cksum_verify(buf);
3723                 arc_buf_unwatch(buf);
3724 
3725                 mutex_exit(hash_lock);
3726 
3727                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3728                 nhdr->b_size = blksz;
3729                 nhdr->b_spa = spa;
3730                 nhdr->b_type = type;
3731                 nhdr->b_buf = buf;
3732                 nhdr->b_state = arc_anon;
3733                 nhdr->b_arc_access = 0;
3734                 nhdr->b_flags = flags & ARC_L2_WRITING;
3735                 nhdr->b_l2hdr = NULL;
3736                 nhdr->b_datacnt = 1;
3737                 nhdr->b_freeze_cksum = NULL;
3738                 (void) refcount_add(&nhdr->b_refcnt, tag);
3739                 buf->b_hdr = nhdr;
3740                 mutex_exit(&buf->b_evict_lock);
3741                 atomic_add_64(&arc_anon->arcs_size, blksz);
3742         } else {
3743                 mutex_exit(&buf->b_evict_lock);
3744                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3745                 ASSERT(!list_link_active(&hdr->b_arc_node));
3746                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3747                 if (hdr->b_state != arc_anon)
3748                         arc_change_state(arc_anon, hdr, hash_lock);
3749                 hdr->b_arc_access = 0;
3750                 if (hash_lock)
3751                         mutex_exit(hash_lock);
3752 
3753                 buf_discard_identity(hdr);
3754                 arc_buf_thaw(buf);
3755         }
3756         buf->b_efunc = NULL;
3757         buf->b_private = NULL;
3758 
3759         if (l2hdr) {
3760                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3761                 kmem_free(l2hdr, sizeof (*l2hdr));
3762                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3763                 mutex_exit(&l2arc_buflist_mtx);
3764         }
3765 }
3766 
3767 int
3768 arc_released(arc_buf_t *buf)
3769 {
3770         int released;
3771 
3772         mutex_enter(&buf->b_evict_lock);
3773         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3774         mutex_exit(&buf->b_evict_lock);
3775         return (released);
3776 }
3777 
3778 int
3779 arc_has_callback(arc_buf_t *buf)
3780 {
3781         int callback;
3782 
3783         mutex_enter(&buf->b_evict_lock);
3784         callback = (buf->b_efunc != NULL);
3785         mutex_exit(&buf->b_evict_lock);
3786         return (callback);
3787 }
3788 
3789 #ifdef ZFS_DEBUG
3790 int
3791 arc_referenced(arc_buf_t *buf)
3792 {
3793         int referenced;
3794 
3795         mutex_enter(&buf->b_evict_lock);
3796         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3797         mutex_exit(&buf->b_evict_lock);
3798         return (referenced);
3799 }
3800 #endif
3801 
3802 static void
3803 arc_write_ready(zio_t *zio)
3804 {
3805         arc_write_callback_t *callback = zio->io_private;
3806         arc_buf_t *buf = callback->awcb_buf;
3807         arc_buf_hdr_t *hdr = buf->b_hdr;
3808 
3809         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3810         callback->awcb_ready(zio, buf, callback->awcb_private);
3811 
3812         /*
3813          * If the IO is already in progress, then this is a re-write
3814          * attempt, so we need to thaw and re-compute the cksum.
3815          * It is the responsibility of the callback to handle the
3816          * accounting for any re-write attempt.
3817          */
3818         if (HDR_IO_IN_PROGRESS(hdr)) {
3819                 mutex_enter(&hdr->b_freeze_lock);
3820                 if (hdr->b_freeze_cksum != NULL) {
3821                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3822                         hdr->b_freeze_cksum = NULL;
3823                 }
3824                 mutex_exit(&hdr->b_freeze_lock);
3825         }
3826         arc_cksum_compute(buf, B_FALSE);
3827         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3828 }
3829 
3830 /*
3831  * The SPA calls this callback for each physical write that happens on behalf
3832  * of a logical write.  See the comment in dbuf_write_physdone() for details.
3833  */
3834 static void
3835 arc_write_physdone(zio_t *zio)
3836 {
3837         arc_write_callback_t *cb = zio->io_private;
3838         if (cb->awcb_physdone != NULL)
3839                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3840 }
3841 
3842 static void
3843 arc_write_done(zio_t *zio)
3844 {
3845         arc_write_callback_t *callback = zio->io_private;
3846         arc_buf_t *buf = callback->awcb_buf;
3847         arc_buf_hdr_t *hdr = buf->b_hdr;
3848 
3849         ASSERT(hdr->b_acb == NULL);
3850 
3851         if (zio->io_error == 0) {
3852                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3853                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3854                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3855         } else {
3856                 ASSERT(BUF_EMPTY(hdr));
3857         }
3858 
3859         /*
3860          * If the block to be written was all-zero, we may have
3861          * compressed it away.  In this case no write was performed
3862          * so there will be no dva/birth/checksum.  The buffer must
3863          * therefore remain anonymous (and uncached).
3864          */
3865         if (!BUF_EMPTY(hdr)) {
3866                 arc_buf_hdr_t *exists;
3867                 kmutex_t *hash_lock;
3868 
3869                 ASSERT(zio->io_error == 0);
3870 
3871                 arc_cksum_verify(buf);
3872 
3873                 exists = buf_hash_insert(hdr, &hash_lock);
3874                 if (exists) {
3875                         /*
3876                          * This can only happen if we overwrite for
3877                          * sync-to-convergence, because we remove
3878                          * buffers from the hash table when we arc_free().
3879                          */
3880                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3881                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3882                                         panic("bad overwrite, hdr=%p exists=%p",
3883                                             (void *)hdr, (void *)exists);
3884                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3885                                 arc_change_state(arc_anon, exists, hash_lock);
3886                                 mutex_exit(hash_lock);
3887                                 arc_hdr_destroy(exists);
3888                                 exists = buf_hash_insert(hdr, &hash_lock);
3889                                 ASSERT3P(exists, ==, NULL);
3890                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3891                                 /* nopwrite */
3892                                 ASSERT(zio->io_prop.zp_nopwrite);
3893                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3894                                         panic("bad nopwrite, hdr=%p exists=%p",
3895                                             (void *)hdr, (void *)exists);
3896                         } else {
3897                                 /* Dedup */
3898                                 ASSERT(hdr->b_datacnt == 1);
3899                                 ASSERT(hdr->b_state == arc_anon);
3900                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3901                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3902                         }
3903                 }
3904                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3905                 /* if it's not anon, we are doing a scrub */
3906                 if (!exists && hdr->b_state == arc_anon)
3907                         arc_access(hdr, hash_lock);
3908                 mutex_exit(hash_lock);
3909         } else {
3910                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3911         }
3912 
3913         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3914         callback->awcb_done(zio, buf, callback->awcb_private);
3915 
3916         kmem_free(callback, sizeof (arc_write_callback_t));
3917 }
3918 
3919 zio_t *
3920 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3921     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3922     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3923     arc_done_func_t *done, void *private, zio_priority_t priority,
3924     int zio_flags, const zbookmark_t *zb)
3925 {
3926         arc_buf_hdr_t *hdr = buf->b_hdr;
3927         arc_write_callback_t *callback;
3928         zio_t *zio;
3929 
3930         ASSERT(ready != NULL);
3931         ASSERT(done != NULL);
3932         ASSERT(!HDR_IO_ERROR(hdr));
3933         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3934         ASSERT(hdr->b_acb == NULL);
3935         if (l2arc)
3936                 hdr->b_flags |= ARC_L2CACHE;
3937         if (l2arc_compress)
3938                 hdr->b_flags |= ARC_L2COMPRESS;
3939         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3940         callback->awcb_ready = ready;
3941         callback->awcb_physdone = physdone;
3942         callback->awcb_done = done;
3943         callback->awcb_private = private;
3944         callback->awcb_buf = buf;
3945 
3946         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3947             arc_write_ready, arc_write_physdone, arc_write_done, callback,
3948             priority, zio_flags, zb);
3949 
3950         return (zio);
3951 }
3952 
3953 static int
3954 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3955 {
3956 #ifdef _KERNEL
3957         uint64_t available_memory = ptob(freemem);
3958         static uint64_t page_load = 0;
3959         static uint64_t last_txg = 0;
3960 
3961 #if defined(__i386)
3962         available_memory =
3963             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3964 #endif
3965 
3966         if (freemem > physmem * arc_lotsfree_percent / 100)
3967                 return (0);
3968 
3969         if (txg > last_txg) {
3970                 last_txg = txg;
3971                 page_load = 0;
3972         }
3973         /*
3974          * If we are in pageout, we know that memory is already tight,
3975          * the arc is already going to be evicting, so we just want to
3976          * continue to let page writes occur as quickly as possible.
3977          */
3978         if (curproc == proc_pageout) {
3979                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3980                         return (SET_ERROR(ERESTART));
3981                 /* Note: reserve is inflated, so we deflate */
3982                 page_load += reserve / 8;
3983                 return (0);
3984         } else if (page_load > 0 && arc_reclaim_needed()) {
3985                 /* memory is low, delay before restarting */
3986                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3987                 return (SET_ERROR(EAGAIN));
3988         }
3989         page_load = 0;
3990 #endif
3991         return (0);
3992 }
3993 
3994 void
3995 arc_tempreserve_clear(uint64_t reserve)
3996 {
3997         atomic_add_64(&arc_tempreserve, -reserve);
3998         ASSERT((int64_t)arc_tempreserve >= 0);
3999 }
4000 
4001 int
4002 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4003 {
4004         int error;
4005         uint64_t anon_size;
4006 
4007         if (reserve > arc_c/4 && !arc_no_grow)
4008                 arc_c = MIN(arc_c_max, reserve * 4);
4009         if (reserve > arc_c)
4010                 return (SET_ERROR(ENOMEM));
4011 
4012         /*
4013          * Don't count loaned bufs as in flight dirty data to prevent long
4014          * network delays from blocking transactions that are ready to be
4015          * assigned to a txg.
4016          */
4017         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4018 
4019         /*
4020          * Writes will, almost always, require additional memory allocations
4021          * in order to compress/encrypt/etc the data.  We therefore need to
4022          * make sure that there is sufficient available memory for this.
4023          */
4024         error = arc_memory_throttle(reserve, txg);
4025         if (error != 0)
4026                 return (error);
4027 
4028         /*
4029          * Throttle writes when the amount of dirty data in the cache
4030          * gets too large.  We try to keep the cache less than half full
4031          * of dirty blocks so that our sync times don't grow too large.
4032          * Note: if two requests come in concurrently, we might let them
4033          * both succeed, when one of them should fail.  Not a huge deal.
4034          */
4035 
4036         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4037             anon_size > arc_c / 4) {
4038                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4039                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4040                     arc_tempreserve>>10,
4041                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4042                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4043                     reserve>>10, arc_c>>10);
4044                 return (SET_ERROR(ERESTART));
4045         }
4046         atomic_add_64(&arc_tempreserve, reserve);
4047         return (0);
4048 }
4049 
4050 void
4051 arc_init(void)
4052 {
4053         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4054         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4055 
4056         /* Convert seconds to clock ticks */
4057         arc_min_prefetch_lifespan = 1 * hz;
4058 
4059         /* Start out with 1/8 of all memory */
4060         arc_c = physmem * PAGESIZE / 8;
4061 
4062 #ifdef _KERNEL
4063         /*
4064          * On architectures where the physical memory can be larger
4065          * than the addressable space (intel in 32-bit mode), we may
4066          * need to limit the cache to 1/8 of VM size.
4067          */
4068         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4069 #endif
4070 
4071         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
4072         arc_c_min = MAX(arc_c / 4, 64<<20);
4073         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
4074         if (arc_c * 8 >= 1<<30)
4075                 arc_c_max = (arc_c * 8) - (1<<30);
4076         else
4077                 arc_c_max = arc_c_min;
4078         arc_c_max = MAX(arc_c * 6, arc_c_max);
4079 
4080         /*
4081          * Allow the tunables to override our calculations if they are
4082          * reasonable (ie. over 64MB)
4083          */
4084         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
4085                 arc_c_max = zfs_arc_max;
4086         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
4087                 arc_c_min = zfs_arc_min;
4088 
4089         arc_c = arc_c_max;
4090         arc_p = (arc_c >> 1);
4091 
4092         /* limit meta-data to 1/4 of the arc capacity */
4093         arc_meta_limit = arc_c_max / 4;
4094 
4095         /* Allow the tunable to override if it is reasonable */
4096         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4097                 arc_meta_limit = zfs_arc_meta_limit;
4098 
4099         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4100                 arc_c_min = arc_meta_limit / 2;
4101 
4102         if (zfs_arc_grow_retry > 0)
4103                 arc_grow_retry = zfs_arc_grow_retry;
4104 
4105         if (zfs_arc_shrink_shift > 0)
4106                 arc_shrink_shift = zfs_arc_shrink_shift;
4107 
4108         if (zfs_arc_p_min_shift > 0)
4109                 arc_p_min_shift = zfs_arc_p_min_shift;
4110 
4111         /* if kmem_flags are set, lets try to use less memory */
4112         if (kmem_debugging())
4113                 arc_c = arc_c / 2;
4114         if (arc_c < arc_c_min)
4115                 arc_c = arc_c_min;
4116 
4117         arc_anon = &ARC_anon;
4118         arc_mru = &ARC_mru;
4119         arc_mru_ghost = &ARC_mru_ghost;
4120         arc_mfu = &ARC_mfu;
4121         arc_mfu_ghost = &ARC_mfu_ghost;
4122         arc_l2c_only = &ARC_l2c_only;
4123         arc_size = 0;
4124 
4125         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4126         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4127         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4128         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4129         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4130         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4131 
4132         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
4133             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4134         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
4135             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4136         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
4137             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4138         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
4139             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4140         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
4141             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4142         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
4143             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4144         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
4145             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4146         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
4147             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4148         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
4149             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4150         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
4151             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4152 
4153         buf_init();
4154 
4155         arc_thread_exit = 0;
4156         arc_eviction_list = NULL;
4157         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4158         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4159 
4160         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4161             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4162 
4163         if (arc_ksp != NULL) {
4164                 arc_ksp->ks_data = &arc_stats;
4165                 kstat_install(arc_ksp);
4166         }
4167 
4168         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4169             TS_RUN, minclsyspri);
4170 
4171         arc_dead = FALSE;
4172         arc_warm = B_FALSE;
4173 
4174         /*
4175          * Calculate maximum amount of dirty data per pool.
4176          *
4177          * If it has been set by /etc/system, take that.
4178          * Otherwise, use a percentage of physical memory defined by
4179          * zfs_dirty_data_max_percent (default 10%) with a cap at
4180          * zfs_dirty_data_max_max (default 4GB).
4181          */
4182         if (zfs_dirty_data_max == 0) {
4183                 zfs_dirty_data_max = physmem * PAGESIZE *
4184                     zfs_dirty_data_max_percent / 100;
4185                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4186                     zfs_dirty_data_max_max);
4187         }
4188 }
4189 
4190 void
4191 arc_fini(void)
4192 {
4193         mutex_enter(&arc_reclaim_thr_lock);
4194         arc_thread_exit = 1;
4195         while (arc_thread_exit != 0)
4196                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4197         mutex_exit(&arc_reclaim_thr_lock);
4198 
4199         arc_flush(NULL);
4200 
4201         arc_dead = TRUE;
4202 
4203         if (arc_ksp != NULL) {
4204                 kstat_delete(arc_ksp);
4205                 arc_ksp = NULL;
4206         }
4207 
4208         mutex_destroy(&arc_eviction_mtx);
4209         mutex_destroy(&arc_reclaim_thr_lock);
4210         cv_destroy(&arc_reclaim_thr_cv);
4211 
4212         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
4213         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
4214         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
4215         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
4216         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
4217         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
4218         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
4219         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
4220 
4221         mutex_destroy(&arc_anon->arcs_mtx);
4222         mutex_destroy(&arc_mru->arcs_mtx);
4223         mutex_destroy(&arc_mru_ghost->arcs_mtx);
4224         mutex_destroy(&arc_mfu->arcs_mtx);
4225         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
4226         mutex_destroy(&arc_l2c_only->arcs_mtx);
4227 
4228         buf_fini();
4229 
4230         ASSERT(arc_loaned_bytes == 0);
4231 }
4232 
4233 /*
4234  * Level 2 ARC
4235  *
4236  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4237  * It uses dedicated storage devices to hold cached data, which are populated
4238  * using large infrequent writes.  The main role of this cache is to boost
4239  * the performance of random read workloads.  The intended L2ARC devices
4240  * include short-stroked disks, solid state disks, and other media with
4241  * substantially faster read latency than disk.
4242  *
4243  *                 +-----------------------+
4244  *                 |         ARC           |
4245  *                 +-----------------------+
4246  *                    |         ^     ^
4247  *                    |         |     |
4248  *      l2arc_feed_thread()    arc_read()
4249  *                    |         |     |
4250  *                    |  l2arc read   |
4251  *                    V         |     |
4252  *               +---------------+    |
4253  *               |     L2ARC     |    |
4254  *               +---------------+    |
4255  *                   |    ^           |
4256  *          l2arc_write() |           |
4257  *                   |    |           |
4258  *                   V    |           |
4259  *                 +-------+      +-------+
4260  *                 | vdev  |      | vdev  |
4261  *                 | cache |      | cache |
4262  *                 +-------+      +-------+
4263  *                 +=========+     .-----.
4264  *                 :  L2ARC  :    |-_____-|
4265  *                 : devices :    | Disks |
4266  *                 +=========+    `-_____-'
4267  *
4268  * Read requests are satisfied from the following sources, in order:
4269  *
4270  *      1) ARC
4271  *      2) vdev cache of L2ARC devices
4272  *      3) L2ARC devices
4273  *      4) vdev cache of disks
4274  *      5) disks
4275  *
4276  * Some L2ARC device types exhibit extremely slow write performance.
4277  * To accommodate for this there are some significant differences between
4278  * the L2ARC and traditional cache design:
4279  *
4280  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4281  * the ARC behave as usual, freeing buffers and placing headers on ghost
4282  * lists.  The ARC does not send buffers to the L2ARC during eviction as
4283  * this would add inflated write latencies for all ARC memory pressure.
4284  *
4285  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4286  * It does this by periodically scanning buffers from the eviction-end of
4287  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4288  * not already there. It scans until a headroom of buffers is satisfied,
4289  * which itself is a buffer for ARC eviction. If a compressible buffer is
4290  * found during scanning and selected for writing to an L2ARC device, we
4291  * temporarily boost scanning headroom during the next scan cycle to make
4292  * sure we adapt to compression effects (which might significantly reduce
4293  * the data volume we write to L2ARC). The thread that does this is
4294  * l2arc_feed_thread(), illustrated below; example sizes are included to
4295  * provide a better sense of ratio than this diagram:
4296  *
4297  *             head -->                        tail
4298  *              +---------------------+----------+
4299  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4300  *              +---------------------+----------+   |   o L2ARC eligible
4301  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4302  *              +---------------------+----------+   |
4303  *                   15.9 Gbytes      ^ 32 Mbytes    |
4304  *                                 headroom          |
4305  *                                            l2arc_feed_thread()
4306  *                                                   |
4307  *                       l2arc write hand <--[oooo]--'
4308  *                               |           8 Mbyte
4309  *                               |          write max
4310  *                               V
4311  *                +==============================+
4312  *      L2ARC dev |####|#|###|###|    |####| ... |
4313  *                +==============================+
4314  *                           32 Gbytes
4315  *
4316  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4317  * evicted, then the L2ARC has cached a buffer much sooner than it probably
4318  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4319  * safe to say that this is an uncommon case, since buffers at the end of
4320  * the ARC lists have moved there due to inactivity.
4321  *
4322  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4323  * then the L2ARC simply misses copying some buffers.  This serves as a
4324  * pressure valve to prevent heavy read workloads from both stalling the ARC
4325  * with waits and clogging the L2ARC with writes.  This also helps prevent
4326  * the potential for the L2ARC to churn if it attempts to cache content too
4327  * quickly, such as during backups of the entire pool.
4328  *
4329  * 5. After system boot and before the ARC has filled main memory, there are
4330  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4331  * lists can remain mostly static.  Instead of searching from tail of these
4332  * lists as pictured, the l2arc_feed_thread() will search from the list heads
4333  * for eligible buffers, greatly increasing its chance of finding them.
4334  *
4335  * The L2ARC device write speed is also boosted during this time so that
4336  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4337  * there are no L2ARC reads, and no fear of degrading read performance
4338  * through increased writes.
4339  *
4340  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4341  * the vdev queue can aggregate them into larger and fewer writes.  Each
4342  * device is written to in a rotor fashion, sweeping writes through
4343  * available space then repeating.
4344  *
4345  * 7. The L2ARC does not store dirty content.  It never needs to flush
4346  * write buffers back to disk based storage.
4347  *
4348  * 8. If an ARC buffer is written (and dirtied) which also exists in the
4349  * L2ARC, the now stale L2ARC buffer is immediately dropped.
4350  *
4351  * The performance of the L2ARC can be tweaked by a number of tunables, which
4352  * may be necessary for different workloads:
4353  *
4354  *      l2arc_write_max         max write bytes per interval
4355  *      l2arc_write_boost       extra write bytes during device warmup
4356  *      l2arc_noprefetch        skip caching prefetched buffers
4357  *      l2arc_headroom          number of max device writes to precache
4358  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4359  *                              scanning, we multiply headroom by this
4360  *                              percentage factor for the next scan cycle,
4361  *                              since more compressed buffers are likely to
4362  *                              be present
4363  *      l2arc_feed_secs         seconds between L2ARC writing
4364  *
4365  * Tunables may be removed or added as future performance improvements are
4366  * integrated, and also may become zpool properties.
4367  *
4368  * There are three key functions that control how the L2ARC warms up:
4369  *
4370  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4371  *      l2arc_write_size()      calculate how much to write
4372  *      l2arc_write_interval()  calculate sleep delay between writes
4373  *
4374  * These three functions determine what to write, how much, and how quickly
4375  * to send writes.
4376  *
4377  * L2ARC persistency:
4378  *
4379  * When writing buffers to L2ARC, we periodically add some metadata to
4380  * make sure we can pick them up after reboot, thus dramatically reducing
4381  * the impact that any downtime has on the performance of storage systems
4382  * with large caches.
4383  *
4384  * The implementation works fairly simply by integrating the following two
4385  * modifications:
4386  *
4387  * *) Every now and then we mix in a piece of metadata (called a log block)
4388  *    into the L2ARC write. This allows us to understand what's been written,
4389  *    so that we can rebuild the arc_buf_hdr_t structures of the main ARC
4390  *    buffers. The log block also includes a "back-reference" pointer to the
4391  *    previous block, forming a back-linked list of blocks on the L2ARC device.
4392  *
4393  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
4394  *    for our header bookkeeping purposes. This contains a device header, which
4395  *    contains our top-level reference structures. We update it each time we
4396  *    write a new log block, so that we're able to locate it in the L2ARC
4397  *    device. If this write results in an inconsistent device header (e.g. due
4398  *    to power failure), we detect this by verifying the header's checksum
4399  *    and simply drop the entries from L2ARC.
4400  *
4401  * Implementation diagram:
4402  *
4403  * +=== L2ARC device (not to scale) ======================================+
4404  * |       __________newest log block pointers_________                   |
4405  * |      /                                  \1 back   \latest            |
4406  * |     /                                    V         V                 |
4407  * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
4408  * |                       ^       / ^       / ^       /                  |
4409  * |                       `-prev-'  `-prev-'  `-prev-'                   |
4410  * |                         lb        lb        lb                       |
4411  * +======================================================================+
4412  *
4413  * On-device data structures:
4414  *
4415  * L2ARC device header: l2arc_dev_hdr_phys_t
4416  * L2ARC log block:     l2arc_log_blk_phys_t
4417  *
4418  * L2ARC reconstruction:
4419  *
4420  * When writing data, we simply write in the standard rotary fashion,
4421  * evicting buffers as we go and simply writing new data over them (writing
4422  * a new log block every now and then). This obviously means that once we
4423  * loop around the end of the device, we will start cutting into an already
4424  * committed log block (and its referenced data buffers), like so:
4425  *
4426  *    current write head__       __old tail
4427  *                        \     /
4428  *                        V    V
4429  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
4430  *                         ^    ^^^^^^^^^___________________________________
4431  *                         |                                                \
4432  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
4433  *
4434  * When importing the pool, we detect this situation and use it to stop
4435  * our scanning process (see l2arc_rebuild).
4436  *
4437  * There is one significant caveat to consider when rebuilding ARC contents
4438  * from an L2ARC device: what about invalidated buffers? Given the above
4439  * construction, we cannot update blocks which we've already written to amend
4440  * them to remove buffers which were invalidated. Thus, during reconstruction,
4441  * we might be populating the cache with buffers for data that's not on the
4442  * main pool anymore, or may have been overwritten!
4443  *
4444  * As it turns out, this isn't a problem. Every arc_read request includes
4445  * both the DVA and, crucially, the birth TXG of the BP the caller is
4446  * looking for. So even if the cache were populated by completely rotten
4447  * blocks for data that had been long deleted and/or overwritten, we'll
4448  * never actually return bad data from the cache, since the DVA with the
4449  * birth TXG uniquely identify a block in space and time - once created,
4450  * a block is immutable on disk. The worst thing we have done is wasted
4451  * some time and memory at l2arc rebuild to reconstruct outdated ARC
4452  * entries that will get dropped from the l2arc as it is being updated
4453  * with new blocks.
4454  */
4455 
4456 static boolean_t
4457 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4458 {
4459         /*
4460          * A buffer is *not* eligible for the L2ARC if it:
4461          * 1. belongs to a different spa.
4462          * 2. is already cached on the L2ARC.
4463          * 3. has an I/O in progress (it may be an incomplete read).
4464          * 4. is flagged not eligible (zfs property).
4465          */
4466         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4467             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4468                 return (B_FALSE);
4469 
4470         return (B_TRUE);
4471 }
4472 
4473 static uint64_t
4474 l2arc_write_size(void)
4475 {
4476         uint64_t size;
4477 
4478         /*
4479          * Make sure our globals have meaningful values in case the user
4480          * altered them.
4481          */
4482         size = l2arc_write_max;
4483         if (size == 0) {
4484                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4485                     "be greater than zero, resetting it to the default (%d)",
4486                     L2ARC_WRITE_SIZE);
4487                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4488         }
4489 
4490         if (arc_warm == B_FALSE)
4491                 size += l2arc_write_boost;
4492 
4493         return (size);
4494 
4495 }
4496 
4497 static clock_t
4498 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4499 {
4500         clock_t interval, next, now;
4501 
4502         /*
4503          * If the ARC lists are busy, increase our write rate; if the
4504          * lists are stale, idle back.  This is achieved by checking
4505          * how much we previously wrote - if it was more than half of
4506          * what we wanted, schedule the next write much sooner.
4507          */
4508         if (l2arc_feed_again && wrote > (wanted / 2))
4509                 interval = (hz * l2arc_feed_min_ms) / 1000;
4510         else
4511                 interval = hz * l2arc_feed_secs;
4512 
4513         now = ddi_get_lbolt();
4514         next = MAX(now, MIN(now + interval, began + interval));
4515 
4516         return (next);
4517 }
4518 
4519 static void
4520 l2arc_hdr_stat_add(boolean_t from_arc)
4521 {
4522         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4523         if (from_arc)
4524                 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4525 }
4526 
4527 static void
4528 l2arc_hdr_stat_remove(void)
4529 {
4530         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4531         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4532 }
4533 
4534 /*
4535  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4536  * If a device is returned, this also returns holding the spa config lock.
4537  */
4538 static l2arc_dev_t *
4539 l2arc_dev_get_next(void)
4540 {
4541         l2arc_dev_t *first, *next = NULL;
4542 
4543         /*
4544          * Lock out the removal of spas (spa_namespace_lock), then removal
4545          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4546          * both locks will be dropped and a spa config lock held instead.
4547          */
4548         mutex_enter(&spa_namespace_lock);
4549         mutex_enter(&l2arc_dev_mtx);
4550 
4551         /* if there are no vdevs, there is nothing to do */
4552         if (l2arc_ndev == 0)
4553                 goto out;
4554 
4555         first = NULL;
4556         next = l2arc_dev_last;
4557         do {
4558                 /*
4559                  * Loop around the list looking for a non-faulted vdev
4560                  * and one that isn't currently doing an L2ARC rebuild.
4561                  */
4562                 if (next == NULL) {
4563                         next = list_head(l2arc_dev_list);
4564                 } else {
4565                         next = list_next(l2arc_dev_list, next);
4566                         if (next == NULL)
4567                                 next = list_head(l2arc_dev_list);
4568                 }
4569 
4570                 /* if we have come back to the start, bail out */
4571                 if (first == NULL)
4572                         first = next;
4573                 else if (next == first)
4574                         break;
4575 
4576         } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
4577 
4578         /* if we were unable to find any usable vdevs, return NULL */
4579         if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
4580                 next = NULL;
4581 
4582         l2arc_dev_last = next;
4583 
4584 out:
4585         mutex_exit(&l2arc_dev_mtx);
4586 
4587         /*
4588          * Grab the config lock to prevent the 'next' device from being
4589          * removed while we are writing to it.
4590          */
4591         if (next != NULL)
4592                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4593         mutex_exit(&spa_namespace_lock);
4594 
4595         return (next);
4596 }
4597 
4598 /*
4599  * Free buffers that were tagged for destruction.
4600  */
4601 static void
4602 l2arc_do_free_on_write()
4603 {
4604         list_t *buflist;
4605         l2arc_data_free_t *df, *df_prev;
4606 
4607         mutex_enter(&l2arc_free_on_write_mtx);
4608         buflist = l2arc_free_on_write;
4609 
4610         for (df = list_tail(buflist); df; df = df_prev) {
4611                 df_prev = list_prev(buflist, df);
4612                 ASSERT(df->l2df_data != NULL);
4613                 ASSERT(df->l2df_func != NULL);
4614                 df->l2df_func(df->l2df_data, df->l2df_size);
4615                 list_remove(buflist, df);
4616                 kmem_free(df, sizeof (l2arc_data_free_t));
4617         }
4618 
4619         mutex_exit(&l2arc_free_on_write_mtx);
4620 }
4621 
4622 /*
4623  * A write to a cache device has completed.  Update all headers to allow
4624  * reads from these buffers to begin.
4625  */
4626 static void
4627 l2arc_write_done(zio_t *zio)
4628 {
4629         l2arc_write_callback_t *cb;
4630         l2arc_dev_t *dev;
4631         list_t *buflist;
4632         arc_buf_hdr_t *head, *ab, *ab_prev;
4633         l2arc_buf_hdr_t *l2hdr;
4634         kmutex_t *hash_lock;
4635         l2arc_log_blk_buf_t *lb_buf;
4636 
4637         cb = zio->io_private;
4638         ASSERT(cb != NULL);
4639         dev = cb->l2wcb_dev;
4640         ASSERT(dev != NULL);
4641         head = cb->l2wcb_head;
4642         ASSERT(head != NULL);
4643         buflist = dev->l2ad_buflist;
4644         ASSERT(buflist != NULL);
4645         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4646             l2arc_write_callback_t *, cb);
4647 
4648         if (zio->io_error != 0)
4649                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4650 
4651         mutex_enter(&l2arc_buflist_mtx);
4652 
4653         /*
4654          * All writes completed, or an error was hit.
4655          */
4656         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4657                 ab_prev = list_prev(buflist, ab);
4658                 l2hdr = ab->b_l2hdr;
4659 
4660                 /*
4661                  * Release the temporary compressed buffer as soon as possible.
4662                  */
4663                 if (l2hdr->b_compress != ZIO_COMPRESS_OFF)
4664                         l2arc_release_cdata_buf(ab);
4665 
4666                 hash_lock = HDR_LOCK(ab);
4667                 if (!mutex_tryenter(hash_lock)) {
4668                         /*
4669                          * This buffer misses out.  It may be in a stage
4670                          * of eviction.  Its ARC_L2_WRITING flag will be
4671                          * left set, denying reads to this buffer.
4672                          */
4673                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4674                         continue;
4675                 }
4676 
4677                 if (zio->io_error != 0) {
4678                         /*
4679                          * Error - drop L2ARC entry.
4680                          */
4681                         list_remove(buflist, ab);
4682                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4683                         ab->b_l2hdr = NULL;
4684                         kmem_free(l2hdr, sizeof (*l2hdr));
4685                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4686                 }
4687 
4688                 /*
4689                  * Allow ARC to begin reads to this L2ARC entry.
4690                  */
4691                 ab->b_flags &= ~ARC_L2_WRITING;
4692 
4693                 mutex_exit(hash_lock);
4694         }
4695 
4696         atomic_inc_64(&l2arc_writes_done);
4697         list_remove(buflist, head);
4698         kmem_cache_free(hdr_cache, head);
4699         mutex_exit(&l2arc_buflist_mtx);
4700 
4701         l2arc_do_free_on_write();
4702 
4703         for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL;
4704             lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) {
4705                 (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list);
4706                 kmem_free(lb_buf, sizeof (*lb_buf));
4707         }
4708         list_destroy(&cb->l2wcb_log_blk_buf_list);
4709         kmem_free(cb, sizeof (l2arc_write_callback_t));
4710 }
4711 
4712 /*
4713  * A read to a cache device completed.  Validate buffer contents before
4714  * handing over to the regular ARC routines.
4715  */
4716 static void
4717 l2arc_read_done(zio_t *zio)
4718 {
4719         l2arc_read_callback_t *cb;
4720         arc_buf_hdr_t *hdr;
4721         arc_buf_t *buf;
4722         kmutex_t *hash_lock;
4723         int equal;
4724 
4725         ASSERT(zio->io_vd != NULL);
4726         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4727 
4728         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4729 
4730         cb = zio->io_private;
4731         ASSERT(cb != NULL);
4732         buf = cb->l2rcb_buf;
4733         ASSERT(buf != NULL);
4734 
4735         hash_lock = HDR_LOCK(buf->b_hdr);
4736         mutex_enter(hash_lock);
4737         hdr = buf->b_hdr;
4738         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4739 
4740         /*
4741          * If the buffer was compressed, decompress it first.
4742          */
4743         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4744                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4745         ASSERT(zio->io_data != NULL);
4746 
4747         /*
4748          * Check this survived the L2ARC journey.
4749          */
4750         equal = arc_cksum_equal(buf);
4751         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4752                 mutex_exit(hash_lock);
4753                 zio->io_private = buf;
4754                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
4755                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
4756                 arc_read_done(zio);
4757         } else {
4758                 mutex_exit(hash_lock);
4759                 /*
4760                  * Buffer didn't survive caching.  Increment stats and
4761                  * reissue to the original storage device.
4762                  */
4763                 if (zio->io_error != 0) {
4764                         ARCSTAT_BUMP(arcstat_l2_io_error);
4765                 } else {
4766                         zio->io_error = SET_ERROR(EIO);
4767                 }
4768                 if (!equal)
4769                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4770 
4771                 /*
4772                  * If there's no waiter, issue an async i/o to the primary
4773                  * storage now.  If there *is* a waiter, the caller must
4774                  * issue the i/o in a context where it's OK to block.
4775                  */
4776                 if (zio->io_waiter == NULL) {
4777                         zio_t *pio = zio_unique_parent(zio);
4778 
4779                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4780 
4781                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4782                             buf->b_data, zio->io_size, arc_read_done, buf,
4783                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4784                 }
4785         }
4786 
4787         kmem_free(cb, sizeof (l2arc_read_callback_t));
4788 }
4789 
4790 /*
4791  * This is the list priority from which the L2ARC will search for pages to
4792  * cache.  This is used within loops (0..3) to cycle through lists in the
4793  * desired order.  This order can have a significant effect on cache
4794  * performance.
4795  *
4796  * Currently the metadata lists are hit first, MFU then MRU, followed by
4797  * the data lists.  This function returns a locked list, and also returns
4798  * the lock pointer.
4799  */
4800 static list_t *
4801 l2arc_list_locked(int list_num, kmutex_t **lock)
4802 {
4803         list_t *list = NULL;
4804 
4805         ASSERT(list_num >= 0 && list_num <= 3);
4806 
4807         switch (list_num) {
4808         case 0:
4809                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4810                 *lock = &arc_mfu->arcs_mtx;
4811                 break;
4812         case 1:
4813                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4814                 *lock = &arc_mru->arcs_mtx;
4815                 break;
4816         case 2:
4817                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4818                 *lock = &arc_mfu->arcs_mtx;
4819                 break;
4820         case 3:
4821                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4822                 *lock = &arc_mru->arcs_mtx;
4823                 break;
4824         }
4825 
4826         ASSERT(!(MUTEX_HELD(*lock)));
4827         mutex_enter(*lock);
4828         return (list);
4829 }
4830 
4831 /*
4832  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
4833  * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
4834  * overhead in processing to make sure there is enough headroom available
4835  * when writing buffers.
4836  */
4837 static inline uint64_t
4838 l2arc_log_blk_overhead(uint64_t write_sz)
4839 {
4840         return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
4841             L2ARC_LOG_BLK_SIZE;
4842 }
4843 
4844 /*
4845  * Evict buffers from the device write hand to the distance specified in
4846  * bytes.  This distance may span populated buffers, it may span nothing.
4847  * This is clearing a region on the L2ARC device ready for writing.
4848  * If the 'all' boolean is set, every buffer is evicted.
4849  */
4850 static void
4851 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4852 {
4853         list_t *buflist;
4854         l2arc_buf_hdr_t *l2hdr;
4855         arc_buf_hdr_t *ab, *ab_prev;
4856         kmutex_t *hash_lock;
4857         uint64_t taddr;
4858 
4859         buflist = dev->l2ad_buflist;
4860 
4861         if (buflist == NULL)
4862                 return;
4863 
4864         if (!all && dev->l2ad_first) {
4865                 /*
4866                  * This is the first sweep through the device.  There is
4867                  * nothing to evict.
4868                  */
4869                 return;
4870         }
4871 
4872         /*
4873          * We need to add in the worst case scenario of log block overhead.
4874          */
4875         distance += l2arc_log_blk_overhead(distance);
4876         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4877                 /*
4878                  * When nearing the end of the device, evict to the end
4879                  * before the device write hand jumps to the start.
4880                  */
4881                 taddr = dev->l2ad_end;
4882         } else {
4883                 taddr = dev->l2ad_hand + distance;
4884         }
4885         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4886             uint64_t, taddr, boolean_t, all);
4887 
4888 top:
4889         mutex_enter(&l2arc_buflist_mtx);
4890         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4891                 ab_prev = list_prev(buflist, ab);
4892 
4893                 hash_lock = HDR_LOCK(ab);
4894                 if (!mutex_tryenter(hash_lock)) {
4895                         /*
4896                          * Missed the hash lock.  Retry.
4897                          */
4898                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4899                         mutex_exit(&l2arc_buflist_mtx);
4900                         mutex_enter(hash_lock);
4901                         mutex_exit(hash_lock);
4902                         goto top;
4903                 }
4904 
4905                 if (HDR_L2_WRITE_HEAD(ab)) {
4906                         /*
4907                          * We hit a write head node.  Leave it for
4908                          * l2arc_write_done().
4909                          */
4910                         list_remove(buflist, ab);
4911                         mutex_exit(hash_lock);
4912                         continue;
4913                 }
4914 
4915                 if (!all && ab->b_l2hdr != NULL &&
4916                     (ab->b_l2hdr->b_daddr > taddr ||
4917                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4918                         /*
4919                          * We've evicted to the target address,
4920                          * or the end of the device.
4921                          */
4922                         mutex_exit(hash_lock);
4923                         break;
4924                 }
4925 
4926                 if (HDR_FREE_IN_PROGRESS(ab)) {
4927                         /*
4928                          * Already on the path to destruction.
4929                          */
4930                         mutex_exit(hash_lock);
4931                         continue;
4932                 }
4933 
4934                 if (ab->b_state == arc_l2c_only) {
4935                         ASSERT(!HDR_L2_READING(ab));
4936                         /*
4937                          * This doesn't exist in the ARC.  Destroy.
4938                          * arc_hdr_destroy() will call list_remove()
4939                          * and decrement arcstat_l2_size.
4940                          */
4941                         arc_change_state(arc_anon, ab, hash_lock);
4942                         arc_hdr_destroy(ab);
4943                 } else {
4944                         /*
4945                          * Invalidate issued or about to be issued
4946                          * reads, since we may be about to write
4947                          * over this location.
4948                          */
4949                         if (HDR_L2_READING(ab)) {
4950                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4951                                 ab->b_flags |= ARC_L2_EVICTED;
4952                         }
4953 
4954                         /*
4955                          * Tell ARC this no longer exists in L2ARC.
4956                          */
4957                         if (ab->b_l2hdr != NULL) {
4958                                 l2hdr = ab->b_l2hdr;
4959                                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4960                                 ab->b_l2hdr = NULL;
4961                                 kmem_free(l2hdr, sizeof (*l2hdr));
4962                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4963                         }
4964                         list_remove(buflist, ab);
4965 
4966                         /*
4967                          * This may have been leftover after a
4968                          * failed write.
4969                          */
4970                         ab->b_flags &= ~ARC_L2_WRITING;
4971                 }
4972                 mutex_exit(hash_lock);
4973         }
4974         mutex_exit(&l2arc_buflist_mtx);
4975 
4976         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4977         dev->l2ad_evict = taddr;
4978 }
4979 
4980 /*
4981  * Find and write ARC buffers to the L2ARC device.
4982  *
4983  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4984  * for reading until they have completed writing.
4985  * The headroom_boost is an in-out parameter used to maintain headroom boost
4986  * state between calls to this function.
4987  *
4988  * Returns the number of bytes actually written (which may be smaller than
4989  * the delta by which the device hand has changed due to alignment).
4990  */
4991 static uint64_t
4992 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4993     boolean_t *headroom_boost)
4994 {
4995         arc_buf_hdr_t *ab, *ab_prev, *head;
4996         list_t *list;
4997         /*
4998          * These variables mean:
4999          * - write_size: in-memory size of ARC buffers we've written (before
5000          *      compression).
5001          * - write_asize: actual on-disk size of ARC buffers we've written
5002          *      (after compression).
5003          * - write_aligned_asize: actual sum of space taken by ARC buffers
5004          *      on the device (after compression and alignment, so that
5005          *      every buffer starts on a multiple of the device block size).
5006          * - headroom: L2ARC scanning headroom (we won't scan beyond this
5007          *      distance from the list tail).
5008          * - buf_compress_minsz: minimum in-memory ARC buffer size for us
5009          *      to try compressing it.
5010          */
5011         uint64_t write_size, write_asize, write_aligned_asize, headroom,
5012             buf_compress_minsz;
5013         void *buf_data;
5014         kmutex_t *list_lock;
5015         boolean_t full;
5016         l2arc_write_callback_t *cb;
5017         zio_t *pio, *wzio;
5018         uint64_t guid = spa_load_guid(spa);
5019         const boolean_t do_headroom_boost = *headroom_boost;
5020         boolean_t dev_hdr_update = B_FALSE;
5021 
5022         ASSERT(dev->l2ad_vdev != NULL);
5023 
5024         /* Lower the flag now, we might want to raise it again later. */
5025         *headroom_boost = B_FALSE;
5026 
5027         pio = NULL;
5028         cb = NULL;
5029         write_size = write_asize = write_aligned_asize = 0;
5030         full = B_FALSE;
5031         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5032         head->b_flags |= ARC_L2_WRITE_HEAD;
5033 
5034         /*
5035          * We will want to try to compress buffers that are at least 2x the
5036          * device sector size.
5037          */
5038         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5039 
5040         /*
5041          * Copy buffers for L2ARC writing.
5042          */
5043         mutex_enter(&l2arc_buflist_mtx);
5044         for (int try = 0; try <= 3; try++) {
5045                 uint64_t passed_sz = 0;
5046 
5047                 list = l2arc_list_locked(try, &list_lock);
5048 
5049                 /*
5050                  * L2ARC fast warmup.
5051                  *
5052                  * Until the ARC is warm and starts to evict, read from the
5053                  * head of the ARC lists rather than the tail.
5054                  */
5055                 if (arc_warm == B_FALSE)
5056                         ab = list_head(list);
5057                 else
5058                         ab = list_tail(list);
5059 
5060                 headroom = target_sz * l2arc_headroom;
5061                 if (do_headroom_boost)
5062                         headroom = (headroom * l2arc_headroom_boost) / 100;
5063 
5064                 for (; ab; ab = ab_prev) {
5065                         l2arc_buf_hdr_t *l2hdr;
5066                         kmutex_t *hash_lock;
5067                         uint64_t buf_aligned_size;
5068 
5069                         if (arc_warm == B_FALSE)
5070                                 ab_prev = list_next(list, ab);
5071                         else
5072                                 ab_prev = list_prev(list, ab);
5073 
5074                         hash_lock = HDR_LOCK(ab);
5075                         if (!mutex_tryenter(hash_lock)) {
5076                                 /*
5077                                  * Skip this buffer rather than waiting.
5078                                  */
5079                                 continue;
5080                         }
5081 
5082                         /*
5083                          * When examining whether we've met our write target,
5084                          * we must always use the aligned size of the buffer,
5085                          * since that's the maximum amount of space a buffer
5086                          * can take up on the L2ARC device.
5087                          */
5088                         buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev,
5089                             ab->b_size);
5090                         passed_sz += buf_aligned_size;
5091                         if (passed_sz > headroom) {
5092                                 /*
5093                                  * Searched too far.
5094                                  */
5095                                 mutex_exit(hash_lock);
5096                                 break;
5097                         }
5098 
5099                         if (!l2arc_write_eligible(guid, ab)) {
5100                                 mutex_exit(hash_lock);
5101                                 continue;
5102                         }
5103 
5104                         if ((write_size + buf_aligned_size) > target_sz) {
5105                                 full = B_TRUE;
5106                                 mutex_exit(hash_lock);
5107                                 break;
5108                         }
5109 
5110                         if (pio == NULL) {
5111                                 /*
5112                                  * Insert a dummy header on the buflist so
5113                                  * l2arc_write_done() can find where the
5114                                  * write buffers begin without searching.
5115                                  */
5116                                 list_insert_head(dev->l2ad_buflist, head);
5117 
5118                                 cb = kmem_zalloc(
5119                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
5120                                 cb->l2wcb_dev = dev;
5121                                 cb->l2wcb_head = head;
5122                                 list_create(&cb->l2wcb_log_blk_buf_list,
5123                                     sizeof (l2arc_log_blk_buf_t),
5124                                     offsetof(l2arc_log_blk_buf_t, l2lbb_node));
5125                                 pio = zio_root(spa, l2arc_write_done, cb,
5126                                     ZIO_FLAG_CANFAIL);
5127                         }
5128 
5129                         /*
5130                          * Create and add a new L2ARC header.
5131                          */
5132                         l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
5133                         l2hdr->b_dev = dev;
5134                         ab->b_flags |= ARC_L2_WRITING;
5135 
5136                         /*
5137                          * Temporarily stash the data buffer in b_tmp_cdata.
5138                          * The subsequent write step will pick it up from
5139                          * there. This is because can't access ab->b_buf
5140                          * without holding the hash_lock, which we in turn
5141                          * can't access without holding the ARC list locks
5142                          * (which we want to avoid during compression/writing).
5143                          */
5144                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
5145                         l2hdr->b_asize = ab->b_size;
5146                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5147 
5148                         ab->b_l2hdr = l2hdr;
5149 
5150                         list_insert_head(dev->l2ad_buflist, ab);
5151 
5152                         /*
5153                          * Compute and store the buffer cksum before
5154                          * writing.  On debug the cksum is verified first.
5155                          */
5156                         arc_cksum_verify(ab->b_buf);
5157                         arc_cksum_compute(ab->b_buf, B_TRUE);
5158 
5159                         mutex_exit(hash_lock);
5160 
5161                         write_size += buf_aligned_size;
5162                 }
5163 
5164                 mutex_exit(list_lock);
5165 
5166                 if (full == B_TRUE)
5167                         break;
5168         }
5169 
5170         /* No buffers selected for writing? */
5171         if (pio == NULL) {
5172                 ASSERT0(write_size);
5173                 mutex_exit(&l2arc_buflist_mtx);
5174                 kmem_cache_free(hdr_cache, head);
5175                 return (0);
5176         }
5177 
5178         /*
5179          * Now start writing the buffers. We're starting at the write head
5180          * and work backwards, retracing the course of the buffer selector
5181          * loop above.
5182          */
5183         for (ab = list_prev(dev->l2ad_buflist, head); ab;
5184             ab = list_prev(dev->l2ad_buflist, ab)) {
5185                 l2arc_buf_hdr_t *l2hdr;
5186                 uint64_t buf_sz;
5187 
5188                 /*
5189                  * We shouldn't need to lock the buffer here, since we flagged
5190                  * it as ARC_L2_WRITING in the previous step, but we must take
5191                  * care to only access its L2 cache parameters. In particular,
5192                  * ab->b_buf may be invalid by now due to ARC eviction.
5193                  */
5194                 l2hdr = ab->b_l2hdr;
5195                 l2hdr->b_daddr = dev->l2ad_hand;
5196 
5197                 if ((ab->b_flags & ARC_L2COMPRESS) &&
5198                     l2hdr->b_asize >= buf_compress_minsz) {
5199                         if (l2arc_compress_buf(l2hdr)) {
5200                                 /*
5201                                  * If compression succeeded, enable headroom
5202                                  * boost on the next scan cycle.
5203                                  */
5204                                 *headroom_boost = B_TRUE;
5205                         }
5206                 }
5207 
5208                 /*
5209                  * Pick up the buffer data we had previously stashed away
5210                  * (and now potentially also compressed).
5211                  */
5212                 buf_data = l2hdr->b_tmp_cdata;
5213                 buf_sz = l2hdr->b_asize;
5214 
5215                 /* Compression may have squashed the buffer to zero length. */
5216                 if (buf_sz != 0) {
5217                         uint64_t buf_aligned_asize;
5218 
5219                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
5220                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5221                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5222                             ZIO_FLAG_CANFAIL, B_FALSE);
5223 
5224                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5225                             zio_t *, wzio);
5226                         (void) zio_nowait(wzio);
5227 
5228                         write_asize += buf_sz;
5229                         /*
5230                          * Keep the clock hand suitably device-aligned.
5231                          */
5232                         buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev,
5233                             buf_sz);
5234                         write_aligned_asize += buf_aligned_asize;
5235                         dev->l2ad_hand += buf_aligned_asize;
5236                         ASSERT(dev->l2ad_hand <= dev->l2ad_evict ||
5237                             dev->l2ad_first);
5238                 }
5239 
5240                 if (l2arc_log_blk_insert(dev, ab)) {
5241                         l2arc_log_blk_commit(dev, pio, cb);
5242                         dev_hdr_update = B_TRUE;
5243                 }
5244         }
5245         mutex_exit(&l2arc_buflist_mtx);
5246 
5247         if (dev_hdr_update)
5248                 l2arc_dev_hdr_update(dev, pio);
5249 
5250         VERIFY3U(write_aligned_asize, <=, target_sz);
5251         ARCSTAT_BUMP(arcstat_l2_writes_sent);
5252         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5253         ARCSTAT_INCR(arcstat_l2_size, write_size);
5254         ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize);
5255         vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0);
5256 
5257         /*
5258          * Bump device hand to the device start if it is approaching the end.
5259          * l2arc_evict() will already have evicted ahead for this case.
5260          */
5261         if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
5262             dev->l2ad_end) {
5263                 vdev_space_update(dev->l2ad_vdev,
5264                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
5265                 dev->l2ad_hand = dev->l2ad_start;
5266                 dev->l2ad_evict = dev->l2ad_start;
5267                 dev->l2ad_first = B_FALSE;
5268         }
5269 
5270         dev->l2ad_writing = B_TRUE;
5271         (void) zio_wait(pio);
5272         dev->l2ad_writing = B_FALSE;
5273 
5274         return (write_asize);
5275 }
5276 
5277 /*
5278  * Compresses an L2ARC buffer.
5279  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5280  * size in l2hdr->b_asize. This routine tries to compress the data and
5281  * depending on the compression result there are three possible outcomes:
5282  * *) The buffer was incompressible. The original l2hdr contents were left
5283  *    untouched and are ready for writing to an L2 device.
5284  * *) The buffer was all-zeros, so there is no need to write it to an L2
5285  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5286  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5287  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5288  *    data buffer which holds the compressed data to be written, and b_asize
5289  *    tells us how much data there is. b_compress is set to the appropriate
5290  *    compression algorithm. Once writing is done, invoke
5291  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5292  *
5293  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5294  * buffer was incompressible).
5295  */
5296 static boolean_t
5297 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5298 {
5299         void *cdata;
5300         size_t csize, len;
5301 
5302         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5303         ASSERT(l2hdr->b_tmp_cdata != NULL);
5304 
5305         len = l2hdr->b_asize;
5306         cdata = zio_data_buf_alloc(len);
5307         csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5308             cdata, l2hdr->b_asize);
5309 
5310         if (csize == 0) {
5311                 /* zero block, indicate that there's nothing to write */
5312                 zio_data_buf_free(cdata, len);
5313                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5314                 l2hdr->b_asize = 0;
5315                 l2hdr->b_tmp_cdata = NULL;
5316                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5317                 return (B_TRUE);
5318         } else if (csize > 0 && csize < len) {
5319                 /*
5320                  * Compression succeeded, we'll keep the cdata around for
5321                  * writing and release it afterwards.
5322                  */
5323                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5324                 l2hdr->b_asize = csize;
5325                 l2hdr->b_tmp_cdata = cdata;
5326                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
5327                 return (B_TRUE);
5328         } else {
5329                 /*
5330                  * Compression failed, release the compressed buffer.
5331                  * l2hdr will be left unmodified.
5332                  */
5333                 zio_data_buf_free(cdata, len);
5334                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
5335                 return (B_FALSE);
5336         }
5337 }
5338 
5339 /*
5340  * Decompresses a zio read back from an l2arc device. On success, the
5341  * underlying zio's io_data buffer is overwritten by the uncompressed
5342  * version. On decompression error (corrupt compressed stream), the
5343  * zio->io_error value is set to signal an I/O error.
5344  *
5345  * Please note that the compressed data stream is not checksummed, so
5346  * if the underlying device is experiencing data corruption, we may feed
5347  * corrupt data to the decompressor, so the decompressor needs to be
5348  * able to handle this situation (LZ4 does).
5349  */
5350 static void
5351 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5352 {
5353         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5354 
5355         if (zio->io_error != 0) {
5356                 /*
5357                  * An io error has occured, just restore the original io
5358                  * size in preparation for a main pool read.
5359                  */
5360                 zio->io_orig_size = zio->io_size = hdr->b_size;
5361                 return;
5362         }
5363 
5364         if (c == ZIO_COMPRESS_EMPTY) {
5365                 /*
5366                  * An empty buffer results in a null zio, which means we
5367                  * need to fill its io_data after we're done restoring the
5368                  * buffer's contents.
5369                  */
5370                 ASSERT(hdr->b_buf != NULL);
5371                 bzero(hdr->b_buf->b_data, hdr->b_size);
5372                 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5373         } else {
5374                 ASSERT(zio->io_data != NULL);
5375                 /*
5376                  * We copy the compressed data from the start of the arc buffer
5377                  * (the zio_read will have pulled in only what we need, the
5378                  * rest is garbage which we will overwrite at decompression)
5379                  * and then decompress back to the ARC data buffer. This way we
5380                  * can minimize copying by simply decompressing back over the
5381                  * original compressed data (rather than decompressing to an
5382                  * aux buffer and then copying back the uncompressed buffer,
5383                  * which is likely to be much larger).
5384                  */
5385                 uint64_t csize;
5386                 void *cdata;
5387 
5388                 csize = zio->io_size;
5389                 cdata = zio_data_buf_alloc(csize);
5390                 bcopy(zio->io_data, cdata, csize);
5391                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5392                     hdr->b_size) != 0)
5393                         zio->io_error = EIO;
5394                 zio_data_buf_free(cdata, csize);
5395         }
5396 
5397         /* Restore the expected uncompressed IO size. */
5398         zio->io_orig_size = zio->io_size = hdr->b_size;
5399 }
5400 
5401 /*
5402  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5403  * This buffer serves as a temporary holder of compressed data while
5404  * the buffer entry is being written to an l2arc device. Once that is
5405  * done, we can dispose of it.
5406  */
5407 static void
5408 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5409 {
5410         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5411 
5412         if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5413                 /*
5414                  * If the data was compressed, then we've allocated a
5415                  * temporary buffer for it, so now we need to release it.
5416                  */
5417                 ASSERT(l2hdr->b_tmp_cdata != NULL);
5418                 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5419         }
5420         l2hdr->b_tmp_cdata = NULL;
5421 }
5422 
5423 /*
5424  * This thread feeds the L2ARC at regular intervals.  This is the beating
5425  * heart of the L2ARC.
5426  */
5427 static void
5428 l2arc_feed_thread(void)
5429 {
5430         callb_cpr_t cpr;
5431         l2arc_dev_t *dev;
5432         spa_t *spa;
5433         uint64_t size, wrote;
5434         clock_t begin, next = ddi_get_lbolt();
5435         boolean_t headroom_boost = B_FALSE;
5436 
5437         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5438 
5439         mutex_enter(&l2arc_feed_thr_lock);
5440 
5441         while (l2arc_thread_exit == 0) {
5442                 CALLB_CPR_SAFE_BEGIN(&cpr);
5443                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5444                     next);
5445                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5446                 next = ddi_get_lbolt() + hz;
5447 
5448                 /*
5449                  * Quick check for L2ARC devices.
5450                  */
5451                 mutex_enter(&l2arc_dev_mtx);
5452                 if (l2arc_ndev == 0) {
5453                         mutex_exit(&l2arc_dev_mtx);
5454                         continue;
5455                 }
5456                 mutex_exit(&l2arc_dev_mtx);
5457                 begin = ddi_get_lbolt();
5458 
5459                 /*
5460                  * This selects the next l2arc device to write to, and in
5461                  * doing so the next spa to feed from: dev->l2ad_spa.   This
5462                  * will return NULL if there are now no l2arc devices or if
5463                  * they are all faulted.
5464                  *
5465                  * If a device is returned, its spa's config lock is also
5466                  * held to prevent device removal.  l2arc_dev_get_next()
5467                  * will grab and release l2arc_dev_mtx.
5468                  */
5469                 if ((dev = l2arc_dev_get_next()) == NULL)
5470                         continue;
5471 
5472                 spa = dev->l2ad_spa;
5473                 ASSERT(spa != NULL);
5474 
5475                 /*
5476                  * If the pool is read-only then force the feed thread to
5477                  * sleep a little longer.
5478                  */
5479                 if (!spa_writeable(spa)) {
5480                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5481                         spa_config_exit(spa, SCL_L2ARC, dev);
5482                         continue;
5483                 }
5484 
5485                 /*
5486                  * Avoid contributing to memory pressure.
5487                  */
5488                 if (arc_reclaim_needed()) {
5489                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5490                         spa_config_exit(spa, SCL_L2ARC, dev);
5491                         continue;
5492                 }
5493 
5494                 ARCSTAT_BUMP(arcstat_l2_feeds);
5495 
5496                 size = l2arc_write_size();
5497 
5498                 /*
5499                  * Evict L2ARC buffers that will be overwritten.
5500                  */
5501                 l2arc_evict(dev, size, B_FALSE);
5502 
5503                 /*
5504                  * Write ARC buffers.
5505                  */
5506                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5507 
5508                 /*
5509                  * Calculate interval between writes.
5510                  */
5511                 next = l2arc_write_interval(begin, size, wrote);
5512                 spa_config_exit(spa, SCL_L2ARC, dev);
5513         }
5514 
5515         l2arc_thread_exit = 0;
5516         cv_broadcast(&l2arc_feed_thr_cv);
5517         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
5518         thread_exit();
5519 }
5520 
5521 boolean_t
5522 l2arc_vdev_present(vdev_t *vd)
5523 {
5524         return (l2arc_vdev_get(vd) != NULL);
5525 }
5526 
5527 static l2arc_dev_t *
5528 l2arc_vdev_get(vdev_t *vd)
5529 {
5530         l2arc_dev_t     *dev;
5531         boolean_t       held = MUTEX_HELD(&l2arc_dev_mtx);
5532 
5533         if (!held)
5534                 mutex_enter(&l2arc_dev_mtx);
5535         for (dev = list_head(l2arc_dev_list); dev != NULL;
5536             dev = list_next(l2arc_dev_list, dev)) {
5537                 if (dev->l2ad_vdev == vd)
5538                         break;
5539         }
5540         if (!held)
5541                 mutex_exit(&l2arc_dev_mtx);
5542 
5543         return (dev);
5544 }
5545 
5546 /*
5547  * Add a vdev for use by the L2ARC.  By this point the spa has already
5548  * validated the vdev and opened it. The `rebuild' flag indicates whether
5549  * we should attempt an L2ARC persistency rebuild.
5550  */
5551 void
5552 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5553 {
5554         l2arc_dev_t *adddev;
5555 
5556         ASSERT(!l2arc_vdev_present(vd));
5557 
5558         /*
5559          * Create a new l2arc device entry.
5560          */
5561         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5562         adddev->l2ad_spa = spa;
5563         adddev->l2ad_vdev = vd;
5564         /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */
5565         adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE;
5566         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5567         adddev->l2ad_hand = adddev->l2ad_start;
5568         adddev->l2ad_evict = adddev->l2ad_start;
5569         adddev->l2ad_first = B_TRUE;
5570         adddev->l2ad_writing = B_FALSE;
5571 
5572         /*
5573          * This is a list of all ARC buffers that are still valid on the
5574          * device.
5575          */
5576         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5577         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5578             offsetof(arc_buf_hdr_t, b_l2node));
5579 
5580         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5581 
5582         /*
5583          * Add device to global list
5584          */
5585         mutex_enter(&l2arc_dev_mtx);
5586         list_insert_head(l2arc_dev_list, adddev);
5587         atomic_inc_64(&l2arc_ndev);
5588         if (rebuild && l2arc_rebuild_enabled &&
5589             adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
5590                 /*
5591                  * Just mark the device as pending for a rebuild. We won't
5592                  * be starting a rebuild in line here as it would block pool
5593                  * import. Instead spa_load_impl will hand that off to an
5594                  * async task which will call l2arc_spa_rebuild_start.
5595                  */
5596                 adddev->l2ad_rebuild = B_TRUE;
5597         }
5598         mutex_exit(&l2arc_dev_mtx);
5599 }
5600 
5601 /*
5602  * Remove a vdev from the L2ARC.
5603  */
5604 void
5605 l2arc_remove_vdev(vdev_t *vd)
5606 {
5607         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5608 
5609         /*
5610          * Find the device by vdev
5611          */
5612         mutex_enter(&l2arc_dev_mtx);
5613         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5614                 nextdev = list_next(l2arc_dev_list, dev);
5615                 if (vd == dev->l2ad_vdev) {
5616                         remdev = dev;
5617                         break;
5618                 }
5619         }
5620         ASSERT(remdev != NULL);
5621 
5622         /*
5623          * Remove device from global list
5624          */
5625         list_remove(l2arc_dev_list, remdev);
5626         l2arc_dev_last = NULL;          /* may have been invalidated */
5627         atomic_dec_64(&l2arc_ndev);
5628         mutex_exit(&l2arc_dev_mtx);
5629 
5630         /*
5631          * Clear all buflists and ARC references.  L2ARC device flush.
5632          */
5633         l2arc_evict(remdev, 0, B_TRUE);
5634         list_destroy(remdev->l2ad_buflist);
5635         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5636         kmem_free(remdev, sizeof (l2arc_dev_t));
5637 }
5638 
5639 void
5640 l2arc_init(void)
5641 {
5642         l2arc_thread_exit = 0;
5643         l2arc_ndev = 0;
5644         l2arc_writes_sent = 0;
5645         l2arc_writes_done = 0;
5646 
5647         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5648         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5649         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5650         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5651         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5652 
5653         l2arc_dev_list = &L2ARC_dev_list;
5654         l2arc_free_on_write = &L2ARC_free_on_write;
5655         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5656             offsetof(l2arc_dev_t, l2ad_node));
5657         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5658             offsetof(l2arc_data_free_t, l2df_list_node));
5659 }
5660 
5661 void
5662 l2arc_fini(void)
5663 {
5664         /*
5665          * This is called from dmu_fini(), which is called from spa_fini();
5666          * Because of this, we can assume that all l2arc devices have
5667          * already been removed when the pools themselves were removed.
5668          */
5669 
5670         l2arc_do_free_on_write();
5671 
5672         mutex_destroy(&l2arc_feed_thr_lock);
5673         cv_destroy(&l2arc_feed_thr_cv);
5674         mutex_destroy(&l2arc_dev_mtx);
5675         mutex_destroy(&l2arc_buflist_mtx);
5676         mutex_destroy(&l2arc_free_on_write_mtx);
5677 
5678         list_destroy(l2arc_dev_list);
5679         list_destroy(l2arc_free_on_write);
5680 }
5681 
5682 void
5683 l2arc_start(void)
5684 {
5685         if (!(spa_mode_global & FWRITE))
5686                 return;
5687 
5688         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5689             TS_RUN, minclsyspri);
5690 }
5691 
5692 void
5693 l2arc_stop(void)
5694 {
5695         if (!(spa_mode_global & FWRITE))
5696                 return;
5697 
5698         mutex_enter(&l2arc_feed_thr_lock);
5699         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
5700         l2arc_thread_exit = 1;
5701         while (l2arc_thread_exit != 0)
5702                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5703         mutex_exit(&l2arc_feed_thr_lock);
5704 }
5705 
5706 /*
5707  * Punches out rebuild threads for the L2ARC devices in a spa. This should
5708  * be called as one of the final steps of a pool import.
5709  */
5710 void
5711 l2arc_spa_rebuild_start(spa_t *spa)
5712 {
5713         l2arc_dev_t     *dev;
5714         /*
5715          * Locate the spa's l2arc devices and kick off rebuild threads.
5716          */
5717         mutex_enter(&l2arc_dev_mtx);
5718         for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
5719                 dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
5720                 ASSERT(dev != NULL);
5721                 if (dev->l2ad_rebuild) {
5722                         (void) thread_create(NULL, 0, l2arc_dev_rebuild_start,
5723                             dev, 0, &p0, TS_RUN, minclsyspri);
5724                 }
5725         }
5726         mutex_exit(&l2arc_dev_mtx);
5727 }
5728 
5729 /*
5730  * Main entry point for L2ARC rebuilding.
5731  */
5732 static void
5733 l2arc_dev_rebuild_start(l2arc_dev_t *dev)
5734 {
5735         spa_t *spa = dev->l2ad_spa;
5736         vdev_t *vd = dev->l2ad_vdev;
5737 
5738         /* Lock out device removal. */
5739         spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
5740         ASSERT(dev->l2ad_rebuild);
5741         (void) l2arc_rebuild(dev);
5742         dev->l2ad_rebuild = B_FALSE;
5743         spa_config_exit(spa, SCL_L2ARC, vd);
5744         thread_exit();
5745 }
5746 
5747 /*
5748  * This function implements the actual L2ARC metadata rebuild. It:
5749  *
5750  * 1) reads the device's header
5751  * 2) if a good device header is found, starts reading the log block chain
5752  * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
5753  *
5754  * Operation stops under any of the following conditions:
5755  *
5756  * 1) We reach the end of the log blk chain (the back-reference in the blk is
5757  *    invalid or loops over our starting point).
5758  * 2) We encounter *any* error condition (cksum errors, io errors, looped
5759  *    blocks, etc.).
5760  * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
5761  *    from making severely fragmented L2ARC log blocks or slow L2ARC devices
5762  *    prevent a machine from finishing a pool import (and thus letting the
5763  *    administrator take corrective action, e.g. by kicking the misbehaving
5764  *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
5765  *    rebuilding disabled).
5766  */
5767 static int
5768 l2arc_rebuild(l2arc_dev_t *dev)
5769 {
5770         int                     err;
5771         l2arc_log_blk_phys_t    *this_lb, *next_lb;
5772         uint8_t                 *this_lb_buf, *next_lb_buf;
5773         zio_t                   *this_io = NULL, *next_io = NULL;
5774         int64_t                 deadline;
5775         l2arc_log_blk_ptr_t     lb_ptrs[2];
5776         boolean_t               first_pass;
5777         uint64_t                load_guid;
5778 
5779         load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
5780         deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
5781         /*
5782          * Device header processing phase.
5783          */
5784         if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) {
5785                 /* device header corrupted, start a new one */
5786                 bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr));
5787                 return (err);
5788         }
5789         if (l2arc_check_rebuild_timeout_hit(deadline))
5790                 return (SET_ERROR(ETIMEDOUT));
5791 
5792         /* Retrieve the persistent L2ARC device state */
5793         dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail;
5794         dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
5795             dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr +
5796             LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0]));
5797         dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags &
5798             L2ARC_DEV_HDR_EVICT_FIRST);
5799 
5800         /* Prepare the rebuild processing state */
5801         bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
5802         this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
5803         next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
5804         this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
5805         next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
5806         first_pass = B_TRUE;
5807 
5808         /* Start the rebuild process */
5809         for (;;) {
5810                 if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0]))
5811                         /* We hit an invalid block address, end the rebuild. */
5812                         break;
5813 
5814                 if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
5815                     this_lb, next_lb, this_lb_buf, next_lb_buf,
5816                     this_io, &next_io)) != 0)
5817                         break;
5818 
5819                 /* Protection against infinite loops of log blocks. */
5820                 if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr,
5821                     lb_ptrs[0].l2lbp_daddr,
5822                     dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
5823                     !first_pass) {
5824                         ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
5825                         err = SET_ERROR(ELOOP);
5826                         break;
5827                 }
5828 
5829                 /*
5830                  * Our memory pressure valve. If the system is running low
5831                  * on memory, rather than swamping memory with new ARC buf
5832                  * hdrs, we opt not to rebuild the L2ARC. At this point,
5833                  * however, we have already set up our L2ARC dev to chain in
5834                  * new metadata log blk, so the user may choose to re-add the
5835                  * L2ARC dev at a later time to reconstruct it (when there's
5836                  * less memory pressure).
5837                  */
5838                 if (arc_reclaim_needed()) {
5839                         ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
5840                         cmn_err(CE_NOTE, "System running low on memory, "
5841                             "aborting L2ARC rebuild.");
5842                         err = SET_ERROR(ENOMEM);
5843                         break;
5844                 }
5845 
5846                 /*
5847                  * Now that we know that the next_lb checks out alright, we
5848                  * can start reconstruction from this lb - we can be sure
5849                  * that the L2ARC write hand has not yet reached any of our
5850                  * buffers.
5851                  */
5852                 l2arc_log_blk_restore(dev, load_guid, this_lb,
5853                     LBP_GET_PSIZE(&lb_ptrs[0]));
5854 
5855                 /*
5856                  * End of list detection. We can look ahead two steps in the
5857                  * blk chain and if the 2nd blk from this_lb dips below the
5858                  * initial chain starting point, then we know two things:
5859                  *      1) it can't be valid, and
5860                  *      2) the next_lb's ARC entries might have already been
5861                  *      partially overwritten and so we should stop before
5862                  *      we restore it
5863                  */
5864                 if (l2arc_range_check_overlap(
5865                     this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr,
5866                     dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
5867                     !first_pass)
5868                         break;
5869 
5870                 /* log blk restored, continue with next one in the list */
5871                 lb_ptrs[0] = lb_ptrs[1];
5872                 lb_ptrs[1] = this_lb->l2lb_back2_lbp;
5873                 PTR_SWAP(this_lb, next_lb);
5874                 PTR_SWAP(this_lb_buf, next_lb_buf);
5875                 this_io = next_io;
5876                 next_io = NULL;
5877                 first_pass = B_FALSE;
5878 
5879                 if (l2arc_check_rebuild_timeout_hit(deadline)) {
5880                         err = SET_ERROR(ETIMEDOUT);
5881                         break;
5882                 }
5883         }
5884         if (next_io != NULL)
5885                 l2arc_log_blk_prefetch_abort(next_io);
5886         kmem_free(this_lb, sizeof (*this_lb));
5887         kmem_free(next_lb, sizeof (*next_lb));
5888         kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
5889         kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
5890         if (err == 0)
5891                 ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
5892 
5893         return (err);
5894 }
5895 
5896 /*
5897  * Restores the payload of a log blk to ARC. This creates empty ARC hdr
5898  * entries which only contain an l2arc hdr, essentially restoring the
5899  * buffers to their L2ARC evicted state. This function also updates space
5900  * usage on the L2ARC vdev to make sure it tracks restored buffers.
5901  */
5902 static void
5903 l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
5904     l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
5905 {
5906         uint64_t        size = 0, psize = 0;
5907 
5908         mutex_enter(&l2arc_buflist_mtx);
5909 
5910         for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
5911                 /*
5912                  * Restore goes in the reverse direction to preserve correct
5913                  * temporal ordering of buffers in the l2ad_buflist.
5914                  */
5915                 l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid);
5916                 size += LE_GET_LSIZE(&lb->l2lb_entries[i]);
5917                 psize += LE_GET_PSIZE(&lb->l2lb_entries[i]);
5918         }
5919         mutex_exit(&l2arc_buflist_mtx);
5920 
5921         /*
5922          * Record rebuild stats:
5923          *      size            In-memory size of restored buffer data in ARC
5924          *      psize           Physical size of restored buffers in the L2ARC
5925          *      bufs            # of ARC buffer headers restored
5926          *      log_blks        # of L2ARC log entries processed during restore
5927          */
5928         ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
5929         ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
5930         ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
5931         ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
5932         ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
5933         ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
5934         vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
5935 }
5936 
5937 /*
5938  * Restores a single ARC buf hdr from a log block. The ARC buffer is put
5939  * into a state indicating that it has been evicted to L2ARC.
5940  */
5941 static void
5942 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
5943     uint64_t load_guid)
5944 {
5945         arc_buf_hdr_t   *hdr, *exists;
5946         kmutex_t        *hash_lock;
5947         arc_buf_contents_t      type = LE_GET_TYPE(le);
5948         l2arc_buf_hdr_t         *l2hdr;
5949 
5950         hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type);
5951         hdr->b_dva = le->l2le_dva;
5952         hdr->b_birth = le->l2le_birth;
5953         hdr->b_cksum0 = le->l2le_cksum0;
5954         hdr->b_size = LE_GET_LSIZE(le);
5955         exists = buf_hash_insert(hdr, &hash_lock);
5956         if (exists) {
5957                 /* Buffer was already cached, no need to restore it. */
5958                 mutex_exit(hash_lock);
5959                 arc_hdr_destroy(hdr);
5960                 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
5961                 return;
5962         }
5963         hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE;
5964         if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF)
5965                 hdr->b_flags |= ARC_L2COMPRESS;
5966         mutex_enter(&hdr->b_freeze_lock);
5967         ASSERT(hdr->b_freeze_cksum == NULL);
5968         hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
5969         *hdr->b_freeze_cksum = le->l2le_freeze_cksum;
5970         mutex_exit(&hdr->b_freeze_lock);
5971 
5972         /* now rebuild the l2arc entry */
5973         ASSERT(hdr->b_l2hdr == NULL);
5974         l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
5975         l2hdr->b_dev = dev;
5976         l2hdr->b_daddr = le->l2le_daddr;
5977         l2hdr->b_asize = LE_GET_PSIZE(le);
5978         l2hdr->b_compress = LE_GET_COMPRESS(le);
5979         hdr->b_l2hdr = l2hdr;
5980         list_insert_tail(dev->l2ad_buflist, hdr);
5981         ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
5982         ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
5983 
5984         arc_change_state(arc_l2c_only, hdr, hash_lock);
5985         mutex_exit(hash_lock);
5986 }
5987 
5988 /*
5989  * Attempts to read the device header on the provided L2ARC device and writes
5990  * it to `ub'. On success, this function returns 0, otherwise the appropriate
5991  * error code is returned.
5992  */
5993 static int
5994 l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr)
5995 {
5996         int             err;
5997         uint64_t        guid;
5998         zio_cksum_t     cksum;
5999 
6000         guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6001 
6002         if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
6003             VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr,
6004             ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
6005             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6006             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
6007                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
6008                 return (err);
6009         }
6010 
6011         if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
6012                 byteswap_uint64_array(hdr, sizeof (*hdr));
6013 
6014         if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC ||
6015             hdr->l2dh_spa_guid != guid) {
6016                 /*
6017                  * Attempt to rebuild a device containing no actual dev hdr
6018                  * or containing a header from some other pool.
6019                  */
6020                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
6021                 return (SET_ERROR(ENOTSUP));
6022         }
6023 
6024         l2arc_dev_hdr_checksum(hdr, &cksum);
6025         if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) {
6026                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
6027                 return (SET_ERROR(EINVAL));
6028         }
6029         if (hdr->l2dh_evict_tail < dev->l2ad_start ||
6030             hdr->l2dh_evict_tail >= dev->l2ad_end) {
6031                 /* Data in dev hdr is invalid for this device. */
6032                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
6033                 return (SET_ERROR(EINVAL));
6034         }
6035 
6036         return (0);
6037 }
6038 
6039 /*
6040  * Reads L2ARC log blocks from storage and validates their contents.
6041  *
6042  * This function implements a simple prefetcher to make sure that while
6043  * we're processing one buffer the L2ARC is already prefetching the next
6044  * one in the chain.
6045  *
6046  * The arguments this_lp and next_lp point to the current and next log blk
6047  * address in the block chain. Similarly, this_lb and next_lb hold the
6048  * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
6049  * and next_lb_buf must be buffers of appropriate to hold a raw
6050  * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
6051  * to buffer decompression).
6052  *
6053  * The `this_io' and `next_io' arguments are used for block prefetching.
6054  * When issuing the first blk IO during rebuild, you should pass NULL for
6055  * `this_io'. This function will then issue a sync IO to read the block and
6056  * also issue an async IO to fetch the next block in the block chain. The
6057  * prefetch IO is returned in `next_io'. On subsequent calls to this
6058  * function, pass the value returned in `next_io' from the previous call
6059  * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
6060  * Prior to the call, you should initialize your `next_io' pointer to be
6061  * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
6062  *
6063  * On success, this function returns 0, otherwise it returns an appropriate
6064  * error code. On error the prefetching IO is aborted and cleared before
6065  * returning from this function. Therefore, if we return `success', the
6066  * caller can assume that we have taken care of cleanup of prefetch IOs.
6067  */
6068 static int
6069 l2arc_log_blk_read(l2arc_dev_t *dev,
6070     const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp,
6071     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
6072     uint8_t *this_lb_buf, uint8_t *next_lb_buf,
6073     zio_t *this_io, zio_t **next_io)
6074 {
6075         int err = 0;
6076         zio_cksum_t cksum;
6077 
6078         ASSERT(this_lbp != NULL && next_lbp != NULL);
6079         ASSERT(this_lb != NULL && next_lb != NULL);
6080         ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
6081         ASSERT(next_io != NULL && *next_io == NULL);
6082         ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp));
6083 
6084         /*
6085          * Check to see if we have issued the IO for this log blk in a
6086          * previous run. If not, this is the first call, so issue it now.
6087          */
6088         if (this_io == NULL) {
6089                 this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
6090                     this_lb_buf);
6091         }
6092 
6093         /*
6094          * Peek to see if we can start issuing the next IO immediately.
6095          */
6096         if (l2arc_log_blk_ptr_valid(dev, next_lbp)) {
6097                 /*
6098                  * Start issuing IO for the next log blk early - this
6099                  * should help keep the L2ARC device busy while we
6100                  * decompress and restore this log blk.
6101                  */
6102                 *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
6103                     next_lb_buf);
6104         }
6105 
6106         /* Wait for the IO to read this log block to complete */
6107         if ((err = zio_wait(this_io)) != 0) {
6108                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
6109                 goto cleanup;
6110         }
6111 
6112         /* Make sure the buffer checks out */
6113         fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum);
6114         if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) {
6115                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
6116                 err = SET_ERROR(EINVAL);
6117                 goto cleanup;
6118         }
6119 
6120         /* Now we can take our time decoding this buffer */
6121         switch (LBP_GET_COMPRESS(this_lbp)) {
6122         case ZIO_COMPRESS_OFF:
6123                 bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
6124                 break;
6125         case ZIO_COMPRESS_LZ4:
6126                 if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
6127                     this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
6128                     sizeof (*this_lb))) != 0) {
6129                         err = SET_ERROR(EINVAL);
6130                         goto cleanup;
6131                 }
6132                 break;
6133         default:
6134                 err = SET_ERROR(EINVAL);
6135                 goto cleanup;
6136         }
6137         if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
6138                 byteswap_uint64_array(this_lb, sizeof (*this_lb));
6139         if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) {
6140                 err = SET_ERROR(EINVAL);
6141                 goto cleanup;
6142         }
6143 cleanup:
6144         /* Abort an in-flight prefetch I/O in case of error */
6145         if (err != 0 && *next_io != NULL) {
6146                 l2arc_log_blk_prefetch_abort(*next_io);
6147                 *next_io = NULL;
6148         }
6149         return (err);
6150 }
6151 
6152 /*
6153  * Validates an L2ARC log blk address to make sure that it can be read
6154  * from the provided L2ARC device. Returns B_TRUE if the address is
6155  * within the device's bounds, or B_FALSE if not.
6156  */
6157 static boolean_t
6158 l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp)
6159 {
6160         uint64_t psize = LBP_GET_PSIZE(lbp);
6161         uint64_t end = lbp->l2lbp_daddr + psize;
6162 
6163         /*
6164          * A log block is valid if all of the following conditions are true:
6165          * - it fits entirely between l2ad_start and l2ad_end
6166          * - it has a valid size
6167          * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it
6168          *      doesn't sit in the evicted region)
6169          */
6170         return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end &&
6171             psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
6172             lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand);
6173 }
6174 
6175 /*
6176  * Starts an asynchronous read IO to read a log block. This is used in log
6177  * block reconstruction to start reading the next block before we are done
6178  * decoding and reconstructing the current block, to keep the l2arc device
6179  * nice and hot with read IO to process.
6180  * The returned zio will contain a newly allocated memory buffers for the IO
6181  * data which should then be freed by the caller once the zio is no longer
6182  * needed (i.e. due to it having completed). If you wish to abort this
6183  * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
6184  * care of disposing of the allocated buffers correctly.
6185  */
6186 static zio_t *
6187 l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp,
6188     uint8_t *lb_buf)
6189 {
6190         uint32_t psize;
6191         zio_t *pio;
6192 
6193         psize = LBP_GET_PSIZE(lbp);
6194         ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
6195         pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
6196             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6197             ZIO_FLAG_DONT_RETRY);
6198         (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize,
6199             lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
6200             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6201             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
6202 
6203         return (pio);
6204 }
6205 
6206 /*
6207  * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
6208  * buffers allocated for it.
6209  */
6210 static void
6211 l2arc_log_blk_prefetch_abort(zio_t *zio)
6212 {
6213         (void) zio_wait(zio);
6214 }
6215 
6216 /*
6217  * Creates a zio to update the device header on an l2arc device. The zio is
6218  * initiated as a child of `pio'.
6219  */
6220 static void
6221 l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
6222 {
6223         zio_t *wzio;
6224         vdev_stat_t st;
6225         l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr;
6226 
6227         vdev_get_stats(dev->l2ad_vdev, &st);
6228 
6229         hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC;
6230         hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6231         hdr->l2dh_evict_tail = dev->l2ad_evict;
6232         hdr->l2dh_alloc_space = st.vs_alloc;
6233         hdr->l2dh_flags = 0;
6234         if (dev->l2ad_first)
6235                 hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
6236 
6237         /* checksum operation goes last */
6238         l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum);
6239 
6240         CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE &&
6241             sizeof (*hdr) <= SPA_MAXBLOCKSIZE);
6242         wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
6243             sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL,
6244             NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6245         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6246             zio_t *, wzio);
6247         (void) zio_nowait(wzio);
6248 }
6249 
6250 /*
6251  * Commits a log block to the L2ARC device. This routine is invoked from
6252  * l2arc_write_buffers when the log block fills up.
6253  * This function allocates some memory to temporarily hold the serialized
6254  * buffer to be written. This is then released in l2arc_write_done.
6255  */
6256 static void
6257 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
6258     l2arc_write_callback_t *cb)
6259 {
6260         l2arc_log_blk_phys_t    *lb = &dev->l2ad_log_blk;
6261         uint64_t                psize, asize;
6262         l2arc_log_blk_buf_t     *lb_buf;
6263         zio_t                   *wzio;
6264 
6265         VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
6266 
6267         /* link the buffer into the block chain */
6268         lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1];
6269         lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC;
6270 
6271         /* try to compress the buffer */
6272         lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
6273         list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf);
6274         VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb,
6275             lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0);
6276 
6277         /*
6278          * Update the start log blk pointer in the device header to point
6279          * to the log block we're about to write.
6280          */
6281         dev->l2ad_dev_hdr.l2dh_start_lbps[1] =
6282             dev->l2ad_dev_hdr.l2dh_start_lbps[0];
6283         dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand;
6284         LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb));
6285         LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize);
6286         LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6287             ZIO_CHECKSUM_FLETCHER_4);
6288         LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0);
6289         if (psize < sizeof (*lb)) {
6290                 /* compression succeeded */
6291                 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6292                     ZIO_COMPRESS_LZ4);
6293         } else {
6294                 /* compression failed */
6295                 bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb));
6296                 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6297                     ZIO_COMPRESS_OFF);
6298         }
6299         /* checksum what we're about to write */
6300         fletcher_4_native(lb_buf->l2lbb_log_blk, psize,
6301             &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum);
6302 
6303         /* perform the write itself */
6304         CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
6305             L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
6306         wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
6307             psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
6308             ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6309         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
6310         (void) zio_nowait(wzio);
6311 
6312         /* realign the device hand */
6313         asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
6314         dev->l2ad_hand += asize;
6315         VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first);
6316         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
6317 
6318         /* bump the kstats */
6319         ARCSTAT_INCR(arcstat_l2_write_bytes, psize);
6320         ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
6321         ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
6322         ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
6323             dev->l2ad_log_blk_payload_asize / asize);
6324 
6325         dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0;
6326 }
6327 
6328 /*
6329  * Computes the checksum of `hdr' and stores it in `cksum'.
6330  */
6331 static void
6332 l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
6333 {
6334         fletcher_4_native((uint8_t *)hdr +
6335             offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
6336             sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
6337             cksum);
6338 }
6339 
6340 /*
6341  * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
6342  * The buffer being inserted must be present in L2ARC.
6343  * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
6344  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
6345  */
6346 static boolean_t
6347 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
6348 {
6349         l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
6350         l2arc_log_ent_phys_t *le;
6351         const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
6352         int index = dev->l2ad_log_ent_idx++;
6353 
6354         ASSERT(l2hdr != NULL);
6355         ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
6356 
6357         le = &lb->l2lb_entries[index];
6358         bzero(le, sizeof (*le));
6359         le->l2le_dva = ab->b_dva;
6360         le->l2le_birth = ab->b_birth;
6361         le->l2le_cksum0 = ab->b_cksum0;
6362         le->l2le_daddr = l2hdr->b_daddr;
6363         LE_SET_LSIZE(le, ab->b_size);
6364         LE_SET_PSIZE(le, l2hdr->b_asize);
6365         LE_SET_COMPRESS(le, l2hdr->b_compress);
6366         le->l2le_freeze_cksum = *ab->b_freeze_cksum;
6367         LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
6368         LE_SET_TYPE(le, ab->b_type);
6369         dev->l2ad_log_blk_payload_asize += l2hdr->b_asize;
6370 
6371         return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
6372 }
6373 
6374 /*
6375  * Checks whether a given L2ARC device address sits in a time-sequential
6376  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
6377  * just do a range comparison, we need to handle the situation in which the
6378  * range wraps around the end of the L2ARC device. Arguments:
6379  *      bottom  Lower end of the range to check (written to earlier).
6380  *      top     Upper end of the range to check (written to later).
6381  *      check   The address for which we want to determine if it sits in
6382  *              between the top and bottom.
6383  *
6384  * The 3-way conditional below represents the following cases:
6385  *
6386  *      bottom < top : Sequentially ordered case:
6387  *        <check>--------+-------------------+
6388  *                       |  (overlap here?)  |
6389  *       L2ARC dev       V                   V
6390  *       |---------------<bottom>============<top>--------------|
6391  *
6392  *      bottom > top: Looped-around case:
6393  *                            <check>--------+------------------+
6394  *                                           |  (overlap here?) |
6395  *       L2ARC dev                           V                  V
6396  *       |===============<top>---------------<bottom>===========|
6397  *       ^               ^
6398  *       |  (or here?)   |
6399  *       +---------------+---------<check>
6400  *
6401  *      top == bottom : Just a single address comparison.
6402  */
6403 static inline boolean_t
6404 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
6405 {
6406         if (bottom < top)
6407                 return (bottom <= check && check <= top);
6408         else if (bottom > top)
6409                 return (check <= top || bottom <= check);
6410         else
6411                 return (check == top);
6412 }
6413 
6414 /*
6415  * Checks whether a rebuild timeout deadline has been hit and if it has,
6416  * increments the appropriate error counters.
6417  */
6418 static boolean_t
6419 l2arc_check_rebuild_timeout_hit(int64_t deadline)
6420 {
6421         if (deadline != 0 && deadline < ddi_get_lbolt64()) {
6422                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout);
6423                 cmn_err(CE_WARN, "L2ARC rebuild is taking too long, "
6424                     "dropping remaining L2ARC metadata.");
6425                 return (B_TRUE);
6426         } else {
6427                 return (B_FALSE);
6428         }
6429 }