illumos-gate.git New usr/src/uts/common/fs/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26  */
  27 
  28 /*
  29  * DVA-based Adjustable Replacement Cache
  30  *
  31  * While much of the theory of operation used here is
  32  * based on the self-tuning, low overhead replacement cache
  33  * presented by Megiddo and Modha at FAST 2003, there are some
  34  * significant differences:
  35  *
  36  * 1. The Megiddo and Modha model assumes any page is evictable.
  37  * Pages in its cache cannot be "locked" into memory.  This makes
  38  * the eviction algorithm simple: evict the last page in the list.
  39  * This also make the performance characteristics easy to reason
  40  * about.  Our cache is not so simple.  At any given moment, some
  41  * subset of the blocks in the cache are un-evictable because we
  42  * have handed out a reference to them.  Blocks are only evictable
  43  * when there are no external references active.  This makes
  44  * eviction far more problematic:  we choose to evict the evictable
  45  * blocks that are the "lowest" in the list.
  46  *
  47  * There are times when it is not possible to evict the requested
  48  * space.  In these circumstances we are unable to adjust the cache
  49  * size.  To prevent the cache growing unbounded at these times we
  50  * implement a "cache throttle" that slows the flow of new data
  51  * into the cache until we can make space available.
  52  *
  53  * 2. The Megiddo and Modha model assumes a fixed cache size.
  54  * Pages are evicted when the cache is full and there is a cache
  55  * miss.  Our model has a variable sized cache.  It grows with
  56  * high use, but also tries to react to memory pressure from the
  57  * operating system: decreasing its size when system memory is
  58  * tight.
  59  *
  60  * 3. The Megiddo and Modha model assumes a fixed page size. All
  61  * elements of the cache are therefore exactly the same size.  So
  62  * when adjusting the cache size following a cache miss, its simply
  63  * a matter of choosing a single page to evict.  In our model, we
  64  * have variable sized cache blocks (rangeing from 512 bytes to
  65  * 128K bytes).  We therefore choose a set of blocks to evict to make
  66  * space for a cache miss that approximates as closely as possible
  67  * the space used by the new block.
  68  *
  69  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  70  * by N. Megiddo & D. Modha, FAST 2003
  71  */
  72 
  73 /*
  74  * The locking model:
  75  *
  76  * A new reference to a cache buffer can be obtained in two
  77  * ways: 1) via a hash table lookup using the DVA as a key,
  78  * or 2) via one of the ARC lists.  The arc_read() interface
  79  * uses method 1, while the internal arc algorithms for
  80  * adjusting the cache use method 2.  We therefore provide two
  81  * types of locks: 1) the hash table lock array, and 2) the
  82  * arc list locks.
  83  *
  84  * Buffers do not have their own mutexes, rather they rely on the
  85  * hash table mutexes for the bulk of their protection (i.e. most
  86  * fields in the arc_buf_hdr_t are protected by these mutexes).
  87  *
  88  * buf_hash_find() returns the appropriate mutex (held) when it
  89  * locates the requested buffer in the hash table.  It returns
  90  * NULL for the mutex if the buffer was not in the table.
  91  *
  92  * buf_hash_remove() expects the appropriate hash mutex to be
  93  * already held before it is invoked.
  94  *
  95  * Each arc state also has a mutex which is used to protect the
  96  * buffer list associated with the state.  When attempting to
  97  * obtain a hash table lock while holding an arc list lock you
  98  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  99  * the active state mutex must be held before the ghost state mutex.
 100  *
 101  * Arc buffers may have an associated eviction callback function.
 102  * This function will be invoked prior to removing the buffer (e.g.
 103  * in arc_do_user_evicts()).  Note however that the data associated
 104  * with the buffer may be evicted prior to the callback.  The callback
 105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  * the users of callbacks must ensure that their private data is
 107  * protected from simultaneous callbacks from arc_buf_evict()
 108  * and arc_do_user_evicts().
 109  *
 110  * Note that the majority of the performance stats are manipulated
 111  * with atomic operations.
 112  *
 113  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 114  *
 115  *      - L2ARC buflist creation
 116  *      - L2ARC buflist eviction
 117  *      - L2ARC write completion, which walks L2ARC buflists
 118  *      - ARC header destruction, as it removes from L2ARC buflists
 119  *      - ARC header release, as it removes from L2ARC buflists
 120  */
 121 
 122 #include <sys/spa.h>
 123 #include <sys/zio.h>
 124 #include <sys/zio_compress.h>
 125 #include <sys/zfs_context.h>
 126 #include <sys/arc.h>
 127 #include <sys/refcount.h>
 128 #include <sys/vdev.h>
 129 #include <sys/vdev_impl.h>
 130 #ifdef _KERNEL
 131 #include <sys/vmsystm.h>
 132 #include <vm/anon.h>
 133 #include <sys/fs/swapnode.h>
 134 #include <sys/dnlc.h>
 135 #endif
 136 #include <sys/callb.h>
 137 #include <sys/kstat.h>
 138 #include <zfs_fletcher.h>
 139 #include <sys/byteorder.h>
 140 
 141 #ifndef _KERNEL
 142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 143 boolean_t arc_watch = B_FALSE;
 144 int arc_procfd;
 145 #endif
 146 
 147 static kmutex_t         arc_reclaim_thr_lock;
 148 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 149 static uint8_t          arc_thread_exit;
 150 
 151 extern int zfs_write_limit_shift;
 152 extern uint64_t zfs_write_limit_max;
 153 extern kmutex_t zfs_write_limit_lock;
 154 
 155 #define ARC_REDUCE_DNLC_PERCENT 3
 156 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 157 
 158 typedef enum arc_reclaim_strategy {
 159         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 160         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 161 } arc_reclaim_strategy_t;
 162 
 163 /* number of seconds before growing cache again */
 164 static int              arc_grow_retry = 60;
 165 
 166 /* shift of arc_c for calculating both min and max arc_p */
 167 static int              arc_p_min_shift = 4;
 168 
 169 /* log2(fraction of arc to reclaim) */
 170 static int              arc_shrink_shift = 5;
 171 
 172 /*
 173  * minimum lifespan of a prefetch block in clock ticks
 174  * (initialized in arc_init())
 175  */
 176 static int              arc_min_prefetch_lifespan;
 177 
 178 static int arc_dead;
 179 
 180 /*
 181  * The arc has filled available memory and has now warmed up.
 182  */
 183 static boolean_t arc_warm;
 184 
 185 /*
 186  * These tunables are for performance analysis.
 187  */
 188 uint64_t zfs_arc_max;
 189 uint64_t zfs_arc_min;
 190 uint64_t zfs_arc_meta_limit = 0;
 191 int zfs_arc_grow_retry = 0;
 192 int zfs_arc_shrink_shift = 0;
 193 int zfs_arc_p_min_shift = 0;
 194 int zfs_disable_dup_eviction = 0;
 195 
 196 /*
 197  * Note that buffers can be in one of 6 states:
 198  *      ARC_anon        - anonymous (discussed below)
 199  *      ARC_mru         - recently used, currently cached
 200  *      ARC_mru_ghost   - recentely used, no longer in cache
 201  *      ARC_mfu         - frequently used, currently cached
 202  *      ARC_mfu_ghost   - frequently used, no longer in cache
 203  *      ARC_l2c_only    - exists in L2ARC but not other states
 204  * When there are no active references to the buffer, they are
 205  * are linked onto a list in one of these arc states.  These are
 206  * the only buffers that can be evicted or deleted.  Within each
 207  * state there are multiple lists, one for meta-data and one for
 208  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 209  * etc.) is tracked separately so that it can be managed more
 210  * explicitly: favored over data, limited explicitly.
 211  *
 212  * Anonymous buffers are buffers that are not associated with
 213  * a DVA.  These are buffers that hold dirty block copies
 214  * before they are written to stable storage.  By definition,
 215  * they are "ref'd" and are considered part of arc_mru
 216  * that cannot be freed.  Generally, they will aquire a DVA
 217  * as they are written and migrate onto the arc_mru list.
 218  *
 219  * The ARC_l2c_only state is for buffers that are in the second
 220  * level ARC but no longer in any of the ARC_m* lists.  The second
 221  * level ARC itself may also contain buffers that are in any of
 222  * the ARC_m* states - meaning that a buffer can exist in two
 223  * places.  The reason for the ARC_l2c_only state is to keep the
 224  * buffer header in the hash table, so that reads that hit the
 225  * second level ARC benefit from these fast lookups.
 226  */
 227 
 228 typedef struct arc_state {
 229         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 230         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 231         uint64_t arcs_size;     /* total amount of data in this state */
 232         kmutex_t arcs_mtx;
 233 } arc_state_t;
 234 
 235 /* The 6 states: */
 236 static arc_state_t ARC_anon;
 237 static arc_state_t ARC_mru;
 238 static arc_state_t ARC_mru_ghost;
 239 static arc_state_t ARC_mfu;
 240 static arc_state_t ARC_mfu_ghost;
 241 static arc_state_t ARC_l2c_only;
 242 
 243 typedef struct arc_stats {
 244         kstat_named_t arcstat_hits;
 245         kstat_named_t arcstat_misses;
 246         kstat_named_t arcstat_demand_data_hits;
 247         kstat_named_t arcstat_demand_data_misses;
 248         kstat_named_t arcstat_demand_metadata_hits;
 249         kstat_named_t arcstat_demand_metadata_misses;
 250         kstat_named_t arcstat_prefetch_data_hits;
 251         kstat_named_t arcstat_prefetch_data_misses;
 252         kstat_named_t arcstat_prefetch_metadata_hits;
 253         kstat_named_t arcstat_prefetch_metadata_misses;
 254         kstat_named_t arcstat_mru_hits;
 255         kstat_named_t arcstat_mru_ghost_hits;
 256         kstat_named_t arcstat_mfu_hits;
 257         kstat_named_t arcstat_mfu_ghost_hits;
 258         kstat_named_t arcstat_deleted;
 259         kstat_named_t arcstat_recycle_miss;
 260         /*
 261          * Number of buffers that could not be evicted because the hash lock
 262          * was held by another thread.  The lock may not necessarily be held
 263          * by something using the same buffer, since hash locks are shared
 264          * by multiple buffers.
 265          */
 266         kstat_named_t arcstat_mutex_miss;
 267         /*
 268          * Number of buffers skipped because they have I/O in progress, are
 269          * indrect prefetch buffers that have not lived long enough, or are
 270          * not from the spa we're trying to evict from.
 271          */
 272         kstat_named_t arcstat_evict_skip;
 273         kstat_named_t arcstat_evict_l2_cached;
 274         kstat_named_t arcstat_evict_l2_eligible;
 275         kstat_named_t arcstat_evict_l2_ineligible;
 276         kstat_named_t arcstat_hash_elements;
 277         kstat_named_t arcstat_hash_elements_max;
 278         kstat_named_t arcstat_hash_collisions;
 279         kstat_named_t arcstat_hash_chains;
 280         kstat_named_t arcstat_hash_chain_max;
 281         kstat_named_t arcstat_p;
 282         kstat_named_t arcstat_c;
 283         kstat_named_t arcstat_c_min;
 284         kstat_named_t arcstat_c_max;
 285         kstat_named_t arcstat_size;
 286         kstat_named_t arcstat_hdr_size;
 287         kstat_named_t arcstat_data_size;
 288         kstat_named_t arcstat_other_size;
 289         kstat_named_t arcstat_l2_hits;
 290         kstat_named_t arcstat_l2_misses;
 291         kstat_named_t arcstat_l2_feeds;
 292         kstat_named_t arcstat_l2_rw_clash;
 293         kstat_named_t arcstat_l2_read_bytes;
 294         kstat_named_t arcstat_l2_write_bytes;
 295         kstat_named_t arcstat_l2_writes_sent;
 296         kstat_named_t arcstat_l2_writes_done;
 297         kstat_named_t arcstat_l2_writes_error;
 298         kstat_named_t arcstat_l2_writes_hdr_miss;
 299         kstat_named_t arcstat_l2_evict_lock_retry;
 300         kstat_named_t arcstat_l2_evict_reading;
 301         kstat_named_t arcstat_l2_free_on_write;
 302         kstat_named_t arcstat_l2_abort_lowmem;
 303         kstat_named_t arcstat_l2_cksum_bad;
 304         kstat_named_t arcstat_l2_io_error;
 305         kstat_named_t arcstat_l2_size;
 306         kstat_named_t arcstat_l2_asize;
 307         kstat_named_t arcstat_l2_hdr_size;
 308         kstat_named_t arcstat_l2_compress_successes;
 309         kstat_named_t arcstat_l2_compress_zeros;
 310         kstat_named_t arcstat_l2_compress_failures;
 311         kstat_named_t arcstat_l2_meta_writes;
 312         kstat_named_t arcstat_l2_meta_avg_size;
 313         kstat_named_t arcstat_l2_meta_avg_asize;
 314         kstat_named_t arcstat_l2_asize_to_meta_ratio;
 315         kstat_named_t arcstat_l2_rebuild_attempts;
 316         kstat_named_t arcstat_l2_rebuild_successes;
 317         kstat_named_t arcstat_l2_rebuild_unsupported;
 318         kstat_named_t arcstat_l2_rebuild_timeout;
 319         kstat_named_t arcstat_l2_rebuild_arc_bytes;
 320         kstat_named_t arcstat_l2_rebuild_l2arc_bytes;
 321         kstat_named_t arcstat_l2_rebuild_bufs;
 322         kstat_named_t arcstat_l2_rebuild_bufs_precached;
 323         kstat_named_t arcstat_l2_rebuild_metabufs;
 324         kstat_named_t arcstat_l2_rebuild_uberblk_errors;
 325         kstat_named_t arcstat_l2_rebuild_io_errors;
 326         kstat_named_t arcstat_l2_rebuild_cksum_errors;
 327         kstat_named_t arcstat_l2_rebuild_loop_errors;
 328         kstat_named_t arcstat_l2_rebuild_abort_lowmem;
 329         kstat_named_t arcstat_memory_throttle_count;
 330         kstat_named_t arcstat_duplicate_buffers;
 331         kstat_named_t arcstat_duplicate_buffers_size;
 332         kstat_named_t arcstat_duplicate_reads;
 333         kstat_named_t arcstat_meta_used;
 334         kstat_named_t arcstat_meta_limit;
 335         kstat_named_t arcstat_meta_max;
 336 } arc_stats_t;
 337 
 338 static arc_stats_t arc_stats = {
 339         { "hits",                       KSTAT_DATA_UINT64 },
 340         { "misses",                     KSTAT_DATA_UINT64 },
 341         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 342         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 343         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 344         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 345         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 346         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 347         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 348         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 349         { "mru_hits",                   KSTAT_DATA_UINT64 },
 350         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 351         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 352         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 353         { "deleted",                    KSTAT_DATA_UINT64 },
 354         { "recycle_miss",               KSTAT_DATA_UINT64 },
 355         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 356         { "evict_skip",                 KSTAT_DATA_UINT64 },
 357         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 358         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 359         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 360         { "hash_elements",              KSTAT_DATA_UINT64 },
 361         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 362         { "hash_collisions",            KSTAT_DATA_UINT64 },
 363         { "hash_chains",                KSTAT_DATA_UINT64 },
 364         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 365         { "p",                          KSTAT_DATA_UINT64 },
 366         { "c",                          KSTAT_DATA_UINT64 },
 367         { "c_min",                      KSTAT_DATA_UINT64 },
 368         { "c_max",                      KSTAT_DATA_UINT64 },
 369         { "size",                       KSTAT_DATA_UINT64 },
 370         { "hdr_size",                   KSTAT_DATA_UINT64 },
 371         { "data_size",                  KSTAT_DATA_UINT64 },
 372         { "other_size",                 KSTAT_DATA_UINT64 },
 373         { "l2_hits",                    KSTAT_DATA_UINT64 },
 374         { "l2_misses",                  KSTAT_DATA_UINT64 },
 375         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 376         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 377         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 378         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 379         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 380         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 381         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 382         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 383         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 384         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 385         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 386         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 387         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 388         { "l2_io_error",                KSTAT_DATA_UINT64 },
 389         { "l2_size",                    KSTAT_DATA_UINT64 },
 390         { "l2_asize",                   KSTAT_DATA_UINT64 },
 391         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 392         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 393         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 394         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 395         { "l2_meta_writes",             KSTAT_DATA_UINT64 },
 396         { "l2_meta_avg_size",           KSTAT_DATA_UINT64 },
 397         { "l2_meta_avg_asize",          KSTAT_DATA_UINT64 },
 398         { "l2_asize_to_meta_ratio",     KSTAT_DATA_UINT64 },
 399         { "l2_rebuild_attempts",        KSTAT_DATA_UINT64 },
 400         { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
 401         { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
 402         { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
 403         { "l2_rebuild_arc_bytes",       KSTAT_DATA_UINT64 },
 404         { "l2_rebuild_l2arc_bytes",     KSTAT_DATA_UINT64 },
 405         { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
 406         { "l2_rebuild_precached",       KSTAT_DATA_UINT64 },
 407         { "l2_rebuild_metabufs",        KSTAT_DATA_UINT64 },
 408         { "l2_rebuild_uberblk_errors",  KSTAT_DATA_UINT64 },
 409         { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
 410         { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
 411         { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
 412         { "l2_rebuild_abort_lowmem",    KSTAT_DATA_UINT64 },
 413         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 414         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 415         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 416         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 417         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 418         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 419         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 420 };
 421 
 422 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 423 
 424 #define ARCSTAT_INCR(stat, val) \
 425         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 426 
 427 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 428 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 429 
 430 #define ARCSTAT_MAX(stat, val) {                                        \
 431         uint64_t m;                                                     \
 432         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 433             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 434                 continue;                                               \
 435 }
 436 
 437 #define ARCSTAT_MAXSTAT(stat) \
 438         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 439 
 440 /*
 441  * We define a macro to allow ARC hits/misses to be easily broken down by
 442  * two separate conditions, giving a total of four different subtypes for
 443  * each of hits and misses (so eight statistics total).
 444  */
 445 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 446         if (cond1) {                                                    \
 447                 if (cond2) {                                            \
 448                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 449                 } else {                                                \
 450                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 451                 }                                                       \
 452         } else {                                                        \
 453                 if (cond2) {                                            \
 454                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 455                 } else {                                                \
 456                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 457                 }                                                       \
 458         }
 459 
 460 /*
 461  * This macro allows us to use kstats as floating averages. Each time we
 462  * update this kstat, we first factor it and the update value by
 463  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
 464  * average. This macro assumes that integer loads and stores are atomic, but
 465  * is not safe for multiple writers updating the kstat in parallel (only the
 466  * last writer's update will remain).
 467  */
 468 #define ARCSTAT_F_AVG_FACTOR    3
 469 #define ARCSTAT_F_AVG(stat, value) \
 470         do { \
 471                 uint64_t x = ARCSTAT(stat); \
 472                 x = x - x / ARCSTAT_F_AVG_FACTOR + \
 473                     (value) / ARCSTAT_F_AVG_FACTOR; \
 474                 ARCSTAT(stat) = x; \
 475                 _NOTE(NOTREACHED) \
 476                 _NOTE(CONSTCOND) \
 477         } while (0)
 478 
 479 kstat_t                 *arc_ksp;
 480 static arc_state_t      *arc_anon;
 481 static arc_state_t      *arc_mru;
 482 static arc_state_t      *arc_mru_ghost;
 483 static arc_state_t      *arc_mfu;
 484 static arc_state_t      *arc_mfu_ghost;
 485 static arc_state_t      *arc_l2c_only;
 486 
 487 /*
 488  * There are several ARC variables that are critical to export as kstats --
 489  * but we don't want to have to grovel around in the kstat whenever we wish to
 490  * manipulate them.  For these variables, we therefore define them to be in
 491  * terms of the statistic variable.  This assures that we are not introducing
 492  * the possibility of inconsistency by having shadow copies of the variables,
 493  * while still allowing the code to be readable.
 494  */
 495 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 496 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 497 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 498 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 499 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 500 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 501 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 502 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 503 
 504 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 505         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 506 
 507 static int              arc_no_grow;    /* Don't try to grow cache size */
 508 static uint64_t         arc_tempreserve;
 509 static uint64_t         arc_loaned_bytes;
 510 
 511 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 512 
 513 typedef struct arc_callback arc_callback_t;
 514 
 515 struct arc_callback {
 516         void                    *acb_private;
 517         arc_done_func_t         *acb_done;
 518         arc_buf_t               *acb_buf;
 519         zio_t                   *acb_zio_dummy;
 520         arc_callback_t          *acb_next;
 521 };
 522 
 523 typedef struct arc_write_callback arc_write_callback_t;
 524 
 525 struct arc_write_callback {
 526         void            *awcb_private;
 527         arc_done_func_t *awcb_ready;
 528         arc_done_func_t *awcb_done;
 529         arc_buf_t       *awcb_buf;
 530 };
 531 
 532 struct arc_buf_hdr {
 533         /* protected by hash lock */
 534         dva_t                   b_dva;
 535         uint64_t                b_birth;
 536         uint64_t                b_cksum0;
 537 
 538         kmutex_t                b_freeze_lock;
 539         zio_cksum_t             *b_freeze_cksum;
 540         void                    *b_thawed;
 541 
 542         arc_buf_hdr_t           *b_hash_next;
 543         arc_buf_t               *b_buf;
 544         uint32_t                b_flags;
 545         uint32_t                b_datacnt;
 546 
 547         arc_callback_t          *b_acb;
 548         kcondvar_t              b_cv;
 549 
 550         /* immutable */
 551         arc_buf_contents_t      b_type;
 552         uint64_t                b_size;
 553         uint64_t                b_spa;
 554 
 555         /* protected by arc state mutex */
 556         arc_state_t             *b_state;
 557         list_node_t             b_arc_node;
 558 
 559         /* updated atomically */
 560         clock_t                 b_arc_access;
 561 
 562         /* self protecting */
 563         refcount_t              b_refcnt;
 564 
 565         l2arc_buf_hdr_t         *b_l2hdr;
 566         list_node_t             b_l2node;
 567 };
 568 
 569 static arc_buf_t *arc_eviction_list;
 570 static kmutex_t arc_eviction_mtx;
 571 static arc_buf_hdr_t arc_eviction_hdr;
 572 static void arc_get_data_buf(arc_buf_t *buf);
 573 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 574 static int arc_evict_needed(arc_buf_contents_t type);
 575 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 576 static void arc_buf_watch(arc_buf_t *buf);
 577 
 578 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 579 
 580 #define GHOST_STATE(state)      \
 581         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 582         (state) == arc_l2c_only)
 583 
 584 /*
 585  * Private ARC flags.  These flags are private ARC only flags that will show up
 586  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 587  * be passed in as arc_flags in things like arc_read.  However, these flags
 588  * should never be passed and should only be set by ARC code.  When adding new
 589  * public flags, make sure not to smash the private ones.
 590  */
 591 
 592 #define ARC_IN_HASH_TABLE       (1 << 9)  /* this buffer is hashed */
 593 #define ARC_IO_IN_PROGRESS      (1 << 10) /* I/O in progress for buf */
 594 #define ARC_IO_ERROR            (1 << 11) /* I/O failed for buf */
 595 #define ARC_FREED_IN_READ       (1 << 12) /* buf freed while in read */
 596 #define ARC_BUF_AVAILABLE       (1 << 13) /* block not in active use */
 597 #define ARC_INDIRECT            (1 << 14) /* this is an indirect block */
 598 #define ARC_FREE_IN_PROGRESS    (1 << 15) /* hdr about to be freed */
 599 #define ARC_L2_WRITING          (1 << 16) /* L2ARC write in progress */
 600 #define ARC_L2_EVICTED          (1 << 17) /* evicted during I/O */
 601 #define ARC_L2_WRITE_HEAD       (1 << 18) /* head of write list */
 602 
 603 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 604 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 605 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 606 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 607 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 608 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 609 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 610 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 611 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS &&  \
 612                                     (hdr)->b_l2hdr != NULL)
 613 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 614 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 615 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 616 
 617 /*
 618  * Other sizes
 619  */
 620 
 621 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 622 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 623 
 624 /*
 625  * Hash table routines
 626  */
 627 
 628 #define HT_LOCK_PAD     64
 629 
 630 struct ht_lock {
 631         kmutex_t        ht_lock;
 632 #ifdef _KERNEL
 633         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 634 #endif
 635 };
 636 
 637 #define BUF_LOCKS 256
 638 typedef struct buf_hash_table {
 639         uint64_t ht_mask;
 640         arc_buf_hdr_t **ht_table;
 641         struct ht_lock ht_locks[BUF_LOCKS];
 642 } buf_hash_table_t;
 643 
 644 static buf_hash_table_t buf_hash_table;
 645 
 646 #define BUF_HASH_INDEX(spa, dva, birth) \
 647         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 648 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 649 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 650 #define HDR_LOCK(hdr) \
 651         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 652 
 653 uint64_t zfs_crc64_table[256];
 654 
 655 /*
 656  * Level 2 ARC
 657  */
 658 
 659 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 660 #define L2ARC_HEADROOM          2                       /* num of writes */
 661 /*
 662  * If we discover during ARC scan any buffers to be compressed, we boost
 663  * our headroom for the next scanning cycle by this percentage multiple.
 664  */
 665 #define L2ARC_HEADROOM_BOOST    200
 666 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 667 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 668 
 669 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 670 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 671 
 672 /* L2ARC Performance Tunables */
 673 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 674 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 675 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 676 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 677 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 678 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 679 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 680 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 681 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 682 
 683 /*
 684  * L2ARC Internals
 685  */
 686 typedef struct l2arc_dev l2arc_dev_t;
 687 static list_t L2ARC_dev_list;                   /* device list */
 688 static list_t *l2arc_dev_list;                  /* device list pointer */
 689 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 690 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 691 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 692 static list_t L2ARC_free_on_write;              /* free after write buf list */
 693 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 694 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 695 static uint64_t l2arc_ndev;                     /* number of devices */
 696 
 697 typedef struct l2arc_read_callback {
 698         arc_buf_t               *l2rcb_buf;             /* read buffer */
 699         spa_t                   *l2rcb_spa;             /* spa */
 700         blkptr_t                l2rcb_bp;               /* original blkptr */
 701         zbookmark_t             l2rcb_zb;               /* original bookmark */
 702         int                     l2rcb_flags;            /* original flags */
 703         enum zio_compress       l2rcb_compress;         /* applied compress */
 704 } l2arc_read_callback_t;
 705 
 706 typedef struct l2arc_write_callback {
 707         l2arc_dev_t     *l2wcb_dev;             /* device info */
 708         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 709         uint8_t         *l2wcb_pbuf;            /* pbuf sent in this write */
 710         uint32_t        l2wcb_pbuf_size;        /* size of committed pbuf */
 711         uint8_t         *l2wcb_ub_buf;          /* uberblock in this write */
 712 } l2arc_write_callback_t;
 713 
 714 struct l2arc_buf_hdr {
 715         /* protected by arc_buf_hdr  mutex */
 716         l2arc_dev_t             *b_dev;         /* L2ARC device */
 717         uint64_t                b_daddr;        /* disk address, offset byte */
 718         /* compression applied to buffer data */
 719         enum zio_compress       b_compress;
 720         /* real alloc'd buffer size depending on b_compress applied */
 721         int                     b_asize;
 722         /* temporary buffer holder for in-flight compressed data */
 723         void                    *b_tmp_cdata;
 724 };
 725 
 726 typedef struct l2arc_data_free {
 727         /* protected by l2arc_free_on_write_mtx */
 728         void            *l2df_data;
 729         size_t          l2df_size;
 730         void            (*l2df_func)(void *, size_t);
 731         list_node_t     l2df_list_node;
 732 } l2arc_data_free_t;
 733 
 734 static kmutex_t l2arc_feed_thr_lock;
 735 static kcondvar_t l2arc_feed_thr_cv;
 736 static uint8_t l2arc_thread_exit;
 737 
 738 static void l2arc_read_done(zio_t *zio);
 739 static void l2arc_hdr_stat_add(boolean_t from_arc);
 740 static void l2arc_hdr_stat_remove(void);
 741 
 742 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 743 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 744     enum zio_compress c);
 745 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 746 
 747 typedef enum {
 748         L2UBLK_BIG_ENDIAN = (1 << 0),     /* little endian assumed otherwise */
 749         L2UBLK_EVICT_FIRST = (1 << 1)     /* mirror of l2ad_first in l2dev */
 750 } l2uberblock_flags_t;
 751 
 752 typedef struct l2uberblock {
 753         uint32_t                ub_magic;
 754         uint8_t                 ub_version;
 755         l2uberblock_flags_t     ub_flags;
 756 
 757         uint64_t                ub_spa_guid;
 758         uint64_t                ub_birth;
 759         uint64_t                ub_evict_tail;  /* current evict pointer */
 760         uint64_t                ub_alloc_space; /* vdev space alloc status */
 761         uint64_t                ub_pbuf_daddr;  /* address of newest pbuf */
 762         uint32_t                ub_pbuf_asize;  /* size of newest pbuf */
 763         zio_cksum_t             ub_pbuf_cksum;  /* fletcher4 of newest pbuf */
 764 
 765         zio_cksum_t             ub_cksum;       /* cksum of uberblock */
 766 } l2uberblock_t;
 767 
 768 typedef enum {
 769         L2PBUF_BIG_ENDIAN = (1 << 0),     /* little endian assumed otherwise */
 770         L2PBUF_COMPRESSED = (1 << 1)      /* pbuf data items are compressed */
 771 } l2pbuf_flags_t;
 772 
 773 typedef struct l2pbuf {
 774         uint32_t                pb_magic;
 775         unsigned int            pb_version;
 776         l2pbuf_flags_t          pb_flags;
 777 
 778         uint64_t                pb_prev_daddr;  /* address of previous pbuf */
 779         uint32_t                pb_prev_asize;  /* size of previous pbuf */
 780         zio_cksum_t             pb_prev_cksum;  /* fletcher4 of prev. pbuf */
 781 
 782         /*
 783          * This is a set of item lists that are contained in this pbuf. Each
 784          * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's.
 785          * This serves as a soft timeout feature - once the limit of the
 786          * number of item lists that a pbuf can hold is reached, the pbuf is
 787          * flushed to stable storage, regardless of its total size.
 788          */
 789         list_t                  *pb_buflists_list;
 790 
 791         /*
 792          * Number of compressed bytes referenced by items in this pbuf and
 793          * the number of lists present.
 794          * This is not actually written to storage, it is only used by
 795          * internal algorithms which check for when a pbuf reaches a
 796          * certain size limit, after which it is flushed in a write.
 797          */
 798         uint64_t                pb_payload_asz;
 799         /* Same thing for number of buflists */
 800         int                     pb_nbuflists;
 801 
 802         /*
 803          * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size.
 804          * This is then used by l2arc_pbuf_restore to update used space
 805          * on the L2ARC vdev.
 806          */
 807         size_t                  pb_asize;
 808 } l2pbuf_t;
 809 
 810 typedef struct l2pbuf_buf l2pbuf_buf_t;
 811 typedef struct l2pbuf_buflist {
 812         uint32_t                l2pbl_nbufs;
 813         l2pbuf_buf_t            *l2pbl_bufs;
 814         list_node_t             l2pbl_node;
 815 } l2pbuf_buflist_t;
 816 
 817 struct l2pbuf_buf {
 818         dva_t                   b_dva;          /* dva of buffer */
 819         uint64_t                b_birth;        /* birth txg of buffer */
 820         uint64_t                b_cksum0;
 821         zio_cksum_t             b_freeze_cksum;
 822         uint32_t                b_size;         /* uncompressed buf size */
 823         uint64_t                b_l2daddr;      /* buf location on l2dev */
 824         uint32_t                b_l2asize;      /* actual buf data size */
 825         enum zio_compress       b_l2compress;   /* compression applied */
 826         uint16_t                b_contents_type;
 827         uint32_t                b_flags;
 828 };
 829 
 830 struct l2arc_dev {
 831         vdev_t                  *l2ad_vdev;     /* vdev */
 832         spa_t                   *l2ad_spa;      /* spa */
 833         uint64_t                l2ad_hand;      /* next write location */
 834         uint64_t                l2ad_start;     /* first addr on device */
 835         uint64_t                l2ad_end;       /* last addr on device */
 836         uint64_t                l2ad_evict;     /* last addr eviction reached */
 837         boolean_t               l2ad_first;     /* first sweep through */
 838         boolean_t               l2ad_writing;   /* currently writing */
 839         list_t                  *l2ad_buflist;  /* buffer list */
 840         list_node_t             l2ad_node;      /* device list node */
 841         l2pbuf_t                l2ad_pbuf;      /* currently open pbuf */
 842         uint64_t                l2ad_pbuf_daddr;        /* prev pbuf daddr */
 843         uint64_t                l2ad_pbuf_asize;        /* prev pbuf asize */
 844         zio_cksum_t             l2ad_pbuf_cksum;        /* prev pbuf cksum */
 845         /* uberblock birth counter - incremented for each committed uberblk */
 846         uint64_t                l2ad_uberblock_birth;
 847         /* flag indicating whether a rebuild is currently going on */
 848         boolean_t               l2ad_rebuilding;
 849 };
 850 
 851 /* Stores information about an L2ARC prefetch zio */
 852 typedef struct l2arc_prefetch_info {
 853         uint8_t                 *pi_buf;        /* where the zio writes to */
 854         uint64_t                pi_buflen;      /* length of `buf' */
 855         zio_t                   *pi_hdr_io;     /* see l2arc_pbuf_read below */
 856 } l2arc_prefetch_info_t;
 857 
 858 /* 256 x 4k of l2uberblocks */
 859 #define L2UBERBLOCK_SIZE        4096
 860 #define L2UBERBLOCK_MAGIC       0x12bab10c
 861 #define L2UBERBLOCK_MAX_VERSION 1       /* our maximum uberblock version */
 862 #define L2PBUF_MAGIC            0xdb0faba6
 863 #define L2PBUF_MAX_VERSION      1       /* our maximum pbuf version */
 864 #define L2PBUF_BUF_SIZE         88      /* size of one pbuf buf entry */
 865 #define L2PBUF_HDR_SIZE         56      /* pbuf header excluding any payload */
 866 #define L2PBUF_ENCODED_SIZE(_pb) \
 867         (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb))
 868 /*
 869  * Allocation limit for the payload of a pbuf. This also fundamentally
 870  * limits the number of bufs we can reference in a pbuf.
 871  */
 872 #define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024)
 873 #define L2PBUF_MAX_BUFS         (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE)
 874 #define L2PBUF_COMPRESS_MINSZ   8192    /* minimum size to compress a pbuf */
 875 #define L2PBUF_MAXSZ            100 * 1024 * 1024       /* maximum pbuf size */
 876 #define L2PBUF_MAX_BUFLISTS     128     /* max number of buflists per pbuf */
 877 #define L2ARC_REBUILD_TIMEOUT   60      /* a rebuild may take at most 60s */
 878 #define L2PBUF_IS_FULL(_pb) \
 879         ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \
 880         (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists)
 881 /*
 882  * These are the flags we allow to persist in L2ARC pbufs. The other flags
 883  * of an ARC buffer pertain to the buffer's runtime behavior.
 884  */
 885 #define L2ARC_PERSIST_FLAGS \
 886         (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH)
 887 
 888 /*
 889  * Used during L2ARC rebuild after each read operation to check whether we
 890  * haven't exceeded the rebuild timeout value.
 891  */
 892 #define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \
 893         do { \
 894                 if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \
 895                         __VA_ARGS__; \
 896                         ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \
 897                         cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \
 898                             "dropping remaining L2ARC metadata."); \
 899                         return; \
 900                 } \
 901                 _NOTE(NOTREACHED) \
 902                 _NOTE(CONSTCOND) \
 903         } while (0)
 904 
 905 /*
 906  * Performance tuning of L2ARC persistency:
 907  *
 908  * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt
 909  *              compressing it.
 910  * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers
 911  *              referenced from a pbuf. Once a pbuf reaches this size, it is
 912  *              committed to stable storage. Ideally, there should be approx.
 913  *              l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device.
 914  * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will
 915  *              be buffered in a pbuf before it is committed to L2ARC. This
 916  *              puts a soft temporal upper bound on pbuf commit intervals.
 917  * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
 918  *              pool import or when adding one manually later) will attempt
 919  *              to rebuild L2ARC buffer contents. In special circumstances,
 920  *              the administrator may want to set this to B_FALSE, if they
 921  *              are having trouble importing a pool or attaching an L2ARC
 922  *              device (e.g. the L2ARC device is slow to read in stored pbuf
 923  *              metadata, or the metadata has become somehow
 924  *              fragmented/unusable).
 925  * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
 926  *              avoid a slow L2ARC device from preventing pool import. If we
 927  *              are not done rebuilding an L2ARC device by this time, we
 928  *              stop the rebuild and return immediately.
 929  */
 930 uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ;
 931 uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ;
 932 uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS;
 933 boolean_t l2arc_rebuild_enabled = B_TRUE;
 934 uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
 935 
 936 static void l2arc_rebuild_start(l2arc_dev_t *dev);
 937 static void l2arc_rebuild(l2arc_dev_t *dev);
 938 static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb);
 939 static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev,
 940     uint64_t guid);
 941 
 942 static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub);
 943 static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
 944     zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io);
 945 static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr,
 946     uint32_t asize);
 947 static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize);
 948 static void l2arc_pbuf_prefetch_abort(zio_t *zio);
 949 
 950 static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf);
 951 static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub);
 952 static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
 953     uint64_t guid);
 954 static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio,
 955     l2arc_write_callback_t *cb);
 956 
 957 static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen);
 958 static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen,
 959     l2pbuf_t *pbuf);
 960 static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen,
 961     uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum);
 962 static void l2arc_pbuf_init(l2pbuf_t *pb);
 963 static void l2arc_pbuf_destroy(l2pbuf_t *pb);
 964 static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio,
 965     l2arc_write_callback_t *cb);
 966 static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs);
 967 static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
 968     const arc_buf_hdr_t *ab, int index);
 969 static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb);
 970 
 971 static uint64_t
 972 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 973 {
 974         uint8_t *vdva = (uint8_t *)dva;
 975         uint64_t crc = -1ULL;
 976         int i;
 977 
 978         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 979 
 980         for (i = 0; i < sizeof (dva_t); i++)
 981                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 982 
 983         crc ^= (spa>>8) ^ birth;
 984 
 985         return (crc);
 986 }
 987 
 988 #define BUF_EMPTY(buf)                                          \
 989         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 990         (buf)->b_dva.dva_word[1] == 0 &&                     \
 991         (buf)->b_birth == 0)
 992 
 993 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 994         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&       \
 995         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&       \
 996         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 997 
 998 static void
 999 buf_discard_identity(arc_buf_hdr_t *hdr)
1000 {
1001         hdr->b_dva.dva_word[0] = 0;
1002         hdr->b_dva.dva_word[1] = 0;
1003         hdr->b_birth = 0;
1004         hdr->b_cksum0 = 0;
1005 }
1006 
1007 static arc_buf_hdr_t *
1008 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
1009 {
1010         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1011         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1012         arc_buf_hdr_t *buf;
1013 
1014         mutex_enter(hash_lock);
1015         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
1016             buf = buf->b_hash_next) {
1017                 if (BUF_EQUAL(spa, dva, birth, buf)) {
1018                         *lockp = hash_lock;
1019                         return (buf);
1020                 }
1021         }
1022         mutex_exit(hash_lock);
1023         *lockp = NULL;
1024         return (NULL);
1025 }
1026 
1027 /*
1028  * Insert an entry into the hash table.  If there is already an element
1029  * equal to elem in the hash table, then the already existing element
1030  * will be returned and the new element will not be inserted.
1031  * Otherwise returns NULL.
1032  */
1033 static arc_buf_hdr_t *
1034 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
1035 {
1036         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1037         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1038         arc_buf_hdr_t *fbuf;
1039         uint32_t i;
1040 
1041         ASSERT(!HDR_IN_HASH_TABLE(buf));
1042         *lockp = hash_lock;
1043         mutex_enter(hash_lock);
1044         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
1045             fbuf = fbuf->b_hash_next, i++) {
1046                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
1047                         return (fbuf);
1048         }
1049 
1050         buf->b_hash_next = buf_hash_table.ht_table[idx];
1051         buf_hash_table.ht_table[idx] = buf;
1052         buf->b_flags |= ARC_IN_HASH_TABLE;
1053 
1054         /* collect some hash table performance data */
1055         if (i > 0) {
1056                 ARCSTAT_BUMP(arcstat_hash_collisions);
1057                 if (i == 1)
1058                         ARCSTAT_BUMP(arcstat_hash_chains);
1059 
1060                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1061         }
1062 
1063         ARCSTAT_BUMP(arcstat_hash_elements);
1064         ARCSTAT_MAXSTAT(arcstat_hash_elements);
1065 
1066         return (NULL);
1067 }
1068 
1069 static void
1070 buf_hash_remove(arc_buf_hdr_t *buf)
1071 {
1072         arc_buf_hdr_t *fbuf, **bufp;
1073         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1074 
1075         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1076         ASSERT(HDR_IN_HASH_TABLE(buf));
1077 
1078         bufp = &buf_hash_table.ht_table[idx];
1079         while ((fbuf = *bufp) != buf) {
1080                 ASSERT(fbuf != NULL);
1081                 bufp = &fbuf->b_hash_next;
1082         }
1083         *bufp = buf->b_hash_next;
1084         buf->b_hash_next = NULL;
1085         buf->b_flags &= ~ARC_IN_HASH_TABLE;
1086 
1087         /* collect some hash table performance data */
1088         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1089 
1090         if (buf_hash_table.ht_table[idx] &&
1091             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1092                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1093 }
1094 
1095 /*
1096  * Global data structures and functions for the buf kmem cache.
1097  */
1098 static kmem_cache_t *hdr_cache;
1099 static kmem_cache_t *buf_cache;
1100 
1101 static void
1102 buf_fini(void)
1103 {
1104         int i;
1105 
1106         kmem_free(buf_hash_table.ht_table,
1107             (buf_hash_table.ht_mask + 1) * sizeof (void *));
1108         for (i = 0; i < BUF_LOCKS; i++)
1109                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1110         kmem_cache_destroy(hdr_cache);
1111         kmem_cache_destroy(buf_cache);
1112 }
1113 
1114 /*
1115  * Constructor callback - called when the cache is empty
1116  * and a new buf is requested.
1117  */
1118 /* ARGSUSED */
1119 static int
1120 hdr_cons(void *vbuf, void *unused, int kmflag)
1121 {
1122         arc_buf_hdr_t *buf = vbuf;
1123 
1124         bzero(buf, sizeof (arc_buf_hdr_t));
1125         refcount_create(&buf->b_refcnt);
1126         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
1127         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1128         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1129 
1130         return (0);
1131 }
1132 
1133 /* ARGSUSED */
1134 static int
1135 buf_cons(void *vbuf, void *unused, int kmflag)
1136 {
1137         arc_buf_t *buf = vbuf;
1138 
1139         bzero(buf, sizeof (arc_buf_t));
1140         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1141         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1142 
1143         return (0);
1144 }
1145 
1146 /*
1147  * Destructor callback - called when a cached buf is
1148  * no longer required.
1149  */
1150 /* ARGSUSED */
1151 static void
1152 hdr_dest(void *vbuf, void *unused)
1153 {
1154         arc_buf_hdr_t *buf = vbuf;
1155 
1156         ASSERT(BUF_EMPTY(buf));
1157         refcount_destroy(&buf->b_refcnt);
1158         cv_destroy(&buf->b_cv);
1159         mutex_destroy(&buf->b_freeze_lock);
1160         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1161 }
1162 
1163 /* ARGSUSED */
1164 static void
1165 buf_dest(void *vbuf, void *unused)
1166 {
1167         arc_buf_t *buf = vbuf;
1168 
1169         mutex_destroy(&buf->b_evict_lock);
1170         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1171 }
1172 
1173 /*
1174  * Reclaim callback -- invoked when memory is low.
1175  */
1176 /* ARGSUSED */
1177 static void
1178 hdr_recl(void *unused)
1179 {
1180         dprintf("hdr_recl called\n");
1181         /*
1182          * umem calls the reclaim func when we destroy the buf cache,
1183          * which is after we do arc_fini().
1184          */
1185         if (!arc_dead)
1186                 cv_signal(&arc_reclaim_thr_cv);
1187 }
1188 
1189 static void
1190 buf_init(void)
1191 {
1192         uint64_t *ct;
1193         uint64_t hsize = 1ULL << 12;
1194         int i, j;
1195 
1196         /*
1197          * The hash table is big enough to fill all of physical memory
1198          * with an average 64K block size.  The table will take up
1199          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
1200          */
1201         while (hsize * 65536 < physmem * PAGESIZE)
1202                 hsize <<= 1;
1203 retry:
1204         buf_hash_table.ht_mask = hsize - 1;
1205         buf_hash_table.ht_table =
1206             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1207         if (buf_hash_table.ht_table == NULL) {
1208                 ASSERT(hsize > (1ULL << 8));
1209                 hsize >>= 1;
1210                 goto retry;
1211         }
1212 
1213         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1214             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1215         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1216             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1217 
1218         for (i = 0; i < 256; i++)
1219                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1220                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1221 
1222         for (i = 0; i < BUF_LOCKS; i++) {
1223                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1224                     NULL, MUTEX_DEFAULT, NULL);
1225         }
1226 }
1227 
1228 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1229 
1230 static void
1231 arc_cksum_verify(arc_buf_t *buf)
1232 {
1233         zio_cksum_t zc;
1234 
1235         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1236                 return;
1237 
1238         mutex_enter(&buf->b_hdr->b_freeze_lock);
1239         if (buf->b_hdr->b_freeze_cksum == NULL ||
1240             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1241                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1242                 return;
1243         }
1244         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1245         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1246                 panic("buffer modified while frozen!");
1247         mutex_exit(&buf->b_hdr->b_freeze_lock);
1248 }
1249 
1250 static int
1251 arc_cksum_equal(arc_buf_t *buf)
1252 {
1253         zio_cksum_t zc;
1254         int equal;
1255 
1256         mutex_enter(&buf->b_hdr->b_freeze_lock);
1257         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1258         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1259         mutex_exit(&buf->b_hdr->b_freeze_lock);
1260 
1261         return (equal);
1262 }
1263 
1264 static void
1265 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1266 {
1267         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1268                 return;
1269 
1270         mutex_enter(&buf->b_hdr->b_freeze_lock);
1271         if (buf->b_hdr->b_freeze_cksum != NULL) {
1272                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1273                 return;
1274         }
1275         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1276         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1277             buf->b_hdr->b_freeze_cksum);
1278         mutex_exit(&buf->b_hdr->b_freeze_lock);
1279         arc_buf_watch(buf);
1280 }
1281 
1282 #ifndef _KERNEL
1283 typedef struct procctl {
1284         long cmd;
1285         prwatch_t prwatch;
1286 } procctl_t;
1287 #endif
1288 
1289 /* ARGSUSED */
1290 static void
1291 arc_buf_unwatch(arc_buf_t *buf)
1292 {
1293 #ifndef _KERNEL
1294         if (arc_watch) {
1295                 int result;
1296                 procctl_t ctl;
1297                 ctl.cmd = PCWATCH;
1298                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1299                 ctl.prwatch.pr_size = 0;
1300                 ctl.prwatch.pr_wflags = 0;
1301                 result = write(arc_procfd, &ctl, sizeof (ctl));
1302                 ASSERT3U(result, ==, sizeof (ctl));
1303         }
1304 #endif
1305 }
1306 
1307 /* ARGSUSED */
1308 static void
1309 arc_buf_watch(arc_buf_t *buf)
1310 {
1311 #ifndef _KERNEL
1312         if (arc_watch) {
1313                 int result;
1314                 procctl_t ctl;
1315                 ctl.cmd = PCWATCH;
1316                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1317                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1318                 ctl.prwatch.pr_wflags = WA_WRITE;
1319                 result = write(arc_procfd, &ctl, sizeof (ctl));
1320                 ASSERT3U(result, ==, sizeof (ctl));
1321         }
1322 #endif
1323 }
1324 
1325 void
1326 arc_buf_thaw(arc_buf_t *buf)
1327 {
1328         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1329                 if (buf->b_hdr->b_state != arc_anon)
1330                         panic("modifying non-anon buffer!");
1331                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1332                         panic("modifying buffer while i/o in progress!");
1333                 arc_cksum_verify(buf);
1334         }
1335 
1336         mutex_enter(&buf->b_hdr->b_freeze_lock);
1337         if (buf->b_hdr->b_freeze_cksum != NULL) {
1338                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1339                 buf->b_hdr->b_freeze_cksum = NULL;
1340         }
1341 
1342         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1343                 if (buf->b_hdr->b_thawed)
1344                         kmem_free(buf->b_hdr->b_thawed, 1);
1345                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1346         }
1347 
1348         mutex_exit(&buf->b_hdr->b_freeze_lock);
1349 
1350         arc_buf_unwatch(buf);
1351 }
1352 
1353 void
1354 arc_buf_freeze(arc_buf_t *buf)
1355 {
1356         kmutex_t *hash_lock;
1357 
1358         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1359                 return;
1360 
1361         hash_lock = HDR_LOCK(buf->b_hdr);
1362         mutex_enter(hash_lock);
1363 
1364         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1365             buf->b_hdr->b_state == arc_anon);
1366         arc_cksum_compute(buf, B_FALSE);
1367         mutex_exit(hash_lock);
1368 
1369 }
1370 
1371 static void
1372 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1373 {
1374         ASSERT(MUTEX_HELD(hash_lock));
1375 
1376         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1377             (ab->b_state != arc_anon)) {
1378                 uint64_t delta = ab->b_size * ab->b_datacnt;
1379                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1380                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1381 
1382                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1383                 mutex_enter(&ab->b_state->arcs_mtx);
1384                 ASSERT(list_link_active(&ab->b_arc_node));
1385                 list_remove(list, ab);
1386                 if (GHOST_STATE(ab->b_state)) {
1387                         ASSERT0(ab->b_datacnt);
1388                         ASSERT3P(ab->b_buf, ==, NULL);
1389                         delta = ab->b_size;
1390                 }
1391                 ASSERT(delta > 0);
1392                 ASSERT3U(*size, >=, delta);
1393                 atomic_add_64(size, -delta);
1394                 mutex_exit(&ab->b_state->arcs_mtx);
1395                 /* remove the prefetch flag if we get a reference */
1396                 if (ab->b_flags & ARC_PREFETCH)
1397                         ab->b_flags &= ~ARC_PREFETCH;
1398         }
1399 }
1400 
1401 static int
1402 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1403 {
1404         int cnt;
1405         arc_state_t *state = ab->b_state;
1406 
1407         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1408         ASSERT(!GHOST_STATE(state));
1409 
1410         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1411             (state != arc_anon)) {
1412                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1413 
1414                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1415                 mutex_enter(&state->arcs_mtx);
1416                 ASSERT(!list_link_active(&ab->b_arc_node));
1417                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1418                 ASSERT(ab->b_datacnt > 0);
1419                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1420                 mutex_exit(&state->arcs_mtx);
1421         }
1422         return (cnt);
1423 }
1424 
1425 /*
1426  * Move the supplied buffer to the indicated state.  The mutex
1427  * for the buffer must be held by the caller.
1428  */
1429 static void
1430 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1431 {
1432         arc_state_t *old_state = ab->b_state;
1433         int64_t refcnt = refcount_count(&ab->b_refcnt);
1434         uint64_t from_delta, to_delta;
1435 
1436         ASSERT(MUTEX_HELD(hash_lock));
1437         ASSERT(new_state != old_state);
1438         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1439         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1440         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1441 
1442         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1443 
1444         /*
1445          * If this buffer is evictable, transfer it from the
1446          * old state list to the new state list.
1447          */
1448         if (refcnt == 0) {
1449                 if (old_state != arc_anon) {
1450                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1451                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1452 
1453                         if (use_mutex)
1454                                 mutex_enter(&old_state->arcs_mtx);
1455 
1456                         ASSERT(list_link_active(&ab->b_arc_node));
1457                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1458 
1459                         /*
1460                          * If prefetching out of the ghost cache,
1461                          * we will have a non-zero datacnt.
1462                          */
1463                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1464                                 /* ghost elements have a ghost size */
1465                                 ASSERT(ab->b_buf == NULL);
1466                                 from_delta = ab->b_size;
1467                         }
1468                         ASSERT3U(*size, >=, from_delta);
1469                         atomic_add_64(size, -from_delta);
1470 
1471                         if (use_mutex)
1472                                 mutex_exit(&old_state->arcs_mtx);
1473                 }
1474                 if (new_state != arc_anon) {
1475                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1476                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1477 
1478                         if (use_mutex)
1479                                 mutex_enter(&new_state->arcs_mtx);
1480 
1481                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1482 
1483                         /* ghost elements have a ghost size */
1484                         if (GHOST_STATE(new_state)) {
1485                                 ASSERT(ab->b_datacnt == 0);
1486                                 ASSERT(ab->b_buf == NULL);
1487                                 to_delta = ab->b_size;
1488                         }
1489                         atomic_add_64(size, to_delta);
1490 
1491                         if (use_mutex)
1492                                 mutex_exit(&new_state->arcs_mtx);
1493                 }
1494         }
1495 
1496         ASSERT(!BUF_EMPTY(ab));
1497         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1498                 buf_hash_remove(ab);
1499 
1500         /* adjust state sizes */
1501         if (to_delta)
1502                 atomic_add_64(&new_state->arcs_size, to_delta);
1503         if (from_delta) {
1504                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1505                 atomic_add_64(&old_state->arcs_size, -from_delta);
1506         }
1507         ab->b_state = new_state;
1508 
1509         /* adjust l2arc hdr stats */
1510         if (new_state == arc_l2c_only)
1511                 l2arc_hdr_stat_add(old_state != arc_anon);
1512         else if (old_state == arc_l2c_only)
1513                 l2arc_hdr_stat_remove();
1514 }
1515 
1516 void
1517 arc_space_consume(uint64_t space, arc_space_type_t type)
1518 {
1519         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1520 
1521         switch (type) {
1522         case ARC_SPACE_DATA:
1523                 ARCSTAT_INCR(arcstat_data_size, space);
1524                 break;
1525         case ARC_SPACE_OTHER:
1526                 ARCSTAT_INCR(arcstat_other_size, space);
1527                 break;
1528         case ARC_SPACE_HDRS:
1529                 ARCSTAT_INCR(arcstat_hdr_size, space);
1530                 break;
1531         case ARC_SPACE_L2HDRS:
1532                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1533                 break;
1534         }
1535 
1536         ARCSTAT_INCR(arcstat_meta_used, space);
1537         atomic_add_64(&arc_size, space);
1538 }
1539 
1540 void
1541 arc_space_return(uint64_t space, arc_space_type_t type)
1542 {
1543         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1544 
1545         switch (type) {
1546         case ARC_SPACE_DATA:
1547                 ARCSTAT_INCR(arcstat_data_size, -space);
1548                 break;
1549         case ARC_SPACE_OTHER:
1550                 ARCSTAT_INCR(arcstat_other_size, -space);
1551                 break;
1552         case ARC_SPACE_HDRS:
1553                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1554                 break;
1555         case ARC_SPACE_L2HDRS:
1556                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1557                 break;
1558         }
1559 
1560         ASSERT(arc_meta_used >= space);
1561         if (arc_meta_max < arc_meta_used)
1562                 arc_meta_max = arc_meta_used;
1563         ARCSTAT_INCR(arcstat_meta_used, -space);
1564         ASSERT(arc_size >= space);
1565         atomic_add_64(&arc_size, -space);
1566 }
1567 
1568 void *
1569 arc_data_buf_alloc(uint64_t size)
1570 {
1571         if (arc_evict_needed(ARC_BUFC_DATA))
1572                 cv_signal(&arc_reclaim_thr_cv);
1573         atomic_add_64(&arc_size, size);
1574         return (zio_data_buf_alloc(size));
1575 }
1576 
1577 void
1578 arc_data_buf_free(void *buf, uint64_t size)
1579 {
1580         zio_data_buf_free(buf, size);
1581         ASSERT(arc_size >= size);
1582         atomic_add_64(&arc_size, -size);
1583 }
1584 
1585 arc_buf_t *
1586 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1587 {
1588         arc_buf_hdr_t *hdr;
1589         arc_buf_t *buf;
1590 
1591         ASSERT3U(size, >, 0);
1592         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1593         ASSERT(BUF_EMPTY(hdr));
1594         hdr->b_size = size;
1595         hdr->b_type = type;
1596         hdr->b_spa = spa_load_guid(spa);
1597         hdr->b_state = arc_anon;
1598         hdr->b_arc_access = 0;
1599         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1600         buf->b_hdr = hdr;
1601         buf->b_data = NULL;
1602         buf->b_efunc = NULL;
1603         buf->b_private = NULL;
1604         buf->b_next = NULL;
1605         hdr->b_buf = buf;
1606         arc_get_data_buf(buf);
1607         hdr->b_datacnt = 1;
1608         hdr->b_flags = 0;
1609         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1610         (void) refcount_add(&hdr->b_refcnt, tag);
1611 
1612         return (buf);
1613 }
1614 
1615 /*
1616  * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
1617  * This is used during l2arc reconstruction to make empty ARC buffers
1618  * which circumvent the regular disk->arc->l2arc path and instead come
1619  * into being in the reverse order, i.e. l2arc->arc->(disk).
1620  */
1621 arc_buf_hdr_t *
1622 arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
1623 {
1624         arc_buf_hdr_t *hdr;
1625 
1626         ASSERT3U(size, >, 0);
1627         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1628         ASSERT(BUF_EMPTY(hdr));
1629         hdr->b_size = size;
1630         hdr->b_type = type;
1631         hdr->b_spa = guid;
1632         hdr->b_state = arc_anon;
1633         hdr->b_arc_access = 0;
1634         hdr->b_buf = NULL;
1635         hdr->b_datacnt = 0;
1636         hdr->b_flags = 0;
1637         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1638 
1639         return (hdr);
1640 }
1641 
1642 static char *arc_onloan_tag = "onloan";
1643 
1644 /*
1645  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1646  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1647  * buffers must be returned to the arc before they can be used by the DMU or
1648  * freed.
1649  */
1650 arc_buf_t *
1651 arc_loan_buf(spa_t *spa, int size)
1652 {
1653         arc_buf_t *buf;
1654 
1655         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1656 
1657         atomic_add_64(&arc_loaned_bytes, size);
1658         return (buf);
1659 }
1660 
1661 /*
1662  * Return a loaned arc buffer to the arc.
1663  */
1664 void
1665 arc_return_buf(arc_buf_t *buf, void *tag)
1666 {
1667         arc_buf_hdr_t *hdr = buf->b_hdr;
1668 
1669         ASSERT(buf->b_data != NULL);
1670         (void) refcount_add(&hdr->b_refcnt, tag);
1671         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1672 
1673         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1674 }
1675 
1676 /* Detach an arc_buf from a dbuf (tag) */
1677 void
1678 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1679 {
1680         arc_buf_hdr_t *hdr;
1681 
1682         ASSERT(buf->b_data != NULL);
1683         hdr = buf->b_hdr;
1684         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1685         (void) refcount_remove(&hdr->b_refcnt, tag);
1686         buf->b_efunc = NULL;
1687         buf->b_private = NULL;
1688 
1689         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1690 }
1691 
1692 static arc_buf_t *
1693 arc_buf_clone(arc_buf_t *from)
1694 {
1695         arc_buf_t *buf;
1696         arc_buf_hdr_t *hdr = from->b_hdr;
1697         uint64_t size = hdr->b_size;
1698 
1699         ASSERT(hdr->b_state != arc_anon);
1700 
1701         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1702         buf->b_hdr = hdr;
1703         buf->b_data = NULL;
1704         buf->b_efunc = NULL;
1705         buf->b_private = NULL;
1706         buf->b_next = hdr->b_buf;
1707         hdr->b_buf = buf;
1708         arc_get_data_buf(buf);
1709         bcopy(from->b_data, buf->b_data, size);
1710 
1711         /*
1712          * This buffer already exists in the arc so create a duplicate
1713          * copy for the caller.  If the buffer is associated with user data
1714          * then track the size and number of duplicates.  These stats will be
1715          * updated as duplicate buffers are created and destroyed.
1716          */
1717         if (hdr->b_type == ARC_BUFC_DATA) {
1718                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1719                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1720         }
1721         hdr->b_datacnt += 1;
1722         return (buf);
1723 }
1724 
1725 void
1726 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1727 {
1728         arc_buf_hdr_t *hdr;
1729         kmutex_t *hash_lock;
1730 
1731         /*
1732          * Check to see if this buffer is evicted.  Callers
1733          * must verify b_data != NULL to know if the add_ref
1734          * was successful.
1735          */
1736         mutex_enter(&buf->b_evict_lock);
1737         if (buf->b_data == NULL) {
1738                 mutex_exit(&buf->b_evict_lock);
1739                 return;
1740         }
1741         hash_lock = HDR_LOCK(buf->b_hdr);
1742         mutex_enter(hash_lock);
1743         hdr = buf->b_hdr;
1744         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1745         mutex_exit(&buf->b_evict_lock);
1746 
1747         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1748         add_reference(hdr, hash_lock, tag);
1749         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1750         arc_access(hdr, hash_lock);
1751         mutex_exit(hash_lock);
1752         ARCSTAT_BUMP(arcstat_hits);
1753         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1754             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1755             data, metadata, hits);
1756 }
1757 
1758 /*
1759  * Free the arc data buffer.  If it is an l2arc write in progress,
1760  * the buffer is placed on l2arc_free_on_write to be freed later.
1761  */
1762 static void
1763 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1764 {
1765         arc_buf_hdr_t *hdr = buf->b_hdr;
1766 
1767         if (HDR_L2_WRITING(hdr)) {
1768                 l2arc_data_free_t *df;
1769                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1770                 df->l2df_data = buf->b_data;
1771                 df->l2df_size = hdr->b_size;
1772                 df->l2df_func = free_func;
1773                 mutex_enter(&l2arc_free_on_write_mtx);
1774                 list_insert_head(l2arc_free_on_write, df);
1775                 mutex_exit(&l2arc_free_on_write_mtx);
1776                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1777         } else {
1778                 free_func(buf->b_data, hdr->b_size);
1779         }
1780 }
1781 
1782 static void
1783 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1784 {
1785         arc_buf_t **bufp;
1786 
1787         /* free up data associated with the buf */
1788         if (buf->b_data) {
1789                 arc_state_t *state = buf->b_hdr->b_state;
1790                 uint64_t size = buf->b_hdr->b_size;
1791                 arc_buf_contents_t type = buf->b_hdr->b_type;
1792 
1793                 arc_cksum_verify(buf);
1794                 arc_buf_unwatch(buf);
1795 
1796                 if (!recycle) {
1797                         if (type == ARC_BUFC_METADATA) {
1798                                 arc_buf_data_free(buf, zio_buf_free);
1799                                 arc_space_return(size, ARC_SPACE_DATA);
1800                         } else {
1801                                 ASSERT(type == ARC_BUFC_DATA);
1802                                 arc_buf_data_free(buf, zio_data_buf_free);
1803                                 ARCSTAT_INCR(arcstat_data_size, -size);
1804                                 atomic_add_64(&arc_size, -size);
1805                         }
1806                 }
1807                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1808                         uint64_t *cnt = &state->arcs_lsize[type];
1809 
1810                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1811                         ASSERT(state != arc_anon);
1812 
1813                         ASSERT3U(*cnt, >=, size);
1814                         atomic_add_64(cnt, -size);
1815                 }
1816                 ASSERT3U(state->arcs_size, >=, size);
1817                 atomic_add_64(&state->arcs_size, -size);
1818                 buf->b_data = NULL;
1819 
1820                 /*
1821                  * If we're destroying a duplicate buffer make sure
1822                  * that the appropriate statistics are updated.
1823                  */
1824                 if (buf->b_hdr->b_datacnt > 1 &&
1825                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1826                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1827                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1828                 }
1829                 ASSERT(buf->b_hdr->b_datacnt > 0);
1830                 buf->b_hdr->b_datacnt -= 1;
1831         }
1832 
1833         /* only remove the buf if requested */
1834         if (!all)
1835                 return;
1836 
1837         /* remove the buf from the hdr list */
1838         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1839                 continue;
1840         *bufp = buf->b_next;
1841         buf->b_next = NULL;
1842 
1843         ASSERT(buf->b_efunc == NULL);
1844 
1845         /* clean up the buf */
1846         buf->b_hdr = NULL;
1847         kmem_cache_free(buf_cache, buf);
1848 }
1849 
1850 static void
1851 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1852 {
1853         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1854         ASSERT3P(hdr->b_state, ==, arc_anon);
1855         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1856         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1857 
1858         if (l2hdr != NULL) {
1859                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1860                 /*
1861                  * To prevent arc_free() and l2arc_evict() from
1862                  * attempting to free the same buffer at the same time,
1863                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1864                  * give it priority.  l2arc_evict() can't destroy this
1865                  * header while we are waiting on l2arc_buflist_mtx.
1866                  *
1867                  * The hdr may be removed from l2ad_buflist before we
1868                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1869                  */
1870                 if (!buflist_held) {
1871                         mutex_enter(&l2arc_buflist_mtx);
1872                         l2hdr = hdr->b_l2hdr;
1873                 }
1874 
1875                 if (l2hdr != NULL) {
1876                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1877                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1878                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1879                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1880                         if (hdr->b_state == arc_l2c_only)
1881                                 l2arc_hdr_stat_remove();
1882                         hdr->b_l2hdr = NULL;
1883                 }
1884 
1885                 if (!buflist_held)
1886                         mutex_exit(&l2arc_buflist_mtx);
1887         }
1888 
1889         if (!BUF_EMPTY(hdr)) {
1890                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1891                 buf_discard_identity(hdr);
1892         }
1893         while (hdr->b_buf) {
1894                 arc_buf_t *buf = hdr->b_buf;
1895 
1896                 if (buf->b_efunc) {
1897                         mutex_enter(&arc_eviction_mtx);
1898                         mutex_enter(&buf->b_evict_lock);
1899                         ASSERT(buf->b_hdr != NULL);
1900                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1901                         hdr->b_buf = buf->b_next;
1902                         buf->b_hdr = &arc_eviction_hdr;
1903                         buf->b_next = arc_eviction_list;
1904                         arc_eviction_list = buf;
1905                         mutex_exit(&buf->b_evict_lock);
1906                         mutex_exit(&arc_eviction_mtx);
1907                 } else {
1908                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1909                 }
1910         }
1911         if (hdr->b_freeze_cksum != NULL) {
1912                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1913                 hdr->b_freeze_cksum = NULL;
1914         }
1915         if (hdr->b_thawed) {
1916                 kmem_free(hdr->b_thawed, 1);
1917                 hdr->b_thawed = NULL;
1918         }
1919 
1920         ASSERT(!list_link_active(&hdr->b_arc_node));
1921         ASSERT3P(hdr->b_hash_next, ==, NULL);
1922         ASSERT3P(hdr->b_acb, ==, NULL);
1923         kmem_cache_free(hdr_cache, hdr);
1924 }
1925 
1926 void
1927 arc_buf_free(arc_buf_t *buf, void *tag)
1928 {
1929         arc_buf_hdr_t *hdr = buf->b_hdr;
1930         int hashed = hdr->b_state != arc_anon;
1931 
1932         ASSERT(buf->b_efunc == NULL);
1933         ASSERT(buf->b_data != NULL);
1934 
1935         if (hashed) {
1936                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1937 
1938                 mutex_enter(hash_lock);
1939                 hdr = buf->b_hdr;
1940                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1941 
1942                 (void) remove_reference(hdr, hash_lock, tag);
1943                 if (hdr->b_datacnt > 1) {
1944                         arc_buf_destroy(buf, FALSE, TRUE);
1945                 } else {
1946                         ASSERT(buf == hdr->b_buf);
1947                         ASSERT(buf->b_efunc == NULL);
1948                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1949                 }
1950                 mutex_exit(hash_lock);
1951         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1952                 int destroy_hdr;
1953                 /*
1954                  * We are in the middle of an async write.  Don't destroy
1955                  * this buffer unless the write completes before we finish
1956                  * decrementing the reference count.
1957                  */
1958                 mutex_enter(&arc_eviction_mtx);
1959                 (void) remove_reference(hdr, NULL, tag);
1960                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1961                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1962                 mutex_exit(&arc_eviction_mtx);
1963                 if (destroy_hdr)
1964                         arc_hdr_destroy(hdr);
1965         } else {
1966                 if (remove_reference(hdr, NULL, tag) > 0)
1967                         arc_buf_destroy(buf, FALSE, TRUE);
1968                 else
1969                         arc_hdr_destroy(hdr);
1970         }
1971 }
1972 
1973 boolean_t
1974 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1975 {
1976         arc_buf_hdr_t *hdr = buf->b_hdr;
1977         kmutex_t *hash_lock = HDR_LOCK(hdr);
1978         boolean_t no_callback = (buf->b_efunc == NULL);
1979 
1980         if (hdr->b_state == arc_anon) {
1981                 ASSERT(hdr->b_datacnt == 1);
1982                 arc_buf_free(buf, tag);
1983                 return (no_callback);
1984         }
1985 
1986         mutex_enter(hash_lock);
1987         hdr = buf->b_hdr;
1988         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1989         ASSERT(hdr->b_state != arc_anon);
1990         ASSERT(buf->b_data != NULL);
1991 
1992         (void) remove_reference(hdr, hash_lock, tag);
1993         if (hdr->b_datacnt > 1) {
1994                 if (no_callback)
1995                         arc_buf_destroy(buf, FALSE, TRUE);
1996         } else if (no_callback) {
1997                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1998                 ASSERT(buf->b_efunc == NULL);
1999                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2000         }
2001         ASSERT(no_callback || hdr->b_datacnt > 1 ||
2002             refcount_is_zero(&hdr->b_refcnt));
2003         mutex_exit(hash_lock);
2004         return (no_callback);
2005 }
2006 
2007 int
2008 arc_buf_size(arc_buf_t *buf)
2009 {
2010         return (buf->b_hdr->b_size);
2011 }
2012 
2013 /*
2014  * Called from the DMU to determine if the current buffer should be
2015  * evicted. In order to ensure proper locking, the eviction must be initiated
2016  * from the DMU. Return true if the buffer is associated with user data and
2017  * duplicate buffers still exist.
2018  */
2019 boolean_t
2020 arc_buf_eviction_needed(arc_buf_t *buf)
2021 {
2022         arc_buf_hdr_t *hdr;
2023         boolean_t evict_needed = B_FALSE;
2024 
2025         if (zfs_disable_dup_eviction)
2026                 return (B_FALSE);
2027 
2028         mutex_enter(&buf->b_evict_lock);
2029         hdr = buf->b_hdr;
2030         if (hdr == NULL) {
2031                 /*
2032                  * We are in arc_do_user_evicts(); let that function
2033                  * perform the eviction.
2034                  */
2035                 ASSERT(buf->b_data == NULL);
2036                 mutex_exit(&buf->b_evict_lock);
2037                 return (B_FALSE);
2038         } else if (buf->b_data == NULL) {
2039                 /*
2040                  * We have already been added to the arc eviction list;
2041                  * recommend eviction.
2042                  */
2043                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2044                 mutex_exit(&buf->b_evict_lock);
2045                 return (B_TRUE);
2046         }
2047 
2048         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
2049                 evict_needed = B_TRUE;
2050 
2051         mutex_exit(&buf->b_evict_lock);
2052         return (evict_needed);
2053 }
2054 
2055 /*
2056  * Evict buffers from list until we've removed the specified number of
2057  * bytes.  Move the removed buffers to the appropriate evict state.
2058  * If the recycle flag is set, then attempt to "recycle" a buffer:
2059  * - look for a buffer to evict that is `bytes' long.
2060  * - return the data block from this buffer rather than freeing it.
2061  * This flag is used by callers that are trying to make space for a
2062  * new buffer in a full arc cache.
2063  *
2064  * This function makes a "best effort".  It skips over any buffers
2065  * it can't get a hash_lock on, and so may not catch all candidates.
2066  * It may also return without evicting as much space as requested.
2067  */
2068 static void *
2069 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2070     arc_buf_contents_t type)
2071 {
2072         arc_state_t *evicted_state;
2073         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2074         arc_buf_hdr_t *ab, *ab_prev = NULL;
2075         list_t *list = &state->arcs_list[type];
2076         kmutex_t *hash_lock;
2077         boolean_t have_lock;
2078         void *stolen = NULL;
2079 
2080         ASSERT(state == arc_mru || state == arc_mfu);
2081 
2082         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2083 
2084         mutex_enter(&state->arcs_mtx);
2085         mutex_enter(&evicted_state->arcs_mtx);
2086 
2087         for (ab = list_tail(list); ab; ab = ab_prev) {
2088                 ab_prev = list_prev(list, ab);
2089                 /* prefetch buffers have a minimum lifespan */
2090                 if (HDR_IO_IN_PROGRESS(ab) ||
2091                     (spa && ab->b_spa != spa) ||
2092                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2093                     ddi_get_lbolt() - ab->b_arc_access <
2094                     arc_min_prefetch_lifespan)) {
2095                         skipped++;
2096                         continue;
2097                 }
2098                 /* "lookahead" for better eviction candidate */
2099                 if (recycle && ab->b_size != bytes &&
2100                     ab_prev && ab_prev->b_size == bytes)
2101                         continue;
2102                 hash_lock = HDR_LOCK(ab);
2103                 have_lock = MUTEX_HELD(hash_lock);
2104                 if (have_lock || mutex_tryenter(hash_lock)) {
2105                         ASSERT0(refcount_count(&ab->b_refcnt));
2106                         ASSERT(ab->b_datacnt > 0);
2107                         while (ab->b_buf) {
2108                                 arc_buf_t *buf = ab->b_buf;
2109                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
2110                                         missed += 1;
2111                                         break;
2112                                 }
2113                                 if (buf->b_data) {
2114                                         bytes_evicted += ab->b_size;
2115                                         if (recycle && ab->b_type == type &&
2116                                             ab->b_size == bytes &&
2117                                             !HDR_L2_WRITING(ab)) {
2118                                                 stolen = buf->b_data;
2119                                                 recycle = FALSE;
2120                                         }
2121                                 }
2122                                 if (buf->b_efunc) {
2123                                         mutex_enter(&arc_eviction_mtx);
2124                                         arc_buf_destroy(buf,
2125                                             buf->b_data == stolen, FALSE);
2126                                         ab->b_buf = buf->b_next;
2127                                         buf->b_hdr = &arc_eviction_hdr;
2128                                         buf->b_next = arc_eviction_list;
2129                                         arc_eviction_list = buf;
2130                                         mutex_exit(&arc_eviction_mtx);
2131                                         mutex_exit(&buf->b_evict_lock);
2132                                 } else {
2133                                         mutex_exit(&buf->b_evict_lock);
2134                                         arc_buf_destroy(buf,
2135                                             buf->b_data == stolen, TRUE);
2136                                 }
2137                         }
2138 
2139                         if (ab->b_l2hdr) {
2140                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
2141                                     ab->b_size);
2142                         } else {
2143                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
2144                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
2145                                             ab->b_size);
2146                                 } else {
2147                                         ARCSTAT_INCR(
2148                                             arcstat_evict_l2_ineligible,
2149                                             ab->b_size);
2150                                 }
2151                         }
2152 
2153                         if (ab->b_datacnt == 0) {
2154                                 arc_change_state(evicted_state, ab, hash_lock);
2155                                 ASSERT(HDR_IN_HASH_TABLE(ab));
2156                                 ab->b_flags |= ARC_IN_HASH_TABLE;
2157                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
2158                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2159                         }
2160                         if (!have_lock)
2161                                 mutex_exit(hash_lock);
2162                         if (bytes >= 0 && bytes_evicted >= bytes)
2163                                 break;
2164                 } else {
2165                         missed += 1;
2166                 }
2167         }
2168 
2169         mutex_exit(&evicted_state->arcs_mtx);
2170         mutex_exit(&state->arcs_mtx);
2171 
2172         if (bytes_evicted < bytes)
2173                 dprintf("only evicted %lld bytes from %x",
2174                     (longlong_t)bytes_evicted, state);
2175 
2176         if (skipped)
2177                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
2178 
2179         if (missed)
2180                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
2181 
2182         /*
2183          * We have just evicted some data into the ghost state, make
2184          * sure we also adjust the ghost state size if necessary.
2185          */
2186         if (arc_no_grow &&
2187             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
2188                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
2189                     arc_mru_ghost->arcs_size - arc_c;
2190 
2191                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2192                         int64_t todelete =
2193                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
2194                         arc_evict_ghost(arc_mru_ghost, NULL, todelete);
2195                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
2196                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
2197                             arc_mru_ghost->arcs_size +
2198                             arc_mfu_ghost->arcs_size - arc_c);
2199                         arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
2200                 }
2201         }
2202 
2203         return (stolen);
2204 }
2205 
2206 /*
2207  * Remove buffers from list until we've removed the specified number of
2208  * bytes.  Destroy the buffers that are removed.
2209  */
2210 static void
2211 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2212 {
2213         arc_buf_hdr_t *ab, *ab_prev;
2214         arc_buf_hdr_t marker = { 0 };
2215         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
2216         kmutex_t *hash_lock;
2217         uint64_t bytes_deleted = 0;
2218         uint64_t bufs_skipped = 0;
2219 
2220         ASSERT(GHOST_STATE(state));
2221 top:
2222         mutex_enter(&state->arcs_mtx);
2223         for (ab = list_tail(list); ab; ab = ab_prev) {
2224                 ab_prev = list_prev(list, ab);
2225                 if (spa && ab->b_spa != spa)
2226                         continue;
2227 
2228                 /* ignore markers */
2229                 if (ab->b_spa == 0)
2230                         continue;
2231 
2232                 hash_lock = HDR_LOCK(ab);
2233                 /* caller may be trying to modify this buffer, skip it */
2234                 if (MUTEX_HELD(hash_lock))
2235                         continue;
2236                 if (mutex_tryenter(hash_lock)) {
2237                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
2238                         ASSERT(ab->b_buf == NULL);
2239                         ARCSTAT_BUMP(arcstat_deleted);
2240                         bytes_deleted += ab->b_size;
2241 
2242                         if (ab->b_l2hdr != NULL) {
2243                                 /*
2244                                  * This buffer is cached on the 2nd Level ARC;
2245                                  * don't destroy the header.
2246                                  */
2247                                 arc_change_state(arc_l2c_only, ab, hash_lock);
2248                                 mutex_exit(hash_lock);
2249                         } else {
2250                                 arc_change_state(arc_anon, ab, hash_lock);
2251                                 mutex_exit(hash_lock);
2252                                 arc_hdr_destroy(ab);
2253                         }
2254 
2255                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2256                         if (bytes >= 0 && bytes_deleted >= bytes)
2257                                 break;
2258                 } else if (bytes < 0) {
2259                         /*
2260                          * Insert a list marker and then wait for the
2261                          * hash lock to become available. Once its
2262                          * available, restart from where we left off.
2263                          */
2264                         list_insert_after(list, ab, &marker);
2265                         mutex_exit(&state->arcs_mtx);
2266                         mutex_enter(hash_lock);
2267                         mutex_exit(hash_lock);
2268                         mutex_enter(&state->arcs_mtx);
2269                         ab_prev = list_prev(list, &marker);
2270                         list_remove(list, &marker);
2271                 } else
2272                         bufs_skipped += 1;
2273         }
2274         mutex_exit(&state->arcs_mtx);
2275 
2276         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2277             (bytes < 0 || bytes_deleted < bytes)) {
2278                 list = &state->arcs_list[ARC_BUFC_METADATA];
2279                 goto top;
2280         }
2281 
2282         if (bufs_skipped) {
2283                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2284                 ASSERT(bytes >= 0);
2285         }
2286 
2287         if (bytes_deleted < bytes)
2288                 dprintf("only deleted %lld bytes from %p",
2289                     (longlong_t)bytes_deleted, state);
2290 }
2291 
2292 static void
2293 arc_adjust(void)
2294 {
2295         int64_t adjustment, delta;
2296 
2297         /*
2298          * Adjust MRU size
2299          */
2300 
2301         adjustment = MIN((int64_t)(arc_size - arc_c),
2302             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2303             arc_p));
2304 
2305         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2306                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2307                 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2308                 adjustment -= delta;
2309         }
2310 
2311         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2312                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2313                 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2314                     ARC_BUFC_METADATA);
2315         }
2316 
2317         /*
2318          * Adjust MFU size
2319          */
2320 
2321         adjustment = arc_size - arc_c;
2322 
2323         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2324                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2325                 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2326                 adjustment -= delta;
2327         }
2328 
2329         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2330                 int64_t delta = MIN(adjustment,
2331                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2332                 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2333                     ARC_BUFC_METADATA);
2334         }
2335 
2336         /*
2337          * Adjust ghost lists
2338          */
2339 
2340         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2341 
2342         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2343                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2344                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2345         }
2346 
2347         adjustment =
2348             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2349 
2350         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2351                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2352                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2353         }
2354 }
2355 
2356 static void
2357 arc_do_user_evicts(void)
2358 {
2359         mutex_enter(&arc_eviction_mtx);
2360         while (arc_eviction_list != NULL) {
2361                 arc_buf_t *buf = arc_eviction_list;
2362                 arc_eviction_list = buf->b_next;
2363                 mutex_enter(&buf->b_evict_lock);
2364                 buf->b_hdr = NULL;
2365                 mutex_exit(&buf->b_evict_lock);
2366                 mutex_exit(&arc_eviction_mtx);
2367 
2368                 if (buf->b_efunc != NULL)
2369                         VERIFY(buf->b_efunc(buf) == 0);
2370 
2371                 buf->b_efunc = NULL;
2372                 buf->b_private = NULL;
2373                 kmem_cache_free(buf_cache, buf);
2374                 mutex_enter(&arc_eviction_mtx);
2375         }
2376         mutex_exit(&arc_eviction_mtx);
2377 }
2378 
2379 /*
2380  * Flush all *evictable* data from the cache for the given spa.
2381  * NOTE: this will not touch "active" (i.e. referenced) data.
2382  */
2383 void
2384 arc_flush(spa_t *spa)
2385 {
2386         uint64_t guid = 0;
2387 
2388         if (spa)
2389                 guid = spa_load_guid(spa);
2390 
2391         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2392                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2393                 if (spa)
2394                         break;
2395         }
2396         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2397                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2398                 if (spa)
2399                         break;
2400         }
2401         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2402                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2403                 if (spa)
2404                         break;
2405         }
2406         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2407                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2408                 if (spa)
2409                         break;
2410         }
2411 
2412         arc_evict_ghost(arc_mru_ghost, guid, -1);
2413         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2414 
2415         mutex_enter(&arc_reclaim_thr_lock);
2416         arc_do_user_evicts();
2417         mutex_exit(&arc_reclaim_thr_lock);
2418         ASSERT(spa || arc_eviction_list == NULL);
2419 }
2420 
2421 void
2422 arc_shrink(void)
2423 {
2424         if (arc_c > arc_c_min) {
2425                 uint64_t to_free;
2426 
2427 #ifdef _KERNEL
2428                 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2429 #else
2430                 to_free = arc_c >> arc_shrink_shift;
2431 #endif
2432                 if (arc_c > arc_c_min + to_free)
2433                         atomic_add_64(&arc_c, -to_free);
2434                 else
2435                         arc_c = arc_c_min;
2436 
2437                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2438                 if (arc_c > arc_size)
2439                         arc_c = MAX(arc_size, arc_c_min);
2440                 if (arc_p > arc_c)
2441                         arc_p = (arc_c >> 1);
2442                 ASSERT(arc_c >= arc_c_min);
2443                 ASSERT((int64_t)arc_p >= 0);
2444         }
2445 
2446         if (arc_size > arc_c)
2447                 arc_adjust();
2448 }
2449 
2450 /*
2451  * Determine if the system is under memory pressure and is asking
2452  * to reclaim memory. A return value of 1 indicates that the system
2453  * is under memory pressure and that the arc should adjust accordingly.
2454  */
2455 static int
2456 arc_reclaim_needed(void)
2457 {
2458         uint64_t extra;
2459 
2460 #ifdef _KERNEL
2461 
2462         if (needfree)
2463                 return (1);
2464 
2465         /*
2466          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2467          */
2468         extra = desfree;
2469 
2470         /*
2471          * check that we're out of range of the pageout scanner.  It starts to
2472          * schedule paging if freemem is less than lotsfree and needfree.
2473          * lotsfree is the high-water mark for pageout, and needfree is the
2474          * number of needed free pages.  We add extra pages here to make sure
2475          * the scanner doesn't start up while we're freeing memory.
2476          */
2477         if (freemem < lotsfree + needfree + extra)
2478                 return (1);
2479 
2480         /*
2481          * check to make sure that swapfs has enough space so that anon
2482          * reservations can still succeed. anon_resvmem() checks that the
2483          * availrmem is greater than swapfs_minfree, and the number of reserved
2484          * swap pages.  We also add a bit of extra here just to prevent
2485          * circumstances from getting really dire.
2486          */
2487         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2488                 return (1);
2489 
2490 #if defined(__i386)
2491         /*
2492          * If we're on an i386 platform, it's possible that we'll exhaust the
2493          * kernel heap space before we ever run out of available physical
2494          * memory.  Most checks of the size of the heap_area compare against
2495          * tune.t_minarmem, which is the minimum available real memory that we
2496          * can have in the system.  However, this is generally fixed at 25 pages
2497          * which is so low that it's useless.  In this comparison, we seek to
2498          * calculate the total heap-size, and reclaim if more than 3/4ths of the
2499          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2500          * free)
2501          */
2502         if (vmem_size(heap_arena, VMEM_FREE) <
2503             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2504                 return (1);
2505 #endif
2506 
2507         /*
2508          * If zio data pages are being allocated out of a separate heap segment,
2509          * then enforce that the size of available vmem for this arena remains
2510          * above about 1/16th free.
2511          *
2512          * Note: The 1/16th arena free requirement was put in place
2513          * to aggressively evict memory from the arc in order to avoid
2514          * memory fragmentation issues.
2515          */
2516         if (zio_arena != NULL &&
2517             vmem_size(zio_arena, VMEM_FREE) <
2518             (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2519                 return (1);
2520 #else
2521         if (spa_get_random(100) == 0)
2522                 return (1);
2523 #endif
2524         return (0);
2525 }
2526 
2527 static void
2528 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2529 {
2530         size_t                  i;
2531         kmem_cache_t            *prev_cache = NULL;
2532         kmem_cache_t            *prev_data_cache = NULL;
2533         extern kmem_cache_t     *zio_buf_cache[];
2534         extern kmem_cache_t     *zio_data_buf_cache[];
2535 
2536 #ifdef _KERNEL
2537         if (arc_meta_used >= arc_meta_limit) {
2538                 /*
2539                  * We are exceeding our meta-data cache limit.
2540                  * Purge some DNLC entries to release holds on meta-data.
2541                  */
2542                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2543         }
2544 #if defined(__i386)
2545         /*
2546          * Reclaim unused memory from all kmem caches.
2547          */
2548         kmem_reap();
2549 #endif
2550 #endif
2551 
2552         /*
2553          * An aggressive reclamation will shrink the cache size as well as
2554          * reap free buffers from the arc kmem caches.
2555          */
2556         if (strat == ARC_RECLAIM_AGGR)
2557                 arc_shrink();
2558 
2559         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2560                 if (zio_buf_cache[i] != prev_cache) {
2561                         prev_cache = zio_buf_cache[i];
2562                         kmem_cache_reap_now(zio_buf_cache[i]);
2563                 }
2564                 if (zio_data_buf_cache[i] != prev_data_cache) {
2565                         prev_data_cache = zio_data_buf_cache[i];
2566                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2567                 }
2568         }
2569         kmem_cache_reap_now(buf_cache);
2570         kmem_cache_reap_now(hdr_cache);
2571 
2572         /*
2573          * Ask the vmem areana to reclaim unused memory from its
2574          * quantum caches.
2575          */
2576         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2577                 vmem_qcache_reap(zio_arena);
2578 }
2579 
2580 static void
2581 arc_reclaim_thread(void)
2582 {
2583         clock_t                 growtime = 0;
2584         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2585         callb_cpr_t             cpr;
2586 
2587         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2588 
2589         mutex_enter(&arc_reclaim_thr_lock);
2590         while (arc_thread_exit == 0) {
2591                 if (arc_reclaim_needed()) {
2592 
2593                         if (arc_no_grow) {
2594                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2595                                         last_reclaim = ARC_RECLAIM_AGGR;
2596                                 } else {
2597                                         last_reclaim = ARC_RECLAIM_CONS;
2598                                 }
2599                         } else {
2600                                 arc_no_grow = TRUE;
2601                                 last_reclaim = ARC_RECLAIM_AGGR;
2602                                 membar_producer();
2603                         }
2604 
2605                         /* reset the growth delay for every reclaim */
2606                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2607 
2608                         arc_kmem_reap_now(last_reclaim);
2609                         arc_warm = B_TRUE;
2610 
2611                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2612                         arc_no_grow = FALSE;
2613                 }
2614 
2615                 arc_adjust();
2616 
2617                 if (arc_eviction_list != NULL)
2618                         arc_do_user_evicts();
2619 
2620                 /* block until needed, or one second, whichever is shorter */
2621                 CALLB_CPR_SAFE_BEGIN(&cpr);
2622                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2623                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2624                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2625         }
2626 
2627         arc_thread_exit = 0;
2628         cv_broadcast(&arc_reclaim_thr_cv);
2629         CALLB_CPR_EXIT(&cpr);               /* drops arc_reclaim_thr_lock */
2630         thread_exit();
2631 }
2632 
2633 /*
2634  * Adapt arc info given the number of bytes we are trying to add and
2635  * the state that we are comming from.  This function is only called
2636  * when we are adding new content to the cache.
2637  */
2638 static void
2639 arc_adapt(int bytes, arc_state_t *state)
2640 {
2641         int mult;
2642         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2643 
2644         if (state == arc_l2c_only)
2645                 return;
2646 
2647         ASSERT(bytes > 0);
2648         /*
2649          * Adapt the target size of the MRU list:
2650          *      - if we just hit in the MRU ghost list, then increase
2651          *        the target size of the MRU list.
2652          *      - if we just hit in the MFU ghost list, then increase
2653          *        the target size of the MFU list by decreasing the
2654          *        target size of the MRU list.
2655          */
2656         if (state == arc_mru_ghost) {
2657                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2658                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2659                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2660 
2661                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2662         } else if (state == arc_mfu_ghost) {
2663                 uint64_t delta;
2664 
2665                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2666                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2667                 mult = MIN(mult, 10);
2668 
2669                 delta = MIN(bytes * mult, arc_p);
2670                 arc_p = MAX(arc_p_min, arc_p - delta);
2671         }
2672         ASSERT((int64_t)arc_p >= 0);
2673 
2674         if (arc_reclaim_needed()) {
2675                 cv_signal(&arc_reclaim_thr_cv);
2676                 return;
2677         }
2678 
2679         if (arc_no_grow)
2680                 return;
2681 
2682         if (arc_c >= arc_c_max)
2683                 return;
2684 
2685         /*
2686          * If we're within (2 * maxblocksize) bytes of the target
2687          * cache size, increment the target cache size
2688          */
2689         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2690                 atomic_add_64(&arc_c, (int64_t)bytes);
2691                 if (arc_c > arc_c_max)
2692                         arc_c = arc_c_max;
2693                 else if (state == arc_anon)
2694                         atomic_add_64(&arc_p, (int64_t)bytes);
2695                 if (arc_p > arc_c)
2696                         arc_p = arc_c;
2697         }
2698         ASSERT((int64_t)arc_p >= 0);
2699 }
2700 
2701 /*
2702  * Check if the cache has reached its limits and eviction is required
2703  * prior to insert.
2704  */
2705 static int
2706 arc_evict_needed(arc_buf_contents_t type)
2707 {
2708         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2709                 return (1);
2710 
2711         if (arc_reclaim_needed())
2712                 return (1);
2713 
2714         return (arc_size > arc_c);
2715 }
2716 
2717 /*
2718  * The buffer, supplied as the first argument, needs a data block.
2719  * So, if we are at cache max, determine which cache should be victimized.
2720  * We have the following cases:
2721  *
2722  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2723  * In this situation if we're out of space, but the resident size of the MFU is
2724  * under the limit, victimize the MFU cache to satisfy this insertion request.
2725  *
2726  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2727  * Here, we've used up all of the available space for the MRU, so we need to
2728  * evict from our own cache instead.  Evict from the set of resident MRU
2729  * entries.
2730  *
2731  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2732  * c minus p represents the MFU space in the cache, since p is the size of the
2733  * cache that is dedicated to the MRU.  In this situation there's still space on
2734  * the MFU side, so the MRU side needs to be victimized.
2735  *
2736  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2737  * MFU's resident set is consuming more space than it has been allotted.  In
2738  * this situation, we must victimize our own cache, the MFU, for this insertion.
2739  */
2740 static void
2741 arc_get_data_buf(arc_buf_t *buf)
2742 {
2743         arc_state_t             *state = buf->b_hdr->b_state;
2744         uint64_t                size = buf->b_hdr->b_size;
2745         arc_buf_contents_t      type = buf->b_hdr->b_type;
2746 
2747         arc_adapt(size, state);
2748 
2749         /*
2750          * We have not yet reached cache maximum size,
2751          * just allocate a new buffer.
2752          */
2753         if (!arc_evict_needed(type)) {
2754                 if (type == ARC_BUFC_METADATA) {
2755                         buf->b_data = zio_buf_alloc(size);
2756                         arc_space_consume(size, ARC_SPACE_DATA);
2757                 } else {
2758                         ASSERT(type == ARC_BUFC_DATA);
2759                         buf->b_data = zio_data_buf_alloc(size);
2760                         ARCSTAT_INCR(arcstat_data_size, size);
2761                         atomic_add_64(&arc_size, size);
2762                 }
2763                 goto out;
2764         }
2765 
2766         /*
2767          * If we are prefetching from the mfu ghost list, this buffer
2768          * will end up on the mru list; so steal space from there.
2769          */
2770         if (state == arc_mfu_ghost)
2771                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2772         else if (state == arc_mru_ghost)
2773                 state = arc_mru;
2774 
2775         if (state == arc_mru || state == arc_anon) {
2776                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2777                 state = (arc_mfu->arcs_lsize[type] >= size &&
2778                     arc_p > mru_used) ? arc_mfu : arc_mru;
2779         } else {
2780                 /* MFU cases */
2781                 uint64_t mfu_space = arc_c - arc_p;
2782                 state =  (arc_mru->arcs_lsize[type] >= size &&
2783                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2784         }
2785         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2786                 if (type == ARC_BUFC_METADATA) {
2787                         buf->b_data = zio_buf_alloc(size);
2788                         arc_space_consume(size, ARC_SPACE_DATA);
2789                 } else {
2790                         ASSERT(type == ARC_BUFC_DATA);
2791                         buf->b_data = zio_data_buf_alloc(size);
2792                         ARCSTAT_INCR(arcstat_data_size, size);
2793                         atomic_add_64(&arc_size, size);
2794                 }
2795                 ARCSTAT_BUMP(arcstat_recycle_miss);
2796         }
2797         ASSERT(buf->b_data != NULL);
2798 out:
2799         /*
2800          * Update the state size.  Note that ghost states have a
2801          * "ghost size" and so don't need to be updated.
2802          */
2803         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2804                 arc_buf_hdr_t *hdr = buf->b_hdr;
2805 
2806                 atomic_add_64(&hdr->b_state->arcs_size, size);
2807                 if (list_link_active(&hdr->b_arc_node)) {
2808                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2809                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2810                 }
2811                 /*
2812                  * If we are growing the cache, and we are adding anonymous
2813                  * data, and we have outgrown arc_p, update arc_p
2814                  */
2815                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2816                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2817                         arc_p = MIN(arc_c, arc_p + size);
2818         }
2819 }
2820 
2821 /*
2822  * This routine is called whenever a buffer is accessed.
2823  * NOTE: the hash lock is dropped in this function.
2824  */
2825 static void
2826 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2827 {
2828         clock_t now;
2829 
2830         ASSERT(MUTEX_HELD(hash_lock));
2831 
2832         if (buf->b_state == arc_anon) {
2833                 /*
2834                  * This buffer is not in the cache, and does not
2835                  * appear in our "ghost" list.  Add the new buffer
2836                  * to the MRU state.
2837                  */
2838 
2839                 ASSERT(buf->b_arc_access == 0);
2840                 buf->b_arc_access = ddi_get_lbolt();
2841                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2842                 arc_change_state(arc_mru, buf, hash_lock);
2843 
2844         } else if (buf->b_state == arc_mru) {
2845                 now = ddi_get_lbolt();
2846 
2847                 /*
2848                  * If this buffer is here because of a prefetch, then either:
2849                  * - clear the flag if this is a "referencing" read
2850                  *   (any subsequent access will bump this into the MFU state).
2851                  * or
2852                  * - move the buffer to the head of the list if this is
2853                  *   another prefetch (to make it less likely to be evicted).
2854                  */
2855                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2856                         if (refcount_count(&buf->b_refcnt) == 0) {
2857                                 ASSERT(list_link_active(&buf->b_arc_node));
2858                         } else {
2859                                 buf->b_flags &= ~ARC_PREFETCH;
2860                                 ARCSTAT_BUMP(arcstat_mru_hits);
2861                         }
2862                         buf->b_arc_access = now;
2863                         return;
2864                 }
2865 
2866                 /*
2867                  * This buffer has been "accessed" only once so far,
2868                  * but it is still in the cache. Move it to the MFU
2869                  * state.
2870                  */
2871                 if (now > buf->b_arc_access + ARC_MINTIME) {
2872                         /*
2873                          * More than 125ms have passed since we
2874                          * instantiated this buffer.  Move it to the
2875                          * most frequently used state.
2876                          */
2877                         buf->b_arc_access = now;
2878                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2879                         arc_change_state(arc_mfu, buf, hash_lock);
2880                 }
2881                 ARCSTAT_BUMP(arcstat_mru_hits);
2882         } else if (buf->b_state == arc_mru_ghost) {
2883                 arc_state_t     *new_state;
2884                 /*
2885                  * This buffer has been "accessed" recently, but
2886                  * was evicted from the cache.  Move it to the
2887                  * MFU state.
2888                  */
2889 
2890                 if (buf->b_flags & ARC_PREFETCH) {
2891                         new_state = arc_mru;
2892                         if (refcount_count(&buf->b_refcnt) > 0)
2893                                 buf->b_flags &= ~ARC_PREFETCH;
2894                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2895                 } else {
2896                         new_state = arc_mfu;
2897                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2898                 }
2899 
2900                 buf->b_arc_access = ddi_get_lbolt();
2901                 arc_change_state(new_state, buf, hash_lock);
2902 
2903                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2904         } else if (buf->b_state == arc_mfu) {
2905                 /*
2906                  * This buffer has been accessed more than once and is
2907                  * still in the cache.  Keep it in the MFU state.
2908                  *
2909                  * NOTE: an add_reference() that occurred when we did
2910                  * the arc_read() will have kicked this off the list.
2911                  * If it was a prefetch, we will explicitly move it to
2912                  * the head of the list now.
2913                  */
2914                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2915                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2916                         ASSERT(list_link_active(&buf->b_arc_node));
2917                 }
2918                 ARCSTAT_BUMP(arcstat_mfu_hits);
2919                 buf->b_arc_access = ddi_get_lbolt();
2920         } else if (buf->b_state == arc_mfu_ghost) {
2921                 arc_state_t     *new_state = arc_mfu;
2922                 /*
2923                  * This buffer has been accessed more than once but has
2924                  * been evicted from the cache.  Move it back to the
2925                  * MFU state.
2926                  */
2927 
2928                 if (buf->b_flags & ARC_PREFETCH) {
2929                         /*
2930                          * This is a prefetch access...
2931                          * move this block back to the MRU state.
2932                          */
2933                         ASSERT0(refcount_count(&buf->b_refcnt));
2934                         new_state = arc_mru;
2935                 }
2936 
2937                 buf->b_arc_access = ddi_get_lbolt();
2938                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2939                 arc_change_state(new_state, buf, hash_lock);
2940 
2941                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2942         } else if (buf->b_state == arc_l2c_only) {
2943                 /*
2944                  * This buffer is on the 2nd Level ARC.
2945                  */
2946 
2947                 buf->b_arc_access = ddi_get_lbolt();
2948                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2949                 arc_change_state(arc_mfu, buf, hash_lock);
2950         } else {
2951                 ASSERT(!"invalid arc state");
2952         }
2953 }
2954 
2955 /* a generic arc_done_func_t which you can use */
2956 /* ARGSUSED */
2957 void
2958 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2959 {
2960         if (zio == NULL || zio->io_error == 0)
2961                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2962         VERIFY(arc_buf_remove_ref(buf, arg));
2963 }
2964 
2965 /* a generic arc_done_func_t */
2966 void
2967 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2968 {
2969         arc_buf_t **bufp = arg;
2970         if (zio && zio->io_error) {
2971                 VERIFY(arc_buf_remove_ref(buf, arg));
2972                 *bufp = NULL;
2973         } else {
2974                 *bufp = buf;
2975                 ASSERT(buf->b_data);
2976         }
2977 }
2978 
2979 static void
2980 arc_read_done(zio_t *zio)
2981 {
2982         arc_buf_hdr_t   *hdr, *found;
2983         arc_buf_t       *buf;
2984         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2985         kmutex_t        *hash_lock;
2986         arc_callback_t  *callback_list, *acb;
2987         int             freeable = FALSE;
2988 
2989         buf = zio->io_private;
2990         hdr = buf->b_hdr;
2991 
2992         /*
2993          * The hdr was inserted into hash-table and removed from lists
2994          * prior to starting I/O.  We should find this header, since
2995          * it's in the hash table, and it should be legit since it's
2996          * not possible to evict it during the I/O.  The only possible
2997          * reason for it not to be found is if we were freed during the
2998          * read.
2999          */
3000         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
3001             &hash_lock);
3002 
3003         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
3004             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3005             (found == hdr && HDR_L2_READING(hdr)));
3006 
3007         hdr->b_flags &= ~ARC_L2_EVICTED;
3008         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3009                 hdr->b_flags &= ~ARC_L2CACHE;
3010 
3011         /* byteswap if necessary */
3012         callback_list = hdr->b_acb;
3013         ASSERT(callback_list != NULL);
3014         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3015                 dmu_object_byteswap_t bswap =
3016                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3017                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3018                     byteswap_uint64_array :
3019                     dmu_ot_byteswap[bswap].ob_func;
3020                 func(buf->b_data, hdr->b_size);
3021         }
3022 
3023         arc_cksum_compute(buf, B_FALSE);
3024         arc_buf_watch(buf);
3025 
3026         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3027                 /*
3028                  * Only call arc_access on anonymous buffers.  This is because
3029                  * if we've issued an I/O for an evicted buffer, we've already
3030                  * called arc_access (to prevent any simultaneous readers from
3031                  * getting confused).
3032                  */
3033                 arc_access(hdr, hash_lock);
3034         }
3035 
3036         /* create copies of the data buffer for the callers */
3037         abuf = buf;
3038         for (acb = callback_list; acb; acb = acb->acb_next) {
3039                 if (acb->acb_done) {
3040                         if (abuf == NULL) {
3041                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
3042                                 abuf = arc_buf_clone(buf);
3043                         }
3044                         acb->acb_buf = abuf;
3045                         abuf = NULL;
3046                 }
3047         }
3048         hdr->b_acb = NULL;
3049         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3050         ASSERT(!HDR_BUF_AVAILABLE(hdr));
3051         if (abuf == buf) {
3052                 ASSERT(buf->b_efunc == NULL);
3053                 ASSERT(hdr->b_datacnt == 1);
3054                 hdr->b_flags |= ARC_BUF_AVAILABLE;
3055         }
3056 
3057         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3058 
3059         if (zio->io_error != 0) {
3060                 hdr->b_flags |= ARC_IO_ERROR;
3061                 if (hdr->b_state != arc_anon)
3062                         arc_change_state(arc_anon, hdr, hash_lock);
3063                 if (HDR_IN_HASH_TABLE(hdr))
3064                         buf_hash_remove(hdr);
3065                 freeable = refcount_is_zero(&hdr->b_refcnt);
3066         }
3067 
3068         /*
3069          * Broadcast before we drop the hash_lock to avoid the possibility
3070          * that the hdr (and hence the cv) might be freed before we get to
3071          * the cv_broadcast().
3072          */
3073         cv_broadcast(&hdr->b_cv);
3074 
3075         if (hash_lock) {
3076                 mutex_exit(hash_lock);
3077         } else {
3078                 /*
3079                  * This block was freed while we waited for the read to
3080                  * complete.  It has been removed from the hash table and
3081                  * moved to the anonymous state (so that it won't show up
3082                  * in the cache).
3083                  */
3084                 ASSERT3P(hdr->b_state, ==, arc_anon);
3085                 freeable = refcount_is_zero(&hdr->b_refcnt);
3086         }
3087 
3088         /* execute each callback and free its structure */
3089         while ((acb = callback_list) != NULL) {
3090                 if (acb->acb_done)
3091                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3092 
3093                 if (acb->acb_zio_dummy != NULL) {
3094                         acb->acb_zio_dummy->io_error = zio->io_error;
3095                         zio_nowait(acb->acb_zio_dummy);
3096                 }
3097 
3098                 callback_list = acb->acb_next;
3099                 kmem_free(acb, sizeof (arc_callback_t));
3100         }
3101 
3102         if (freeable)
3103                 arc_hdr_destroy(hdr);
3104 }
3105 
3106 /*
3107  * "Read" the block at the specified DVA (in bp) via the
3108  * cache.  If the block is found in the cache, invoke the provided
3109  * callback immediately and return.  Note that the `zio' parameter
3110  * in the callback will be NULL in this case, since no IO was
3111  * required.  If the block is not in the cache pass the read request
3112  * on to the spa with a substitute callback function, so that the
3113  * requested block will be added to the cache.
3114  *
3115  * If a read request arrives for a block that has a read in-progress,
3116  * either wait for the in-progress read to complete (and return the
3117  * results); or, if this is a read with a "done" func, add a record
3118  * to the read to invoke the "done" func when the read completes,
3119  * and return; or just return.
3120  *
3121  * arc_read_done() will invoke all the requested "done" functions
3122  * for readers of this block.
3123  */
3124 int
3125 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3126     void *private, int priority, int zio_flags, uint32_t *arc_flags,
3127     const zbookmark_t *zb)
3128 {
3129         arc_buf_hdr_t *hdr;
3130         arc_buf_t *buf = NULL;
3131         kmutex_t *hash_lock;
3132         zio_t *rzio;
3133         uint64_t guid = spa_load_guid(spa);
3134 
3135 top:
3136         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3137             &hash_lock);
3138         if (hdr && hdr->b_datacnt > 0) {
3139 
3140                 *arc_flags |= ARC_CACHED;
3141 
3142                 if (HDR_IO_IN_PROGRESS(hdr)) {
3143 
3144                         if (*arc_flags & ARC_WAIT) {
3145                                 cv_wait(&hdr->b_cv, hash_lock);
3146                                 mutex_exit(hash_lock);
3147                                 goto top;
3148                         }
3149                         ASSERT(*arc_flags & ARC_NOWAIT);
3150 
3151                         if (done) {
3152                                 arc_callback_t  *acb = NULL;
3153 
3154                                 acb = kmem_zalloc(sizeof (arc_callback_t),
3155                                     KM_SLEEP);
3156                                 acb->acb_done = done;
3157                                 acb->acb_private = private;
3158                                 if (pio != NULL)
3159                                         acb->acb_zio_dummy = zio_null(pio,
3160                                             spa, NULL, NULL, NULL, zio_flags);
3161 
3162                                 ASSERT(acb->acb_done != NULL);
3163                                 acb->acb_next = hdr->b_acb;
3164                                 hdr->b_acb = acb;
3165                                 add_reference(hdr, hash_lock, private);
3166                                 mutex_exit(hash_lock);
3167                                 return (0);
3168                         }
3169                         mutex_exit(hash_lock);
3170                         return (0);
3171                 }
3172 
3173                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3174 
3175                 if (done) {
3176                         add_reference(hdr, hash_lock, private);
3177                         /*
3178                          * If this block is already in use, create a new
3179                          * copy of the data so that we will be guaranteed
3180                          * that arc_release() will always succeed.
3181                          */
3182                         buf = hdr->b_buf;
3183                         ASSERT(buf);
3184                         ASSERT(buf->b_data);
3185                         if (HDR_BUF_AVAILABLE(hdr)) {
3186                                 ASSERT(buf->b_efunc == NULL);
3187                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3188                         } else {
3189                                 buf = arc_buf_clone(buf);
3190                         }
3191 
3192                 } else if (*arc_flags & ARC_PREFETCH &&
3193                     refcount_count(&hdr->b_refcnt) == 0) {
3194                         hdr->b_flags |= ARC_PREFETCH;
3195                 }
3196                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3197                 arc_access(hdr, hash_lock);
3198                 if (*arc_flags & ARC_L2CACHE)
3199                         hdr->b_flags |= ARC_L2CACHE;
3200                 if (*arc_flags & ARC_L2COMPRESS)
3201                         hdr->b_flags |= ARC_L2COMPRESS;
3202                 mutex_exit(hash_lock);
3203                 ARCSTAT_BUMP(arcstat_hits);
3204                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3205                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3206                     data, metadata, hits);
3207 
3208                 if (done)
3209                         done(NULL, buf, private);
3210         } else {
3211                 uint64_t size = BP_GET_LSIZE(bp);
3212                 arc_callback_t  *acb;
3213                 vdev_t *vd = NULL;
3214                 uint64_t addr = 0;
3215                 boolean_t devw = B_FALSE;
3216 
3217                 if (hdr == NULL) {
3218                         /* this block is not in the cache */
3219                         arc_buf_hdr_t   *exists;
3220                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3221                         buf = arc_buf_alloc(spa, size, private, type);
3222                         hdr = buf->b_hdr;
3223                         hdr->b_dva = *BP_IDENTITY(bp);
3224                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3225                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3226                         exists = buf_hash_insert(hdr, &hash_lock);
3227                         if (exists) {
3228                                 /* somebody beat us to the hash insert */
3229                                 mutex_exit(hash_lock);
3230                                 buf_discard_identity(hdr);
3231                                 (void) arc_buf_remove_ref(buf, private);
3232                                 goto top; /* restart the IO request */
3233                         }
3234                         /* if this is a prefetch, we don't have a reference */
3235                         if (*arc_flags & ARC_PREFETCH) {
3236                                 (void) remove_reference(hdr, hash_lock,
3237                                     private);
3238                                 hdr->b_flags |= ARC_PREFETCH;
3239                         }
3240                         if (*arc_flags & ARC_L2CACHE)
3241                                 hdr->b_flags |= ARC_L2CACHE;
3242                         if (*arc_flags & ARC_L2COMPRESS)
3243                                 hdr->b_flags |= ARC_L2COMPRESS;
3244                         if (BP_GET_LEVEL(bp) > 0)
3245                                 hdr->b_flags |= ARC_INDIRECT;
3246                 } else {
3247                         /* this block is in the ghost cache */
3248                         ASSERT(GHOST_STATE(hdr->b_state));
3249                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3250                         ASSERT0(refcount_count(&hdr->b_refcnt));
3251                         ASSERT(hdr->b_buf == NULL);
3252 
3253                         /* if this is a prefetch, we don't have a reference */
3254                         if (*arc_flags & ARC_PREFETCH)
3255                                 hdr->b_flags |= ARC_PREFETCH;
3256                         else
3257                                 add_reference(hdr, hash_lock, private);
3258                         if (*arc_flags & ARC_L2CACHE)
3259                                 hdr->b_flags |= ARC_L2CACHE;
3260                         if (*arc_flags & ARC_L2COMPRESS)
3261                                 hdr->b_flags |= ARC_L2COMPRESS;
3262                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3263                         buf->b_hdr = hdr;
3264                         buf->b_data = NULL;
3265                         buf->b_efunc = NULL;
3266                         buf->b_private = NULL;
3267                         buf->b_next = NULL;
3268                         hdr->b_buf = buf;
3269                         ASSERT(hdr->b_datacnt == 0);
3270                         hdr->b_datacnt = 1;
3271                         arc_get_data_buf(buf);
3272                         arc_access(hdr, hash_lock);
3273                 }
3274 
3275                 ASSERT(!GHOST_STATE(hdr->b_state));
3276 
3277                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3278                 acb->acb_done = done;
3279                 acb->acb_private = private;
3280 
3281                 ASSERT(hdr->b_acb == NULL);
3282                 hdr->b_acb = acb;
3283                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3284 
3285                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3286                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3287                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3288                         addr = hdr->b_l2hdr->b_daddr;
3289                         /*
3290                          * Lock out device removal.
3291                          */
3292                         if (vdev_is_dead(vd) ||
3293                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3294                                 vd = NULL;
3295                 }
3296 
3297                 mutex_exit(hash_lock);
3298 
3299                 /*
3300                  * At this point, we have a level 1 cache miss.  Try again in
3301                  * L2ARC if possible.
3302                  */
3303                 ASSERT3U(hdr->b_size, ==, size);
3304                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3305                     uint64_t, size, zbookmark_t *, zb);
3306                 ARCSTAT_BUMP(arcstat_misses);
3307                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3308                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3309                     data, metadata, misses);
3310 
3311                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3312                         /*
3313                          * Read from the L2ARC if the following are true:
3314                          * 1. The L2ARC vdev was previously cached.
3315                          * 2. This buffer still has L2ARC metadata.
3316                          * 3. This buffer isn't currently writing to the L2ARC.
3317                          * 4. The L2ARC entry wasn't evicted, which may
3318                          *    also have invalidated the vdev.
3319                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3320                          */
3321                         if (hdr->b_l2hdr != NULL &&
3322                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3323                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3324                                 l2arc_read_callback_t *cb;
3325 
3326                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3327                                 ARCSTAT_BUMP(arcstat_l2_hits);
3328 
3329                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3330                                     KM_SLEEP);
3331                                 cb->l2rcb_buf = buf;
3332                                 cb->l2rcb_spa = spa;
3333                                 cb->l2rcb_bp = *bp;
3334                                 cb->l2rcb_zb = *zb;
3335                                 cb->l2rcb_flags = zio_flags;
3336                                 cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3337 
3338                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3339                                     addr + size < vd->vdev_psize -
3340                                     VDEV_LABEL_END_SIZE);
3341 
3342                                 /*
3343                                  * l2arc read.  The SCL_L2ARC lock will be
3344                                  * released by l2arc_read_done().
3345                                  * Issue a null zio if the underlying buffer
3346                                  * was squashed to zero size by compression.
3347                                  */
3348                                 if (hdr->b_l2hdr->b_compress ==
3349                                     ZIO_COMPRESS_EMPTY) {
3350                                         rzio = zio_null(pio, spa, vd,
3351                                             l2arc_read_done, cb,
3352                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3353                                             ZIO_FLAG_CANFAIL |
3354                                             ZIO_FLAG_DONT_PROPAGATE |
3355                                             ZIO_FLAG_DONT_RETRY);
3356                                 } else {
3357                                         rzio = zio_read_phys(pio, vd, addr,
3358                                             hdr->b_l2hdr->b_asize,
3359                                             buf->b_data, ZIO_CHECKSUM_OFF,
3360                                             l2arc_read_done, cb, priority,
3361                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3362                                             ZIO_FLAG_CANFAIL |
3363                                             ZIO_FLAG_DONT_PROPAGATE |
3364                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
3365                                 }
3366                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3367                                     zio_t *, rzio);
3368                                 ARCSTAT_INCR(arcstat_l2_read_bytes,
3369                                     hdr->b_l2hdr->b_asize);
3370 
3371                                 if (*arc_flags & ARC_NOWAIT) {
3372                                         zio_nowait(rzio);
3373                                         return (0);
3374                                 }
3375 
3376                                 ASSERT(*arc_flags & ARC_WAIT);
3377                                 if (zio_wait(rzio) == 0)
3378                                         return (0);
3379 
3380                                 /* l2arc read error; goto zio_read() */
3381                         } else {
3382                                 DTRACE_PROBE1(l2arc__miss,
3383                                     arc_buf_hdr_t *, hdr);
3384                                 ARCSTAT_BUMP(arcstat_l2_misses);
3385                                 if (HDR_L2_WRITING(hdr))
3386                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3387                                 spa_config_exit(spa, SCL_L2ARC, vd);
3388                         }
3389                 } else {
3390                         if (vd != NULL)
3391                                 spa_config_exit(spa, SCL_L2ARC, vd);
3392                         if (l2arc_ndev != 0) {
3393                                 DTRACE_PROBE1(l2arc__miss,
3394                                     arc_buf_hdr_t *, hdr);
3395                                 ARCSTAT_BUMP(arcstat_l2_misses);
3396                         }
3397                 }
3398 
3399                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3400                     arc_read_done, buf, priority, zio_flags, zb);
3401 
3402                 if (*arc_flags & ARC_WAIT)
3403                         return (zio_wait(rzio));
3404 
3405                 ASSERT(*arc_flags & ARC_NOWAIT);
3406                 zio_nowait(rzio);
3407         }
3408         return (0);
3409 }
3410 
3411 void
3412 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3413 {
3414         ASSERT(buf->b_hdr != NULL);
3415         ASSERT(buf->b_hdr->b_state != arc_anon);
3416         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3417         ASSERT(buf->b_efunc == NULL);
3418         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3419 
3420         buf->b_efunc = func;
3421         buf->b_private = private;
3422 }
3423 
3424 /*
3425  * Notify the arc that a block was freed, and thus will never be used again.
3426  */
3427 void
3428 arc_freed(spa_t *spa, const blkptr_t *bp)
3429 {
3430         arc_buf_hdr_t *hdr;
3431         kmutex_t *hash_lock;
3432         uint64_t guid = spa_load_guid(spa);
3433 
3434         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3435             &hash_lock);
3436         if (hdr == NULL)
3437                 return;
3438         if (HDR_BUF_AVAILABLE(hdr)) {
3439                 arc_buf_t *buf = hdr->b_buf;
3440                 add_reference(hdr, hash_lock, FTAG);
3441                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3442                 mutex_exit(hash_lock);
3443 
3444                 arc_release(buf, FTAG);
3445                 (void) arc_buf_remove_ref(buf, FTAG);
3446         } else {
3447                 mutex_exit(hash_lock);
3448         }
3449 
3450 }
3451 
3452 /*
3453  * This is used by the DMU to let the ARC know that a buffer is
3454  * being evicted, so the ARC should clean up.  If this arc buf
3455  * is not yet in the evicted state, it will be put there.
3456  */
3457 int
3458 arc_buf_evict(arc_buf_t *buf)
3459 {
3460         arc_buf_hdr_t *hdr;
3461         kmutex_t *hash_lock;
3462         arc_buf_t **bufp;
3463 
3464         mutex_enter(&buf->b_evict_lock);
3465         hdr = buf->b_hdr;
3466         if (hdr == NULL) {
3467                 /*
3468                  * We are in arc_do_user_evicts().
3469                  */
3470                 ASSERT(buf->b_data == NULL);
3471                 mutex_exit(&buf->b_evict_lock);
3472                 return (0);
3473         } else if (buf->b_data == NULL) {
3474                 arc_buf_t copy = *buf; /* structure assignment */
3475                 /*
3476                  * We are on the eviction list; process this buffer now
3477                  * but let arc_do_user_evicts() do the reaping.
3478                  */
3479                 buf->b_efunc = NULL;
3480                 mutex_exit(&buf->b_evict_lock);
3481                 VERIFY(copy.b_efunc(&copy) == 0);
3482                 return (1);
3483         }
3484         hash_lock = HDR_LOCK(hdr);
3485         mutex_enter(hash_lock);
3486         hdr = buf->b_hdr;
3487         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3488 
3489         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3490         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3491 
3492         /*
3493          * Pull this buffer off of the hdr
3494          */
3495         bufp = &hdr->b_buf;
3496         while (*bufp != buf)
3497                 bufp = &(*bufp)->b_next;
3498         *bufp = buf->b_next;
3499 
3500         ASSERT(buf->b_data != NULL);
3501         arc_buf_destroy(buf, FALSE, FALSE);
3502 
3503         if (hdr->b_datacnt == 0) {
3504                 arc_state_t *old_state = hdr->b_state;
3505                 arc_state_t *evicted_state;
3506 
3507                 ASSERT(hdr->b_buf == NULL);
3508                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3509 
3510                 evicted_state =
3511                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3512 
3513                 mutex_enter(&old_state->arcs_mtx);
3514                 mutex_enter(&evicted_state->arcs_mtx);
3515 
3516                 arc_change_state(evicted_state, hdr, hash_lock);
3517                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3518                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3519                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3520 
3521                 mutex_exit(&evicted_state->arcs_mtx);
3522                 mutex_exit(&old_state->arcs_mtx);
3523         }
3524         mutex_exit(hash_lock);
3525         mutex_exit(&buf->b_evict_lock);
3526 
3527         VERIFY(buf->b_efunc(buf) == 0);
3528         buf->b_efunc = NULL;
3529         buf->b_private = NULL;
3530         buf->b_hdr = NULL;
3531         buf->b_next = NULL;
3532         kmem_cache_free(buf_cache, buf);
3533         return (1);
3534 }
3535 
3536 /*
3537  * Release this buffer from the cache, making it an anonymous buffer.  This
3538  * must be done after a read and prior to modifying the buffer contents.
3539  * If the buffer has more than one reference, we must make
3540  * a new hdr for the buffer.
3541  */
3542 void
3543 arc_release(arc_buf_t *buf, void *tag)
3544 {
3545         arc_buf_hdr_t *hdr;
3546         kmutex_t *hash_lock = NULL;
3547         l2arc_buf_hdr_t *l2hdr;
3548         uint64_t buf_size;
3549 
3550         /*
3551          * It would be nice to assert that if it's DMU metadata (level >
3552          * 0 || it's the dnode file), then it must be syncing context.
3553          * But we don't know that information at this level.
3554          */
3555 
3556         mutex_enter(&buf->b_evict_lock);
3557         hdr = buf->b_hdr;
3558 
3559         /* this buffer is not on any list */
3560         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3561 
3562         if (hdr->b_state == arc_anon) {
3563                 /* this buffer is already released */
3564                 ASSERT(buf->b_efunc == NULL);
3565         } else {
3566                 hash_lock = HDR_LOCK(hdr);
3567                 mutex_enter(hash_lock);
3568                 hdr = buf->b_hdr;
3569                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3570         }
3571 
3572         l2hdr = hdr->b_l2hdr;
3573         if (l2hdr) {
3574                 mutex_enter(&l2arc_buflist_mtx);
3575                 hdr->b_l2hdr = NULL;
3576         }
3577         buf_size = hdr->b_size;
3578 
3579         /*
3580          * Do we have more than one buf?
3581          */
3582         if (hdr->b_datacnt > 1) {
3583                 arc_buf_hdr_t *nhdr;
3584                 arc_buf_t **bufp;
3585                 uint64_t blksz = hdr->b_size;
3586                 uint64_t spa = hdr->b_spa;
3587                 arc_buf_contents_t type = hdr->b_type;
3588                 uint32_t flags = hdr->b_flags;
3589 
3590                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3591                 /*
3592                  * Pull the data off of this hdr and attach it to
3593                  * a new anonymous hdr.
3594                  */
3595                 (void) remove_reference(hdr, hash_lock, tag);
3596                 bufp = &hdr->b_buf;
3597                 while (*bufp != buf)
3598                         bufp = &(*bufp)->b_next;
3599                 *bufp = buf->b_next;
3600                 buf->b_next = NULL;
3601 
3602                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3603                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3604                 if (refcount_is_zero(&hdr->b_refcnt)) {
3605                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3606                         ASSERT3U(*size, >=, hdr->b_size);
3607                         atomic_add_64(size, -hdr->b_size);
3608                 }
3609 
3610                 /*
3611                  * We're releasing a duplicate user data buffer, update
3612                  * our statistics accordingly.
3613                  */
3614                 if (hdr->b_type == ARC_BUFC_DATA) {
3615                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3616                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3617                             -hdr->b_size);
3618                 }
3619                 hdr->b_datacnt -= 1;
3620                 arc_cksum_verify(buf);
3621                 arc_buf_unwatch(buf);
3622 
3623                 mutex_exit(hash_lock);
3624 
3625                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3626                 nhdr->b_size = blksz;
3627                 nhdr->b_spa = spa;
3628                 nhdr->b_type = type;
3629                 nhdr->b_buf = buf;
3630                 nhdr->b_state = arc_anon;
3631                 nhdr->b_arc_access = 0;
3632                 nhdr->b_flags = flags & ARC_L2_WRITING;
3633                 nhdr->b_l2hdr = NULL;
3634                 nhdr->b_datacnt = 1;
3635                 nhdr->b_freeze_cksum = NULL;
3636                 (void) refcount_add(&nhdr->b_refcnt, tag);
3637                 buf->b_hdr = nhdr;
3638                 mutex_exit(&buf->b_evict_lock);
3639                 atomic_add_64(&arc_anon->arcs_size, blksz);
3640         } else {
3641                 mutex_exit(&buf->b_evict_lock);
3642                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3643                 ASSERT(!list_link_active(&hdr->b_arc_node));
3644                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3645                 if (hdr->b_state != arc_anon)
3646                         arc_change_state(arc_anon, hdr, hash_lock);
3647                 hdr->b_arc_access = 0;
3648                 if (hash_lock)
3649                         mutex_exit(hash_lock);
3650 
3651                 buf_discard_identity(hdr);
3652                 arc_buf_thaw(buf);
3653         }
3654         buf->b_efunc = NULL;
3655         buf->b_private = NULL;
3656 
3657         if (l2hdr) {
3658                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3659                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3660                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3661                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3662                 mutex_exit(&l2arc_buflist_mtx);
3663         }
3664 }
3665 
3666 int
3667 arc_released(arc_buf_t *buf)
3668 {
3669         int released;
3670 
3671         mutex_enter(&buf->b_evict_lock);
3672         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3673         mutex_exit(&buf->b_evict_lock);
3674         return (released);
3675 }
3676 
3677 int
3678 arc_has_callback(arc_buf_t *buf)
3679 {
3680         int callback;
3681 
3682         mutex_enter(&buf->b_evict_lock);
3683         callback = (buf->b_efunc != NULL);
3684         mutex_exit(&buf->b_evict_lock);
3685         return (callback);
3686 }
3687 
3688 #ifdef ZFS_DEBUG
3689 int
3690 arc_referenced(arc_buf_t *buf)
3691 {
3692         int referenced;
3693 
3694         mutex_enter(&buf->b_evict_lock);
3695         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3696         mutex_exit(&buf->b_evict_lock);
3697         return (referenced);
3698 }
3699 #endif
3700 
3701 static void
3702 arc_write_ready(zio_t *zio)
3703 {
3704         arc_write_callback_t *callback = zio->io_private;
3705         arc_buf_t *buf = callback->awcb_buf;
3706         arc_buf_hdr_t *hdr = buf->b_hdr;
3707 
3708         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3709         callback->awcb_ready(zio, buf, callback->awcb_private);
3710 
3711         /*
3712          * If the IO is already in progress, then this is a re-write
3713          * attempt, so we need to thaw and re-compute the cksum.
3714          * It is the responsibility of the callback to handle the
3715          * accounting for any re-write attempt.
3716          */
3717         if (HDR_IO_IN_PROGRESS(hdr)) {
3718                 mutex_enter(&hdr->b_freeze_lock);
3719                 if (hdr->b_freeze_cksum != NULL) {
3720                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3721                         hdr->b_freeze_cksum = NULL;
3722                 }
3723                 mutex_exit(&hdr->b_freeze_lock);
3724         }
3725         arc_cksum_compute(buf, B_FALSE);
3726         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3727 }
3728 
3729 static void
3730 arc_write_done(zio_t *zio)
3731 {
3732         arc_write_callback_t *callback = zio->io_private;
3733         arc_buf_t *buf = callback->awcb_buf;
3734         arc_buf_hdr_t *hdr = buf->b_hdr;
3735 
3736         ASSERT(hdr->b_acb == NULL);
3737 
3738         if (zio->io_error == 0) {
3739                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3740                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3741                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3742         } else {
3743                 ASSERT(BUF_EMPTY(hdr));
3744         }
3745 
3746         /*
3747          * If the block to be written was all-zero, we may have
3748          * compressed it away.  In this case no write was performed
3749          * so there will be no dva/birth/checksum.  The buffer must
3750          * therefore remain anonymous (and uncached).
3751          */
3752         if (!BUF_EMPTY(hdr)) {
3753                 arc_buf_hdr_t *exists;
3754                 kmutex_t *hash_lock;
3755 
3756                 ASSERT(zio->io_error == 0);
3757 
3758                 arc_cksum_verify(buf);
3759 
3760                 exists = buf_hash_insert(hdr, &hash_lock);
3761                 if (exists) {
3762                         /*
3763                          * This can only happen if we overwrite for
3764                          * sync-to-convergence, because we remove
3765                          * buffers from the hash table when we arc_free().
3766                          */
3767                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3768                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3769                                         panic("bad overwrite, hdr=%p exists=%p",
3770                                             (void *)hdr, (void *)exists);
3771                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3772                                 arc_change_state(arc_anon, exists, hash_lock);
3773                                 mutex_exit(hash_lock);
3774                                 arc_hdr_destroy(exists);
3775                                 exists = buf_hash_insert(hdr, &hash_lock);
3776                                 ASSERT3P(exists, ==, NULL);
3777                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3778                                 /* nopwrite */
3779                                 ASSERT(zio->io_prop.zp_nopwrite);
3780                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3781                                         panic("bad nopwrite, hdr=%p exists=%p",
3782                                             (void *)hdr, (void *)exists);
3783                         } else {
3784                                 /* Dedup */
3785                                 ASSERT(hdr->b_datacnt == 1);
3786                                 ASSERT(hdr->b_state == arc_anon);
3787                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3788                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3789                         }
3790                 }
3791                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3792                 /* if it's not anon, we are doing a scrub */
3793                 if (!exists && hdr->b_state == arc_anon)
3794                         arc_access(hdr, hash_lock);
3795                 mutex_exit(hash_lock);
3796         } else {
3797                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3798         }
3799 
3800         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3801         callback->awcb_done(zio, buf, callback->awcb_private);
3802 
3803         kmem_free(callback, sizeof (arc_write_callback_t));
3804 }
3805 
3806 zio_t *
3807 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3808     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3809     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3810     void *private, int priority, int zio_flags, const zbookmark_t *zb)
3811 {
3812         arc_buf_hdr_t *hdr = buf->b_hdr;
3813         arc_write_callback_t *callback;
3814         zio_t *zio;
3815 
3816         ASSERT(ready != NULL);
3817         ASSERT(done != NULL);
3818         ASSERT(!HDR_IO_ERROR(hdr));
3819         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3820         ASSERT(hdr->b_acb == NULL);
3821         if (l2arc)
3822                 hdr->b_flags |= ARC_L2CACHE;
3823         if (l2arc_compress)
3824                 hdr->b_flags |= ARC_L2COMPRESS;
3825         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3826         callback->awcb_ready = ready;
3827         callback->awcb_done = done;
3828         callback->awcb_private = private;
3829         callback->awcb_buf = buf;
3830 
3831         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3832             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3833 
3834         return (zio);
3835 }
3836 
3837 static int
3838 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3839 {
3840 #ifdef _KERNEL
3841         uint64_t available_memory = ptob(freemem);
3842         static uint64_t page_load = 0;
3843         static uint64_t last_txg = 0;
3844 
3845 #if defined(__i386)
3846         available_memory =
3847             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3848 #endif
3849         if (available_memory >= zfs_write_limit_max)
3850                 return (0);
3851 
3852         if (txg > last_txg) {
3853                 last_txg = txg;
3854                 page_load = 0;
3855         }
3856         /*
3857          * If we are in pageout, we know that memory is already tight,
3858          * the arc is already going to be evicting, so we just want to
3859          * continue to let page writes occur as quickly as possible.
3860          */
3861         if (curproc == proc_pageout) {
3862                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3863                         return (SET_ERROR(ERESTART));
3864                 /* Note: reserve is inflated, so we deflate */
3865                 page_load += reserve / 8;
3866                 return (0);
3867         } else if (page_load > 0 && arc_reclaim_needed()) {
3868                 /* memory is low, delay before restarting */
3869                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3870                 return (SET_ERROR(EAGAIN));
3871         }
3872         page_load = 0;
3873 
3874         if (arc_size > arc_c_min) {
3875                 uint64_t evictable_memory =
3876                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3877                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3878                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3879                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3880                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3881         }
3882 
3883         if (inflight_data > available_memory / 4) {
3884                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3885                 return (SET_ERROR(ERESTART));
3886         }
3887 #endif
3888         return (0);
3889 }
3890 
3891 void
3892 arc_tempreserve_clear(uint64_t reserve)
3893 {
3894         atomic_add_64(&arc_tempreserve, -reserve);
3895         ASSERT((int64_t)arc_tempreserve >= 0);
3896 }
3897 
3898 int
3899 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3900 {
3901         int error;
3902         uint64_t anon_size;
3903 
3904 #ifdef ZFS_DEBUG
3905         /*
3906          * Once in a while, fail for no reason.  Everything should cope.
3907          */
3908         if (spa_get_random(10000) == 0) {
3909                 dprintf("forcing random failure\n");
3910                 return (SET_ERROR(ERESTART));
3911         }
3912 #endif
3913         if (reserve > arc_c/4 && !arc_no_grow)
3914                 arc_c = MIN(arc_c_max, reserve * 4);
3915         if (reserve > arc_c)
3916                 return (SET_ERROR(ENOMEM));
3917 
3918         /*
3919          * Don't count loaned bufs as in flight dirty data to prevent long
3920          * network delays from blocking transactions that are ready to be
3921          * assigned to a txg.
3922          */
3923         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3924 
3925         /*
3926          * Writes will, almost always, require additional memory allocations
3927          * in order to compress/encrypt/etc the data.  We therefore need to
3928          * make sure that there is sufficient available memory for this.
3929          */
3930         if (error = arc_memory_throttle(reserve, anon_size, txg))
3931                 return (error);
3932 
3933         /*
3934          * Throttle writes when the amount of dirty data in the cache
3935          * gets too large.  We try to keep the cache less than half full
3936          * of dirty blocks so that our sync times don't grow too large.
3937          * Note: if two requests come in concurrently, we might let them
3938          * both succeed, when one of them should fail.  Not a huge deal.
3939          */
3940 
3941         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3942             anon_size > arc_c / 4) {
3943                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3944                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3945                     arc_tempreserve>>10,
3946                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3947                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3948                     reserve>>10, arc_c>>10);
3949                 return (SET_ERROR(ERESTART));
3950         }
3951         atomic_add_64(&arc_tempreserve, reserve);
3952         return (0);
3953 }
3954 
3955 void
3956 arc_init(void)
3957 {
3958         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3959         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3960 
3961         /* Convert seconds to clock ticks */
3962         arc_min_prefetch_lifespan = 1 * hz;
3963 
3964         /* Start out with 1/8 of all memory */
3965         arc_c = physmem * PAGESIZE / 8;
3966 
3967 #ifdef _KERNEL
3968         /*
3969          * On architectures where the physical memory can be larger
3970          * than the addressable space (intel in 32-bit mode), we may
3971          * need to limit the cache to 1/8 of VM size.
3972          */
3973         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3974 #endif
3975 
3976         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3977         arc_c_min = MAX(arc_c / 4, 64<<20);
3978         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3979         if (arc_c * 8 >= 1<<30)
3980                 arc_c_max = (arc_c * 8) - (1<<30);
3981         else
3982                 arc_c_max = arc_c_min;
3983         arc_c_max = MAX(arc_c * 6, arc_c_max);
3984 
3985         /*
3986          * Allow the tunables to override our calculations if they are
3987          * reasonable (ie. over 64MB)
3988          */
3989         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3990                 arc_c_max = zfs_arc_max;
3991         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3992                 arc_c_min = zfs_arc_min;
3993 
3994         arc_c = arc_c_max;
3995         arc_p = (arc_c >> 1);
3996 
3997         /* limit meta-data to 1/4 of the arc capacity */
3998         arc_meta_limit = arc_c_max / 4;
3999 
4000         /* Allow the tunable to override if it is reasonable */
4001         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4002                 arc_meta_limit = zfs_arc_meta_limit;
4003 
4004         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4005                 arc_c_min = arc_meta_limit / 2;
4006 
4007         if (zfs_arc_grow_retry > 0)
4008                 arc_grow_retry = zfs_arc_grow_retry;
4009 
4010         if (zfs_arc_shrink_shift > 0)
4011                 arc_shrink_shift = zfs_arc_shrink_shift;
4012 
4013         if (zfs_arc_p_min_shift > 0)
4014                 arc_p_min_shift = zfs_arc_p_min_shift;
4015 
4016         /* if kmem_flags are set, lets try to use less memory */
4017         if (kmem_debugging())
4018                 arc_c = arc_c / 2;
4019         if (arc_c < arc_c_min)
4020                 arc_c = arc_c_min;
4021 
4022         arc_anon = &ARC_anon;
4023         arc_mru = &ARC_mru;
4024         arc_mru_ghost = &ARC_mru_ghost;
4025         arc_mfu = &ARC_mfu;
4026         arc_mfu_ghost = &ARC_mfu_ghost;
4027         arc_l2c_only = &ARC_l2c_only;
4028         arc_size = 0;
4029 
4030         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4031         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4032         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4033         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4034         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4035         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4036 
4037         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
4038             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4039         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
4040             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4041         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
4042             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4043         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
4044             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4045         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
4046             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4047         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
4048             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4049         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
4050             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4051         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
4052             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4053         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
4054             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4055         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
4056             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4057 
4058         buf_init();
4059 
4060         arc_thread_exit = 0;
4061         arc_eviction_list = NULL;
4062         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4063         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4064 
4065         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4066             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4067 
4068         if (arc_ksp != NULL) {
4069                 arc_ksp->ks_data = &arc_stats;
4070                 kstat_install(arc_ksp);
4071         }
4072 
4073         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4074             TS_RUN, minclsyspri);
4075 
4076         arc_dead = FALSE;
4077         arc_warm = B_FALSE;
4078 
4079         if (zfs_write_limit_max == 0)
4080                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
4081         else
4082                 zfs_write_limit_shift = 0;
4083         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
4084 }
4085 
4086 void
4087 arc_fini(void)
4088 {
4089         mutex_enter(&arc_reclaim_thr_lock);
4090         arc_thread_exit = 1;
4091         while (arc_thread_exit != 0)
4092                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4093         mutex_exit(&arc_reclaim_thr_lock);
4094 
4095         arc_flush(NULL);
4096 
4097         arc_dead = TRUE;
4098 
4099         if (arc_ksp != NULL) {
4100                 kstat_delete(arc_ksp);
4101                 arc_ksp = NULL;
4102         }
4103 
4104         mutex_destroy(&arc_eviction_mtx);
4105         mutex_destroy(&arc_reclaim_thr_lock);
4106         cv_destroy(&arc_reclaim_thr_cv);
4107 
4108         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
4109         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
4110         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
4111         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
4112         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
4113         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
4114         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
4115         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
4116 
4117         mutex_destroy(&arc_anon->arcs_mtx);
4118         mutex_destroy(&arc_mru->arcs_mtx);
4119         mutex_destroy(&arc_mru_ghost->arcs_mtx);
4120         mutex_destroy(&arc_mfu->arcs_mtx);
4121         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
4122         mutex_destroy(&arc_l2c_only->arcs_mtx);
4123 
4124         mutex_destroy(&zfs_write_limit_lock);
4125 
4126         buf_fini();
4127 
4128         ASSERT(arc_loaned_bytes == 0);
4129 }
4130 
4131 /*
4132  * Level 2 ARC
4133  *
4134  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4135  * It uses dedicated storage devices to hold cached data, which are populated
4136  * using large infrequent writes.  The main role of this cache is to boost
4137  * the performance of random read workloads.  The intended L2ARC devices
4138  * include short-stroked disks, solid state disks, and other media with
4139  * substantially faster read latency than disk.
4140  *
4141  *                 +-----------------------+
4142  *                 |         ARC           |
4143  *                 +-----------------------+
4144  *                    |         ^     ^
4145  *                    |         |     |
4146  *      l2arc_feed_thread()    arc_read()
4147  *                    |         |     |
4148  *                    |  l2arc read   |
4149  *                    V         |     |
4150  *               +---------------+    |
4151  *               |     L2ARC     |    |
4152  *               +---------------+    |
4153  *                   |    ^           |
4154  *          l2arc_write() |           |
4155  *                   |    |           |
4156  *                   V    |           |
4157  *                 +-------+      +-------+
4158  *                 | vdev  |      | vdev  |
4159  *                 | cache |      | cache |
4160  *                 +-------+      +-------+
4161  *                 +=========+     .-----.
4162  *                 :  L2ARC  :    |-_____-|
4163  *                 : devices :    | Disks |
4164  *                 +=========+    `-_____-'
4165  *
4166  * Read requests are satisfied from the following sources, in order:
4167  *
4168  *      1) ARC
4169  *      2) vdev cache of L2ARC devices
4170  *      3) L2ARC devices
4171  *      4) vdev cache of disks
4172  *      5) disks
4173  *
4174  * Some L2ARC device types exhibit extremely slow write performance.
4175  * To accommodate for this there are some significant differences between
4176  * the L2ARC and traditional cache design:
4177  *
4178  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4179  * the ARC behave as usual, freeing buffers and placing headers on ghost
4180  * lists.  The ARC does not send buffers to the L2ARC during eviction as
4181  * this would add inflated write latencies for all ARC memory pressure.
4182  *
4183  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4184  * It does this by periodically scanning buffers from the eviction-end of
4185  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4186  * not already there. It scans until a headroom of buffers is satisfied,
4187  * which itself is a buffer for ARC eviction. If a compressible buffer is
4188  * found during scanning and selected for writing to an L2ARC device, we
4189  * temporarily boost scanning headroom during the next scan cycle to make
4190  * sure we adapt to compression effects (which might significantly reduce
4191  * the data volume we write to L2ARC). The thread that does this is
4192  * l2arc_feed_thread(), illustrated below; example sizes are included to
4193  * provide a better sense of ratio than this diagram:
4194  *
4195  *             head -->                        tail
4196  *              +---------------------+----------+
4197  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4198  *              +---------------------+----------+   |   o L2ARC eligible
4199  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4200  *              +---------------------+----------+   |
4201  *                   15.9 Gbytes      ^ 32 Mbytes    |
4202  *                                 headroom          |
4203  *                                            l2arc_feed_thread()
4204  *                                                   |
4205  *                       l2arc write hand <--[oooo]--'
4206  *                               |           8 Mbyte
4207  *                               |          write max
4208  *                               V
4209  *                +==============================+
4210  *      L2ARC dev |####|#|###|###|    |####| ... |
4211  *                +==============================+
4212  *                           32 Gbytes
4213  *
4214  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4215  * evicted, then the L2ARC has cached a buffer much sooner than it probably
4216  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4217  * safe to say that this is an uncommon case, since buffers at the end of
4218  * the ARC lists have moved there due to inactivity.
4219  *
4220  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4221  * then the L2ARC simply misses copying some buffers.  This serves as a
4222  * pressure valve to prevent heavy read workloads from both stalling the ARC
4223  * with waits and clogging the L2ARC with writes.  This also helps prevent
4224  * the potential for the L2ARC to churn if it attempts to cache content too
4225  * quickly, such as during backups of the entire pool.
4226  *
4227  * 5. After system boot and before the ARC has filled main memory, there are
4228  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4229  * lists can remain mostly static.  Instead of searching from tail of these
4230  * lists as pictured, the l2arc_feed_thread() will search from the list heads
4231  * for eligible buffers, greatly increasing its chance of finding them.
4232  *
4233  * The L2ARC device write speed is also boosted during this time so that
4234  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4235  * there are no L2ARC reads, and no fear of degrading read performance
4236  * through increased writes.
4237  *
4238  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4239  * the vdev queue can aggregate them into larger and fewer writes.  Each
4240  * device is written to in a rotor fashion, sweeping writes through
4241  * available space then repeating.
4242  *
4243  * 7. The L2ARC does not store dirty content.  It never needs to flush
4244  * write buffers back to disk based storage.
4245  *
4246  * 8. If an ARC buffer is written (and dirtied) which also exists in the
4247  * L2ARC, the now stale L2ARC buffer is immediately dropped.
4248  *
4249  * The performance of the L2ARC can be tweaked by a number of tunables, which
4250  * may be necessary for different workloads:
4251  *
4252  *      l2arc_write_max         max write bytes per interval
4253  *      l2arc_write_boost       extra write bytes during device warmup
4254  *      l2arc_noprefetch        skip caching prefetched buffers
4255  *      l2arc_headroom          number of max device writes to precache
4256  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4257  *                              scanning, we multiply headroom by this
4258  *                              percentage factor for the next scan cycle,
4259  *                              since more compressed buffers are likely to
4260  *                              be present
4261  *      l2arc_feed_secs         seconds between L2ARC writing
4262  *
4263  * Tunables may be removed or added as future performance improvements are
4264  * integrated, and also may become zpool properties.
4265  *
4266  * There are three key functions that control how the L2ARC warms up:
4267  *
4268  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4269  *      l2arc_write_size()      calculate how much to write
4270  *      l2arc_write_interval()  calculate sleep delay between writes
4271  *
4272  * These three functions determine what to write, how much, and how quickly
4273  * to send writes.
4274  *
4275  * L2ARC persistency:
4276  *
4277  * When writing buffers to L2ARC, we periodically add some metadata to
4278  * make sure we can pick them up after reboot, thus dramatically reducing
4279  * the impact that any downtime has on the performance of storage systems
4280  * with large caches.
4281  *
4282  * The implementation works fairly simply by integrating the following two
4283  * modifications:
4284  *
4285  * *) Every now and then, at end of an L2ARC feed cycle, we append a piece
4286  *    of metadata (called a "pbuf", or "persistency buffer") to the L2ARC
4287  *    write. This allows us to understand what what's been written, so that
4288  *    we can rebuild the arc_buf_hdr_t structures of the main ARC buffers.
4289  *    The pbuf also includes a "back-reference" pointer to the previous
4290  *    pbuf, forming a linked list of pbufs on the L2ARC device.
4291  *
4292  * *) We reserve 4k of space at the start of each L2ARC device for our
4293  *    header bookkeeping purposes. This contains a single 4k uberblock, which
4294  *    contains our top-level reference structures. We update it on each pbuf
4295  *    write. If this write results in an inconsistent uberblock (e.g. due to
4296  *    power failure), we detect this by verifying the uberblock's checksum
4297  *    and simply drop the entries from L2ARC. Once an L2ARC pbuf update
4298  *    completes, we update the uberblock to point to it.
4299  *
4300  * Implementation diagram:
4301  *
4302  * +=== L2ARC device (not to scale) ======================================+
4303  * |       ____________newest pbuf pointer_____________                   |
4304  * |      /                                            \                  |
4305  * |     /                                              V                 |
4306  * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---|
4307  * |                       ^       / ^       / ^       /                  |
4308  * |                       `-prev-'  `-prev-'  `-prev-'                   |
4309  * |                         pbuf      pbuf      pbuf                     |
4310  * +======================================================================+
4311  *
4312  * On-device data structures:
4313  *
4314  * (L2ARC persistent uberblock)
4315  * struct l2uberblock {
4316  *      (these fields are in network byte order)
4317  *      uint32_t magic = 0x12bab10c;    l2-ber-block
4318  *      uint8_t  version = 0x1;
4319  *      uint8_t  reserved = 0x0;
4320  *      uint16_t ublk_flags;            see l2uberblock_flags_t
4321  *
4322  *      (byte order of fields below determined by `ublk_flags')
4323  *      uint64_t spa_guid;              what pool this l2arc dev belongs to
4324  *      uint64_t birth_txg;             ublk with highest birth_txg is newest
4325  *      uint64_t evict_tail;            current evict pointer on l2arc dev
4326  *      uint64_t alloc_space;           how much space is alloc'd on the dev
4327  *      uint64_t pbuf_daddr;            dev addr of the newest l2pbuf_t
4328  *      uint32_t pbuf_asize;            size of newest pbuf
4329  *      uint64_t pbuf_cksum[4];         fletcher4 of newest pbuf
4330  *
4331  *      uint8_t  reserved[3996] = {0x0, 0x0, ... 0x0};
4332  *
4333  *      uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above);
4334  * } l2dev_uberblock;
4335  *
4336  * (L2ARC persistent buffer list)
4337  * typedef struct l2pbuf_t {
4338  *      (these fields are in network byte order)
4339  *      uint32_t magic = 0xdb0faba6;    the-buffer-bag
4340  *      uint8_t  version = 0x1;
4341  *      uint8_t  reserved = 0x0;
4342  *      uint16_t pbuf_flags;            see l2pbuf_flags_t
4343  *
4344  *      (byte order of fields below determined by `pbuf_flags')
4345  *      uint64_t prev_pbuf_daddr;       previous pbuf dev addr
4346  *      uint32_t prev_pbuf_asize;       previous pbuf size
4347  *      uint64_t prev_pbuf_cksum[4];    fletcher4(of previous pbuf)
4348  *
4349  *      uint32_t items_size;            uncompressed size of `items' below
4350  *      (if (pbuf_flags & compress) decompress `items' prior to decoding)
4351  *      struct l2pbuf_buf_item {
4352  *              (these fields mirror [l2]arc_buf_hdr fields)
4353  *              uint64_t dva[2];                buffer's DVA
4354  *              uint64_t birth;                 buffer's birth TXG in ARC
4355  *              uint64_t cksum0;                lower 64-bits of buffer's cksum
4356  *              uint64_t freeze_cksum[4];       buffer's freeze cksum
4357  *              uint32_t size;                  uncompressed buffer data size
4358  *              uint64_t l2daddr;               device address (offset) of buf
4359  *              uint32_t l2asize;               actual space occupied by buf
4360  *              uint8_t  compress;              compress algo used on data
4361  *              uint8_t  contents_type;         buffer's contents type
4362  *              uint16_t reserved = 0x0;        for alignment and future use
4363  *              uint32_t flags;                 buffer's persistent flags
4364  *      } items[];                              continues for remainder of pbuf
4365  * } l2pbuf_t;
4366  *
4367  * L2ARC reconstruction:
4368  *
4369  * When writing data, we simply write in the standard rotary fashion,
4370  * evicting buffers as we go and simply writing new data over them (appending
4371  * an updated l2pbuf_t every now and then). This obviously means that once we
4372  * loop around the end of the device, we will start cutting into an already
4373  * committed l2pbuf (and its referenced data buffers), like so:
4374  *
4375  *    current write head__       __old tail
4376  *                        \     /
4377  *                        V    V
4378  * <--|bufs|pbuf|bufs|pbuf|    |bufs|pbuf|bufs|pbuf|-->
4379  *                         ^    ^^^^^^^^^_____________________________
4380  *                         |                                          \
4381  *                         <<nextwrite>> - will overwrite this pbuf --/
4382  *
4383  * When importing the pool, we detect this situation and use it to stop
4384  * our scanning process:
4385  * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the
4386  *      previous one.
4387  * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum)
4388  *      then the pbuf is invalid and stop scanning (goto step 3 below).
4389  * 3) if (this is the last valid pbuf)
4390  *      discard this pbuf as well (its ARC bufs may have been damaged by a
4391  *      partial overwrite).
4392  * (We could potentially salvage the remaining good arc bufs above in step 3,
4393  * buf the cost of doing so probably outweighs the value of the entire pbuf).
4394  *
4395  * There is one significant caveat to consider when rebuilding ARC contents
4396  * from an L2ARC device: what about invalidated buffers? Given the above
4397  * construction, we cannot update pbufs which we've already written to amend
4398  * them to remove buffers which were invalidated. Thus, during reconstruction,
4399  * we might be populating the cache with buffers for data that's not on the
4400  * main pool anymore, or may have been overwritten!
4401  *
4402  * As it turns out, this isn't a problem. Every arc_read request includes
4403  * both the DVA and, crucially, the birth TXG of the BP the caller is
4404  * looking for. So even if the cache were populated by completely rotten
4405  * blocks for data that had been long deleted and/or overwritten, we'll
4406  * never actually return bad data from the cache, since the DVA with the
4407  * birth TXG uniquely identify a block in space and time - once created,
4408  * a block is immutable on disk. The worst thing we have done is wasted
4409  * some time and memory at l2arc rebuild to reconstruct outdated ARC
4410  * entries that will get dropped from the l2arc as it is being updated
4411  * with new blocks.
4412  */
4413 
4414 static boolean_t
4415 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4416 {
4417         /*
4418          * A buffer is *not* eligible for the L2ARC if it:
4419          * 1. belongs to a different spa.
4420          * 2. is already cached on the L2ARC.
4421          * 3. has an I/O in progress (it may be an incomplete read).
4422          * 4. is flagged not eligible (zfs property).
4423          */
4424         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4425             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4426                 return (B_FALSE);
4427 
4428         return (B_TRUE);
4429 }
4430 
4431 static uint64_t
4432 l2arc_write_size(void)
4433 {
4434         uint64_t size;
4435 
4436         /*
4437          * Make sure our globals have meaningful values in case the user
4438          * altered them.
4439          */
4440         size = l2arc_write_max;
4441         if (size == 0) {
4442                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4443                     "be greater than zero, resetting it to the default (%d)",
4444                     L2ARC_WRITE_SIZE);
4445                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4446         }
4447 
4448         if (arc_warm == B_FALSE)
4449                 size += l2arc_write_boost;
4450 
4451         return (size);
4452 
4453 }
4454 
4455 static clock_t
4456 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4457 {
4458         clock_t interval, next, now;
4459 
4460         /*
4461          * If the ARC lists are busy, increase our write rate; if the
4462          * lists are stale, idle back.  This is achieved by checking
4463          * how much we previously wrote - if it was more than half of
4464          * what we wanted, schedule the next write much sooner.
4465          */
4466         if (l2arc_feed_again && wrote > (wanted / 2))
4467                 interval = (hz * l2arc_feed_min_ms) / 1000;
4468         else
4469                 interval = hz * l2arc_feed_secs;
4470 
4471         now = ddi_get_lbolt();
4472         next = MAX(now, MIN(now + interval, began + interval));
4473 
4474         return (next);
4475 }
4476 
4477 static void
4478 l2arc_hdr_stat_add(boolean_t from_arc)
4479 {
4480         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4481         if (from_arc)
4482                 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4483 }
4484 
4485 static void
4486 l2arc_hdr_stat_remove(void)
4487 {
4488         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4489         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4490 }
4491 
4492 /*
4493  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4494  * If a device is returned, this also returns holding the spa config lock.
4495  */
4496 static l2arc_dev_t *
4497 l2arc_dev_get_next(void)
4498 {
4499         l2arc_dev_t *first, *next = NULL;
4500 
4501         /*
4502          * Lock out the removal of spas (spa_namespace_lock), then removal
4503          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4504          * both locks will be dropped and a spa config lock held instead.
4505          */
4506         mutex_enter(&spa_namespace_lock);
4507         mutex_enter(&l2arc_dev_mtx);
4508 
4509         /* if there are no vdevs, there is nothing to do */
4510         if (l2arc_ndev == 0)
4511                 goto out;
4512 
4513         first = NULL;
4514         next = l2arc_dev_last;
4515         do {
4516                 /*
4517                  * Loop around the list looking for a non-faulted vdev
4518                  * and one that isn't currently doing an L2ARC rebuild.
4519                  */
4520                 if (next == NULL) {
4521                         next = list_head(l2arc_dev_list);
4522                 } else {
4523                         next = list_next(l2arc_dev_list, next);
4524                         if (next == NULL)
4525                                 next = list_head(l2arc_dev_list);
4526                 }
4527 
4528                 /* if we have come back to the start, bail out */
4529                 if (first == NULL)
4530                         first = next;
4531                 else if (next == first)
4532                         break;
4533 
4534         } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding);
4535 
4536         /* if we were unable to find any usable vdevs, return NULL */
4537         if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding)
4538                 next = NULL;
4539 
4540         l2arc_dev_last = next;
4541 
4542 out:
4543         mutex_exit(&l2arc_dev_mtx);
4544 
4545         /*
4546          * Grab the config lock to prevent the 'next' device from being
4547          * removed while we are writing to it.
4548          */
4549         if (next != NULL)
4550                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4551         mutex_exit(&spa_namespace_lock);
4552 
4553         return (next);
4554 }
4555 
4556 /*
4557  * Free buffers that were tagged for destruction.
4558  */
4559 static void
4560 l2arc_do_free_on_write()
4561 {
4562         list_t *buflist;
4563         l2arc_data_free_t *df, *df_prev;
4564 
4565         mutex_enter(&l2arc_free_on_write_mtx);
4566         buflist = l2arc_free_on_write;
4567 
4568         for (df = list_tail(buflist); df; df = df_prev) {
4569                 df_prev = list_prev(buflist, df);
4570                 ASSERT(df->l2df_data != NULL);
4571                 ASSERT(df->l2df_func != NULL);
4572                 df->l2df_func(df->l2df_data, df->l2df_size);
4573                 list_remove(buflist, df);
4574                 kmem_free(df, sizeof (l2arc_data_free_t));
4575         }
4576 
4577         mutex_exit(&l2arc_free_on_write_mtx);
4578 }
4579 
4580 /*
4581  * A write to a cache device has completed.  Update all headers to allow
4582  * reads from these buffers to begin.
4583  */
4584 static void
4585 l2arc_write_done(zio_t *zio)
4586 {
4587         l2arc_write_callback_t *cb;
4588         l2arc_dev_t *dev;
4589         list_t *buflist;
4590         arc_buf_hdr_t *head, *ab, *ab_prev;
4591         l2arc_buf_hdr_t *abl2;
4592         kmutex_t *hash_lock;
4593 
4594         cb = zio->io_private;
4595         ASSERT(cb != NULL);
4596         dev = cb->l2wcb_dev;
4597         ASSERT(dev != NULL);
4598         head = cb->l2wcb_head;
4599         ASSERT(head != NULL);
4600         buflist = dev->l2ad_buflist;
4601         ASSERT(buflist != NULL);
4602         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4603             l2arc_write_callback_t *, cb);
4604 
4605         if (zio->io_error != 0)
4606                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4607 
4608         mutex_enter(&l2arc_buflist_mtx);
4609 
4610         /*
4611          * All writes completed, or an error was hit.
4612          */
4613         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4614                 ab_prev = list_prev(buflist, ab);
4615                 abl2 = ab->b_l2hdr;
4616 
4617                 /*
4618                  * Release the temporary compressed buffer as soon as possible.
4619                  */
4620                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4621                         l2arc_release_cdata_buf(ab);
4622 
4623                 hash_lock = HDR_LOCK(ab);
4624                 if (!mutex_tryenter(hash_lock)) {
4625                         /*
4626                          * This buffer misses out.  It may be in a stage
4627                          * of eviction.  Its ARC_L2_WRITING flag will be
4628                          * left set, denying reads to this buffer.
4629                          */
4630                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4631                         continue;
4632                 }
4633 
4634                 if (zio->io_error != 0) {
4635                         /*
4636                          * Error - drop L2ARC entry.
4637                          */
4638                         list_remove(buflist, ab);
4639                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4640                         ab->b_l2hdr = NULL;
4641                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4642                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4643                 }
4644 
4645                 /*
4646                  * Allow ARC to begin reads to this L2ARC entry.
4647                  */
4648                 ab->b_flags &= ~ARC_L2_WRITING;
4649 
4650                 mutex_exit(hash_lock);
4651         }
4652 
4653         atomic_inc_64(&l2arc_writes_done);
4654         list_remove(buflist, head);
4655         kmem_cache_free(hdr_cache, head);
4656         mutex_exit(&l2arc_buflist_mtx);
4657 
4658         l2arc_do_free_on_write();
4659 
4660         if (cb->l2wcb_pbuf)
4661                 kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size);
4662         if (cb->l2wcb_ub_buf)
4663                 kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE);
4664         kmem_free(cb, sizeof (l2arc_write_callback_t));
4665 }
4666 
4667 /*
4668  * A read to a cache device completed.  Validate buffer contents before
4669  * handing over to the regular ARC routines.
4670  */
4671 static void
4672 l2arc_read_done(zio_t *zio)
4673 {
4674         l2arc_read_callback_t *cb;
4675         arc_buf_hdr_t *hdr;
4676         arc_buf_t *buf;
4677         kmutex_t *hash_lock;
4678         int equal;
4679 
4680         ASSERT(zio->io_vd != NULL);
4681         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4682 
4683         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4684 
4685         cb = zio->io_private;
4686         ASSERT(cb != NULL);
4687         buf = cb->l2rcb_buf;
4688         ASSERT(buf != NULL);
4689 
4690         hash_lock = HDR_LOCK(buf->b_hdr);
4691         mutex_enter(hash_lock);
4692         hdr = buf->b_hdr;
4693         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4694 
4695         /*
4696          * If the buffer was compressed, decompress it first.
4697          */
4698         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4699                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4700         ASSERT(zio->io_data != NULL);
4701 
4702         /*
4703          * Check this survived the L2ARC journey.
4704          */
4705         equal = arc_cksum_equal(buf);
4706         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4707                 mutex_exit(hash_lock);
4708                 zio->io_private = buf;
4709                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
4710                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
4711                 arc_read_done(zio);
4712         } else {
4713                 mutex_exit(hash_lock);
4714                 /*
4715                  * Buffer didn't survive caching.  Increment stats and
4716                  * reissue to the original storage device.
4717                  */
4718                 if (zio->io_error != 0) {
4719                         ARCSTAT_BUMP(arcstat_l2_io_error);
4720                 } else {
4721                         zio->io_error = SET_ERROR(EIO);
4722                 }
4723                 if (!equal)
4724                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4725 
4726                 /*
4727                  * If there's no waiter, issue an async i/o to the primary
4728                  * storage now.  If there *is* a waiter, the caller must
4729                  * issue the i/o in a context where it's OK to block.
4730                  */
4731                 if (zio->io_waiter == NULL) {
4732                         zio_t *pio = zio_unique_parent(zio);
4733 
4734                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4735 
4736                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4737                             buf->b_data, zio->io_size, arc_read_done, buf,
4738                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4739                 }
4740         }
4741 
4742         kmem_free(cb, sizeof (l2arc_read_callback_t));
4743 }
4744 
4745 /*
4746  * This is the list priority from which the L2ARC will search for pages to
4747  * cache.  This is used within loops (0..3) to cycle through lists in the
4748  * desired order.  This order can have a significant effect on cache
4749  * performance.
4750  *
4751  * Currently the metadata lists are hit first, MFU then MRU, followed by
4752  * the data lists.  This function returns a locked list, and also returns
4753  * the lock pointer.
4754  */
4755 static list_t *
4756 l2arc_list_locked(int list_num, kmutex_t **lock)
4757 {
4758         list_t *list = NULL;
4759 
4760         ASSERT(list_num >= 0 && list_num <= 3);
4761 
4762         switch (list_num) {
4763         case 0:
4764                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4765                 *lock = &arc_mfu->arcs_mtx;
4766                 break;
4767         case 1:
4768                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4769                 *lock = &arc_mru->arcs_mtx;
4770                 break;
4771         case 2:
4772                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4773                 *lock = &arc_mfu->arcs_mtx;
4774                 break;
4775         case 3:
4776                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4777                 *lock = &arc_mru->arcs_mtx;
4778                 break;
4779         }
4780 
4781         ASSERT(!(MUTEX_HELD(*lock)));
4782         mutex_enter(*lock);
4783         return (list);
4784 }
4785 
4786 /*
4787  * Evict buffers from the device write hand to the distance specified in
4788  * bytes.  This distance may span populated buffers, it may span nothing.
4789  * This is clearing a region on the L2ARC device ready for writing.
4790  * If the 'all' boolean is set, every buffer is evicted.
4791  */
4792 static void
4793 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4794 {
4795         list_t *buflist;
4796         l2arc_buf_hdr_t *abl2;
4797         arc_buf_hdr_t *ab, *ab_prev;
4798         kmutex_t *hash_lock;
4799         uint64_t taddr;
4800 
4801         buflist = dev->l2ad_buflist;
4802 
4803         if (buflist == NULL)
4804                 return;
4805 
4806         if (!all && dev->l2ad_first) {
4807                 /*
4808                  * This is the first sweep through the device.  There is
4809                  * nothing to evict.
4810                  */
4811                 return;
4812         }
4813 
4814         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4815                 /*
4816                  * When nearing the end of the device, evict to the end
4817                  * before the device write hand jumps to the start.
4818                  */
4819                 taddr = dev->l2ad_end;
4820         } else {
4821                 taddr = dev->l2ad_hand + distance;
4822         }
4823         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4824             uint64_t, taddr, boolean_t, all);
4825 
4826 top:
4827         mutex_enter(&l2arc_buflist_mtx);
4828         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4829                 ab_prev = list_prev(buflist, ab);
4830 
4831                 hash_lock = HDR_LOCK(ab);
4832                 if (!mutex_tryenter(hash_lock)) {
4833                         /*
4834                          * Missed the hash lock.  Retry.
4835                          */
4836                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4837                         mutex_exit(&l2arc_buflist_mtx);
4838                         mutex_enter(hash_lock);
4839                         mutex_exit(hash_lock);
4840                         goto top;
4841                 }
4842 
4843                 if (HDR_L2_WRITE_HEAD(ab)) {
4844                         /*
4845                          * We hit a write head node.  Leave it for
4846                          * l2arc_write_done().
4847                          */
4848                         list_remove(buflist, ab);
4849                         mutex_exit(hash_lock);
4850                         continue;
4851                 }
4852 
4853                 if (!all && ab->b_l2hdr != NULL &&
4854                     (ab->b_l2hdr->b_daddr > taddr ||
4855                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4856                         /*
4857                          * We've evicted to the target address,
4858                          * or the end of the device.
4859                          */
4860                         mutex_exit(hash_lock);
4861                         break;
4862                 }
4863 
4864                 if (HDR_FREE_IN_PROGRESS(ab)) {
4865                         /*
4866                          * Already on the path to destruction.
4867                          */
4868                         mutex_exit(hash_lock);
4869                         continue;
4870                 }
4871 
4872                 if (ab->b_state == arc_l2c_only) {
4873                         ASSERT(!HDR_L2_READING(ab));
4874                         /*
4875                          * This doesn't exist in the ARC.  Destroy.
4876                          * arc_hdr_destroy() will call list_remove()
4877                          * and decrement arcstat_l2_size.
4878                          */
4879                         arc_change_state(arc_anon, ab, hash_lock);
4880                         arc_hdr_destroy(ab);
4881                 } else {
4882                         /*
4883                          * Invalidate issued or about to be issued
4884                          * reads, since we may be about to write
4885                          * over this location.
4886                          */
4887                         if (HDR_L2_READING(ab)) {
4888                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4889                                 ab->b_flags |= ARC_L2_EVICTED;
4890                         }
4891 
4892                         /*
4893                          * Tell ARC this no longer exists in L2ARC.
4894                          */
4895                         if (ab->b_l2hdr != NULL) {
4896                                 abl2 = ab->b_l2hdr;
4897                                 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4898                                 ab->b_l2hdr = NULL;
4899                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4900                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4901                         }
4902                         list_remove(buflist, ab);
4903 
4904                         /*
4905                          * This may have been leftover after a
4906                          * failed write.
4907                          */
4908                         ab->b_flags &= ~ARC_L2_WRITING;
4909                 }
4910                 mutex_exit(hash_lock);
4911         }
4912         mutex_exit(&l2arc_buflist_mtx);
4913 
4914         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4915         dev->l2ad_evict = taddr;
4916 }
4917 
4918 /*
4919  * Find and write ARC buffers to the L2ARC device.
4920  *
4921  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4922  * for reading until they have completed writing.
4923  * The headroom_boost is an in-out parameter used to maintain headroom boost
4924  * state between calls to this function.
4925  *
4926  * Returns the number of bytes actually written (which may be smaller than
4927  * the delta by which the device hand has changed due to alignment).
4928  */
4929 static uint64_t
4930 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4931     boolean_t *headroom_boost)
4932 {
4933         arc_buf_hdr_t *ab, *ab_prev, *head;
4934         list_t *list;
4935         uint64_t write_asize, write_psize, write_sz, headroom,
4936             buf_compress_minsz;
4937         void *buf_data;
4938         kmutex_t *list_lock;
4939         boolean_t full;
4940         l2arc_write_callback_t *cb;
4941         zio_t *pio, *wzio;
4942         uint64_t guid = spa_load_guid(spa);
4943         const boolean_t do_headroom_boost = *headroom_boost;
4944 
4945         /* persistency-related */
4946         l2pbuf_t *pb;
4947         l2pbuf_buflist_t *pb_buflist;
4948         int num_bufs, buf_index;
4949 
4950         ASSERT(dev->l2ad_vdev != NULL);
4951 
4952         /* Lower the flag now, we might want to raise it again later. */
4953         *headroom_boost = B_FALSE;
4954 
4955         pio = NULL;
4956         cb = NULL;
4957         write_sz = write_asize = write_psize = 0;
4958         full = B_FALSE;
4959         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4960         head->b_flags |= ARC_L2_WRITE_HEAD;
4961 
4962         /*
4963          * We will want to try to compress buffers that are at least 2x the
4964          * device sector size.
4965          */
4966         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4967 
4968         pb = &dev->l2ad_pbuf;
4969         num_bufs = 0;
4970 
4971         /*
4972          * We will want to try to compress buffers that are at least 2x the
4973          * device sector size.
4974          */
4975         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4976 
4977         /*
4978          * Copy buffers for L2ARC writing.
4979          */
4980         mutex_enter(&l2arc_buflist_mtx);
4981         for (int try = 0; try <= 3; try++) {
4982                 uint64_t passed_sz = 0;
4983 
4984                 list = l2arc_list_locked(try, &list_lock);
4985 
4986                 /*
4987                  * L2ARC fast warmup.
4988                  *
4989                  * Until the ARC is warm and starts to evict, read from the
4990                  * head of the ARC lists rather than the tail.
4991                  */
4992                 if (arc_warm == B_FALSE)
4993                         ab = list_head(list);
4994                 else
4995                         ab = list_tail(list);
4996 
4997                 headroom = target_sz * l2arc_headroom;
4998                 if (do_headroom_boost)
4999                         headroom = (headroom * l2arc_headroom_boost) / 100;
5000 
5001                 for (; ab; ab = ab_prev) {
5002                         l2arc_buf_hdr_t *l2hdr;
5003                         kmutex_t *hash_lock;
5004                         uint64_t buf_sz;
5005 
5006                         if (arc_warm == B_FALSE)
5007                                 ab_prev = list_next(list, ab);
5008                         else
5009                                 ab_prev = list_prev(list, ab);
5010 
5011                         hash_lock = HDR_LOCK(ab);
5012                         if (!mutex_tryenter(hash_lock)) {
5013                                 /*
5014                                  * Skip this buffer rather than waiting.
5015                                  */
5016                                 continue;
5017                         }
5018 
5019                         passed_sz += ab->b_size;
5020                         if (passed_sz > headroom) {
5021                                 /*
5022                                  * Searched too far.
5023                                  */
5024                                 mutex_exit(hash_lock);
5025                                 break;
5026                         }
5027 
5028                         if (!l2arc_write_eligible(guid, ab)) {
5029                                 mutex_exit(hash_lock);
5030                                 continue;
5031                         }
5032 
5033                         if ((write_sz + ab->b_size) > target_sz) {
5034                                 full = B_TRUE;
5035                                 mutex_exit(hash_lock);
5036                                 break;
5037                         }
5038 
5039                         if (pio == NULL) {
5040                                 /*
5041                                  * Insert a dummy header on the buflist so
5042                                  * l2arc_write_done() can find where the
5043                                  * write buffers begin without searching.
5044                                  */
5045                                 list_insert_head(dev->l2ad_buflist, head);
5046 
5047                                 cb = kmem_zalloc(
5048                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
5049                                 cb->l2wcb_dev = dev;
5050                                 cb->l2wcb_head = head;
5051                                 pio = zio_root(spa, l2arc_write_done, cb,
5052                                     ZIO_FLAG_CANFAIL);
5053                         }
5054 
5055                         /*
5056                          * Create and add a new L2ARC header.
5057                          */
5058                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5059                         l2hdr->b_dev = dev;
5060                         ab->b_flags |= ARC_L2_WRITING;
5061 
5062                         /*
5063                          * Temporarily stash the data buffer in b_tmp_cdata.
5064                          * The subsequent write step will pick it up from
5065                          * there. This is because can't access ab->b_buf
5066                          * without holding the hash_lock, which we in turn
5067                          * can't access without holding the ARC list locks
5068                          * (which we want to avoid during compression/writing).
5069                          */
5070                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
5071                         l2hdr->b_asize = ab->b_size;
5072                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5073 
5074                         buf_sz = ab->b_size;
5075                         ab->b_l2hdr = l2hdr;
5076 
5077                         list_insert_head(dev->l2ad_buflist, ab);
5078 
5079                         /*
5080                          * Compute and store the buffer cksum before
5081                          * writing.  On debug the cksum is verified first.
5082                          */
5083                         arc_cksum_verify(ab->b_buf);
5084                         arc_cksum_compute(ab->b_buf, B_TRUE);
5085 
5086                         mutex_exit(hash_lock);
5087 
5088                         write_sz += buf_sz;
5089                         num_bufs++;
5090                 }
5091 
5092                 mutex_exit(list_lock);
5093 
5094                 if (full == B_TRUE)
5095                         break;
5096         }
5097 
5098         /* No buffers selected for writing? */
5099         if (pio == NULL) {
5100                 ASSERT0(write_sz);
5101                 mutex_exit(&l2arc_buflist_mtx);
5102                 kmem_cache_free(hdr_cache, head);
5103                 return (0);
5104         }
5105 
5106         /* expand the pbuf to include a new list */
5107         pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs);
5108 
5109         /*
5110          * Now start writing the buffers. We're starting at the write head
5111          * and work backwards, retracing the course of the buffer selector
5112          * loop above.
5113          */
5114         for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab;
5115             ab = list_prev(dev->l2ad_buflist, ab), buf_index++) {
5116                 l2arc_buf_hdr_t *l2hdr;
5117                 uint64_t buf_sz;
5118 
5119                 /*
5120                  * We shouldn't need to lock the buffer here, since we flagged
5121                  * it as ARC_L2_WRITING in the previous step, but we must take
5122                  * care to only access its L2 cache parameters. In particular,
5123                  * ab->b_buf may be invalid by now due to ARC eviction.
5124                  */
5125                 l2hdr = ab->b_l2hdr;
5126                 l2hdr->b_daddr = dev->l2ad_hand;
5127 
5128                 if ((ab->b_flags & ARC_L2COMPRESS) &&
5129                     l2hdr->b_asize >= buf_compress_minsz) {
5130                         if (l2arc_compress_buf(l2hdr)) {
5131                                 /*
5132                                  * If compression succeeded, enable headroom
5133                                  * boost on the next scan cycle.
5134                                  */
5135                                 *headroom_boost = B_TRUE;
5136                         }
5137                 }
5138 
5139                 /*
5140                  * Pick up the buffer data we had previously stashed away
5141                  * (and now potentially also compressed).
5142                  */
5143                 buf_data = l2hdr->b_tmp_cdata;
5144                 buf_sz = l2hdr->b_asize;
5145 
5146                 /* Compression may have squashed the buffer to zero length. */
5147                 if (buf_sz != 0) {
5148                         uint64_t buf_p_sz;
5149 
5150                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
5151                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5152                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5153                             ZIO_FLAG_CANFAIL, B_FALSE);
5154 
5155                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5156                             zio_t *, wzio);
5157                         (void) zio_nowait(wzio);
5158 
5159                         write_asize += buf_sz;
5160                         /*
5161                          * Keep the clock hand suitably device-aligned.
5162                          */
5163                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5164                         write_psize += buf_p_sz;
5165                         dev->l2ad_hand += buf_p_sz;
5166                 }
5167 
5168                 l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index);
5169         }
5170         ASSERT(buf_index == num_bufs);
5171         mutex_exit(&l2arc_buflist_mtx);
5172 
5173         ASSERT3U(write_asize, <=, target_sz);
5174         ARCSTAT_BUMP(arcstat_l2_writes_sent);
5175         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5176         ARCSTAT_INCR(arcstat_l2_size, write_sz);
5177         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5178         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5179 
5180         /* Is it time to commit this pbuf? */
5181         if (L2PBUF_IS_FULL(pb) &&
5182             dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) {
5183                 l2arc_pbuf_commit(dev, pio, cb);
5184                 l2arc_pbuf_destroy(pb);
5185                 l2arc_pbuf_init(pb);
5186         }
5187 
5188         /*
5189          * Bump device hand to the device start if it is approaching the end.
5190          * l2arc_evict() will already have evicted ahead for this case.
5191          */
5192         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5193                 vdev_space_update(dev->l2ad_vdev,
5194                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
5195                 dev->l2ad_hand = dev->l2ad_start;
5196                 dev->l2ad_evict = dev->l2ad_start;
5197                 dev->l2ad_first = B_FALSE;
5198         }
5199 
5200         dev->l2ad_writing = B_TRUE;
5201         (void) zio_wait(pio);
5202         dev->l2ad_writing = B_FALSE;
5203 
5204         return (write_asize);
5205 }
5206 
5207 /*
5208  * Compresses an L2ARC buffer.
5209  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5210  * size in l2hdr->b_asize. This routine tries to compress the data and
5211  * depending on the compression result there are three possible outcomes:
5212  * *) The buffer was incompressible. The original l2hdr contents were left
5213  *    untouched and are ready for writing to an L2 device.
5214  * *) The buffer was all-zeros, so there is no need to write it to an L2
5215  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5216  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5217  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5218  *    data buffer which holds the compressed data to be written, and b_asize
5219  *    tells us how much data there is. b_compress is set to the appropriate
5220  *    compression algorithm. Once writing is done, invoke
5221  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5222  *
5223  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5224  * buffer was incompressible).
5225  */
5226 static boolean_t
5227 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5228 {
5229         void *cdata;
5230         size_t csize, len;
5231 
5232         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5233         ASSERT(l2hdr->b_tmp_cdata != NULL);
5234 
5235         len = l2hdr->b_asize;
5236         cdata = zio_data_buf_alloc(len);
5237         csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5238             cdata, l2hdr->b_asize);
5239 
5240         if (csize == 0) {
5241                 /* zero block, indicate that there's nothing to write */
5242                 zio_data_buf_free(cdata, len);
5243                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5244                 l2hdr->b_asize = 0;
5245                 l2hdr->b_tmp_cdata = NULL;
5246                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5247                 return (B_TRUE);
5248         } else if (csize > 0 && csize < len) {
5249                 /*
5250                  * Compression succeeded, we'll keep the cdata around for
5251                  * writing and release it afterwards.
5252                  */
5253                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5254                 l2hdr->b_asize = csize;
5255                 l2hdr->b_tmp_cdata = cdata;
5256                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
5257                 return (B_TRUE);
5258         } else {
5259                 /*
5260                  * Compression failed, release the compressed buffer.
5261                  * l2hdr will be left unmodified.
5262                  */
5263                 zio_data_buf_free(cdata, len);
5264                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
5265                 return (B_FALSE);
5266         }
5267 }
5268 
5269 /*
5270  * Decompresses a zio read back from an l2arc device. On success, the
5271  * underlying zio's io_data buffer is overwritten by the uncompressed
5272  * version. On decompression error (corrupt compressed stream), the
5273  * zio->io_error value is set to signal an I/O error.
5274  *
5275  * Please note that the compressed data stream is not checksummed, so
5276  * if the underlying device is experiencing data corruption, we may feed
5277  * corrupt data to the decompressor, so the decompressor needs to be
5278  * able to handle this situation (LZ4 does).
5279  */
5280 static void
5281 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5282 {
5283         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5284 
5285         if (zio->io_error != 0) {
5286                 /*
5287                  * An io error has occured, just restore the original io
5288                  * size in preparation for a main pool read.
5289                  */
5290                 zio->io_orig_size = zio->io_size = hdr->b_size;
5291                 return;
5292         }
5293 
5294         if (c == ZIO_COMPRESS_EMPTY) {
5295                 /*
5296                  * An empty buffer results in a null zio, which means we
5297                  * need to fill its io_data after we're done restoring the
5298                  * buffer's contents.
5299                  */
5300                 ASSERT(hdr->b_buf != NULL);
5301                 bzero(hdr->b_buf->b_data, hdr->b_size);
5302                 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5303         } else {
5304                 ASSERT(zio->io_data != NULL);
5305                 /*
5306                  * We copy the compressed data from the start of the arc buffer
5307                  * (the zio_read will have pulled in only what we need, the
5308                  * rest is garbage which we will overwrite at decompression)
5309                  * and then decompress back to the ARC data buffer. This way we
5310                  * can minimize copying by simply decompressing back over the
5311                  * original compressed data (rather than decompressing to an
5312                  * aux buffer and then copying back the uncompressed buffer,
5313                  * which is likely to be much larger).
5314                  */
5315                 uint64_t csize;
5316                 void *cdata;
5317 
5318                 csize = zio->io_size;
5319                 cdata = zio_data_buf_alloc(csize);
5320                 bcopy(zio->io_data, cdata, csize);
5321                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5322                     hdr->b_size) != 0)
5323                         zio->io_error = EIO;
5324                 zio_data_buf_free(cdata, csize);
5325         }
5326 
5327         /* Restore the expected uncompressed IO size. */
5328         zio->io_orig_size = zio->io_size = hdr->b_size;
5329 }
5330 
5331 /*
5332  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5333  * This buffer serves as a temporary holder of compressed data while
5334  * the buffer entry is being written to an l2arc device. Once that is
5335  * done, we can dispose of it.
5336  */
5337 static void
5338 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5339 {
5340         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5341 
5342         if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5343                 /*
5344                  * If the data was compressed, then we've allocated a
5345                  * temporary buffer for it, so now we need to release it.
5346                  */
5347                 ASSERT(l2hdr->b_tmp_cdata != NULL);
5348                 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5349         }
5350         l2hdr->b_tmp_cdata = NULL;
5351 }
5352 
5353 /*
5354  * This thread feeds the L2ARC at regular intervals.  This is the beating
5355  * heart of the L2ARC.
5356  */
5357 static void
5358 l2arc_feed_thread(void)
5359 {
5360         callb_cpr_t cpr;
5361         l2arc_dev_t *dev;
5362         spa_t *spa;
5363         uint64_t size, wrote;
5364         clock_t begin, next = ddi_get_lbolt();
5365         boolean_t headroom_boost = B_FALSE;
5366 
5367         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5368 
5369         mutex_enter(&l2arc_feed_thr_lock);
5370 
5371         while (l2arc_thread_exit == 0) {
5372                 CALLB_CPR_SAFE_BEGIN(&cpr);
5373                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5374                     next);
5375                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5376                 next = ddi_get_lbolt() + hz;
5377 
5378                 /*
5379                  * Quick check for L2ARC devices.
5380                  */
5381                 mutex_enter(&l2arc_dev_mtx);
5382                 if (l2arc_ndev == 0) {
5383                         mutex_exit(&l2arc_dev_mtx);
5384                         continue;
5385                 }
5386                 mutex_exit(&l2arc_dev_mtx);
5387                 begin = ddi_get_lbolt();
5388 
5389                 /*
5390                  * This selects the next l2arc device to write to, and in
5391                  * doing so the next spa to feed from: dev->l2ad_spa.   This
5392                  * will return NULL if there are now no l2arc devices or if
5393                  * they are all faulted.
5394                  *
5395                  * If a device is returned, its spa's config lock is also
5396                  * held to prevent device removal.  l2arc_dev_get_next()
5397                  * will grab and release l2arc_dev_mtx.
5398                  */
5399                 if ((dev = l2arc_dev_get_next()) == NULL)
5400                         continue;
5401 
5402                 spa = dev->l2ad_spa;
5403                 ASSERT(spa != NULL);
5404 
5405                 /*
5406                  * If the pool is read-only then force the feed thread to
5407                  * sleep a little longer.
5408                  */
5409                 if (!spa_writeable(spa)) {
5410                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5411                         spa_config_exit(spa, SCL_L2ARC, dev);
5412                         continue;
5413                 }
5414 
5415                 /*
5416                  * Avoid contributing to memory pressure.
5417                  */
5418                 if (arc_reclaim_needed()) {
5419                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5420                         spa_config_exit(spa, SCL_L2ARC, dev);
5421                         continue;
5422                 }
5423 
5424                 ARCSTAT_BUMP(arcstat_l2_feeds);
5425 
5426                 size = l2arc_write_size();
5427 
5428                 /*
5429                  * Evict L2ARC buffers that will be overwritten.
5430                  */
5431                 l2arc_evict(dev, size, B_FALSE);
5432 
5433                 /*
5434                  * Write ARC buffers.
5435                  */
5436                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5437 
5438                 /*
5439                  * Calculate interval between writes.
5440                  */
5441                 next = l2arc_write_interval(begin, size, wrote);
5442                 spa_config_exit(spa, SCL_L2ARC, dev);
5443         }
5444 
5445         l2arc_thread_exit = 0;
5446         cv_broadcast(&l2arc_feed_thr_cv);
5447         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
5448         thread_exit();
5449 }
5450 
5451 boolean_t
5452 l2arc_vdev_present(vdev_t *vd)
5453 {
5454         l2arc_dev_t *dev;
5455 
5456         mutex_enter(&l2arc_dev_mtx);
5457         for (dev = list_head(l2arc_dev_list); dev != NULL;
5458             dev = list_next(l2arc_dev_list, dev)) {
5459                 if (dev->l2ad_vdev == vd)
5460                         break;
5461         }
5462         mutex_exit(&l2arc_dev_mtx);
5463 
5464         return (dev != NULL);
5465 }
5466 
5467 /*
5468  * Add a vdev for use by the L2ARC.  By this point the spa has already
5469  * validated the vdev and opened it. The `rebuild' flag indicates whether
5470  * we should attempt an L2ARC persistency rebuild.
5471  */
5472 void
5473 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5474 {
5475         l2arc_dev_t *adddev;
5476 
5477         ASSERT(!l2arc_vdev_present(vd));
5478 
5479         /*
5480          * Create a new l2arc device entry.
5481          */
5482         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5483         adddev->l2ad_spa = spa;
5484         adddev->l2ad_vdev = vd;
5485         adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE;
5486         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5487         adddev->l2ad_hand = adddev->l2ad_start;
5488         adddev->l2ad_evict = adddev->l2ad_start;
5489         adddev->l2ad_first = B_TRUE;
5490         adddev->l2ad_writing = B_FALSE;
5491         l2arc_pbuf_init(&adddev->l2ad_pbuf);
5492 
5493         /*
5494          * This is a list of all ARC buffers that are still valid on the
5495          * device.
5496          */
5497         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5498         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5499             offsetof(arc_buf_hdr_t, b_l2node));
5500 
5501         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5502 
5503         /*
5504          * Add device to global list
5505          */
5506         mutex_enter(&l2arc_dev_mtx);
5507         list_insert_head(l2arc_dev_list, adddev);
5508         atomic_inc_64(&l2arc_ndev);
5509         if (rebuild && l2arc_rebuild_enabled) {
5510                 adddev->l2ad_rebuilding = B_TRUE;
5511                 (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev,
5512                     0, &p0, TS_RUN, minclsyspri);
5513         }
5514         mutex_exit(&l2arc_dev_mtx);
5515 }
5516 
5517 /*
5518  * Remove a vdev from the L2ARC.
5519  */
5520 void
5521 l2arc_remove_vdev(vdev_t *vd)
5522 {
5523         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5524 
5525         /*
5526          * Find the device by vdev
5527          */
5528         mutex_enter(&l2arc_dev_mtx);
5529         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5530                 nextdev = list_next(l2arc_dev_list, dev);
5531                 if (vd == dev->l2ad_vdev) {
5532                         remdev = dev;
5533                         break;
5534                 }
5535         }
5536         ASSERT(remdev != NULL);
5537 
5538         /*
5539          * Remove device from global list
5540          */
5541         list_remove(l2arc_dev_list, remdev);
5542         l2arc_dev_last = NULL;          /* may have been invalidated */
5543         atomic_dec_64(&l2arc_ndev);
5544         mutex_exit(&l2arc_dev_mtx);
5545 
5546         /*
5547          * Clear all buflists and ARC references.  L2ARC device flush.
5548          */
5549         l2arc_pbuf_destroy(&remdev->l2ad_pbuf);
5550         l2arc_evict(remdev, 0, B_TRUE);
5551         list_destroy(remdev->l2ad_buflist);
5552         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5553         kmem_free(remdev, sizeof (l2arc_dev_t));
5554 }
5555 
5556 void
5557 l2arc_init(void)
5558 {
5559         l2arc_thread_exit = 0;
5560         l2arc_ndev = 0;
5561         l2arc_writes_sent = 0;
5562         l2arc_writes_done = 0;
5563 
5564         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5565         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5566         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5567         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5568         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5569 
5570         l2arc_dev_list = &L2ARC_dev_list;
5571         l2arc_free_on_write = &L2ARC_free_on_write;
5572         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5573             offsetof(l2arc_dev_t, l2ad_node));
5574         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5575             offsetof(l2arc_data_free_t, l2df_list_node));
5576 }
5577 
5578 void
5579 l2arc_fini(void)
5580 {
5581         /*
5582          * This is called from dmu_fini(), which is called from spa_fini();
5583          * Because of this, we can assume that all l2arc devices have
5584          * already been removed when the pools themselves were removed.
5585          */
5586 
5587         l2arc_do_free_on_write();
5588 
5589         mutex_destroy(&l2arc_feed_thr_lock);
5590         cv_destroy(&l2arc_feed_thr_cv);
5591         mutex_destroy(&l2arc_dev_mtx);
5592         mutex_destroy(&l2arc_buflist_mtx);
5593         mutex_destroy(&l2arc_free_on_write_mtx);
5594 
5595         list_destroy(l2arc_dev_list);
5596         list_destroy(l2arc_free_on_write);
5597 }
5598 
5599 void
5600 l2arc_start(void)
5601 {
5602         if (!(spa_mode_global & FWRITE))
5603                 return;
5604 
5605         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5606             TS_RUN, minclsyspri);
5607 }
5608 
5609 void
5610 l2arc_stop(void)
5611 {
5612         if (!(spa_mode_global & FWRITE))
5613                 return;
5614 
5615         mutex_enter(&l2arc_feed_thr_lock);
5616         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
5617         l2arc_thread_exit = 1;
5618         while (l2arc_thread_exit != 0)
5619                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5620         mutex_exit(&l2arc_feed_thr_lock);
5621 }
5622 
5623 /*
5624  * Main entry point for L2ARC metadata rebuilding. This function must be
5625  * called via thread_create so that the L2ARC metadata rebuild doesn't block
5626  * pool import and may proceed in parallel on all available L2ARC devices.
5627  */
5628 static void
5629 l2arc_rebuild_start(l2arc_dev_t *dev)
5630 {
5631         vdev_t *vd = dev->l2ad_vdev;
5632         spa_t *spa = dev->l2ad_spa;
5633 
5634         /* Lock out device removal. */
5635         spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
5636         ASSERT(dev->l2ad_rebuilding == B_TRUE);
5637         l2arc_rebuild(dev);
5638         dev->l2ad_rebuilding = B_FALSE;
5639         spa_config_exit(spa, SCL_L2ARC, vd);
5640         thread_exit();
5641 }
5642 
5643 /*
5644  * This function implements the actual L2ARC metadata rebuild. It:
5645  *
5646  * 1) scans the device for valid l2uberblocks
5647  * 2) if it finds a good uberblock, starts reading the pbuf chain
5648  * 3) restores each pbuf's contents to memory
5649  *
5650  * Operation stops under any of the following conditions:
5651  *
5652  * 1) We reach the end of the pbuf chain (the previous-buffer reference
5653  *    in the pbuf is zero).
5654  * 2) We encounter *any* error condition (cksum errors, io errors, looped
5655  *    pbufs, etc.).
5656  * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
5657  *    from making severely fragmented L2ARC pbufs or slow L2ARC devices
5658  *    prevent a machine from importing the pool (and letting the
5659  *    administrator take corrective action, e.g. by kicking the misbehaving
5660  *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
5661  *    rebuilding disabled).
5662  */
5663 static void
5664 l2arc_rebuild(l2arc_dev_t *dev)
5665 {
5666         int err;
5667         l2uberblock_t ub;
5668         l2pbuf_t pb;
5669         zio_t *this_io = NULL, *next_io = NULL;
5670         int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
5671 
5672         if ((err = l2arc_uberblock_find(dev, &ub)) != 0)
5673                 return;
5674         L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */);
5675 
5676         /* set up uberblock update info */
5677         dev->l2ad_uberblock_birth = ub.ub_birth + 1;
5678 
5679         /* initial sanity checks */
5680         l2arc_pbuf_init(&pb);
5681         if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize,
5682             ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) {
5683                 /* root pbuf is bad, we can't do anything about that */
5684                 if (err == EINVAL) {
5685                         ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5686                 } else {
5687                         ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5688                 }
5689                 l2arc_pbuf_destroy(&pb);
5690                 return;
5691         }
5692         L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5693 
5694         dev->l2ad_evict = ub.ub_evict_tail;
5695 
5696         /* keep on chaining in new blocks */
5697         dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr;
5698         dev->l2ad_pbuf_asize = ub.ub_pbuf_asize;
5699         dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum;
5700         dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
5701             ub.ub_pbuf_daddr + ub.ub_pbuf_asize);
5702         dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0);
5703 
5704         /* start the rebuild process */
5705         for (;;) {
5706                 l2pbuf_t pb_prev;
5707 
5708                 l2arc_pbuf_init(&pb_prev);
5709                 if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr,
5710                     pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io,
5711                     &next_io)) != 0) {
5712                         /*
5713                          * We are done reading, discard the last good buffer.
5714                          */
5715                         if (pb.pb_prev_daddr > dev->l2ad_hand &&
5716                             pb.pb_prev_asize > L2PBUF_HDR_SIZE) {
5717                                 /* this is an error, we stopped too early */
5718                                 if (err == EINVAL) {
5719                                         ARCSTAT_BUMP(
5720                                             arcstat_l2_rebuild_cksum_errors);
5721                                 } else {
5722                                         ARCSTAT_BUMP(
5723                                             arcstat_l2_rebuild_io_errors);
5724                                 }
5725                         }
5726                         l2arc_pbuf_destroy(&pb_prev);
5727                         l2arc_pbuf_destroy(&pb);
5728                         break;
5729                 }
5730 
5731                 /*
5732                  * Protection against infinite loops of pbufs. This is also
5733                  * our primary termination mechanism - once the buffer list
5734                  * loops around our starting pbuf, we can stop.
5735                  */
5736                 if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr &&
5737                     pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) {
5738                         ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
5739                         l2arc_pbuf_destroy(&pb);
5740                         l2arc_pbuf_destroy(&pb_prev);
5741                         if (next_io)
5742                                 l2arc_pbuf_prefetch_abort(next_io);
5743                         return;
5744                 }
5745 
5746                 /*
5747                  * Our memory pressure valve. If the system is running low
5748                  * on memory, rather than swamping memory with new ARC buf
5749                  * hdrs, we opt not to reconstruct the L2ARC. At this point,
5750                  * however, we have already set up our L2ARC dev to chain in
5751                  * new metadata pbufs, so the user may choose to re-add the
5752                  * L2ARC dev at a later time to reconstruct it (when there's
5753                  * less memory pressure).
5754                  */
5755                 if (arc_reclaim_needed()) {
5756                         ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
5757                         cmn_err(CE_NOTE, "System running low on memory, "
5758                             "aborting L2ARC rebuild.");
5759                         l2arc_pbuf_destroy(&pb);
5760                         l2arc_pbuf_destroy(&pb_prev);
5761                         if (next_io)
5762                                 l2arc_pbuf_prefetch_abort(next_io);
5763                         break;
5764                 }
5765 
5766                 /*
5767                  * Now that we know that the prev_pbuf checks out alright, we
5768                  * can start reconstruction from this pbuf - we can be sure
5769                  * that the L2ARC write hand has not yet reached any of our
5770                  * buffers.
5771                  */
5772                 l2arc_pbuf_restore(dev, &pb);
5773 
5774                 /* pbuf restored, continue with next one in the list */
5775                 l2arc_pbuf_destroy(&pb);
5776                 pb = pb_prev;
5777                 this_io = next_io;
5778                 next_io = NULL;
5779 
5780                 L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5781         }
5782 
5783         ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
5784 }
5785 
5786 /*
5787  * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries
5788  * which only contain an l2arc hdr, essentially restoring the buffers to
5789  * their L2ARC evicted state. This function also updates space usage on the
5790  * L2ARC vdev to make sure it tracks restored buffers.
5791  */
5792 static void
5793 l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb)
5794 {
5795         spa_t *spa;
5796         uint64_t guid;
5797         list_t *buflists_list;
5798         l2pbuf_buflist_t *buflist;
5799 
5800         mutex_enter(&l2arc_buflist_mtx);
5801         spa = dev->l2ad_vdev->vdev_spa;
5802         guid = spa_load_guid(spa);
5803         buflists_list = pb->pb_buflists_list;
5804         for (buflist = list_head(buflists_list); buflist;
5805             buflist = list_next(buflists_list, buflist)) {
5806                 int i;
5807                 uint64_t size, asize, psize;
5808 
5809                 size = asize = psize = 0;
5810                 for (i = 0; i < buflist->l2pbl_nbufs; i++) {
5811                         l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev,
5812                             guid);
5813                         size += buflist->l2pbl_bufs[i].b_size;
5814                         asize += buflist->l2pbl_bufs[i].b_l2asize;
5815                         psize += vdev_psize_to_asize(dev->l2ad_vdev,
5816                             buflist->l2pbl_bufs[i].b_l2asize);
5817                 }
5818                 ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size);
5819                 ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize);
5820                 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs);
5821                 vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
5822         }
5823         mutex_exit(&l2arc_buflist_mtx);
5824         ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs);
5825         vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev,
5826             pb->pb_asize), 0, 0);
5827 }
5828 
5829 /*
5830  * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into
5831  * a state indicating that it has been evicted to L2ARC.
5832  * The `guid' here is the ARC-load-guid from spa_load_guid.
5833  */
5834 static void
5835 l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid)
5836 {
5837         arc_buf_hdr_t *hdr;
5838         kmutex_t *hash_lock;
5839         dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]};
5840 
5841         hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock);
5842         if (hdr == NULL) {
5843                 /* not in cache, try to insert */
5844                 arc_buf_hdr_t *exists;
5845                 arc_buf_contents_t type = buf->b_contents_type;
5846                 l2arc_buf_hdr_t *l2hdr;
5847 
5848                 hdr = arc_buf_hdr_alloc(guid, buf->b_size, type);
5849                 hdr->b_dva = buf->b_dva;
5850                 hdr->b_birth = buf->b_birth;
5851                 hdr->b_cksum0 = buf->b_cksum0;
5852                 hdr->b_size = buf->b_size;
5853                 exists = buf_hash_insert(hdr, &hash_lock);
5854                 if (exists) {
5855                         /* somebody beat us to the hash insert */
5856                         mutex_exit(hash_lock);
5857                         arc_hdr_destroy(hdr);
5858                         ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
5859                         return;
5860                 }
5861                 hdr->b_flags = buf->b_flags;
5862                 mutex_enter(&hdr->b_freeze_lock);
5863                 ASSERT(hdr->b_freeze_cksum == NULL);
5864                 hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
5865                     KM_SLEEP);
5866                 *hdr->b_freeze_cksum = buf->b_freeze_cksum;
5867                 mutex_exit(&hdr->b_freeze_lock);
5868 
5869                 /* now rebuild the l2arc entry */
5870                 ASSERT(hdr->b_l2hdr == NULL);
5871                 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5872                 l2hdr->b_dev = dev;
5873                 l2hdr->b_daddr = buf->b_l2daddr;
5874                 l2hdr->b_asize = buf->b_l2asize;
5875                 l2hdr->b_compress = buf->b_l2compress;
5876                 hdr->b_l2hdr = l2hdr;
5877                 list_insert_head(dev->l2ad_buflist, hdr);
5878                 ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
5879                 ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
5880 
5881                 arc_change_state(arc_l2c_only, hdr, hash_lock);
5882         }
5883         mutex_exit(hash_lock);
5884 }
5885 
5886 /*
5887  * Attempts to locate and read the newest valid uberblock on the provided
5888  * L2ARC device and writes it to `ub'. On success, this function returns 0,
5889  * otherwise the appropriate error code is returned.
5890  */
5891 static int
5892 l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub)
5893 {
5894         int err = 0;
5895         uint8_t *ub_buf;
5896         uint64_t guid;
5897 
5898         ARCSTAT_BUMP(arcstat_l2_rebuild_attempts);
5899         ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
5900         guid = spa_guid(dev->l2ad_vdev->vdev_spa);
5901 
5902         if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
5903             VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf,
5904             ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
5905             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
5906             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
5907                 ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5908                 goto cleanup;
5909         }
5910 
5911         /*
5912          * Initial peek - does the device even have any usable uberblocks?
5913          * If not, don't bother continuing.
5914          */
5915         l2arc_uberblock_decode(ub_buf, ub);
5916         if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 ||
5917             ub->ub_version > L2UBERBLOCK_MAX_VERSION ||
5918             ub->ub_spa_guid != guid) {
5919                 err = ENOTSUP;
5920                 ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported);
5921                 goto cleanup;
5922         }
5923 
5924         /* now check to make sure that what we selected is okay */
5925         if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) {
5926                 if (err == EINVAL) {
5927                         ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5928                 } else {
5929                         ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors);
5930                 }
5931                 goto cleanup;
5932         }
5933 
5934         /* this uberblock is valid */
5935 
5936 cleanup:
5937         kmem_free(ub_buf, L2UBERBLOCK_SIZE);
5938         return (err);
5939 }
5940 
5941 /*
5942  * Reads a pbuf from storage, decodes it and validates its contents against
5943  * the provided checksum. The result is placed in `pb'.
5944  *
5945  * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching.
5946  * When issuing the first pbuf IO during rebuild, you should pass NULL for
5947  * `this_io'. This function will then issue a sync IO to read the pbuf and
5948  * also issue an async IO to fetch the next pbuf in the pbuf chain. The
5949  * prefetch IO is returned in `prefetch_io. On subsequent calls to this
5950  * function, pass the value returned in `prefetch_io' from the previous
5951  * call as `this_io' and a fresh `prefetch_io' pointer to hold the next
5952  * prefetch IO. Prior to the call, you should initialize your `prefetch_io'
5953  * pointer to be NULL. If no prefetch IO was issued, the pointer is left
5954  * set at NULL.
5955  *
5956  * Actual prefetching takes place in two steps: a header IO (pi_hdr_io)
5957  * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io
5958  * IO is used internally in this function to be able to `peek' at the next
5959  * buffer's header before the main IO to read it in completely has finished.
5960  * We can then begin to issue the IO for the next buffer in the chain before
5961  * we are done reading, keeping the L2ARC device's pipeline saturated with
5962  * reads (rather than issuing an IO, waiting for it to complete, validating
5963  * the returned buffer and issuing the next one). This will make sure that
5964  * the rebuild proceeds at maximum read throughput.
5965  *
5966  * On success, this function returns 0, otherwise it returns an appropriate
5967  * error code. On error the prefetching IO is aborted and cleared before
5968  * returning from this function. Therefore, if we return `success', the
5969  * caller can assume that we have taken care of cleanup of prefetch IOs.
5970  */
5971 static int
5972 l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
5973     zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io)
5974 {
5975         int err = 0;
5976         uint64_t prev_pb_start;
5977         uint32_t prev_pb_asize;
5978         zio_cksum_t calc_cksum, prev_pb_cksum;
5979         l2arc_prefetch_info_t *pi = NULL;
5980 
5981         ASSERT(dev != NULL);
5982         ASSERT(pb != NULL);
5983         ASSERT(*prefetch_io == NULL);
5984 
5985         if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) {
5986                 /* We could not have issued a prefetch IO for this */
5987                 ASSERT(this_io == NULL);
5988                 return (EINVAL);
5989         }
5990 
5991         /*
5992          * Check to see if we have issued the IO for this pbuf in a previous
5993          * run. If not, issue it now.
5994          */
5995         if (this_io == NULL)
5996                 this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize);
5997 
5998         /* Pick up the prefetch info buffer and read its contents */
5999         pi = this_io->io_private;
6000         ASSERT(pi != NULL);
6001         ASSERT(asize <= pi->pi_buflen);
6002 
6003         /* Wait for the IO to read this pbuf's header to complete */
6004         if ((err = zio_wait(pi->pi_hdr_io)) != 0) {
6005                 (void) zio_wait(this_io);
6006                 goto cleanup;
6007         }
6008 
6009         /*
6010          * Peek to see if we can start issuing the next pbuf IO immediately.
6011          * At this point, only the current pbuf's header has been read.
6012          */
6013         if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start,
6014             &prev_pb_asize, &prev_pb_cksum) == 0) {
6015                 uint64_t this_pb_start, this_pb_end, prev_pb_end;
6016                 /* Detect malformed pbuf references and loops */
6017                 this_pb_start = daddr;
6018                 this_pb_end = daddr + asize;
6019                 prev_pb_end = prev_pb_start + prev_pb_asize;
6020                 if ((prev_pb_start >= this_pb_start && prev_pb_start <
6021                     this_pb_end) ||
6022                     (prev_pb_end >= this_pb_start && prev_pb_end <
6023                     this_pb_end)) {
6024                         ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
6025                         cmn_err(CE_WARN, "Looping L2ARC metadata reference "
6026                             "detected, aborting rebuild.");
6027                         err = EINVAL;
6028                         goto cleanup;
6029                 }
6030                 /*
6031                  * Start issuing IO for the next pbuf early - this should
6032                  * help keep the L2ARC device busy while we read, decode
6033                  * and restore this pbuf.
6034                  */
6035                 if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize))
6036                         *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev,
6037                             prev_pb_start, prev_pb_asize);
6038         }
6039 
6040         /* Wait for the main pbuf IO to complete */
6041         if ((err = zio_wait(this_io)) != 0)
6042                 goto cleanup;
6043 
6044         /* Make sure the buffer checks out ok */
6045         fletcher_4_native(pi->pi_buf, asize, &calc_cksum);
6046         if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) {
6047                 err = EINVAL;
6048                 goto cleanup;
6049         }
6050 
6051         /* Now we can take our time decoding this buffer */
6052         if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0)
6053                 goto cleanup;
6054 
6055         /* This will be used in l2arc_pbuf_restore for space accounting */
6056         pb->pb_asize = asize;
6057 
6058         ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb));
6059         ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize);
6060         ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6061             pb->pb_payload_asz / asize);
6062 
6063 cleanup:
6064         kmem_free(pi->pi_buf, pi->pi_buflen);
6065         pi->pi_buf = NULL;
6066         kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6067         /* Abort an in-flight prefetch in case of error */
6068         if (err != 0 && *prefetch_io != NULL) {
6069                 l2arc_pbuf_prefetch_abort(*prefetch_io);
6070                 *prefetch_io = NULL;
6071         }
6072         return (err);
6073 }
6074 
6075 /*
6076  * Validates a pbuf device address to make sure that it can be read
6077  * from the provided L2ARC device. Returns 1 if the address is within
6078  * the device's bounds, or 0 if not.
6079  */
6080 static int
6081 l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize)
6082 {
6083         uint32_t psize;
6084         uint64_t end;
6085 
6086         psize = vdev_psize_to_asize(dev->l2ad_vdev, asize);
6087         end = daddr + psize;
6088 
6089         if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE ||
6090             asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start ||
6091             /* check that the buffer address is correctly aligned */
6092             (daddr & (vdev_psize_to_asize(dev->l2ad_vdev,
6093             SPA_MINBLOCKSIZE) - 1)) != 0)
6094                 return (0);
6095         else
6096                 return (1);
6097 }
6098 
6099 /*
6100  * Starts an asynchronous read IO to read a pbuf. This is used in pbuf
6101  * reconstruction to start reading the next pbuf before we are done
6102  * decoding and reconstructing the current pbuf, to keep the l2arc device
6103  * nice and hot with read IO to process.
6104  * The returned zio will contain a newly allocated memory buffers for the IO
6105  * data which should then be freed by the caller once the zio is no longer
6106  * needed (i.e. due to it having completed). If you wish to abort this
6107  * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care
6108  * of disposing of the allocated buffers correctly.
6109  */
6110 static zio_t *
6111 l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize)
6112 {
6113         uint32_t i, psize;
6114         zio_t *pio, *hdr_io;
6115         uint64_t hdr_rsize;
6116         uint8_t *buf;
6117         l2arc_prefetch_info_t *pinfo;
6118 
6119         psize = vdev_psize_to_asize(vd, asize);
6120         buf = kmem_alloc(psize, KM_SLEEP);
6121         pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP);
6122         pinfo->pi_buf = buf;
6123         pinfo->pi_buflen = psize;
6124 
6125         /*
6126          * We start issuing the IO for the pbuf header early. This
6127          * allows l2arc_pbuf_read to start issuing IO for the next
6128          * buffer before the current pbuf is read in completely.
6129          */
6130 
6131         hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE);
6132         ASSERT(hdr_rsize <= psize);
6133         pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL,
6134             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6135             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
6136         hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf,
6137             ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
6138             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6139             ZIO_FLAG_DONT_RETRY, B_FALSE);
6140         (void) zio_nowait(hdr_io);
6141 
6142         /*
6143          * Read in the rest of the pbuf - this can take longer than just
6144          * having a peek at the header.
6145          */
6146         pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE |
6147             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6148             ZIO_FLAG_DONT_RETRY);
6149         for (i = hdr_rsize; i < psize; ) {
6150                 uint64_t rsize = psize - i;
6151                 zio_t *rzio;
6152 
6153                 if (psize - i > SPA_MAXBLOCKSIZE)
6154                         rsize = SPA_MAXBLOCKSIZE;
6155                 ASSERT(rsize >= SPA_MINBLOCKSIZE);
6156                 rzio = zio_read_phys(pio, vd, daddr + i,
6157                     rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6158                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE |
6159                     ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6160                     ZIO_FLAG_DONT_RETRY, B_FALSE);
6161                 (void) zio_nowait(rzio);
6162                 i += rsize;
6163         }
6164 
6165         return (pio);
6166 }
6167 
6168 /*
6169  * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data
6170  * buffers allocated for it.
6171  */
6172 static void
6173 l2arc_pbuf_prefetch_abort(zio_t *zio)
6174 {
6175         l2arc_prefetch_info_t *pi;
6176 
6177         pi = zio->io_private;
6178         ASSERT(pi != NULL);
6179         if (pi->pi_hdr_io != NULL)
6180                 (void) zio_wait(pi->pi_hdr_io);
6181         (void) zio_wait(zio);
6182         kmem_free(pi->pi_buf, pi->pi_buflen);
6183         pi->pi_buf = NULL;
6184         kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6185 }
6186 
6187 /*
6188  * Encodes an l2uberblock_t structure into a destination buffer. This
6189  * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting
6190  * uberblock is always of this constant size.
6191  */
6192 static void
6193 l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf)
6194 {
6195         zio_cksum_t cksum;
6196 
6197         bzero(buf, L2UBERBLOCK_SIZE);
6198 
6199 #if defined(_BIG_ENDIAN)
6200         *(uint32_t *)buf = L2UBERBLOCK_MAGIC;
6201         *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN;
6202 #else   /* !defined(_BIG_ENDIAN) */
6203         *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC);
6204         /* zero flags is ok */
6205 #endif  /* !defined(_BIG_ENDIAN) */
6206         buf[4] = L2UBERBLOCK_MAX_VERSION;
6207 
6208         /* rest in native byte order */
6209         *(uint64_t *)(buf + 8) = ub->ub_spa_guid;
6210         *(uint64_t *)(buf + 16) = ub->ub_birth;
6211         *(uint64_t *)(buf + 24) = ub->ub_evict_tail;
6212         *(uint64_t *)(buf + 32) = ub->ub_alloc_space;
6213         *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr;
6214         *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize;
6215         bcopy(&ub->ub_pbuf_cksum, buf + 52, 32);
6216 
6217         fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6218         bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32);
6219 }
6220 
6221 /*
6222  * Decodes an l2uberblock_t from an on-disk representation. Please note
6223  * that this function does not perform any uberblock validation and
6224  * checksumming - call l2arc_uberblock_verify() for that.
6225  */
6226 static void
6227 l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub)
6228 {
6229         boolean_t bswap_needed;
6230 
6231         /* these always come in big endian */
6232 #if defined(_BIG_ENDIAN)
6233         ub->ub_magic = *(uint32_t *)buf;
6234         ub->ub_flags = *(uint16_t *)(buf + 6);
6235         bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1);
6236 #else   /* !defined(_BIG_ENDIAN) */
6237         ub->ub_magic = BSWAP_32(*(uint32_t *)buf);
6238         ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6));
6239         bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0);
6240 #endif  /* !defined(_BIG_ENDIAN) */
6241         ub->ub_version = buf[4];
6242 
6243         ub->ub_spa_guid = *(uint64_t *)(buf + 8);
6244         ub->ub_birth = *(uint64_t *)(buf + 16);
6245         ub->ub_evict_tail = *(uint64_t *)(buf + 24);
6246         ub->ub_alloc_space = *(uint64_t *)(buf + 32);
6247         ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40);
6248         ub->ub_pbuf_asize = *(uint32_t *)(buf + 48);
6249         bcopy(buf + 52, &ub->ub_pbuf_cksum, 36);
6250         bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32);
6251 
6252         /* swap the rest if endianness doesn't match us */
6253         if (bswap_needed) {
6254                 ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid);
6255                 ub->ub_birth = BSWAP_64(ub->ub_birth);
6256                 ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail);
6257                 ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space);
6258                 ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr);
6259                 ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize);
6260                 ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum);
6261                 ZIO_CHECKSUM_BSWAP(&ub->ub_cksum);
6262         }
6263 }
6264 
6265 /*
6266  * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is
6267  * valid and matches its checksum.
6268  */
6269 static int
6270 l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
6271     uint64_t guid)
6272 {
6273         zio_cksum_t cksum;
6274 
6275         if (ub->ub_magic != L2UBERBLOCK_MAGIC ||
6276             ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION)
6277                 /*
6278                  * bad magic or invalid version => persistent l2arc not
6279                  * supported
6280                  */
6281                 return (ENOTSUP);
6282 
6283         if (ub->ub_spa_guid != guid)
6284                 /* this l2arc dev isn't ours */
6285                 return (EINVAL);
6286 
6287         fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6288         if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum))
6289                 /* bad checksum, corrupt uberblock */
6290                 return (EINVAL);
6291 
6292         return (0);
6293 }
6294 
6295 /*
6296  * Schedules a zio to update the uberblock on an l2arc device. The zio is
6297  * initiated as a child of `pio' and `cb' is filled with the information
6298  * needed to free the uberblock data buffer after writing.
6299  */
6300 static void
6301 l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6302 {
6303         uint8_t *ub_buf;
6304         l2uberblock_t ub;
6305         zio_t *wzio;
6306         vdev_stat_t st;
6307 
6308         ASSERT(cb->l2wcb_ub_buf == NULL);
6309         vdev_get_stats(dev->l2ad_vdev, &st);
6310 
6311         bzero(&ub, sizeof (ub));
6312         ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6313         ub.ub_birth = dev->l2ad_uberblock_birth++;
6314         ub.ub_evict_tail = dev->l2ad_evict;
6315         ub.ub_alloc_space = st.vs_alloc;
6316         ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr;
6317         ub.ub_pbuf_asize = dev->l2ad_pbuf_asize;
6318         ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum;
6319         if (dev->l2ad_first)
6320                 ub.ub_flags |= L2UBLK_EVICT_FIRST;
6321 
6322         ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
6323         cb->l2wcb_ub_buf = ub_buf;
6324         l2arc_uberblock_encode(&ub, ub_buf);
6325         wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
6326             L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL,
6327             ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6328         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6329             zio_t *, wzio);
6330         (void) zio_nowait(wzio);
6331 }
6332 
6333 /*
6334  * Encodes a l2pbuf_t structure into the portable on-disk format. The
6335  * `buf' buffer must be suitably sized to hold the entire uncompressed
6336  * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function
6337  * also compresses the buffer.
6338  *
6339  * The return value is the length of the resulting encoded pbuf structure.
6340  * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression
6341  * was applied, or smaller if compression was applied. In either case,
6342  * prior to writing to disk, the caller must suitably pad the output
6343  * buffer so that it is aligned on a multiple of the underlying storage
6344  * system's block size.
6345  */
6346 static uint32_t
6347 l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen)
6348 {
6349         uint16_t flags = 0;
6350         uint8_t *dst_buf;
6351         uint32_t enclen;
6352         l2pbuf_buflist_t *buflist;
6353 
6354         enclen = L2PBUF_ENCODED_SIZE(pb);
6355         ASSERT(buflen >= enclen);
6356         bzero(buf, enclen);
6357 
6358         /* non-header portions of pbufs are in native byte order */
6359         *(uint64_t *)(buf + 8) = pb->pb_prev_daddr;
6360         *(uint32_t *)(buf + 16) = pb->pb_prev_asize;
6361         bcopy(&pb->pb_prev_cksum, buf + 20, 32);
6362         *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE;
6363 
6364         /* first we encode the buflists uncompressed */
6365         dst_buf = buf + L2PBUF_HDR_SIZE;
6366         for (buflist = list_head(pb->pb_buflists_list); buflist;
6367             buflist = list_next(pb->pb_buflists_list, buflist)) {
6368                 int i;
6369 
6370                 ASSERT(buflist->l2pbl_nbufs != 0);
6371                 for (i = 0; i < buflist->l2pbl_nbufs; i++) {
6372                         l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6373 
6374                         ASSERT(pbl_buf->b_size != 0);
6375                         *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0];
6376                         *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1];
6377                         *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth;
6378                         *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0;
6379                         bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32);
6380                         *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size;
6381                         *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr;
6382                         *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize;
6383                         dst_buf[80] = pbl_buf->b_l2compress;
6384                         dst_buf[81] = pbl_buf->b_contents_type;
6385                         *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags;
6386                         dst_buf += L2PBUF_BUF_SIZE;
6387                 }
6388         }
6389         ASSERT((uint32_t)(dst_buf - buf) == enclen);
6390 
6391         /* and then compress them if necessary */
6392         if (enclen >= l2arc_pbuf_compress_minsz) {
6393                 uint8_t *cbuf;
6394                 size_t slen, clen;
6395 
6396                 slen = l2arc_pbuf_items_encoded_size(pb);
6397                 cbuf = kmem_alloc(slen, KM_SLEEP);
6398                 clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0);
6399                 ASSERT(clen != 0);
6400                 if (clen < slen) {
6401                         bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen);
6402                         flags |= L2PBUF_COMPRESSED;
6403                         /* zero out the rest of the input buffer */
6404                         bzero(buf + L2PBUF_HDR_SIZE + clen,
6405                             buflen - (L2PBUF_HDR_SIZE + clen));
6406                         /* adjust our buffer length now that it's shortened */
6407                         enclen = L2PBUF_HDR_SIZE + clen;
6408                 }
6409                 kmem_free(cbuf, slen);
6410         }
6411 
6412         /* the header goes last since `flags' may change due to compression */
6413 #if defined(_BIG_ENDIAN)
6414         *(uint32_t *)buf = L2PBUF_MAGIC;
6415         flags |= L2PBUF_BIG_ENDIAN;
6416         *(uint16_t *)(buf + 6) = flags;
6417 #else   /* !defined(_BIG_ENDIAN) */
6418         *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC);
6419         *(uint16_t *)(buf + 6) = BSWAP_16(flags);
6420 #endif  /* !defined(_BIG_ENDIAN) */
6421         buf[4] = L2PBUF_MAX_VERSION;
6422 
6423         return (enclen);
6424 }
6425 
6426 /*
6427  * Decodes a stored l2pbuf_t structure previously encoded using
6428  * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf
6429  * must be initialized by l2arc_pbuf_init by the caller beforehand, but
6430  * must not have been used to store any buffers yet.
6431  *
6432  * Please note that we don't do checksum verification here, as we don't
6433  * know our own checksum (that's know by the previous block in the linked
6434  * list, or by the uberblock). This should be performed by the caller
6435  * prior to calling l2arc_pbuf_decode.
6436  */
6437 static int
6438 l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb)
6439 {
6440         boolean_t bswap_needed;
6441         uint32_t payload_sz, payload_asz;
6442         uint8_t *src_bufs;
6443         l2pbuf_buflist_t *buflist;
6444         int i, nbufs;
6445 
6446         ASSERT(input_buf != NULL);
6447         ASSERT(pb != NULL);
6448         ASSERT(pb->pb_version != 0);
6449         ASSERT(pb->pb_nbuflists == 0);
6450 
6451         /* no valid buffer can be this small */
6452         if (buflen < L2PBUF_HDR_SIZE)
6453                 return (EINVAL);
6454 
6455         /* these always come in big endian */
6456 #if defined(_BIG_ENDIAN)
6457         pb->pb_magic = *(uint32_t *)input_buf;
6458         pb->pb_flags = *(uint16_t *)(input_buf + 6);
6459         bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1);
6460 #else   /* !defined(_BIG_ENDIAN) */
6461         pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf);
6462         pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6));
6463         bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0);
6464 #endif  /* !defined(_BIG_ENDIAN) */
6465         pb->pb_version = input_buf[4];
6466 
6467         if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0)
6468                 return (EINVAL);
6469         if (pb->pb_version > L2PBUF_MAX_VERSION)
6470                 return (ENOTSUP);
6471 
6472         /* remainder of pbuf may need bswap'ping */
6473         pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8);
6474         pb->pb_prev_asize = *(uint64_t *)(input_buf + 16);
6475         bcopy(input_buf + 20, &pb->pb_prev_cksum, 32);
6476         payload_sz = *(uint32_t *)(input_buf + 52);
6477         payload_asz = buflen - L2PBUF_HDR_SIZE;
6478 
6479         if (bswap_needed) {
6480                 pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr);
6481                 pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize);
6482                 ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum);
6483                 payload_sz = BSWAP_32(payload_sz);
6484         }
6485 
6486         /* check for sensible buffer allocation limits */
6487         if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) ||
6488             (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) ||
6489             (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0)
6490                 return (EINVAL);
6491         nbufs = payload_sz / L2PBUF_BUF_SIZE;
6492 
6493         /* decompression might be needed */
6494         if (pb->pb_flags & L2PBUF_COMPRESSED) {
6495                 src_bufs = kmem_alloc(payload_sz, KM_SLEEP);
6496                 if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs,
6497                     payload_asz, payload_sz, 0) != 0) {
6498                         kmem_free(src_bufs, payload_sz);
6499                         return (EINVAL);
6500                 }
6501         } else {
6502                 src_bufs = input_buf + L2PBUF_HDR_SIZE;
6503         }
6504 
6505         /* Decode individual pbuf items from our source buffer. */
6506         buflist = l2arc_pbuf_buflist_alloc(pb, nbufs);
6507         for (i = 0; i < nbufs; i++) {
6508                 l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6509                 const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE;
6510 
6511                 pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src;
6512                 pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8);
6513                 pbl_buf->b_birth = *(uint64_t *)(src + 16);
6514                 pbl_buf->b_cksum0 = *(uint64_t *)(src + 24);
6515                 bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32);
6516                 pbl_buf->b_size = *(uint32_t *)(src + 64);
6517                 pbl_buf->b_l2daddr = *(uint64_t *)(src + 68);
6518                 pbl_buf->b_l2asize = *(uint32_t *)(src + 76);
6519                 pbl_buf->b_l2compress = src[80];
6520                 pbl_buf->b_contents_type = src[81];
6521                 pbl_buf->b_flags = *(uint32_t *)(src + 84);
6522 
6523                 if (bswap_needed) {
6524                         pbl_buf->b_dva.dva_word[0] =
6525                             BSWAP_64(pbl_buf->b_dva.dva_word[0]);
6526                         pbl_buf->b_dva.dva_word[1] =
6527                             BSWAP_64(pbl_buf->b_dva.dva_word[1]);
6528                         pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth);
6529                         pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0);
6530                         ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum);
6531                         pbl_buf->b_size = BSWAP_32(pbl_buf->b_size);
6532                         pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr);
6533                         pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize);
6534                         pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags);
6535                 }
6536 
6537                 pb->pb_payload_asz += pbl_buf->b_l2asize;
6538         }
6539 
6540         if (pb->pb_flags & L2PBUF_COMPRESSED)
6541                 kmem_free(src_bufs, payload_sz);
6542 
6543         return (0);
6544 }
6545 
6546 /*
6547  * Decodes the previous buffer pointer encoded in a pbuf. This is used
6548  * during L2ARC reconstruction to "peek" at the next buffer and start
6549  * issuing IO to fetch it early, before decoding of the current buffer
6550  * is done (which can take time due to decompression).
6551  * Returns 0 on success (and fills in the return parameters `daddr',
6552  * `asize' and `cksum' with the info of the previous pbuf), and an errno
6553  * on error.
6554  */
6555 static int
6556 l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr,
6557     uint32_t *asize, zio_cksum_t *cksum)
6558 {
6559         boolean_t bswap_needed;
6560         uint16_t version, flags;
6561         uint32_t magic;
6562 
6563         ASSERT(buf != NULL);
6564 
6565         /* no valid buffer can be this small */
6566         if (buflen <= L2PBUF_HDR_SIZE)
6567                 return (EINVAL);
6568 
6569         /* these always come in big endian */
6570 #if defined(_BIG_ENDIAN)
6571         magic = *(uint32_t *)buf;
6572         flags = *(uint16_t *)(buf + 6);
6573         bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1);
6574 #else   /* !defined(_BIG_ENDIAN) */
6575         magic = BSWAP_32(*(uint32_t *)buf);
6576         flags = BSWAP_16(*(uint16_t *)(buf + 6));
6577         bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0);
6578 #endif  /* !defined(_BIG_ENDIAN) */
6579         version = buf[4];
6580 
6581         if (magic != L2PBUF_MAGIC || version == 0)
6582                 return (EINVAL);
6583         if (version > L2PBUF_MAX_VERSION)
6584                 return (ENOTSUP);
6585 
6586         *daddr = *(uint64_t *)(buf + 4);
6587         *asize = *(uint64_t *)(buf + 12);
6588         bcopy(buf + 16, cksum, 32);
6589 
6590         if (bswap_needed) {
6591                 *daddr = BSWAP_64(*daddr);
6592                 *asize = BSWAP_64(*asize);
6593                 ZIO_CHECKSUM_BSWAP(cksum);
6594         }
6595 
6596         return (0);
6597 }
6598 
6599 /*
6600  * Initializes a pbuf structure into a clean state. All version and flags
6601  * fields are filled in as appropriate for this architecture.
6602  * If the structure was used before, first call l2arc_pbuf_destroy on it,
6603  * as this function assumes the structure is uninitialized.
6604  */
6605 static void
6606 l2arc_pbuf_init(l2pbuf_t *pb)
6607 {
6608         bzero(pb, sizeof (l2pbuf_t));
6609         pb->pb_version = L2PBUF_MAX_VERSION;
6610 #if defined(_BIG_ENDIAN)
6611         pb->pb_flags |= L2PB_BIG_ENDIAN;
6612 #endif
6613         pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP);
6614         list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t),
6615             offsetof(l2pbuf_buflist_t, l2pbl_node));
6616 }
6617 
6618 /*
6619  * Destroys a pbuf structure and puts it into a clean state ready to be
6620  * initialized by l2arc_pbuf_init. All buflists created by
6621  * l2arc_pbuf_buflist_alloc are released as well.
6622  */
6623 static void
6624 l2arc_pbuf_destroy(l2pbuf_t *pb)
6625 {
6626         list_t *buflist_list = pb->pb_buflists_list;
6627         l2pbuf_buflist_t *buflist;
6628 
6629         while ((buflist = list_head(buflist_list)) != NULL) {
6630                 ASSERT(buflist->l2pbl_nbufs > 0);
6631                 kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) *
6632                     buflist->l2pbl_nbufs);
6633                 list_remove(buflist_list, buflist);
6634                 kmem_free(buflist, sizeof (l2pbuf_buflist_t));
6635         }
6636         pb->pb_nbuflists = 0;
6637         list_destroy(pb->pb_buflists_list);
6638         kmem_free(pb->pb_buflists_list, sizeof (list_t));
6639         bzero(pb, sizeof (l2pbuf_t));
6640 }
6641 
6642 /*
6643  * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs'
6644  * buffers. This is used during the buffer write cycle - each cycle allocates
6645  * a new buflist and fills it with buffers it writes. Then, when the pbuf
6646  * reaches its buflist limit, it is commited to stable storage.
6647  */
6648 static l2pbuf_buflist_t *
6649 l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs)
6650 {
6651         l2pbuf_buflist_t *buflist;
6652 
6653         ASSERT(pb->pb_buflists_list != NULL);
6654         buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP);
6655         buflist->l2pbl_nbufs = nbufs;
6656         buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs,
6657             KM_SLEEP);
6658         list_insert_tail(pb->pb_buflists_list, buflist);
6659         pb->pb_nbuflists++;
6660 
6661         return (buflist);
6662 }
6663 
6664 /*
6665  * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'.
6666  * The buffer being inserted must be present in L2ARC.
6667  */
6668 static void
6669 l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
6670     const arc_buf_hdr_t *ab, int index)
6671 {
6672         l2pbuf_buf_t *pb_buf;
6673         const l2arc_buf_hdr_t *l2hdr;
6674 
6675         l2hdr = ab->b_l2hdr;
6676         ASSERT(l2hdr != NULL);
6677         ASSERT(pbl->l2pbl_nbufs > index);
6678 
6679         pb_buf = &pbl->l2pbl_bufs[index];
6680         pb_buf->b_dva = ab->b_dva;
6681         pb_buf->b_birth = ab->b_birth;
6682         pb_buf->b_cksum0 = ab->b_cksum0;
6683         pb_buf->b_freeze_cksum = *ab->b_freeze_cksum;
6684         pb_buf->b_size = ab->b_size;
6685         pb_buf->b_l2daddr = l2hdr->b_daddr;
6686         pb_buf->b_l2asize = l2hdr->b_asize;
6687         pb_buf->b_l2compress = l2hdr->b_compress;
6688         pb_buf->b_contents_type = ab->b_type;
6689         pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS;
6690         pb->pb_payload_asz += l2hdr->b_asize;
6691 }
6692 
6693 /*
6694  * Commits a pbuf to stable storage. This routine is invoked when writing
6695  * ARC buffers to an L2ARC device. When the pbuf associated with the device
6696  * has reached its limits (either in size or in number of writes), it is
6697  * scheduled here for writing.
6698  * This function allocates some memory to temporarily hold the serialized
6699  * buffer to be written. This is then released in l2arc_write_done.
6700  */
6701 static void
6702 l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6703 {
6704         l2pbuf_t *pb = &dev->l2ad_pbuf;
6705         uint64_t i, est_encsize, bufsize, encsize, io_size;
6706         uint8_t *pb_buf;
6707 
6708         pb->pb_prev_daddr = dev->l2ad_pbuf_daddr;
6709         pb->pb_prev_asize = dev->l2ad_pbuf_asize;
6710         pb->pb_prev_cksum = dev->l2ad_pbuf_cksum;
6711 
6712         est_encsize = L2PBUF_ENCODED_SIZE(pb);
6713         bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize);
6714         pb_buf = kmem_zalloc(bufsize, KM_SLEEP);
6715         encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize);
6716         cb->l2wcb_pbuf = pb_buf;
6717         cb->l2wcb_pbuf_size = bufsize;
6718 
6719         dev->l2ad_pbuf_daddr = dev->l2ad_hand;
6720         dev->l2ad_pbuf_asize = encsize;
6721         fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum);
6722 
6723         io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize);
6724         for (i = 0; i < io_size; ) {
6725                 zio_t *wzio;
6726                 uint64_t wsize = io_size - i;
6727 
6728                 if (wsize > SPA_MAXBLOCKSIZE)
6729                         wsize = SPA_MAXBLOCKSIZE;
6730                 ASSERT(wsize >= SPA_MINBLOCKSIZE);
6731                 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i,
6732                     wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6733                     ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6734                 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6735                     zio_t *, wzio);
6736                 (void) zio_nowait(wzio);
6737                 i += wsize;
6738         }
6739 
6740         dev->l2ad_hand += io_size;
6741         vdev_space_update(dev->l2ad_vdev, io_size, 0, 0);
6742         l2arc_uberblock_update(dev, pio, cb);
6743 
6744         ARCSTAT_INCR(arcstat_l2_write_bytes, io_size);
6745         ARCSTAT_BUMP(arcstat_l2_meta_writes);
6746         ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize);
6747         ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize);
6748         ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6749             pb->pb_payload_asz / encsize);
6750 }
6751 
6752 /*
6753  * Returns the number of bytes occupied by the payload buffer items of
6754  * a pbuf in portable (on-disk) encoded form, i.e. the bytes following
6755  * L2PBUF_HDR_SIZE.
6756  */
6757 static uint32_t
6758 l2arc_pbuf_items_encoded_size(l2pbuf_t *pb)
6759 {
6760         uint32_t size = 0;
6761         l2pbuf_buflist_t *buflist;
6762 
6763         for (buflist = list_head(pb->pb_buflists_list); buflist != NULL;
6764             buflist = list_next(pb->pb_buflists_list, buflist))
6765                 size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs;
6766 
6767         return (size);
6768 }