dlpx-os-diff New usr/src/uts/common/fs/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26  */
  27 
  28 /*
  29  * DVA-based Adjustable Replacement Cache
  30  *
  31  * While much of the theory of operation used here is
  32  * based on the self-tuning, low overhead replacement cache
  33  * presented by Megiddo and Modha at FAST 2003, there are some
  34  * significant differences:
  35  *
  36  * 1. The Megiddo and Modha model assumes any page is evictable.
  37  * Pages in its cache cannot be "locked" into memory.  This makes
  38  * the eviction algorithm simple: evict the last page in the list.
  39  * This also make the performance characteristics easy to reason
  40  * about.  Our cache is not so simple.  At any given moment, some
  41  * subset of the blocks in the cache are un-evictable because we
  42  * have handed out a reference to them.  Blocks are only evictable
  43  * when there are no external references active.  This makes
  44  * eviction far more problematic:  we choose to evict the evictable
  45  * blocks that are the "lowest" in the list.
  46  *
  47  * There are times when it is not possible to evict the requested
  48  * space.  In these circumstances we are unable to adjust the cache
  49  * size.  To prevent the cache growing unbounded at these times we
  50  * implement a "cache throttle" that slows the flow of new data
  51  * into the cache until we can make space available.
  52  *
  53  * 2. The Megiddo and Modha model assumes a fixed cache size.
  54  * Pages are evicted when the cache is full and there is a cache
  55  * miss.  Our model has a variable sized cache.  It grows with
  56  * high use, but also tries to react to memory pressure from the
  57  * operating system: decreasing its size when system memory is
  58  * tight.
  59  *
  60  * 3. The Megiddo and Modha model assumes a fixed page size. All
  61  * elements of the cache are therefore exactly the same size.  So
  62  * when adjusting the cache size following a cache miss, its simply
  63  * a matter of choosing a single page to evict.  In our model, we
  64  * have variable sized cache blocks (rangeing from 512 bytes to
  65  * 128K bytes).  We therefore choose a set of blocks to evict to make
  66  * space for a cache miss that approximates as closely as possible
  67  * the space used by the new block.
  68  *
  69  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  70  * by N. Megiddo & D. Modha, FAST 2003
  71  */
  72 
  73 /*
  74  * The locking model:
  75  *
  76  * A new reference to a cache buffer can be obtained in two
  77  * ways: 1) via a hash table lookup using the DVA as a key,
  78  * or 2) via one of the ARC lists.  The arc_read() interface
  79  * uses method 1, while the internal arc algorithms for
  80  * adjusting the cache use method 2.  We therefore provide two
  81  * types of locks: 1) the hash table lock array, and 2) the
  82  * arc list locks.
  83  *
  84  * Buffers do not have their own mutexes, rather they rely on the
  85  * hash table mutexes for the bulk of their protection (i.e. most
  86  * fields in the arc_buf_hdr_t are protected by these mutexes).
  87  *
  88  * buf_hash_find() returns the appropriate mutex (held) when it
  89  * locates the requested buffer in the hash table.  It returns
  90  * NULL for the mutex if the buffer was not in the table.
  91  *
  92  * buf_hash_remove() expects the appropriate hash mutex to be
  93  * already held before it is invoked.
  94  *
  95  * Each arc state also has a mutex which is used to protect the
  96  * buffer list associated with the state.  When attempting to
  97  * obtain a hash table lock while holding an arc list lock you
  98  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  99  * the active state mutex must be held before the ghost state mutex.
 100  *
 101  * Arc buffers may have an associated eviction callback function.
 102  * This function will be invoked prior to removing the buffer (e.g.
 103  * in arc_do_user_evicts()).  Note however that the data associated
 104  * with the buffer may be evicted prior to the callback.  The callback
 105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  * the users of callbacks must ensure that their private data is
 107  * protected from simultaneous callbacks from arc_buf_evict()
 108  * and arc_do_user_evicts().
 109  *
 110  * Note that the majority of the performance stats are manipulated
 111  * with atomic operations.
 112  *
 113  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 114  *
 115  *      - L2ARC buflist creation
 116  *      - L2ARC buflist eviction
 117  *      - L2ARC write completion, which walks L2ARC buflists
 118  *      - ARC header destruction, as it removes from L2ARC buflists
 119  *      - ARC header release, as it removes from L2ARC buflists
 120  */
 121 
 122 #include <sys/spa.h>
 123 #include <sys/zio.h>
 124 #include <sys/zio_compress.h>
 125 #include <sys/zfs_context.h>
 126 #include <sys/arc.h>
 127 #include <sys/refcount.h>
 128 #include <sys/vdev.h>
 129 #include <sys/vdev_impl.h>
 130 #include <sys/dsl_pool.h>
 131 #ifdef _KERNEL
 132 #include <sys/vmsystm.h>
 133 #include <vm/anon.h>
 134 #include <sys/fs/swapnode.h>
 135 #include <sys/dnlc.h>
 136 #endif
 137 #include <sys/callb.h>
 138 #include <sys/kstat.h>
 139 #include <zfs_fletcher.h>
 140 
 141 #ifndef _KERNEL
 142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 143 boolean_t arc_watch = B_FALSE;
 144 int arc_procfd;
 145 #endif
 146 
 147 static kmutex_t         arc_reclaim_thr_lock;
 148 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 149 static uint8_t          arc_thread_exit;
 150 
 151 #define ARC_REDUCE_DNLC_PERCENT 3
 152 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 153 
 154 typedef enum arc_reclaim_strategy {
 155         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 156         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 157 } arc_reclaim_strategy_t;
 158 
 159 /*
 160  * The number of iterations through arc_evict_*() before we
 161  * drop & reacquire the lock.
 162  */
 163 int arc_evict_iterations = 100;
 164 
 165 /* number of seconds before growing cache again */
 166 static int              arc_grow_retry = 60;
 167 
 168 /* shift of arc_c for calculating both min and max arc_p */
 169 static int              arc_p_min_shift = 4;
 170 
 171 /* log2(fraction of arc to reclaim) */
 172 static int              arc_shrink_shift = 5;
 173 
 174 /*
 175  * minimum lifespan of a prefetch block in clock ticks
 176  * (initialized in arc_init())
 177  */
 178 static int              arc_min_prefetch_lifespan;
 179 
 180 /*
 181  * If this percent of memory is free, don't throttle.
 182  */
 183 int arc_lotsfree_percent = 10;
 184 
 185 static int arc_dead;
 186 
 187 /*
 188  * The arc has filled available memory and has now warmed up.
 189  */
 190 static boolean_t arc_warm;
 191 
 192 /*
 193  * These tunables are for performance analysis.
 194  */
 195 uint64_t zfs_arc_max;
 196 uint64_t zfs_arc_min;
 197 uint64_t zfs_arc_meta_limit = 0;
 198 int zfs_arc_grow_retry = 0;
 199 int zfs_arc_shrink_shift = 0;
 200 int zfs_arc_p_min_shift = 0;
 201 int zfs_disable_dup_eviction = 0;
 202 
 203 /*
 204  * Note that buffers can be in one of 6 states:
 205  *      ARC_anon        - anonymous (discussed below)
 206  *      ARC_mru         - recently used, currently cached
 207  *      ARC_mru_ghost   - recentely used, no longer in cache
 208  *      ARC_mfu         - frequently used, currently cached
 209  *      ARC_mfu_ghost   - frequently used, no longer in cache
 210  *      ARC_l2c_only    - exists in L2ARC but not other states
 211  * When there are no active references to the buffer, they are
 212  * are linked onto a list in one of these arc states.  These are
 213  * the only buffers that can be evicted or deleted.  Within each
 214  * state there are multiple lists, one for meta-data and one for
 215  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 216  * etc.) is tracked separately so that it can be managed more
 217  * explicitly: favored over data, limited explicitly.
 218  *
 219  * Anonymous buffers are buffers that are not associated with
 220  * a DVA.  These are buffers that hold dirty block copies
 221  * before they are written to stable storage.  By definition,
 222  * they are "ref'd" and are considered part of arc_mru
 223  * that cannot be freed.  Generally, they will aquire a DVA
 224  * as they are written and migrate onto the arc_mru list.
 225  *
 226  * The ARC_l2c_only state is for buffers that are in the second
 227  * level ARC but no longer in any of the ARC_m* lists.  The second
 228  * level ARC itself may also contain buffers that are in any of
 229  * the ARC_m* states - meaning that a buffer can exist in two
 230  * places.  The reason for the ARC_l2c_only state is to keep the
 231  * buffer header in the hash table, so that reads that hit the
 232  * second level ARC benefit from these fast lookups.
 233  */
 234 
 235 typedef struct arc_state {
 236         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 237         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 238         uint64_t arcs_size;     /* total amount of data in this state */
 239         kmutex_t arcs_mtx;
 240 } arc_state_t;
 241 
 242 /* The 6 states: */
 243 static arc_state_t ARC_anon;
 244 static arc_state_t ARC_mru;
 245 static arc_state_t ARC_mru_ghost;
 246 static arc_state_t ARC_mfu;
 247 static arc_state_t ARC_mfu_ghost;
 248 static arc_state_t ARC_l2c_only;
 249 
 250 typedef struct arc_stats {
 251         kstat_named_t arcstat_hits;
 252         kstat_named_t arcstat_misses;
 253         kstat_named_t arcstat_demand_data_hits;
 254         kstat_named_t arcstat_demand_data_misses;
 255         kstat_named_t arcstat_demand_metadata_hits;
 256         kstat_named_t arcstat_demand_metadata_misses;
 257         kstat_named_t arcstat_prefetch_data_hits;
 258         kstat_named_t arcstat_prefetch_data_misses;
 259         kstat_named_t arcstat_prefetch_metadata_hits;
 260         kstat_named_t arcstat_prefetch_metadata_misses;
 261         kstat_named_t arcstat_mru_hits;
 262         kstat_named_t arcstat_mru_ghost_hits;
 263         kstat_named_t arcstat_mfu_hits;
 264         kstat_named_t arcstat_mfu_ghost_hits;
 265         kstat_named_t arcstat_deleted;
 266         kstat_named_t arcstat_recycle_miss;
 267         /*
 268          * Number of buffers that could not be evicted because the hash lock
 269          * was held by another thread.  The lock may not necessarily be held
 270          * by something using the same buffer, since hash locks are shared
 271          * by multiple buffers.
 272          */
 273         kstat_named_t arcstat_mutex_miss;
 274         /*
 275          * Number of buffers skipped because they have I/O in progress, are
 276          * indrect prefetch buffers that have not lived long enough, or are
 277          * not from the spa we're trying to evict from.
 278          */
 279         kstat_named_t arcstat_evict_skip;
 280         kstat_named_t arcstat_evict_l2_cached;
 281         kstat_named_t arcstat_evict_l2_eligible;
 282         kstat_named_t arcstat_evict_l2_ineligible;
 283         kstat_named_t arcstat_hash_elements;
 284         kstat_named_t arcstat_hash_elements_max;
 285         kstat_named_t arcstat_hash_collisions;
 286         kstat_named_t arcstat_hash_chains;
 287         kstat_named_t arcstat_hash_chain_max;
 288         kstat_named_t arcstat_p;
 289         kstat_named_t arcstat_c;
 290         kstat_named_t arcstat_c_min;
 291         kstat_named_t arcstat_c_max;
 292         kstat_named_t arcstat_size;
 293         kstat_named_t arcstat_hdr_size;
 294         kstat_named_t arcstat_data_size;
 295         kstat_named_t arcstat_other_size;
 296         kstat_named_t arcstat_l2_hits;
 297         kstat_named_t arcstat_l2_misses;
 298         kstat_named_t arcstat_l2_feeds;
 299         kstat_named_t arcstat_l2_rw_clash;
 300         kstat_named_t arcstat_l2_read_bytes;
 301         kstat_named_t arcstat_l2_write_bytes;
 302         kstat_named_t arcstat_l2_writes_sent;
 303         kstat_named_t arcstat_l2_writes_done;
 304         kstat_named_t arcstat_l2_writes_error;
 305         kstat_named_t arcstat_l2_writes_hdr_miss;
 306         kstat_named_t arcstat_l2_evict_lock_retry;
 307         kstat_named_t arcstat_l2_evict_reading;
 308         kstat_named_t arcstat_l2_free_on_write;
 309         kstat_named_t arcstat_l2_abort_lowmem;
 310         kstat_named_t arcstat_l2_cksum_bad;
 311         kstat_named_t arcstat_l2_io_error;
 312         kstat_named_t arcstat_l2_size;
 313         kstat_named_t arcstat_l2_asize;
 314         kstat_named_t arcstat_l2_hdr_size;
 315         kstat_named_t arcstat_l2_compress_successes;
 316         kstat_named_t arcstat_l2_compress_zeros;
 317         kstat_named_t arcstat_l2_compress_failures;
 318         kstat_named_t arcstat_memory_throttle_count;
 319         kstat_named_t arcstat_duplicate_buffers;
 320         kstat_named_t arcstat_duplicate_buffers_size;
 321         kstat_named_t arcstat_duplicate_reads;
 322         kstat_named_t arcstat_meta_used;
 323         kstat_named_t arcstat_meta_limit;
 324         kstat_named_t arcstat_meta_max;
 325 } arc_stats_t;
 326 
 327 static arc_stats_t arc_stats = {
 328         { "hits",                       KSTAT_DATA_UINT64 },
 329         { "misses",                     KSTAT_DATA_UINT64 },
 330         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 331         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 332         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 333         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 334         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 335         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 336         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 337         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 338         { "mru_hits",                   KSTAT_DATA_UINT64 },
 339         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 340         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 341         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 342         { "deleted",                    KSTAT_DATA_UINT64 },
 343         { "recycle_miss",               KSTAT_DATA_UINT64 },
 344         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 345         { "evict_skip",                 KSTAT_DATA_UINT64 },
 346         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 347         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 348         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 349         { "hash_elements",              KSTAT_DATA_UINT64 },
 350         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 351         { "hash_collisions",            KSTAT_DATA_UINT64 },
 352         { "hash_chains",                KSTAT_DATA_UINT64 },
 353         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 354         { "p",                          KSTAT_DATA_UINT64 },
 355         { "c",                          KSTAT_DATA_UINT64 },
 356         { "c_min",                      KSTAT_DATA_UINT64 },
 357         { "c_max",                      KSTAT_DATA_UINT64 },
 358         { "size",                       KSTAT_DATA_UINT64 },
 359         { "hdr_size",                   KSTAT_DATA_UINT64 },
 360         { "data_size",                  KSTAT_DATA_UINT64 },
 361         { "other_size",                 KSTAT_DATA_UINT64 },
 362         { "l2_hits",                    KSTAT_DATA_UINT64 },
 363         { "l2_misses",                  KSTAT_DATA_UINT64 },
 364         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 365         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 366         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 367         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 368         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 369         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 370         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 371         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 372         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 373         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 374         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 375         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 376         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 377         { "l2_io_error",                KSTAT_DATA_UINT64 },
 378         { "l2_size",                    KSTAT_DATA_UINT64 },
 379         { "l2_asize",                   KSTAT_DATA_UINT64 },
 380         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 381         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 382         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 383         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 384         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 385         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 386         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 387         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 388         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 389         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 390         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 391 };
 392 
 393 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 394 
 395 #define ARCSTAT_INCR(stat, val) \
 396         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 397 
 398 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 399 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 400 
 401 #define ARCSTAT_MAX(stat, val) {                                        \
 402         uint64_t m;                                                     \
 403         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 404             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 405                 continue;                                               \
 406 }
 407 
 408 #define ARCSTAT_MAXSTAT(stat) \
 409         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 410 
 411 /*
 412  * We define a macro to allow ARC hits/misses to be easily broken down by
 413  * two separate conditions, giving a total of four different subtypes for
 414  * each of hits and misses (so eight statistics total).
 415  */
 416 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 417         if (cond1) {                                                    \
 418                 if (cond2) {                                            \
 419                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 420                 } else {                                                \
 421                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 422                 }                                                       \
 423         } else {                                                        \
 424                 if (cond2) {                                            \
 425                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 426                 } else {                                                \
 427                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 428                 }                                                       \
 429         }
 430 
 431 kstat_t                 *arc_ksp;
 432 static arc_state_t      *arc_anon;
 433 static arc_state_t      *arc_mru;
 434 static arc_state_t      *arc_mru_ghost;
 435 static arc_state_t      *arc_mfu;
 436 static arc_state_t      *arc_mfu_ghost;
 437 static arc_state_t      *arc_l2c_only;
 438 
 439 /*
 440  * There are several ARC variables that are critical to export as kstats --
 441  * but we don't want to have to grovel around in the kstat whenever we wish to
 442  * manipulate them.  For these variables, we therefore define them to be in
 443  * terms of the statistic variable.  This assures that we are not introducing
 444  * the possibility of inconsistency by having shadow copies of the variables,
 445  * while still allowing the code to be readable.
 446  */
 447 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 448 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 449 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 450 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 451 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 452 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 453 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 454 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 455 
 456 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 457         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 458 
 459 static int              arc_no_grow;    /* Don't try to grow cache size */
 460 static uint64_t         arc_tempreserve;
 461 static uint64_t         arc_loaned_bytes;
 462 
 463 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 464 
 465 typedef struct arc_callback arc_callback_t;
 466 
 467 struct arc_callback {
 468         void                    *acb_private;
 469         arc_done_func_t         *acb_done;
 470         arc_buf_t               *acb_buf;
 471         zio_t                   *acb_zio_dummy;
 472         arc_callback_t          *acb_next;
 473 };
 474 
 475 typedef struct arc_write_callback arc_write_callback_t;
 476 
 477 struct arc_write_callback {
 478         void            *awcb_private;
 479         arc_done_func_t *awcb_ready;
 480         arc_done_func_t *awcb_physdone;
 481         arc_done_func_t *awcb_done;
 482         arc_buf_t       *awcb_buf;
 483 };
 484 
 485 struct arc_buf_hdr {
 486         /* protected by hash lock */
 487         dva_t                   b_dva;
 488         uint64_t                b_birth;
 489         uint64_t                b_cksum0;
 490 
 491         kmutex_t                b_freeze_lock;
 492         zio_cksum_t             *b_freeze_cksum;
 493         void                    *b_thawed;
 494 
 495         arc_buf_hdr_t           *b_hash_next;
 496         arc_buf_t               *b_buf;
 497         uint32_t                b_flags;
 498         uint32_t                b_datacnt;
 499 
 500         arc_callback_t          *b_acb;
 501         kcondvar_t              b_cv;
 502 
 503         /* immutable */
 504         arc_buf_contents_t      b_type;
 505         uint64_t                b_size;
 506         uint64_t                b_spa;
 507 
 508         /* protected by arc state mutex */
 509         arc_state_t             *b_state;
 510         list_node_t             b_arc_node;
 511 
 512         /* updated atomically */
 513         clock_t                 b_arc_access;
 514 
 515         /* self protecting */
 516         refcount_t              b_refcnt;
 517 
 518         l2arc_buf_hdr_t         *b_l2hdr;
 519         list_node_t             b_l2node;
 520 };
 521 
 522 static arc_buf_t *arc_eviction_list;
 523 static kmutex_t arc_eviction_mtx;
 524 static arc_buf_hdr_t arc_eviction_hdr;
 525 static void arc_get_data_buf(arc_buf_t *buf);
 526 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 527 static int arc_evict_needed(arc_buf_contents_t type);
 528 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 529 static void arc_buf_watch(arc_buf_t *buf);
 530 
 531 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 532 
 533 #define GHOST_STATE(state)      \
 534         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 535         (state) == arc_l2c_only)
 536 
 537 /*
 538  * Private ARC flags.  These flags are private ARC only flags that will show up
 539  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 540  * be passed in as arc_flags in things like arc_read.  However, these flags
 541  * should never be passed and should only be set by ARC code.  When adding new
 542  * public flags, make sure not to smash the private ones.
 543  */
 544 
 545 #define ARC_IN_HASH_TABLE       (1 << 9)  /* this buffer is hashed */
 546 #define ARC_IO_IN_PROGRESS      (1 << 10) /* I/O in progress for buf */
 547 #define ARC_IO_ERROR            (1 << 11) /* I/O failed for buf */
 548 #define ARC_FREED_IN_READ       (1 << 12) /* buf freed while in read */
 549 #define ARC_BUF_AVAILABLE       (1 << 13) /* block not in active use */
 550 #define ARC_INDIRECT            (1 << 14) /* this is an indirect block */
 551 #define ARC_FREE_IN_PROGRESS    (1 << 15) /* hdr about to be freed */
 552 #define ARC_L2_WRITING          (1 << 16) /* L2ARC write in progress */
 553 #define ARC_L2_EVICTED          (1 << 17) /* evicted during I/O */
 554 #define ARC_L2_WRITE_HEAD       (1 << 18) /* head of write list */
 555 
 556 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 557 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 558 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 559 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 560 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 561 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 562 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 563 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 564 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS &&  \
 565                                     (hdr)->b_l2hdr != NULL)
 566 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 567 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 568 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 569 
 570 /*
 571  * Other sizes
 572  */
 573 
 574 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 575 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 576 
 577 /*
 578  * Hash table routines
 579  */
 580 
 581 #define HT_LOCK_PAD     64
 582 
 583 struct ht_lock {
 584         kmutex_t        ht_lock;
 585 #ifdef _KERNEL
 586         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 587 #endif
 588 };
 589 
 590 #define BUF_LOCKS 256
 591 typedef struct buf_hash_table {
 592         uint64_t ht_mask;
 593         arc_buf_hdr_t **ht_table;
 594         struct ht_lock ht_locks[BUF_LOCKS];
 595 } buf_hash_table_t;
 596 
 597 static buf_hash_table_t buf_hash_table;
 598 
 599 #define BUF_HASH_INDEX(spa, dva, birth) \
 600         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 601 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 602 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 603 #define HDR_LOCK(hdr) \
 604         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 605 
 606 uint64_t zfs_crc64_table[256];
 607 
 608 /*
 609  * Level 2 ARC
 610  */
 611 
 612 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 613 #define L2ARC_HEADROOM          2                       /* num of writes */
 614 /*
 615  * If we discover during ARC scan any buffers to be compressed, we boost
 616  * our headroom for the next scanning cycle by this percentage multiple.
 617  */
 618 #define L2ARC_HEADROOM_BOOST    200
 619 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 620 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 621 
 622 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 623 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 624 
 625 /* L2ARC Performance Tunables */
 626 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 627 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 628 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 629 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 630 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 631 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 632 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 633 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 634 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 635 
 636 /*
 637  * L2ARC Internals
 638  */
 639 typedef struct l2arc_dev {
 640         vdev_t                  *l2ad_vdev;     /* vdev */
 641         spa_t                   *l2ad_spa;      /* spa */
 642         uint64_t                l2ad_hand;      /* next write location */
 643         uint64_t                l2ad_start;     /* first addr on device */
 644         uint64_t                l2ad_end;       /* last addr on device */
 645         uint64_t                l2ad_evict;     /* last addr eviction reached */
 646         boolean_t               l2ad_first;     /* first sweep through */
 647         boolean_t               l2ad_writing;   /* currently writing */
 648         list_t                  *l2ad_buflist;  /* buffer list */
 649         list_node_t             l2ad_node;      /* device list node */
 650 } l2arc_dev_t;
 651 
 652 static list_t L2ARC_dev_list;                   /* device list */
 653 static list_t *l2arc_dev_list;                  /* device list pointer */
 654 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 655 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 656 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 657 static list_t L2ARC_free_on_write;              /* free after write buf list */
 658 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 659 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 660 static uint64_t l2arc_ndev;                     /* number of devices */
 661 
 662 typedef struct l2arc_read_callback {
 663         arc_buf_t               *l2rcb_buf;             /* read buffer */
 664         spa_t                   *l2rcb_spa;             /* spa */
 665         blkptr_t                l2rcb_bp;               /* original blkptr */
 666         zbookmark_t             l2rcb_zb;               /* original bookmark */
 667         int                     l2rcb_flags;            /* original flags */
 668         enum zio_compress       l2rcb_compress;         /* applied compress */
 669 } l2arc_read_callback_t;
 670 
 671 typedef struct l2arc_write_callback {
 672         l2arc_dev_t     *l2wcb_dev;             /* device info */
 673         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 674 } l2arc_write_callback_t;
 675 
 676 struct l2arc_buf_hdr {
 677         /* protected by arc_buf_hdr  mutex */
 678         l2arc_dev_t             *b_dev;         /* L2ARC device */
 679         uint64_t                b_daddr;        /* disk address, offset byte */
 680         /* compression applied to buffer data */
 681         enum zio_compress       b_compress;
 682         /* real alloc'd buffer size depending on b_compress applied */
 683         int                     b_asize;
 684         /* temporary buffer holder for in-flight compressed data */
 685         void                    *b_tmp_cdata;
 686 };
 687 
 688 typedef struct l2arc_data_free {
 689         /* protected by l2arc_free_on_write_mtx */
 690         void            *l2df_data;
 691         size_t          l2df_size;
 692         void            (*l2df_func)(void *, size_t);
 693         list_node_t     l2df_list_node;
 694 } l2arc_data_free_t;
 695 
 696 static kmutex_t l2arc_feed_thr_lock;
 697 static kcondvar_t l2arc_feed_thr_cv;
 698 static uint8_t l2arc_thread_exit;
 699 
 700 static void l2arc_read_done(zio_t *zio);
 701 static void l2arc_hdr_stat_add(void);
 702 static void l2arc_hdr_stat_remove(void);
 703 
 704 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 705 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 706     enum zio_compress c);
 707 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 708 
 709 static uint64_t
 710 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 711 {
 712         uint8_t *vdva = (uint8_t *)dva;
 713         uint64_t crc = -1ULL;
 714         int i;
 715 
 716         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 717 
 718         for (i = 0; i < sizeof (dva_t); i++)
 719                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 720 
 721         crc ^= (spa>>8) ^ birth;
 722 
 723         return (crc);
 724 }
 725 
 726 #define BUF_EMPTY(buf)                                          \
 727         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 728         (buf)->b_dva.dva_word[1] == 0 &&                     \
 729         (buf)->b_birth == 0)
 730 
 731 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 732         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&       \
 733         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&       \
 734         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 735 
 736 static void
 737 buf_discard_identity(arc_buf_hdr_t *hdr)
 738 {
 739         hdr->b_dva.dva_word[0] = 0;
 740         hdr->b_dva.dva_word[1] = 0;
 741         hdr->b_birth = 0;
 742         hdr->b_cksum0 = 0;
 743 }
 744 
 745 static arc_buf_hdr_t *
 746 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 747 {
 748         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 749         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 750         arc_buf_hdr_t *buf;
 751 
 752         mutex_enter(hash_lock);
 753         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 754             buf = buf->b_hash_next) {
 755                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 756                         *lockp = hash_lock;
 757                         return (buf);
 758                 }
 759         }
 760         mutex_exit(hash_lock);
 761         *lockp = NULL;
 762         return (NULL);
 763 }
 764 
 765 /*
 766  * Insert an entry into the hash table.  If there is already an element
 767  * equal to elem in the hash table, then the already existing element
 768  * will be returned and the new element will not be inserted.
 769  * Otherwise returns NULL.
 770  */
 771 static arc_buf_hdr_t *
 772 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 773 {
 774         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 775         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 776         arc_buf_hdr_t *fbuf;
 777         uint32_t i;
 778 
 779         ASSERT(!HDR_IN_HASH_TABLE(buf));
 780         *lockp = hash_lock;
 781         mutex_enter(hash_lock);
 782         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 783             fbuf = fbuf->b_hash_next, i++) {
 784                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 785                         return (fbuf);
 786         }
 787 
 788         buf->b_hash_next = buf_hash_table.ht_table[idx];
 789         buf_hash_table.ht_table[idx] = buf;
 790         buf->b_flags |= ARC_IN_HASH_TABLE;
 791 
 792         /* collect some hash table performance data */
 793         if (i > 0) {
 794                 ARCSTAT_BUMP(arcstat_hash_collisions);
 795                 if (i == 1)
 796                         ARCSTAT_BUMP(arcstat_hash_chains);
 797 
 798                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 799         }
 800 
 801         ARCSTAT_BUMP(arcstat_hash_elements);
 802         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 803 
 804         return (NULL);
 805 }
 806 
 807 static void
 808 buf_hash_remove(arc_buf_hdr_t *buf)
 809 {
 810         arc_buf_hdr_t *fbuf, **bufp;
 811         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 812 
 813         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 814         ASSERT(HDR_IN_HASH_TABLE(buf));
 815 
 816         bufp = &buf_hash_table.ht_table[idx];
 817         while ((fbuf = *bufp) != buf) {
 818                 ASSERT(fbuf != NULL);
 819                 bufp = &fbuf->b_hash_next;
 820         }
 821         *bufp = buf->b_hash_next;
 822         buf->b_hash_next = NULL;
 823         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 824 
 825         /* collect some hash table performance data */
 826         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 827 
 828         if (buf_hash_table.ht_table[idx] &&
 829             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 830                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 831 }
 832 
 833 /*
 834  * Global data structures and functions for the buf kmem cache.
 835  */
 836 static kmem_cache_t *hdr_cache;
 837 static kmem_cache_t *buf_cache;
 838 
 839 static void
 840 buf_fini(void)
 841 {
 842         int i;
 843 
 844         kmem_free(buf_hash_table.ht_table,
 845             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 846         for (i = 0; i < BUF_LOCKS; i++)
 847                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 848         kmem_cache_destroy(hdr_cache);
 849         kmem_cache_destroy(buf_cache);
 850 }
 851 
 852 /*
 853  * Constructor callback - called when the cache is empty
 854  * and a new buf is requested.
 855  */
 856 /* ARGSUSED */
 857 static int
 858 hdr_cons(void *vbuf, void *unused, int kmflag)
 859 {
 860         arc_buf_hdr_t *buf = vbuf;
 861 
 862         bzero(buf, sizeof (arc_buf_hdr_t));
 863         refcount_create(&buf->b_refcnt);
 864         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 865         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 866         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 867 
 868         return (0);
 869 }
 870 
 871 /* ARGSUSED */
 872 static int
 873 buf_cons(void *vbuf, void *unused, int kmflag)
 874 {
 875         arc_buf_t *buf = vbuf;
 876 
 877         bzero(buf, sizeof (arc_buf_t));
 878         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 879         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 880 
 881         return (0);
 882 }
 883 
 884 /*
 885  * Destructor callback - called when a cached buf is
 886  * no longer required.
 887  */
 888 /* ARGSUSED */
 889 static void
 890 hdr_dest(void *vbuf, void *unused)
 891 {
 892         arc_buf_hdr_t *buf = vbuf;
 893 
 894         ASSERT(BUF_EMPTY(buf));
 895         refcount_destroy(&buf->b_refcnt);
 896         cv_destroy(&buf->b_cv);
 897         mutex_destroy(&buf->b_freeze_lock);
 898         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 899 }
 900 
 901 /* ARGSUSED */
 902 static void
 903 buf_dest(void *vbuf, void *unused)
 904 {
 905         arc_buf_t *buf = vbuf;
 906 
 907         mutex_destroy(&buf->b_evict_lock);
 908         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 909 }
 910 
 911 /*
 912  * Reclaim callback -- invoked when memory is low.
 913  */
 914 /* ARGSUSED */
 915 static void
 916 hdr_recl(void *unused)
 917 {
 918         dprintf("hdr_recl called\n");
 919         /*
 920          * umem calls the reclaim func when we destroy the buf cache,
 921          * which is after we do arc_fini().
 922          */
 923         if (!arc_dead)
 924                 cv_signal(&arc_reclaim_thr_cv);
 925 }
 926 
 927 static void
 928 buf_init(void)
 929 {
 930         uint64_t *ct;
 931         uint64_t hsize = 1ULL << 12;
 932         int i, j;
 933 
 934         /*
 935          * The hash table is big enough to fill all of physical memory
 936          * with an average 64K block size.  The table will take up
 937          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 938          */
 939         while (hsize * 65536 < physmem * PAGESIZE)
 940                 hsize <<= 1;
 941 retry:
 942         buf_hash_table.ht_mask = hsize - 1;
 943         buf_hash_table.ht_table =
 944             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 945         if (buf_hash_table.ht_table == NULL) {
 946                 ASSERT(hsize > (1ULL << 8));
 947                 hsize >>= 1;
 948                 goto retry;
 949         }
 950 
 951         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 952             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 953         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 954             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 955 
 956         for (i = 0; i < 256; i++)
 957                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 958                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 959 
 960         for (i = 0; i < BUF_LOCKS; i++) {
 961                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 962                     NULL, MUTEX_DEFAULT, NULL);
 963         }
 964 }
 965 
 966 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 967 
 968 static void
 969 arc_cksum_verify(arc_buf_t *buf)
 970 {
 971         zio_cksum_t zc;
 972 
 973         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 974                 return;
 975 
 976         mutex_enter(&buf->b_hdr->b_freeze_lock);
 977         if (buf->b_hdr->b_freeze_cksum == NULL ||
 978             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 979                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 980                 return;
 981         }
 982         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 983         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 984                 panic("buffer modified while frozen!");
 985         mutex_exit(&buf->b_hdr->b_freeze_lock);
 986 }
 987 
 988 static int
 989 arc_cksum_equal(arc_buf_t *buf)
 990 {
 991         zio_cksum_t zc;
 992         int equal;
 993 
 994         mutex_enter(&buf->b_hdr->b_freeze_lock);
 995         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 996         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 997         mutex_exit(&buf->b_hdr->b_freeze_lock);
 998 
 999         return (equal);
1000 }
1001 
1002 static void
1003 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1004 {
1005         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1006                 return;
1007 
1008         mutex_enter(&buf->b_hdr->b_freeze_lock);
1009         if (buf->b_hdr->b_freeze_cksum != NULL) {
1010                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1011                 return;
1012         }
1013         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1014         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1015             buf->b_hdr->b_freeze_cksum);
1016         mutex_exit(&buf->b_hdr->b_freeze_lock);
1017         arc_buf_watch(buf);
1018 }
1019 
1020 #ifndef _KERNEL
1021 typedef struct procctl {
1022         long cmd;
1023         prwatch_t prwatch;
1024 } procctl_t;
1025 #endif
1026 
1027 /* ARGSUSED */
1028 static void
1029 arc_buf_unwatch(arc_buf_t *buf)
1030 {
1031 #ifndef _KERNEL
1032         if (arc_watch) {
1033                 int result;
1034                 procctl_t ctl;
1035                 ctl.cmd = PCWATCH;
1036                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1037                 ctl.prwatch.pr_size = 0;
1038                 ctl.prwatch.pr_wflags = 0;
1039                 result = write(arc_procfd, &ctl, sizeof (ctl));
1040                 ASSERT3U(result, ==, sizeof (ctl));
1041         }
1042 #endif
1043 }
1044 
1045 /* ARGSUSED */
1046 static void
1047 arc_buf_watch(arc_buf_t *buf)
1048 {
1049 #ifndef _KERNEL
1050         if (arc_watch) {
1051                 int result;
1052                 procctl_t ctl;
1053                 ctl.cmd = PCWATCH;
1054                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1055                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1056                 ctl.prwatch.pr_wflags = WA_WRITE;
1057                 result = write(arc_procfd, &ctl, sizeof (ctl));
1058                 ASSERT3U(result, ==, sizeof (ctl));
1059         }
1060 #endif
1061 }
1062 
1063 void
1064 arc_buf_thaw(arc_buf_t *buf)
1065 {
1066         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1067                 if (buf->b_hdr->b_state != arc_anon)
1068                         panic("modifying non-anon buffer!");
1069                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1070                         panic("modifying buffer while i/o in progress!");
1071                 arc_cksum_verify(buf);
1072         }
1073 
1074         mutex_enter(&buf->b_hdr->b_freeze_lock);
1075         if (buf->b_hdr->b_freeze_cksum != NULL) {
1076                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1077                 buf->b_hdr->b_freeze_cksum = NULL;
1078         }
1079 
1080         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1081                 if (buf->b_hdr->b_thawed)
1082                         kmem_free(buf->b_hdr->b_thawed, 1);
1083                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1084         }
1085 
1086         mutex_exit(&buf->b_hdr->b_freeze_lock);
1087 
1088         arc_buf_unwatch(buf);
1089 }
1090 
1091 void
1092 arc_buf_freeze(arc_buf_t *buf)
1093 {
1094         kmutex_t *hash_lock;
1095 
1096         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1097                 return;
1098 
1099         hash_lock = HDR_LOCK(buf->b_hdr);
1100         mutex_enter(hash_lock);
1101 
1102         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1103             buf->b_hdr->b_state == arc_anon);
1104         arc_cksum_compute(buf, B_FALSE);
1105         mutex_exit(hash_lock);
1106 
1107 }
1108 
1109 static void
1110 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1111 {
1112         ASSERT(MUTEX_HELD(hash_lock));
1113 
1114         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1115             (ab->b_state != arc_anon)) {
1116                 uint64_t delta = ab->b_size * ab->b_datacnt;
1117                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1118                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1119 
1120                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1121                 mutex_enter(&ab->b_state->arcs_mtx);
1122                 ASSERT(list_link_active(&ab->b_arc_node));
1123                 list_remove(list, ab);
1124                 if (GHOST_STATE(ab->b_state)) {
1125                         ASSERT0(ab->b_datacnt);
1126                         ASSERT3P(ab->b_buf, ==, NULL);
1127                         delta = ab->b_size;
1128                 }
1129                 ASSERT(delta > 0);
1130                 ASSERT3U(*size, >=, delta);
1131                 atomic_add_64(size, -delta);
1132                 mutex_exit(&ab->b_state->arcs_mtx);
1133                 /* remove the prefetch flag if we get a reference */
1134                 if (ab->b_flags & ARC_PREFETCH)
1135                         ab->b_flags &= ~ARC_PREFETCH;
1136         }
1137 }
1138 
1139 static int
1140 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1141 {
1142         int cnt;
1143         arc_state_t *state = ab->b_state;
1144 
1145         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1146         ASSERT(!GHOST_STATE(state));
1147 
1148         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1149             (state != arc_anon)) {
1150                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1151 
1152                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1153                 mutex_enter(&state->arcs_mtx);
1154                 ASSERT(!list_link_active(&ab->b_arc_node));
1155                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1156                 ASSERT(ab->b_datacnt > 0);
1157                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1158                 mutex_exit(&state->arcs_mtx);
1159         }
1160         return (cnt);
1161 }
1162 
1163 /*
1164  * Move the supplied buffer to the indicated state.  The mutex
1165  * for the buffer must be held by the caller.
1166  */
1167 static void
1168 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1169 {
1170         arc_state_t *old_state = ab->b_state;
1171         int64_t refcnt = refcount_count(&ab->b_refcnt);
1172         uint64_t from_delta, to_delta;
1173 
1174         ASSERT(MUTEX_HELD(hash_lock));
1175         ASSERT3P(new_state, !=, old_state);
1176         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1177         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1178         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1179 
1180         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1181 
1182         /*
1183          * If this buffer is evictable, transfer it from the
1184          * old state list to the new state list.
1185          */
1186         if (refcnt == 0) {
1187                 if (old_state != arc_anon) {
1188                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1189                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1190 
1191                         if (use_mutex)
1192                                 mutex_enter(&old_state->arcs_mtx);
1193 
1194                         ASSERT(list_link_active(&ab->b_arc_node));
1195                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1196 
1197                         /*
1198                          * If prefetching out of the ghost cache,
1199                          * we will have a non-zero datacnt.
1200                          */
1201                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1202                                 /* ghost elements have a ghost size */
1203                                 ASSERT(ab->b_buf == NULL);
1204                                 from_delta = ab->b_size;
1205                         }
1206                         ASSERT3U(*size, >=, from_delta);
1207                         atomic_add_64(size, -from_delta);
1208 
1209                         if (use_mutex)
1210                                 mutex_exit(&old_state->arcs_mtx);
1211                 }
1212                 if (new_state != arc_anon) {
1213                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1214                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1215 
1216                         if (use_mutex)
1217                                 mutex_enter(&new_state->arcs_mtx);
1218 
1219                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1220 
1221                         /* ghost elements have a ghost size */
1222                         if (GHOST_STATE(new_state)) {
1223                                 ASSERT(ab->b_datacnt == 0);
1224                                 ASSERT(ab->b_buf == NULL);
1225                                 to_delta = ab->b_size;
1226                         }
1227                         atomic_add_64(size, to_delta);
1228 
1229                         if (use_mutex)
1230                                 mutex_exit(&new_state->arcs_mtx);
1231                 }
1232         }
1233 
1234         ASSERT(!BUF_EMPTY(ab));
1235         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1236                 buf_hash_remove(ab);
1237 
1238         /* adjust state sizes */
1239         if (to_delta)
1240                 atomic_add_64(&new_state->arcs_size, to_delta);
1241         if (from_delta) {
1242                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1243                 atomic_add_64(&old_state->arcs_size, -from_delta);
1244         }
1245         ab->b_state = new_state;
1246 
1247         /* adjust l2arc hdr stats */
1248         if (new_state == arc_l2c_only)
1249                 l2arc_hdr_stat_add();
1250         else if (old_state == arc_l2c_only)
1251                 l2arc_hdr_stat_remove();
1252 }
1253 
1254 void
1255 arc_space_consume(uint64_t space, arc_space_type_t type)
1256 {
1257         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1258 
1259         switch (type) {
1260         case ARC_SPACE_DATA:
1261                 ARCSTAT_INCR(arcstat_data_size, space);
1262                 break;
1263         case ARC_SPACE_OTHER:
1264                 ARCSTAT_INCR(arcstat_other_size, space);
1265                 break;
1266         case ARC_SPACE_HDRS:
1267                 ARCSTAT_INCR(arcstat_hdr_size, space);
1268                 break;
1269         case ARC_SPACE_L2HDRS:
1270                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1271                 break;
1272         }
1273 
1274         ARCSTAT_INCR(arcstat_meta_used, space);
1275         atomic_add_64(&arc_size, space);
1276 }
1277 
1278 void
1279 arc_space_return(uint64_t space, arc_space_type_t type)
1280 {
1281         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1282 
1283         switch (type) {
1284         case ARC_SPACE_DATA:
1285                 ARCSTAT_INCR(arcstat_data_size, -space);
1286                 break;
1287         case ARC_SPACE_OTHER:
1288                 ARCSTAT_INCR(arcstat_other_size, -space);
1289                 break;
1290         case ARC_SPACE_HDRS:
1291                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1292                 break;
1293         case ARC_SPACE_L2HDRS:
1294                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1295                 break;
1296         }
1297 
1298         ASSERT(arc_meta_used >= space);
1299         if (arc_meta_max < arc_meta_used)
1300                 arc_meta_max = arc_meta_used;
1301         ARCSTAT_INCR(arcstat_meta_used, -space);
1302         ASSERT(arc_size >= space);
1303         atomic_add_64(&arc_size, -space);
1304 }
1305 
1306 void *
1307 arc_data_buf_alloc(uint64_t size)
1308 {
1309         if (arc_evict_needed(ARC_BUFC_DATA))
1310                 cv_signal(&arc_reclaim_thr_cv);
1311         atomic_add_64(&arc_size, size);
1312         return (zio_data_buf_alloc(size));
1313 }
1314 
1315 void
1316 arc_data_buf_free(void *buf, uint64_t size)
1317 {
1318         zio_data_buf_free(buf, size);
1319         ASSERT(arc_size >= size);
1320         atomic_add_64(&arc_size, -size);
1321 }
1322 
1323 arc_buf_t *
1324 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1325 {
1326         arc_buf_hdr_t *hdr;
1327         arc_buf_t *buf;
1328 
1329         ASSERT3U(size, >, 0);
1330         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1331         ASSERT(BUF_EMPTY(hdr));
1332         hdr->b_size = size;
1333         hdr->b_type = type;
1334         hdr->b_spa = spa_load_guid(spa);
1335         hdr->b_state = arc_anon;
1336         hdr->b_arc_access = 0;
1337         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1338         buf->b_hdr = hdr;
1339         buf->b_data = NULL;
1340         buf->b_efunc = NULL;
1341         buf->b_private = NULL;
1342         buf->b_next = NULL;
1343         hdr->b_buf = buf;
1344         arc_get_data_buf(buf);
1345         hdr->b_datacnt = 1;
1346         hdr->b_flags = 0;
1347         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1348         (void) refcount_add(&hdr->b_refcnt, tag);
1349 
1350         return (buf);
1351 }
1352 
1353 static char *arc_onloan_tag = "onloan";
1354 
1355 /*
1356  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1357  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1358  * buffers must be returned to the arc before they can be used by the DMU or
1359  * freed.
1360  */
1361 arc_buf_t *
1362 arc_loan_buf(spa_t *spa, int size)
1363 {
1364         arc_buf_t *buf;
1365 
1366         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1367 
1368         atomic_add_64(&arc_loaned_bytes, size);
1369         return (buf);
1370 }
1371 
1372 /*
1373  * Return a loaned arc buffer to the arc.
1374  */
1375 void
1376 arc_return_buf(arc_buf_t *buf, void *tag)
1377 {
1378         arc_buf_hdr_t *hdr = buf->b_hdr;
1379 
1380         ASSERT(buf->b_data != NULL);
1381         (void) refcount_add(&hdr->b_refcnt, tag);
1382         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1383 
1384         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1385 }
1386 
1387 /* Detach an arc_buf from a dbuf (tag) */
1388 void
1389 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1390 {
1391         arc_buf_hdr_t *hdr;
1392 
1393         ASSERT(buf->b_data != NULL);
1394         hdr = buf->b_hdr;
1395         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1396         (void) refcount_remove(&hdr->b_refcnt, tag);
1397         buf->b_efunc = NULL;
1398         buf->b_private = NULL;
1399 
1400         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1401 }
1402 
1403 static arc_buf_t *
1404 arc_buf_clone(arc_buf_t *from)
1405 {
1406         arc_buf_t *buf;
1407         arc_buf_hdr_t *hdr = from->b_hdr;
1408         uint64_t size = hdr->b_size;
1409 
1410         ASSERT(hdr->b_state != arc_anon);
1411 
1412         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1413         buf->b_hdr = hdr;
1414         buf->b_data = NULL;
1415         buf->b_efunc = NULL;
1416         buf->b_private = NULL;
1417         buf->b_next = hdr->b_buf;
1418         hdr->b_buf = buf;
1419         arc_get_data_buf(buf);
1420         bcopy(from->b_data, buf->b_data, size);
1421 
1422         /*
1423          * This buffer already exists in the arc so create a duplicate
1424          * copy for the caller.  If the buffer is associated with user data
1425          * then track the size and number of duplicates.  These stats will be
1426          * updated as duplicate buffers are created and destroyed.
1427          */
1428         if (hdr->b_type == ARC_BUFC_DATA) {
1429                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1430                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1431         }
1432         hdr->b_datacnt += 1;
1433         return (buf);
1434 }
1435 
1436 void
1437 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1438 {
1439         arc_buf_hdr_t *hdr;
1440         kmutex_t *hash_lock;
1441 
1442         /*
1443          * Check to see if this buffer is evicted.  Callers
1444          * must verify b_data != NULL to know if the add_ref
1445          * was successful.
1446          */
1447         mutex_enter(&buf->b_evict_lock);
1448         if (buf->b_data == NULL) {
1449                 mutex_exit(&buf->b_evict_lock);
1450                 return;
1451         }
1452         hash_lock = HDR_LOCK(buf->b_hdr);
1453         mutex_enter(hash_lock);
1454         hdr = buf->b_hdr;
1455         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1456         mutex_exit(&buf->b_evict_lock);
1457 
1458         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1459         add_reference(hdr, hash_lock, tag);
1460         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1461         arc_access(hdr, hash_lock);
1462         mutex_exit(hash_lock);
1463         ARCSTAT_BUMP(arcstat_hits);
1464         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1465             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1466             data, metadata, hits);
1467 }
1468 
1469 /*
1470  * Free the arc data buffer.  If it is an l2arc write in progress,
1471  * the buffer is placed on l2arc_free_on_write to be freed later.
1472  */
1473 static void
1474 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1475 {
1476         arc_buf_hdr_t *hdr = buf->b_hdr;
1477 
1478         if (HDR_L2_WRITING(hdr)) {
1479                 l2arc_data_free_t *df;
1480                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1481                 df->l2df_data = buf->b_data;
1482                 df->l2df_size = hdr->b_size;
1483                 df->l2df_func = free_func;
1484                 mutex_enter(&l2arc_free_on_write_mtx);
1485                 list_insert_head(l2arc_free_on_write, df);
1486                 mutex_exit(&l2arc_free_on_write_mtx);
1487                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1488         } else {
1489                 free_func(buf->b_data, hdr->b_size);
1490         }
1491 }
1492 
1493 static void
1494 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1495 {
1496         arc_buf_t **bufp;
1497 
1498         /* free up data associated with the buf */
1499         if (buf->b_data) {
1500                 arc_state_t *state = buf->b_hdr->b_state;
1501                 uint64_t size = buf->b_hdr->b_size;
1502                 arc_buf_contents_t type = buf->b_hdr->b_type;
1503 
1504                 arc_cksum_verify(buf);
1505                 arc_buf_unwatch(buf);
1506 
1507                 if (!recycle) {
1508                         if (type == ARC_BUFC_METADATA) {
1509                                 arc_buf_data_free(buf, zio_buf_free);
1510                                 arc_space_return(size, ARC_SPACE_DATA);
1511                         } else {
1512                                 ASSERT(type == ARC_BUFC_DATA);
1513                                 arc_buf_data_free(buf, zio_data_buf_free);
1514                                 ARCSTAT_INCR(arcstat_data_size, -size);
1515                                 atomic_add_64(&arc_size, -size);
1516                         }
1517                 }
1518                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1519                         uint64_t *cnt = &state->arcs_lsize[type];
1520 
1521                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1522                         ASSERT(state != arc_anon);
1523 
1524                         ASSERT3U(*cnt, >=, size);
1525                         atomic_add_64(cnt, -size);
1526                 }
1527                 ASSERT3U(state->arcs_size, >=, size);
1528                 atomic_add_64(&state->arcs_size, -size);
1529                 buf->b_data = NULL;
1530 
1531                 /*
1532                  * If we're destroying a duplicate buffer make sure
1533                  * that the appropriate statistics are updated.
1534                  */
1535                 if (buf->b_hdr->b_datacnt > 1 &&
1536                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1537                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1538                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1539                 }
1540                 ASSERT(buf->b_hdr->b_datacnt > 0);
1541                 buf->b_hdr->b_datacnt -= 1;
1542         }
1543 
1544         /* only remove the buf if requested */
1545         if (!all)
1546                 return;
1547 
1548         /* remove the buf from the hdr list */
1549         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1550                 continue;
1551         *bufp = buf->b_next;
1552         buf->b_next = NULL;
1553 
1554         ASSERT(buf->b_efunc == NULL);
1555 
1556         /* clean up the buf */
1557         buf->b_hdr = NULL;
1558         kmem_cache_free(buf_cache, buf);
1559 }
1560 
1561 static void
1562 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1563 {
1564         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1565         ASSERT3P(hdr->b_state, ==, arc_anon);
1566         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1567         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1568 
1569         if (l2hdr != NULL) {
1570                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1571                 /*
1572                  * To prevent arc_free() and l2arc_evict() from
1573                  * attempting to free the same buffer at the same time,
1574                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1575                  * give it priority.  l2arc_evict() can't destroy this
1576                  * header while we are waiting on l2arc_buflist_mtx.
1577                  *
1578                  * The hdr may be removed from l2ad_buflist before we
1579                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1580                  */
1581                 if (!buflist_held) {
1582                         mutex_enter(&l2arc_buflist_mtx);
1583                         l2hdr = hdr->b_l2hdr;
1584                 }
1585 
1586                 if (l2hdr != NULL) {
1587                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1588                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1589                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1590                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1591                         if (hdr->b_state == arc_l2c_only)
1592                                 l2arc_hdr_stat_remove();
1593                         hdr->b_l2hdr = NULL;
1594                 }
1595 
1596                 if (!buflist_held)
1597                         mutex_exit(&l2arc_buflist_mtx);
1598         }
1599 
1600         if (!BUF_EMPTY(hdr)) {
1601                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1602                 buf_discard_identity(hdr);
1603         }
1604         while (hdr->b_buf) {
1605                 arc_buf_t *buf = hdr->b_buf;
1606 
1607                 if (buf->b_efunc) {
1608                         mutex_enter(&arc_eviction_mtx);
1609                         mutex_enter(&buf->b_evict_lock);
1610                         ASSERT(buf->b_hdr != NULL);
1611                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1612                         hdr->b_buf = buf->b_next;
1613                         buf->b_hdr = &arc_eviction_hdr;
1614                         buf->b_next = arc_eviction_list;
1615                         arc_eviction_list = buf;
1616                         mutex_exit(&buf->b_evict_lock);
1617                         mutex_exit(&arc_eviction_mtx);
1618                 } else {
1619                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1620                 }
1621         }
1622         if (hdr->b_freeze_cksum != NULL) {
1623                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1624                 hdr->b_freeze_cksum = NULL;
1625         }
1626         if (hdr->b_thawed) {
1627                 kmem_free(hdr->b_thawed, 1);
1628                 hdr->b_thawed = NULL;
1629         }
1630 
1631         ASSERT(!list_link_active(&hdr->b_arc_node));
1632         ASSERT3P(hdr->b_hash_next, ==, NULL);
1633         ASSERT3P(hdr->b_acb, ==, NULL);
1634         kmem_cache_free(hdr_cache, hdr);
1635 }
1636 
1637 void
1638 arc_buf_free(arc_buf_t *buf, void *tag)
1639 {
1640         arc_buf_hdr_t *hdr = buf->b_hdr;
1641         int hashed = hdr->b_state != arc_anon;
1642 
1643         ASSERT(buf->b_efunc == NULL);
1644         ASSERT(buf->b_data != NULL);
1645 
1646         if (hashed) {
1647                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1648 
1649                 mutex_enter(hash_lock);
1650                 hdr = buf->b_hdr;
1651                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1652 
1653                 (void) remove_reference(hdr, hash_lock, tag);
1654                 if (hdr->b_datacnt > 1) {
1655                         arc_buf_destroy(buf, FALSE, TRUE);
1656                 } else {
1657                         ASSERT(buf == hdr->b_buf);
1658                         ASSERT(buf->b_efunc == NULL);
1659                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1660                 }
1661                 mutex_exit(hash_lock);
1662         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1663                 int destroy_hdr;
1664                 /*
1665                  * We are in the middle of an async write.  Don't destroy
1666                  * this buffer unless the write completes before we finish
1667                  * decrementing the reference count.
1668                  */
1669                 mutex_enter(&arc_eviction_mtx);
1670                 (void) remove_reference(hdr, NULL, tag);
1671                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1672                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1673                 mutex_exit(&arc_eviction_mtx);
1674                 if (destroy_hdr)
1675                         arc_hdr_destroy(hdr);
1676         } else {
1677                 if (remove_reference(hdr, NULL, tag) > 0)
1678                         arc_buf_destroy(buf, FALSE, TRUE);
1679                 else
1680                         arc_hdr_destroy(hdr);
1681         }
1682 }
1683 
1684 boolean_t
1685 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1686 {
1687         arc_buf_hdr_t *hdr = buf->b_hdr;
1688         kmutex_t *hash_lock = HDR_LOCK(hdr);
1689         boolean_t no_callback = (buf->b_efunc == NULL);
1690 
1691         if (hdr->b_state == arc_anon) {
1692                 ASSERT(hdr->b_datacnt == 1);
1693                 arc_buf_free(buf, tag);
1694                 return (no_callback);
1695         }
1696 
1697         mutex_enter(hash_lock);
1698         hdr = buf->b_hdr;
1699         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1700         ASSERT(hdr->b_state != arc_anon);
1701         ASSERT(buf->b_data != NULL);
1702 
1703         (void) remove_reference(hdr, hash_lock, tag);
1704         if (hdr->b_datacnt > 1) {
1705                 if (no_callback)
1706                         arc_buf_destroy(buf, FALSE, TRUE);
1707         } else if (no_callback) {
1708                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1709                 ASSERT(buf->b_efunc == NULL);
1710                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1711         }
1712         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1713             refcount_is_zero(&hdr->b_refcnt));
1714         mutex_exit(hash_lock);
1715         return (no_callback);
1716 }
1717 
1718 int
1719 arc_buf_size(arc_buf_t *buf)
1720 {
1721         return (buf->b_hdr->b_size);
1722 }
1723 
1724 /*
1725  * Called from the DMU to determine if the current buffer should be
1726  * evicted. In order to ensure proper locking, the eviction must be initiated
1727  * from the DMU. Return true if the buffer is associated with user data and
1728  * duplicate buffers still exist.
1729  */
1730 boolean_t
1731 arc_buf_eviction_needed(arc_buf_t *buf)
1732 {
1733         arc_buf_hdr_t *hdr;
1734         boolean_t evict_needed = B_FALSE;
1735 
1736         if (zfs_disable_dup_eviction)
1737                 return (B_FALSE);
1738 
1739         mutex_enter(&buf->b_evict_lock);
1740         hdr = buf->b_hdr;
1741         if (hdr == NULL) {
1742                 /*
1743                  * We are in arc_do_user_evicts(); let that function
1744                  * perform the eviction.
1745                  */
1746                 ASSERT(buf->b_data == NULL);
1747                 mutex_exit(&buf->b_evict_lock);
1748                 return (B_FALSE);
1749         } else if (buf->b_data == NULL) {
1750                 /*
1751                  * We have already been added to the arc eviction list;
1752                  * recommend eviction.
1753                  */
1754                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1755                 mutex_exit(&buf->b_evict_lock);
1756                 return (B_TRUE);
1757         }
1758 
1759         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1760                 evict_needed = B_TRUE;
1761 
1762         mutex_exit(&buf->b_evict_lock);
1763         return (evict_needed);
1764 }
1765 
1766 /*
1767  * Evict buffers from list until we've removed the specified number of
1768  * bytes.  Move the removed buffers to the appropriate evict state.
1769  * If the recycle flag is set, then attempt to "recycle" a buffer:
1770  * - look for a buffer to evict that is `bytes' long.
1771  * - return the data block from this buffer rather than freeing it.
1772  * This flag is used by callers that are trying to make space for a
1773  * new buffer in a full arc cache.
1774  *
1775  * This function makes a "best effort".  It skips over any buffers
1776  * it can't get a hash_lock on, and so may not catch all candidates.
1777  * It may also return without evicting as much space as requested.
1778  */
1779 static void *
1780 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1781     arc_buf_contents_t type)
1782 {
1783         arc_state_t *evicted_state;
1784         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1785         arc_buf_hdr_t *ab, *ab_prev = NULL;
1786         list_t *list = &state->arcs_list[type];
1787         kmutex_t *hash_lock;
1788         boolean_t have_lock;
1789         void *stolen = NULL;
1790         arc_buf_hdr_t marker = { 0 };
1791         int count = 0;
1792 
1793         ASSERT(state == arc_mru || state == arc_mfu);
1794 
1795         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1796 
1797         mutex_enter(&state->arcs_mtx);
1798         mutex_enter(&evicted_state->arcs_mtx);
1799 
1800         for (ab = list_tail(list); ab; ab = ab_prev) {
1801                 ab_prev = list_prev(list, ab);
1802                 /* prefetch buffers have a minimum lifespan */
1803                 if (HDR_IO_IN_PROGRESS(ab) ||
1804                     (spa && ab->b_spa != spa) ||
1805                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1806                     ddi_get_lbolt() - ab->b_arc_access <
1807                     arc_min_prefetch_lifespan)) {
1808                         skipped++;
1809                         continue;
1810                 }
1811                 /* "lookahead" for better eviction candidate */
1812                 if (recycle && ab->b_size != bytes &&
1813                     ab_prev && ab_prev->b_size == bytes)
1814                         continue;
1815 
1816                 /* ignore markers */
1817                 if (ab->b_spa == 0)
1818                         continue;
1819 
1820                 /*
1821                  * It may take a long time to evict all the bufs requested.
1822                  * To avoid blocking all arc activity, periodically drop
1823                  * the arcs_mtx and give other threads a chance to run
1824                  * before reacquiring the lock.
1825                  *
1826                  * If we are looking for a buffer to recycle, we are in
1827                  * the hot code path, so don't sleep.
1828                  */
1829                 if (!recycle && count++ > arc_evict_iterations) {
1830                         list_insert_after(list, ab, &marker);
1831                         mutex_exit(&evicted_state->arcs_mtx);
1832                         mutex_exit(&state->arcs_mtx);
1833                         kpreempt(KPREEMPT_SYNC);
1834                         mutex_enter(&state->arcs_mtx);
1835                         mutex_enter(&evicted_state->arcs_mtx);
1836                         ab_prev = list_prev(list, &marker);
1837                         list_remove(list, &marker);
1838                         count = 0;
1839                         continue;
1840                 }
1841 
1842                 hash_lock = HDR_LOCK(ab);
1843                 have_lock = MUTEX_HELD(hash_lock);
1844                 if (have_lock || mutex_tryenter(hash_lock)) {
1845                         ASSERT0(refcount_count(&ab->b_refcnt));
1846                         ASSERT(ab->b_datacnt > 0);
1847                         while (ab->b_buf) {
1848                                 arc_buf_t *buf = ab->b_buf;
1849                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1850                                         missed += 1;
1851                                         break;
1852                                 }
1853                                 if (buf->b_data) {
1854                                         bytes_evicted += ab->b_size;
1855                                         if (recycle && ab->b_type == type &&
1856                                             ab->b_size == bytes &&
1857                                             !HDR_L2_WRITING(ab)) {
1858                                                 stolen = buf->b_data;
1859                                                 recycle = FALSE;
1860                                         }
1861                                 }
1862                                 if (buf->b_efunc) {
1863                                         mutex_enter(&arc_eviction_mtx);
1864                                         arc_buf_destroy(buf,
1865                                             buf->b_data == stolen, FALSE);
1866                                         ab->b_buf = buf->b_next;
1867                                         buf->b_hdr = &arc_eviction_hdr;
1868                                         buf->b_next = arc_eviction_list;
1869                                         arc_eviction_list = buf;
1870                                         mutex_exit(&arc_eviction_mtx);
1871                                         mutex_exit(&buf->b_evict_lock);
1872                                 } else {
1873                                         mutex_exit(&buf->b_evict_lock);
1874                                         arc_buf_destroy(buf,
1875                                             buf->b_data == stolen, TRUE);
1876                                 }
1877                         }
1878 
1879                         if (ab->b_l2hdr) {
1880                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1881                                     ab->b_size);
1882                         } else {
1883                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1884                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1885                                             ab->b_size);
1886                                 } else {
1887                                         ARCSTAT_INCR(
1888                                             arcstat_evict_l2_ineligible,
1889                                             ab->b_size);
1890                                 }
1891                         }
1892 
1893                         if (ab->b_datacnt == 0) {
1894                                 arc_change_state(evicted_state, ab, hash_lock);
1895                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1896                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1897                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1898                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1899                         }
1900                         if (!have_lock)
1901                                 mutex_exit(hash_lock);
1902                         if (bytes >= 0 && bytes_evicted >= bytes)
1903                                 break;
1904                 } else {
1905                         missed += 1;
1906                 }
1907         }
1908 
1909         mutex_exit(&evicted_state->arcs_mtx);
1910         mutex_exit(&state->arcs_mtx);
1911 
1912         if (bytes_evicted < bytes)
1913                 dprintf("only evicted %lld bytes from %x",
1914                     (longlong_t)bytes_evicted, state);
1915 
1916         if (skipped)
1917                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1918 
1919         if (missed)
1920                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1921 
1922         /*
1923          * Note: we have just evicted some data into the ghost state,
1924          * potentially putting the ghost size over the desired size.  Rather
1925          * that evicting from the ghost list in this hot code path, leave
1926          * this chore to the arc_reclaim_thread().
1927          */
1928 
1929         return (stolen);
1930 }
1931 
1932 /*
1933  * Remove buffers from list until we've removed the specified number of
1934  * bytes.  Destroy the buffers that are removed.
1935  */
1936 static void
1937 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1938 {
1939         arc_buf_hdr_t *ab, *ab_prev;
1940         arc_buf_hdr_t marker = { 0 };
1941         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1942         kmutex_t *hash_lock;
1943         uint64_t bytes_deleted = 0;
1944         uint64_t bufs_skipped = 0;
1945         int count = 0;
1946 
1947         ASSERT(GHOST_STATE(state));
1948 top:
1949         mutex_enter(&state->arcs_mtx);
1950         for (ab = list_tail(list); ab; ab = ab_prev) {
1951                 ab_prev = list_prev(list, ab);
1952                 if (ab->b_type > ARC_BUFC_NUMTYPES)
1953                         panic("invalid ab=%p", (void *)ab);
1954                 if (spa && ab->b_spa != spa)
1955                         continue;
1956 
1957                 /* ignore markers */
1958                 if (ab->b_spa == 0)
1959                         continue;
1960 
1961                 hash_lock = HDR_LOCK(ab);
1962                 /* caller may be trying to modify this buffer, skip it */
1963                 if (MUTEX_HELD(hash_lock))
1964                         continue;
1965 
1966                 /*
1967                  * It may take a long time to evict all the bufs requested.
1968                  * To avoid blocking all arc activity, periodically drop
1969                  * the arcs_mtx and give other threads a chance to run
1970                  * before reacquiring the lock.
1971                  */
1972                 if (count++ > arc_evict_iterations) {
1973                         list_insert_after(list, ab, &marker);
1974                         mutex_exit(&state->arcs_mtx);
1975                         kpreempt(KPREEMPT_SYNC);
1976                         mutex_enter(&state->arcs_mtx);
1977                         ab_prev = list_prev(list, &marker);
1978                         list_remove(list, &marker);
1979                         count = 0;
1980                         continue;
1981                 }
1982                 if (mutex_tryenter(hash_lock)) {
1983                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1984                         ASSERT(ab->b_buf == NULL);
1985                         ARCSTAT_BUMP(arcstat_deleted);
1986                         bytes_deleted += ab->b_size;
1987 
1988                         if (ab->b_l2hdr != NULL) {
1989                                 /*
1990                                  * This buffer is cached on the 2nd Level ARC;
1991                                  * don't destroy the header.
1992                                  */
1993                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1994                                 mutex_exit(hash_lock);
1995                         } else {
1996                                 arc_change_state(arc_anon, ab, hash_lock);
1997                                 mutex_exit(hash_lock);
1998                                 arc_hdr_destroy(ab);
1999                         }
2000 
2001                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2002                         if (bytes >= 0 && bytes_deleted >= bytes)
2003                                 break;
2004                 } else if (bytes < 0) {
2005                         /*
2006                          * Insert a list marker and then wait for the
2007                          * hash lock to become available. Once its
2008                          * available, restart from where we left off.
2009                          */
2010                         list_insert_after(list, ab, &marker);
2011                         mutex_exit(&state->arcs_mtx);
2012                         mutex_enter(hash_lock);
2013                         mutex_exit(hash_lock);
2014                         mutex_enter(&state->arcs_mtx);
2015                         ab_prev = list_prev(list, &marker);
2016                         list_remove(list, &marker);
2017                 } else {
2018                         bufs_skipped += 1;
2019                 }
2020 
2021         }
2022         mutex_exit(&state->arcs_mtx);
2023 
2024         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2025             (bytes < 0 || bytes_deleted < bytes)) {
2026                 list = &state->arcs_list[ARC_BUFC_METADATA];
2027                 goto top;
2028         }
2029 
2030         if (bufs_skipped) {
2031                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2032                 ASSERT(bytes >= 0);
2033         }
2034 
2035         if (bytes_deleted < bytes)
2036                 dprintf("only deleted %lld bytes from %p",
2037                     (longlong_t)bytes_deleted, state);
2038 }
2039 
2040 static void
2041 arc_adjust(void)
2042 {
2043         int64_t adjustment, delta;
2044 
2045         /*
2046          * Adjust MRU size
2047          */
2048 
2049         adjustment = MIN((int64_t)(arc_size - arc_c),
2050             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2051             arc_p));
2052 
2053         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2054                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2055                 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2056                 adjustment -= delta;
2057         }
2058 
2059         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2060                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2061                 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2062                     ARC_BUFC_METADATA);
2063         }
2064 
2065         /*
2066          * Adjust MFU size
2067          */
2068 
2069         adjustment = arc_size - arc_c;
2070 
2071         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2072                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2073                 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2074                 adjustment -= delta;
2075         }
2076 
2077         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2078                 int64_t delta = MIN(adjustment,
2079                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2080                 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2081                     ARC_BUFC_METADATA);
2082         }
2083 
2084         /*
2085          * Adjust ghost lists
2086          */
2087 
2088         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2089 
2090         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2091                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2092                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2093         }
2094 
2095         adjustment =
2096             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2097 
2098         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2099                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2100                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2101         }
2102 }
2103 
2104 static void
2105 arc_do_user_evicts(void)
2106 {
2107         mutex_enter(&arc_eviction_mtx);
2108         while (arc_eviction_list != NULL) {
2109                 arc_buf_t *buf = arc_eviction_list;
2110                 arc_eviction_list = buf->b_next;
2111                 mutex_enter(&buf->b_evict_lock);
2112                 buf->b_hdr = NULL;
2113                 mutex_exit(&buf->b_evict_lock);
2114                 mutex_exit(&arc_eviction_mtx);
2115 
2116                 if (buf->b_efunc != NULL)
2117                         VERIFY(buf->b_efunc(buf) == 0);
2118 
2119                 buf->b_efunc = NULL;
2120                 buf->b_private = NULL;
2121                 kmem_cache_free(buf_cache, buf);
2122                 mutex_enter(&arc_eviction_mtx);
2123         }
2124         mutex_exit(&arc_eviction_mtx);
2125 }
2126 
2127 /*
2128  * Flush all *evictable* data from the cache for the given spa.
2129  * NOTE: this will not touch "active" (i.e. referenced) data.
2130  */
2131 void
2132 arc_flush(spa_t *spa)
2133 {
2134         uint64_t guid = 0;
2135 
2136         if (spa)
2137                 guid = spa_load_guid(spa);
2138 
2139         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2140                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2141                 if (spa)
2142                         break;
2143         }
2144         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2145                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2146                 if (spa)
2147                         break;
2148         }
2149         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2150                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2151                 if (spa)
2152                         break;
2153         }
2154         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2155                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2156                 if (spa)
2157                         break;
2158         }
2159 
2160         arc_evict_ghost(arc_mru_ghost, guid, -1);
2161         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2162 
2163         mutex_enter(&arc_reclaim_thr_lock);
2164         arc_do_user_evicts();
2165         mutex_exit(&arc_reclaim_thr_lock);
2166         ASSERT(spa || arc_eviction_list == NULL);
2167 }
2168 
2169 void
2170 arc_shrink(void)
2171 {
2172         if (arc_c > arc_c_min) {
2173                 uint64_t to_free;
2174 
2175 #ifdef _KERNEL
2176                 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2177 #else
2178                 to_free = arc_c >> arc_shrink_shift;
2179 #endif
2180                 if (arc_c > arc_c_min + to_free)
2181                         atomic_add_64(&arc_c, -to_free);
2182                 else
2183                         arc_c = arc_c_min;
2184 
2185                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2186                 if (arc_c > arc_size)
2187                         arc_c = MAX(arc_size, arc_c_min);
2188                 if (arc_p > arc_c)
2189                         arc_p = (arc_c >> 1);
2190                 ASSERT(arc_c >= arc_c_min);
2191                 ASSERT((int64_t)arc_p >= 0);
2192         }
2193 
2194         if (arc_size > arc_c)
2195                 arc_adjust();
2196 }
2197 
2198 /*
2199  * Determine if the system is under memory pressure and is asking
2200  * to reclaim memory. A return value of 1 indicates that the system
2201  * is under memory pressure and that the arc should adjust accordingly.
2202  */
2203 static int
2204 arc_reclaim_needed(void)
2205 {
2206         uint64_t extra;
2207 
2208 #ifdef _KERNEL
2209 
2210         if (needfree)
2211                 return (1);
2212 
2213         /*
2214          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2215          */
2216         extra = desfree;
2217 
2218         /*
2219          * check that we're out of range of the pageout scanner.  It starts to
2220          * schedule paging if freemem is less than lotsfree and needfree.
2221          * lotsfree is the high-water mark for pageout, and needfree is the
2222          * number of needed free pages.  We add extra pages here to make sure
2223          * the scanner doesn't start up while we're freeing memory.
2224          */
2225         if (freemem < lotsfree + needfree + extra)
2226                 return (1);
2227 
2228         /*
2229          * check to make sure that swapfs has enough space so that anon
2230          * reservations can still succeed. anon_resvmem() checks that the
2231          * availrmem is greater than swapfs_minfree, and the number of reserved
2232          * swap pages.  We also add a bit of extra here just to prevent
2233          * circumstances from getting really dire.
2234          */
2235         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2236                 return (1);
2237 
2238 #if defined(__i386)
2239         /*
2240          * If we're on an i386 platform, it's possible that we'll exhaust the
2241          * kernel heap space before we ever run out of available physical
2242          * memory.  Most checks of the size of the heap_area compare against
2243          * tune.t_minarmem, which is the minimum available real memory that we
2244          * can have in the system.  However, this is generally fixed at 25 pages
2245          * which is so low that it's useless.  In this comparison, we seek to
2246          * calculate the total heap-size, and reclaim if more than 3/4ths of the
2247          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2248          * free)
2249          */
2250         if (vmem_size(heap_arena, VMEM_FREE) <
2251             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2252                 return (1);
2253 #endif
2254 
2255         /*
2256          * If zio data pages are being allocated out of a separate heap segment,
2257          * then enforce that the size of available vmem for this arena remains
2258          * above about 1/16th free.
2259          *
2260          * Note: The 1/16th arena free requirement was put in place
2261          * to aggressively evict memory from the arc in order to avoid
2262          * memory fragmentation issues.
2263          */
2264         if (zio_arena != NULL &&
2265             vmem_size(zio_arena, VMEM_FREE) <
2266             (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2267                 return (1);
2268 #else
2269         if (spa_get_random(100) == 0)
2270                 return (1);
2271 #endif
2272         return (0);
2273 }
2274 
2275 static void
2276 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2277 {
2278         size_t                  i;
2279         kmem_cache_t            *prev_cache = NULL;
2280         kmem_cache_t            *prev_data_cache = NULL;
2281         extern kmem_cache_t     *zio_buf_cache[];
2282         extern kmem_cache_t     *zio_data_buf_cache[];
2283 
2284 #ifdef _KERNEL
2285         if (arc_meta_used >= arc_meta_limit) {
2286                 /*
2287                  * We are exceeding our meta-data cache limit.
2288                  * Purge some DNLC entries to release holds on meta-data.
2289                  */
2290                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2291         }
2292 #if defined(__i386)
2293         /*
2294          * Reclaim unused memory from all kmem caches.
2295          */
2296         kmem_reap();
2297 #endif
2298 #endif
2299 
2300         /*
2301          * An aggressive reclamation will shrink the cache size as well as
2302          * reap free buffers from the arc kmem caches.
2303          */
2304         if (strat == ARC_RECLAIM_AGGR)
2305                 arc_shrink();
2306 
2307         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2308                 if (zio_buf_cache[i] != prev_cache) {
2309                         prev_cache = zio_buf_cache[i];
2310                         kmem_cache_reap_now(zio_buf_cache[i]);
2311                 }
2312                 if (zio_data_buf_cache[i] != prev_data_cache) {
2313                         prev_data_cache = zio_data_buf_cache[i];
2314                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2315                 }
2316         }
2317         kmem_cache_reap_now(buf_cache);
2318         kmem_cache_reap_now(hdr_cache);
2319 
2320         /*
2321          * Ask the vmem areana to reclaim unused memory from its
2322          * quantum caches.
2323          */
2324         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2325                 vmem_qcache_reap(zio_arena);
2326 }
2327 
2328 static void
2329 arc_reclaim_thread(void)
2330 {
2331         clock_t                 growtime = 0;
2332         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2333         callb_cpr_t             cpr;
2334 
2335         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2336 
2337         mutex_enter(&arc_reclaim_thr_lock);
2338         while (arc_thread_exit == 0) {
2339                 if (arc_reclaim_needed()) {
2340 
2341                         if (arc_no_grow) {
2342                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2343                                         last_reclaim = ARC_RECLAIM_AGGR;
2344                                 } else {
2345                                         last_reclaim = ARC_RECLAIM_CONS;
2346                                 }
2347                         } else {
2348                                 arc_no_grow = TRUE;
2349                                 last_reclaim = ARC_RECLAIM_AGGR;
2350                                 membar_producer();
2351                         }
2352 
2353                         /* reset the growth delay for every reclaim */
2354                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2355 
2356                         arc_kmem_reap_now(last_reclaim);
2357                         arc_warm = B_TRUE;
2358 
2359                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2360                         arc_no_grow = FALSE;
2361                 }
2362 
2363                 arc_adjust();
2364 
2365                 if (arc_eviction_list != NULL)
2366                         arc_do_user_evicts();
2367 
2368                 /* block until needed, or one second, whichever is shorter */
2369                 CALLB_CPR_SAFE_BEGIN(&cpr);
2370                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2371                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2372                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2373         }
2374 
2375         arc_thread_exit = 0;
2376         cv_broadcast(&arc_reclaim_thr_cv);
2377         CALLB_CPR_EXIT(&cpr);               /* drops arc_reclaim_thr_lock */
2378         thread_exit();
2379 }
2380 
2381 /*
2382  * Adapt arc info given the number of bytes we are trying to add and
2383  * the state that we are comming from.  This function is only called
2384  * when we are adding new content to the cache.
2385  */
2386 static void
2387 arc_adapt(int bytes, arc_state_t *state)
2388 {
2389         int mult;
2390         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2391 
2392         if (state == arc_l2c_only)
2393                 return;
2394 
2395         ASSERT(bytes > 0);
2396         /*
2397          * Adapt the target size of the MRU list:
2398          *      - if we just hit in the MRU ghost list, then increase
2399          *        the target size of the MRU list.
2400          *      - if we just hit in the MFU ghost list, then increase
2401          *        the target size of the MFU list by decreasing the
2402          *        target size of the MRU list.
2403          */
2404         if (state == arc_mru_ghost) {
2405                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2406                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2407                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2408 
2409                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2410         } else if (state == arc_mfu_ghost) {
2411                 uint64_t delta;
2412 
2413                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2414                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2415                 mult = MIN(mult, 10);
2416 
2417                 delta = MIN(bytes * mult, arc_p);
2418                 arc_p = MAX(arc_p_min, arc_p - delta);
2419         }
2420         ASSERT((int64_t)arc_p >= 0);
2421 
2422         if (arc_reclaim_needed()) {
2423                 cv_signal(&arc_reclaim_thr_cv);
2424                 return;
2425         }
2426 
2427         if (arc_no_grow)
2428                 return;
2429 
2430         if (arc_c >= arc_c_max)
2431                 return;
2432 
2433         /*
2434          * If we're within (2 * maxblocksize) bytes of the target
2435          * cache size, increment the target cache size
2436          */
2437         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2438                 atomic_add_64(&arc_c, (int64_t)bytes);
2439                 if (arc_c > arc_c_max)
2440                         arc_c = arc_c_max;
2441                 else if (state == arc_anon)
2442                         atomic_add_64(&arc_p, (int64_t)bytes);
2443                 if (arc_p > arc_c)
2444                         arc_p = arc_c;
2445         }
2446         ASSERT((int64_t)arc_p >= 0);
2447 }
2448 
2449 /*
2450  * Check if the cache has reached its limits and eviction is required
2451  * prior to insert.
2452  */
2453 static int
2454 arc_evict_needed(arc_buf_contents_t type)
2455 {
2456         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2457                 return (1);
2458 
2459         if (arc_reclaim_needed())
2460                 return (1);
2461 
2462         return (arc_size > arc_c);
2463 }
2464 
2465 /*
2466  * The buffer, supplied as the first argument, needs a data block.
2467  * So, if we are at cache max, determine which cache should be victimized.
2468  * We have the following cases:
2469  *
2470  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2471  * In this situation if we're out of space, but the resident size of the MFU is
2472  * under the limit, victimize the MFU cache to satisfy this insertion request.
2473  *
2474  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2475  * Here, we've used up all of the available space for the MRU, so we need to
2476  * evict from our own cache instead.  Evict from the set of resident MRU
2477  * entries.
2478  *
2479  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2480  * c minus p represents the MFU space in the cache, since p is the size of the
2481  * cache that is dedicated to the MRU.  In this situation there's still space on
2482  * the MFU side, so the MRU side needs to be victimized.
2483  *
2484  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2485  * MFU's resident set is consuming more space than it has been allotted.  In
2486  * this situation, we must victimize our own cache, the MFU, for this insertion.
2487  */
2488 static void
2489 arc_get_data_buf(arc_buf_t *buf)
2490 {
2491         arc_state_t             *state = buf->b_hdr->b_state;
2492         uint64_t                size = buf->b_hdr->b_size;
2493         arc_buf_contents_t      type = buf->b_hdr->b_type;
2494 
2495         arc_adapt(size, state);
2496 
2497         /*
2498          * We have not yet reached cache maximum size,
2499          * just allocate a new buffer.
2500          */
2501         if (!arc_evict_needed(type)) {
2502                 if (type == ARC_BUFC_METADATA) {
2503                         buf->b_data = zio_buf_alloc(size);
2504                         arc_space_consume(size, ARC_SPACE_DATA);
2505                 } else {
2506                         ASSERT(type == ARC_BUFC_DATA);
2507                         buf->b_data = zio_data_buf_alloc(size);
2508                         ARCSTAT_INCR(arcstat_data_size, size);
2509                         atomic_add_64(&arc_size, size);
2510                 }
2511                 goto out;
2512         }
2513 
2514         /*
2515          * If we are prefetching from the mfu ghost list, this buffer
2516          * will end up on the mru list; so steal space from there.
2517          */
2518         if (state == arc_mfu_ghost)
2519                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2520         else if (state == arc_mru_ghost)
2521                 state = arc_mru;
2522 
2523         if (state == arc_mru || state == arc_anon) {
2524                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2525                 state = (arc_mfu->arcs_lsize[type] >= size &&
2526                     arc_p > mru_used) ? arc_mfu : arc_mru;
2527         } else {
2528                 /* MFU cases */
2529                 uint64_t mfu_space = arc_c - arc_p;
2530                 state =  (arc_mru->arcs_lsize[type] >= size &&
2531                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2532         }
2533         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2534                 if (type == ARC_BUFC_METADATA) {
2535                         buf->b_data = zio_buf_alloc(size);
2536                         arc_space_consume(size, ARC_SPACE_DATA);
2537                 } else {
2538                         ASSERT(type == ARC_BUFC_DATA);
2539                         buf->b_data = zio_data_buf_alloc(size);
2540                         ARCSTAT_INCR(arcstat_data_size, size);
2541                         atomic_add_64(&arc_size, size);
2542                 }
2543                 ARCSTAT_BUMP(arcstat_recycle_miss);
2544         }
2545         ASSERT(buf->b_data != NULL);
2546 out:
2547         /*
2548          * Update the state size.  Note that ghost states have a
2549          * "ghost size" and so don't need to be updated.
2550          */
2551         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2552                 arc_buf_hdr_t *hdr = buf->b_hdr;
2553 
2554                 atomic_add_64(&hdr->b_state->arcs_size, size);
2555                 if (list_link_active(&hdr->b_arc_node)) {
2556                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2557                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2558                 }
2559                 /*
2560                  * If we are growing the cache, and we are adding anonymous
2561                  * data, and we have outgrown arc_p, update arc_p
2562                  */
2563                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2564                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2565                         arc_p = MIN(arc_c, arc_p + size);
2566         }
2567 }
2568 
2569 /*
2570  * This routine is called whenever a buffer is accessed.
2571  * NOTE: the hash lock is dropped in this function.
2572  */
2573 static void
2574 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2575 {
2576         clock_t now;
2577 
2578         ASSERT(MUTEX_HELD(hash_lock));
2579 
2580         if (buf->b_state == arc_anon) {
2581                 /*
2582                  * This buffer is not in the cache, and does not
2583                  * appear in our "ghost" list.  Add the new buffer
2584                  * to the MRU state.
2585                  */
2586 
2587                 ASSERT(buf->b_arc_access == 0);
2588                 buf->b_arc_access = ddi_get_lbolt();
2589                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2590                 arc_change_state(arc_mru, buf, hash_lock);
2591 
2592         } else if (buf->b_state == arc_mru) {
2593                 now = ddi_get_lbolt();
2594 
2595                 /*
2596                  * If this buffer is here because of a prefetch, then either:
2597                  * - clear the flag if this is a "referencing" read
2598                  *   (any subsequent access will bump this into the MFU state).
2599                  * or
2600                  * - move the buffer to the head of the list if this is
2601                  *   another prefetch (to make it less likely to be evicted).
2602                  */
2603                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2604                         if (refcount_count(&buf->b_refcnt) == 0) {
2605                                 ASSERT(list_link_active(&buf->b_arc_node));
2606                         } else {
2607                                 buf->b_flags &= ~ARC_PREFETCH;
2608                                 ARCSTAT_BUMP(arcstat_mru_hits);
2609                         }
2610                         buf->b_arc_access = now;
2611                         return;
2612                 }
2613 
2614                 /*
2615                  * This buffer has been "accessed" only once so far,
2616                  * but it is still in the cache. Move it to the MFU
2617                  * state.
2618                  */
2619                 if (now > buf->b_arc_access + ARC_MINTIME) {
2620                         /*
2621                          * More than 125ms have passed since we
2622                          * instantiated this buffer.  Move it to the
2623                          * most frequently used state.
2624                          */
2625                         buf->b_arc_access = now;
2626                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2627                         arc_change_state(arc_mfu, buf, hash_lock);
2628                 }
2629                 ARCSTAT_BUMP(arcstat_mru_hits);
2630         } else if (buf->b_state == arc_mru_ghost) {
2631                 arc_state_t     *new_state;
2632                 /*
2633                  * This buffer has been "accessed" recently, but
2634                  * was evicted from the cache.  Move it to the
2635                  * MFU state.
2636                  */
2637 
2638                 if (buf->b_flags & ARC_PREFETCH) {
2639                         new_state = arc_mru;
2640                         if (refcount_count(&buf->b_refcnt) > 0)
2641                                 buf->b_flags &= ~ARC_PREFETCH;
2642                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2643                 } else {
2644                         new_state = arc_mfu;
2645                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2646                 }
2647 
2648                 buf->b_arc_access = ddi_get_lbolt();
2649                 arc_change_state(new_state, buf, hash_lock);
2650 
2651                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2652         } else if (buf->b_state == arc_mfu) {
2653                 /*
2654                  * This buffer has been accessed more than once and is
2655                  * still in the cache.  Keep it in the MFU state.
2656                  *
2657                  * NOTE: an add_reference() that occurred when we did
2658                  * the arc_read() will have kicked this off the list.
2659                  * If it was a prefetch, we will explicitly move it to
2660                  * the head of the list now.
2661                  */
2662                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2663                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2664                         ASSERT(list_link_active(&buf->b_arc_node));
2665                 }
2666                 ARCSTAT_BUMP(arcstat_mfu_hits);
2667                 buf->b_arc_access = ddi_get_lbolt();
2668         } else if (buf->b_state == arc_mfu_ghost) {
2669                 arc_state_t     *new_state = arc_mfu;
2670                 /*
2671                  * This buffer has been accessed more than once but has
2672                  * been evicted from the cache.  Move it back to the
2673                  * MFU state.
2674                  */
2675 
2676                 if (buf->b_flags & ARC_PREFETCH) {
2677                         /*
2678                          * This is a prefetch access...
2679                          * move this block back to the MRU state.
2680                          */
2681                         ASSERT0(refcount_count(&buf->b_refcnt));
2682                         new_state = arc_mru;
2683                 }
2684 
2685                 buf->b_arc_access = ddi_get_lbolt();
2686                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2687                 arc_change_state(new_state, buf, hash_lock);
2688 
2689                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2690         } else if (buf->b_state == arc_l2c_only) {
2691                 /*
2692                  * This buffer is on the 2nd Level ARC.
2693                  */
2694 
2695                 buf->b_arc_access = ddi_get_lbolt();
2696                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2697                 arc_change_state(arc_mfu, buf, hash_lock);
2698         } else {
2699                 ASSERT(!"invalid arc state");
2700         }
2701 }
2702 
2703 /* a generic arc_done_func_t which you can use */
2704 /* ARGSUSED */
2705 void
2706 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2707 {
2708         if (zio == NULL || zio->io_error == 0)
2709                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2710         VERIFY(arc_buf_remove_ref(buf, arg));
2711 }
2712 
2713 /* a generic arc_done_func_t */
2714 void
2715 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2716 {
2717         arc_buf_t **bufp = arg;
2718         if (zio && zio->io_error) {
2719                 VERIFY(arc_buf_remove_ref(buf, arg));
2720                 *bufp = NULL;
2721         } else {
2722                 *bufp = buf;
2723                 ASSERT(buf->b_data);
2724         }
2725 }
2726 
2727 static void
2728 arc_read_done(zio_t *zio)
2729 {
2730         arc_buf_hdr_t   *hdr, *found;
2731         arc_buf_t       *buf;
2732         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2733         kmutex_t        *hash_lock;
2734         arc_callback_t  *callback_list, *acb;
2735         int             freeable = FALSE;
2736 
2737         buf = zio->io_private;
2738         hdr = buf->b_hdr;
2739 
2740         /*
2741          * The hdr was inserted into hash-table and removed from lists
2742          * prior to starting I/O.  We should find this header, since
2743          * it's in the hash table, and it should be legit since it's
2744          * not possible to evict it during the I/O.  The only possible
2745          * reason for it not to be found is if we were freed during the
2746          * read.
2747          */
2748         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2749             &hash_lock);
2750 
2751         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2752             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2753             (found == hdr && HDR_L2_READING(hdr)));
2754 
2755         hdr->b_flags &= ~ARC_L2_EVICTED;
2756         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2757                 hdr->b_flags &= ~ARC_L2CACHE;
2758 
2759         /* byteswap if necessary */
2760         callback_list = hdr->b_acb;
2761         ASSERT(callback_list != NULL);
2762         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2763                 dmu_object_byteswap_t bswap =
2764                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2765                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2766                     byteswap_uint64_array :
2767                     dmu_ot_byteswap[bswap].ob_func;
2768                 func(buf->b_data, hdr->b_size);
2769         }
2770 
2771         arc_cksum_compute(buf, B_FALSE);
2772         arc_buf_watch(buf);
2773 
2774         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2775                 /*
2776                  * Only call arc_access on anonymous buffers.  This is because
2777                  * if we've issued an I/O for an evicted buffer, we've already
2778                  * called arc_access (to prevent any simultaneous readers from
2779                  * getting confused).
2780                  */
2781                 arc_access(hdr, hash_lock);
2782         }
2783 
2784         /* create copies of the data buffer for the callers */
2785         abuf = buf;
2786         for (acb = callback_list; acb; acb = acb->acb_next) {
2787                 if (acb->acb_done) {
2788                         if (abuf == NULL) {
2789                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
2790                                 abuf = arc_buf_clone(buf);
2791                         }
2792                         acb->acb_buf = abuf;
2793                         abuf = NULL;
2794                 }
2795         }
2796         hdr->b_acb = NULL;
2797         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2798         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2799         if (abuf == buf) {
2800                 ASSERT(buf->b_efunc == NULL);
2801                 ASSERT(hdr->b_datacnt == 1);
2802                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2803         }
2804 
2805         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2806 
2807         if (zio->io_error != 0) {
2808                 hdr->b_flags |= ARC_IO_ERROR;
2809                 if (hdr->b_state != arc_anon)
2810                         arc_change_state(arc_anon, hdr, hash_lock);
2811                 if (HDR_IN_HASH_TABLE(hdr))
2812                         buf_hash_remove(hdr);
2813                 freeable = refcount_is_zero(&hdr->b_refcnt);
2814         }
2815 
2816         /*
2817          * Broadcast before we drop the hash_lock to avoid the possibility
2818          * that the hdr (and hence the cv) might be freed before we get to
2819          * the cv_broadcast().
2820          */
2821         cv_broadcast(&hdr->b_cv);
2822 
2823         if (hash_lock) {
2824                 mutex_exit(hash_lock);
2825         } else {
2826                 /*
2827                  * This block was freed while we waited for the read to
2828                  * complete.  It has been removed from the hash table and
2829                  * moved to the anonymous state (so that it won't show up
2830                  * in the cache).
2831                  */
2832                 ASSERT3P(hdr->b_state, ==, arc_anon);
2833                 freeable = refcount_is_zero(&hdr->b_refcnt);
2834         }
2835 
2836         /* execute each callback and free its structure */
2837         while ((acb = callback_list) != NULL) {
2838                 if (acb->acb_done)
2839                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2840 
2841                 if (acb->acb_zio_dummy != NULL) {
2842                         acb->acb_zio_dummy->io_error = zio->io_error;
2843                         zio_nowait(acb->acb_zio_dummy);
2844                 }
2845 
2846                 callback_list = acb->acb_next;
2847                 kmem_free(acb, sizeof (arc_callback_t));
2848         }
2849 
2850         if (freeable)
2851                 arc_hdr_destroy(hdr);
2852 }
2853 
2854 /*
2855  * "Read" the block at the specified DVA (in bp) via the
2856  * cache.  If the block is found in the cache, invoke the provided
2857  * callback immediately and return.  Note that the `zio' parameter
2858  * in the callback will be NULL in this case, since no IO was
2859  * required.  If the block is not in the cache pass the read request
2860  * on to the spa with a substitute callback function, so that the
2861  * requested block will be added to the cache.
2862  *
2863  * If a read request arrives for a block that has a read in-progress,
2864  * either wait for the in-progress read to complete (and return the
2865  * results); or, if this is a read with a "done" func, add a record
2866  * to the read to invoke the "done" func when the read completes,
2867  * and return; or just return.
2868  *
2869  * arc_read_done() will invoke all the requested "done" functions
2870  * for readers of this block.
2871  */
2872 int
2873 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2874     void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
2875     const zbookmark_t *zb)
2876 {
2877         arc_buf_hdr_t *hdr;
2878         arc_buf_t *buf = NULL;
2879         kmutex_t *hash_lock;
2880         zio_t *rzio;
2881         uint64_t guid = spa_load_guid(spa);
2882 
2883 top:
2884         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2885             &hash_lock);
2886         if (hdr && hdr->b_datacnt > 0) {
2887 
2888                 *arc_flags |= ARC_CACHED;
2889 
2890                 if (HDR_IO_IN_PROGRESS(hdr)) {
2891 
2892                         if (*arc_flags & ARC_WAIT) {
2893                                 cv_wait(&hdr->b_cv, hash_lock);
2894                                 mutex_exit(hash_lock);
2895                                 goto top;
2896                         }
2897                         ASSERT(*arc_flags & ARC_NOWAIT);
2898 
2899                         if (done) {
2900                                 arc_callback_t  *acb = NULL;
2901 
2902                                 acb = kmem_zalloc(sizeof (arc_callback_t),
2903                                     KM_SLEEP);
2904                                 acb->acb_done = done;
2905                                 acb->acb_private = private;
2906                                 if (pio != NULL)
2907                                         acb->acb_zio_dummy = zio_null(pio,
2908                                             spa, NULL, NULL, NULL, zio_flags);
2909 
2910                                 ASSERT(acb->acb_done != NULL);
2911                                 acb->acb_next = hdr->b_acb;
2912                                 hdr->b_acb = acb;
2913                                 add_reference(hdr, hash_lock, private);
2914                                 mutex_exit(hash_lock);
2915                                 return (0);
2916                         }
2917                         mutex_exit(hash_lock);
2918                         return (0);
2919                 }
2920 
2921                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2922 
2923                 if (done) {
2924                         add_reference(hdr, hash_lock, private);
2925                         /*
2926                          * If this block is already in use, create a new
2927                          * copy of the data so that we will be guaranteed
2928                          * that arc_release() will always succeed.
2929                          */
2930                         buf = hdr->b_buf;
2931                         ASSERT(buf);
2932                         ASSERT(buf->b_data);
2933                         if (HDR_BUF_AVAILABLE(hdr)) {
2934                                 ASSERT(buf->b_efunc == NULL);
2935                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2936                         } else {
2937                                 buf = arc_buf_clone(buf);
2938                         }
2939 
2940                 } else if (*arc_flags & ARC_PREFETCH &&
2941                     refcount_count(&hdr->b_refcnt) == 0) {
2942                         hdr->b_flags |= ARC_PREFETCH;
2943                 }
2944                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2945                 arc_access(hdr, hash_lock);
2946                 if (*arc_flags & ARC_L2CACHE)
2947                         hdr->b_flags |= ARC_L2CACHE;
2948                 if (*arc_flags & ARC_L2COMPRESS)
2949                         hdr->b_flags |= ARC_L2COMPRESS;
2950                 mutex_exit(hash_lock);
2951                 ARCSTAT_BUMP(arcstat_hits);
2952                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2953                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2954                     data, metadata, hits);
2955 
2956                 if (done)
2957                         done(NULL, buf, private);
2958         } else {
2959                 uint64_t size = BP_GET_LSIZE(bp);
2960                 arc_callback_t  *acb;
2961                 vdev_t *vd = NULL;
2962                 uint64_t addr = 0;
2963                 boolean_t devw = B_FALSE;
2964 
2965                 if (hdr == NULL) {
2966                         /* this block is not in the cache */
2967                         arc_buf_hdr_t   *exists;
2968                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2969                         buf = arc_buf_alloc(spa, size, private, type);
2970                         hdr = buf->b_hdr;
2971                         hdr->b_dva = *BP_IDENTITY(bp);
2972                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2973                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2974                         exists = buf_hash_insert(hdr, &hash_lock);
2975                         if (exists) {
2976                                 /* somebody beat us to the hash insert */
2977                                 mutex_exit(hash_lock);
2978                                 buf_discard_identity(hdr);
2979                                 (void) arc_buf_remove_ref(buf, private);
2980                                 goto top; /* restart the IO request */
2981                         }
2982                         /* if this is a prefetch, we don't have a reference */
2983                         if (*arc_flags & ARC_PREFETCH) {
2984                                 (void) remove_reference(hdr, hash_lock,
2985                                     private);
2986                                 hdr->b_flags |= ARC_PREFETCH;
2987                         }
2988                         if (*arc_flags & ARC_L2CACHE)
2989                                 hdr->b_flags |= ARC_L2CACHE;
2990                         if (*arc_flags & ARC_L2COMPRESS)
2991                                 hdr->b_flags |= ARC_L2COMPRESS;
2992                         if (BP_GET_LEVEL(bp) > 0)
2993                                 hdr->b_flags |= ARC_INDIRECT;
2994                 } else {
2995                         /* this block is in the ghost cache */
2996                         ASSERT(GHOST_STATE(hdr->b_state));
2997                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2998                         ASSERT0(refcount_count(&hdr->b_refcnt));
2999                         ASSERT(hdr->b_buf == NULL);
3000 
3001                         /* if this is a prefetch, we don't have a reference */
3002                         if (*arc_flags & ARC_PREFETCH)
3003                                 hdr->b_flags |= ARC_PREFETCH;
3004                         else
3005                                 add_reference(hdr, hash_lock, private);
3006                         if (*arc_flags & ARC_L2CACHE)
3007                                 hdr->b_flags |= ARC_L2CACHE;
3008                         if (*arc_flags & ARC_L2COMPRESS)
3009                                 hdr->b_flags |= ARC_L2COMPRESS;
3010                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3011                         buf->b_hdr = hdr;
3012                         buf->b_data = NULL;
3013                         buf->b_efunc = NULL;
3014                         buf->b_private = NULL;
3015                         buf->b_next = NULL;
3016                         hdr->b_buf = buf;
3017                         ASSERT(hdr->b_datacnt == 0);
3018                         hdr->b_datacnt = 1;
3019                         arc_get_data_buf(buf);
3020                         arc_access(hdr, hash_lock);
3021                 }
3022 
3023                 ASSERT(!GHOST_STATE(hdr->b_state));
3024 
3025                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3026                 acb->acb_done = done;
3027                 acb->acb_private = private;
3028 
3029                 ASSERT(hdr->b_acb == NULL);
3030                 hdr->b_acb = acb;
3031                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3032 
3033                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3034                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3035                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3036                         addr = hdr->b_l2hdr->b_daddr;
3037                         /*
3038                          * Lock out device removal.
3039                          */
3040                         if (vdev_is_dead(vd) ||
3041                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3042                                 vd = NULL;
3043                 }
3044 
3045                 mutex_exit(hash_lock);
3046 
3047                 /*
3048                  * At this point, we have a level 1 cache miss.  Try again in
3049                  * L2ARC if possible.
3050                  */
3051                 ASSERT3U(hdr->b_size, ==, size);
3052                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3053                     uint64_t, size, zbookmark_t *, zb);
3054                 ARCSTAT_BUMP(arcstat_misses);
3055                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3056                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3057                     data, metadata, misses);
3058 
3059                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3060                         /*
3061                          * Read from the L2ARC if the following are true:
3062                          * 1. The L2ARC vdev was previously cached.
3063                          * 2. This buffer still has L2ARC metadata.
3064                          * 3. This buffer isn't currently writing to the L2ARC.
3065                          * 4. The L2ARC entry wasn't evicted, which may
3066                          *    also have invalidated the vdev.
3067                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3068                          */
3069                         if (hdr->b_l2hdr != NULL &&
3070                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3071                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3072                                 l2arc_read_callback_t *cb;
3073 
3074                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3075                                 ARCSTAT_BUMP(arcstat_l2_hits);
3076 
3077                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3078                                     KM_SLEEP);
3079                                 cb->l2rcb_buf = buf;
3080                                 cb->l2rcb_spa = spa;
3081                                 cb->l2rcb_bp = *bp;
3082                                 cb->l2rcb_zb = *zb;
3083                                 cb->l2rcb_flags = zio_flags;
3084                                 cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3085 
3086                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3087                                     addr + size < vd->vdev_psize -
3088                                     VDEV_LABEL_END_SIZE);
3089 
3090                                 /*
3091                                  * l2arc read.  The SCL_L2ARC lock will be
3092                                  * released by l2arc_read_done().
3093                                  * Issue a null zio if the underlying buffer
3094                                  * was squashed to zero size by compression.
3095                                  */
3096                                 if (hdr->b_l2hdr->b_compress ==
3097                                     ZIO_COMPRESS_EMPTY) {
3098                                         rzio = zio_null(pio, spa, vd,
3099                                             l2arc_read_done, cb,
3100                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3101                                             ZIO_FLAG_CANFAIL |
3102                                             ZIO_FLAG_DONT_PROPAGATE |
3103                                             ZIO_FLAG_DONT_RETRY);
3104                                 } else {
3105                                         rzio = zio_read_phys(pio, vd, addr,
3106                                             hdr->b_l2hdr->b_asize,
3107                                             buf->b_data, ZIO_CHECKSUM_OFF,
3108                                             l2arc_read_done, cb, priority,
3109                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3110                                             ZIO_FLAG_CANFAIL |
3111                                             ZIO_FLAG_DONT_PROPAGATE |
3112                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
3113                                 }
3114                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3115                                     zio_t *, rzio);
3116                                 ARCSTAT_INCR(arcstat_l2_read_bytes,
3117                                     hdr->b_l2hdr->b_asize);
3118 
3119                                 if (*arc_flags & ARC_NOWAIT) {
3120                                         zio_nowait(rzio);
3121                                         return (0);
3122                                 }
3123 
3124                                 ASSERT(*arc_flags & ARC_WAIT);
3125                                 if (zio_wait(rzio) == 0)
3126                                         return (0);
3127 
3128                                 /* l2arc read error; goto zio_read() */
3129                         } else {
3130                                 DTRACE_PROBE1(l2arc__miss,
3131                                     arc_buf_hdr_t *, hdr);
3132                                 ARCSTAT_BUMP(arcstat_l2_misses);
3133                                 if (HDR_L2_WRITING(hdr))
3134                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3135                                 spa_config_exit(spa, SCL_L2ARC, vd);
3136                         }
3137                 } else {
3138                         if (vd != NULL)
3139                                 spa_config_exit(spa, SCL_L2ARC, vd);
3140                         if (l2arc_ndev != 0) {
3141                                 DTRACE_PROBE1(l2arc__miss,
3142                                     arc_buf_hdr_t *, hdr);
3143                                 ARCSTAT_BUMP(arcstat_l2_misses);
3144                         }
3145                 }
3146 
3147                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3148                     arc_read_done, buf, priority, zio_flags, zb);
3149 
3150                 if (*arc_flags & ARC_WAIT)
3151                         return (zio_wait(rzio));
3152 
3153                 ASSERT(*arc_flags & ARC_NOWAIT);
3154                 zio_nowait(rzio);
3155         }
3156         return (0);
3157 }
3158 
3159 void
3160 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3161 {
3162         ASSERT(buf->b_hdr != NULL);
3163         ASSERT(buf->b_hdr->b_state != arc_anon);
3164         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3165         ASSERT(buf->b_efunc == NULL);
3166         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3167 
3168         buf->b_efunc = func;
3169         buf->b_private = private;
3170 }
3171 
3172 /*
3173  * Notify the arc that a block was freed, and thus will never be used again.
3174  */
3175 void
3176 arc_freed(spa_t *spa, const blkptr_t *bp)
3177 {
3178         arc_buf_hdr_t *hdr;
3179         kmutex_t *hash_lock;
3180         uint64_t guid = spa_load_guid(spa);
3181 
3182         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3183             &hash_lock);
3184         if (hdr == NULL)
3185                 return;
3186         if (HDR_BUF_AVAILABLE(hdr)) {
3187                 arc_buf_t *buf = hdr->b_buf;
3188                 add_reference(hdr, hash_lock, FTAG);
3189                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3190                 mutex_exit(hash_lock);
3191 
3192                 arc_release(buf, FTAG);
3193                 (void) arc_buf_remove_ref(buf, FTAG);
3194         } else {
3195                 mutex_exit(hash_lock);
3196         }
3197 
3198 }
3199 
3200 /*
3201  * This is used by the DMU to let the ARC know that a buffer is
3202  * being evicted, so the ARC should clean up.  If this arc buf
3203  * is not yet in the evicted state, it will be put there.
3204  */
3205 int
3206 arc_buf_evict(arc_buf_t *buf)
3207 {
3208         arc_buf_hdr_t *hdr;
3209         kmutex_t *hash_lock;
3210         arc_buf_t **bufp;
3211 
3212         mutex_enter(&buf->b_evict_lock);
3213         hdr = buf->b_hdr;
3214         if (hdr == NULL) {
3215                 /*
3216                  * We are in arc_do_user_evicts().
3217                  */
3218                 ASSERT(buf->b_data == NULL);
3219                 mutex_exit(&buf->b_evict_lock);
3220                 return (0);
3221         } else if (buf->b_data == NULL) {
3222                 arc_buf_t copy = *buf; /* structure assignment */
3223                 /*
3224                  * We are on the eviction list; process this buffer now
3225                  * but let arc_do_user_evicts() do the reaping.
3226                  */
3227                 buf->b_efunc = NULL;
3228                 mutex_exit(&buf->b_evict_lock);
3229                 VERIFY(copy.b_efunc(&copy) == 0);
3230                 return (1);
3231         }
3232         hash_lock = HDR_LOCK(hdr);
3233         mutex_enter(hash_lock);
3234         hdr = buf->b_hdr;
3235         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3236 
3237         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3238         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3239 
3240         /*
3241          * Pull this buffer off of the hdr
3242          */
3243         bufp = &hdr->b_buf;
3244         while (*bufp != buf)
3245                 bufp = &(*bufp)->b_next;
3246         *bufp = buf->b_next;
3247 
3248         ASSERT(buf->b_data != NULL);
3249         arc_buf_destroy(buf, FALSE, FALSE);
3250 
3251         if (hdr->b_datacnt == 0) {
3252                 arc_state_t *old_state = hdr->b_state;
3253                 arc_state_t *evicted_state;
3254 
3255                 ASSERT(hdr->b_buf == NULL);
3256                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3257 
3258                 evicted_state =
3259                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3260 
3261                 mutex_enter(&old_state->arcs_mtx);
3262                 mutex_enter(&evicted_state->arcs_mtx);
3263 
3264                 arc_change_state(evicted_state, hdr, hash_lock);
3265                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3266                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3267                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3268 
3269                 mutex_exit(&evicted_state->arcs_mtx);
3270                 mutex_exit(&old_state->arcs_mtx);
3271         }
3272         mutex_exit(hash_lock);
3273         mutex_exit(&buf->b_evict_lock);
3274 
3275         VERIFY(buf->b_efunc(buf) == 0);
3276         buf->b_efunc = NULL;
3277         buf->b_private = NULL;
3278         buf->b_hdr = NULL;
3279         buf->b_next = NULL;
3280         kmem_cache_free(buf_cache, buf);
3281         return (1);
3282 }
3283 
3284 /*
3285  * Release this buffer from the cache, making it an anonymous buffer.  This
3286  * must be done after a read and prior to modifying the buffer contents.
3287  * If the buffer has more than one reference, we must make
3288  * a new hdr for the buffer.
3289  */
3290 void
3291 arc_release(arc_buf_t *buf, void *tag)
3292 {
3293         arc_buf_hdr_t *hdr;
3294         kmutex_t *hash_lock = NULL;
3295         l2arc_buf_hdr_t *l2hdr;
3296         uint64_t buf_size;
3297 
3298         /*
3299          * It would be nice to assert that if it's DMU metadata (level >
3300          * 0 || it's the dnode file), then it must be syncing context.
3301          * But we don't know that information at this level.
3302          */
3303 
3304         mutex_enter(&buf->b_evict_lock);
3305         hdr = buf->b_hdr;
3306 
3307         /* this buffer is not on any list */
3308         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3309 
3310         if (hdr->b_state == arc_anon) {
3311                 /* this buffer is already released */
3312                 ASSERT(buf->b_efunc == NULL);
3313         } else {
3314                 hash_lock = HDR_LOCK(hdr);
3315                 mutex_enter(hash_lock);
3316                 hdr = buf->b_hdr;
3317                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3318         }
3319 
3320         l2hdr = hdr->b_l2hdr;
3321         if (l2hdr) {
3322                 mutex_enter(&l2arc_buflist_mtx);
3323                 hdr->b_l2hdr = NULL;
3324         }
3325         buf_size = hdr->b_size;
3326 
3327         /*
3328          * Do we have more than one buf?
3329          */
3330         if (hdr->b_datacnt > 1) {
3331                 arc_buf_hdr_t *nhdr;
3332                 arc_buf_t **bufp;
3333                 uint64_t blksz = hdr->b_size;
3334                 uint64_t spa = hdr->b_spa;
3335                 arc_buf_contents_t type = hdr->b_type;
3336                 uint32_t flags = hdr->b_flags;
3337 
3338                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3339                 /*
3340                  * Pull the data off of this hdr and attach it to
3341                  * a new anonymous hdr.
3342                  */
3343                 (void) remove_reference(hdr, hash_lock, tag);
3344                 bufp = &hdr->b_buf;
3345                 while (*bufp != buf)
3346                         bufp = &(*bufp)->b_next;
3347                 *bufp = buf->b_next;
3348                 buf->b_next = NULL;
3349 
3350                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3351                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3352                 if (refcount_is_zero(&hdr->b_refcnt)) {
3353                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3354                         ASSERT3U(*size, >=, hdr->b_size);
3355                         atomic_add_64(size, -hdr->b_size);
3356                 }
3357 
3358                 /*
3359                  * We're releasing a duplicate user data buffer, update
3360                  * our statistics accordingly.
3361                  */
3362                 if (hdr->b_type == ARC_BUFC_DATA) {
3363                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3364                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3365                             -hdr->b_size);
3366                 }
3367                 hdr->b_datacnt -= 1;
3368                 arc_cksum_verify(buf);
3369                 arc_buf_unwatch(buf);
3370 
3371                 mutex_exit(hash_lock);
3372 
3373                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3374                 nhdr->b_size = blksz;
3375                 nhdr->b_spa = spa;
3376                 nhdr->b_type = type;
3377                 nhdr->b_buf = buf;
3378                 nhdr->b_state = arc_anon;
3379                 nhdr->b_arc_access = 0;
3380                 nhdr->b_flags = flags & ARC_L2_WRITING;
3381                 nhdr->b_l2hdr = NULL;
3382                 nhdr->b_datacnt = 1;
3383                 nhdr->b_freeze_cksum = NULL;
3384                 (void) refcount_add(&nhdr->b_refcnt, tag);
3385                 buf->b_hdr = nhdr;
3386                 mutex_exit(&buf->b_evict_lock);
3387                 atomic_add_64(&arc_anon->arcs_size, blksz);
3388         } else {
3389                 mutex_exit(&buf->b_evict_lock);
3390                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3391                 ASSERT(!list_link_active(&hdr->b_arc_node));
3392                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3393                 if (hdr->b_state != arc_anon)
3394                         arc_change_state(arc_anon, hdr, hash_lock);
3395                 hdr->b_arc_access = 0;
3396                 if (hash_lock)
3397                         mutex_exit(hash_lock);
3398 
3399                 buf_discard_identity(hdr);
3400                 arc_buf_thaw(buf);
3401         }
3402         buf->b_efunc = NULL;
3403         buf->b_private = NULL;
3404 
3405         if (l2hdr) {
3406                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3407                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3408                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3409                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3410                 mutex_exit(&l2arc_buflist_mtx);
3411         }
3412 }
3413 
3414 int
3415 arc_released(arc_buf_t *buf)
3416 {
3417         int released;
3418 
3419         mutex_enter(&buf->b_evict_lock);
3420         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3421         mutex_exit(&buf->b_evict_lock);
3422         return (released);
3423 }
3424 
3425 int
3426 arc_has_callback(arc_buf_t *buf)
3427 {
3428         int callback;
3429 
3430         mutex_enter(&buf->b_evict_lock);
3431         callback = (buf->b_efunc != NULL);
3432         mutex_exit(&buf->b_evict_lock);
3433         return (callback);
3434 }
3435 
3436 #ifdef ZFS_DEBUG
3437 int
3438 arc_referenced(arc_buf_t *buf)
3439 {
3440         int referenced;
3441 
3442         mutex_enter(&buf->b_evict_lock);
3443         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3444         mutex_exit(&buf->b_evict_lock);
3445         return (referenced);
3446 }
3447 #endif
3448 
3449 static void
3450 arc_write_ready(zio_t *zio)
3451 {
3452         arc_write_callback_t *callback = zio->io_private;
3453         arc_buf_t *buf = callback->awcb_buf;
3454         arc_buf_hdr_t *hdr = buf->b_hdr;
3455 
3456         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3457         callback->awcb_ready(zio, buf, callback->awcb_private);
3458 
3459         /*
3460          * If the IO is already in progress, then this is a re-write
3461          * attempt, so we need to thaw and re-compute the cksum.
3462          * It is the responsibility of the callback to handle the
3463          * accounting for any re-write attempt.
3464          */
3465         if (HDR_IO_IN_PROGRESS(hdr)) {
3466                 mutex_enter(&hdr->b_freeze_lock);
3467                 if (hdr->b_freeze_cksum != NULL) {
3468                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3469                         hdr->b_freeze_cksum = NULL;
3470                 }
3471                 mutex_exit(&hdr->b_freeze_lock);
3472         }
3473         arc_cksum_compute(buf, B_FALSE);
3474         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3475 }
3476 
3477 /*
3478  * The SPA calls this callback for each physical write that happens on behalf
3479  * of a logical write.  See the comment in dbuf_write_physdone() for details.
3480  */
3481 static void
3482 arc_write_physdone(zio_t *zio)
3483 {
3484         arc_write_callback_t *cb = zio->io_private;
3485         if (cb->awcb_physdone != NULL)
3486                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3487 }
3488 
3489 static void
3490 arc_write_done(zio_t *zio)
3491 {
3492         arc_write_callback_t *callback = zio->io_private;
3493         arc_buf_t *buf = callback->awcb_buf;
3494         arc_buf_hdr_t *hdr = buf->b_hdr;
3495 
3496         ASSERT(hdr->b_acb == NULL);
3497 
3498         if (zio->io_error == 0) {
3499                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3500                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3501                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3502         } else {
3503                 ASSERT(BUF_EMPTY(hdr));
3504         }
3505 
3506         /*
3507          * If the block to be written was all-zero, we may have
3508          * compressed it away.  In this case no write was performed
3509          * so there will be no dva/birth/checksum.  The buffer must
3510          * therefore remain anonymous (and uncached).
3511          */
3512         if (!BUF_EMPTY(hdr)) {
3513                 arc_buf_hdr_t *exists;
3514                 kmutex_t *hash_lock;
3515 
3516                 ASSERT(zio->io_error == 0);
3517 
3518                 arc_cksum_verify(buf);
3519 
3520                 exists = buf_hash_insert(hdr, &hash_lock);
3521                 if (exists) {
3522                         /*
3523                          * This can only happen if we overwrite for
3524                          * sync-to-convergence, because we remove
3525                          * buffers from the hash table when we arc_free().
3526                          */
3527                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3528                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3529                                         panic("bad overwrite, hdr=%p exists=%p",
3530                                             (void *)hdr, (void *)exists);
3531                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3532                                 arc_change_state(arc_anon, exists, hash_lock);
3533                                 mutex_exit(hash_lock);
3534                                 arc_hdr_destroy(exists);
3535                                 exists = buf_hash_insert(hdr, &hash_lock);
3536                                 ASSERT3P(exists, ==, NULL);
3537                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3538                                 /* nopwrite */
3539                                 ASSERT(zio->io_prop.zp_nopwrite);
3540                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3541                                         panic("bad nopwrite, hdr=%p exists=%p",
3542                                             (void *)hdr, (void *)exists);
3543                         } else {
3544                                 /* Dedup */
3545                                 ASSERT(hdr->b_datacnt == 1);
3546                                 ASSERT(hdr->b_state == arc_anon);
3547                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3548                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3549                         }
3550                 }
3551                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3552                 /* if it's not anon, we are doing a scrub */
3553                 if (!exists && hdr->b_state == arc_anon)
3554                         arc_access(hdr, hash_lock);
3555                 mutex_exit(hash_lock);
3556         } else {
3557                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3558         }
3559 
3560         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3561         callback->awcb_done(zio, buf, callback->awcb_private);
3562 
3563         kmem_free(callback, sizeof (arc_write_callback_t));
3564 }
3565 
3566 zio_t *
3567 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3568     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3569     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3570     arc_done_func_t *done, void *private, zio_priority_t priority,
3571     int zio_flags, const zbookmark_t *zb)
3572 {
3573         arc_buf_hdr_t *hdr = buf->b_hdr;
3574         arc_write_callback_t *callback;
3575         zio_t *zio;
3576 
3577         ASSERT(ready != NULL);
3578         ASSERT(done != NULL);
3579         ASSERT(!HDR_IO_ERROR(hdr));
3580         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3581         ASSERT(hdr->b_acb == NULL);
3582         if (l2arc)
3583                 hdr->b_flags |= ARC_L2CACHE;
3584         if (l2arc_compress)
3585                 hdr->b_flags |= ARC_L2COMPRESS;
3586         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3587         callback->awcb_ready = ready;
3588         callback->awcb_physdone = physdone;
3589         callback->awcb_done = done;
3590         callback->awcb_private = private;
3591         callback->awcb_buf = buf;
3592 
3593         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3594             arc_write_ready, arc_write_physdone, arc_write_done, callback,
3595             priority, zio_flags, zb);
3596 
3597         return (zio);
3598 }
3599 
3600 static int
3601 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3602 {
3603 #ifdef _KERNEL
3604         uint64_t available_memory = ptob(freemem);
3605         static uint64_t page_load = 0;
3606         static uint64_t last_txg = 0;
3607 
3608 #if defined(__i386)
3609         available_memory =
3610             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3611 #endif
3612 
3613         if (freemem > physmem * arc_lotsfree_percent / 100)
3614                 return (0);
3615 
3616         if (txg > last_txg) {
3617                 last_txg = txg;
3618                 page_load = 0;
3619         }
3620         /*
3621          * If we are in pageout, we know that memory is already tight,
3622          * the arc is already going to be evicting, so we just want to
3623          * continue to let page writes occur as quickly as possible.
3624          */
3625         if (curproc == proc_pageout) {
3626                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3627                         return (SET_ERROR(ERESTART));
3628                 /* Note: reserve is inflated, so we deflate */
3629                 page_load += reserve / 8;
3630                 return (0);
3631         } else if (page_load > 0 && arc_reclaim_needed()) {
3632                 /* memory is low, delay before restarting */
3633                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3634                 return (SET_ERROR(EAGAIN));
3635         }
3636         page_load = 0;
3637 #endif
3638         return (0);
3639 }
3640 
3641 void
3642 arc_tempreserve_clear(uint64_t reserve)
3643 {
3644         atomic_add_64(&arc_tempreserve, -reserve);
3645         ASSERT((int64_t)arc_tempreserve >= 0);
3646 }
3647 
3648 int
3649 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3650 {
3651         int error;
3652         uint64_t anon_size;
3653 
3654         if (reserve > arc_c/4 && !arc_no_grow)
3655                 arc_c = MIN(arc_c_max, reserve * 4);
3656         if (reserve > arc_c)
3657                 return (SET_ERROR(ENOMEM));
3658 
3659         /*
3660          * Don't count loaned bufs as in flight dirty data to prevent long
3661          * network delays from blocking transactions that are ready to be
3662          * assigned to a txg.
3663          */
3664         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3665 
3666         /*
3667          * Writes will, almost always, require additional memory allocations
3668          * in order to compress/encrypt/etc the data.  We therefore need to
3669          * make sure that there is sufficient available memory for this.
3670          */
3671         error = arc_memory_throttle(reserve, txg);
3672         if (error != 0)
3673                 return (error);
3674 
3675         /*
3676          * Throttle writes when the amount of dirty data in the cache
3677          * gets too large.  We try to keep the cache less than half full
3678          * of dirty blocks so that our sync times don't grow too large.
3679          * Note: if two requests come in concurrently, we might let them
3680          * both succeed, when one of them should fail.  Not a huge deal.
3681          */
3682 
3683         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3684             anon_size > arc_c / 4) {
3685                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3686                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3687                     arc_tempreserve>>10,
3688                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3689                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3690                     reserve>>10, arc_c>>10);
3691                 return (SET_ERROR(ERESTART));
3692         }
3693         atomic_add_64(&arc_tempreserve, reserve);
3694         return (0);
3695 }
3696 
3697 void
3698 arc_init(void)
3699 {
3700         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3701         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3702 
3703         /* Convert seconds to clock ticks */
3704         arc_min_prefetch_lifespan = 1 * hz;
3705 
3706         /* Start out with 1/8 of all memory */
3707         arc_c = physmem * PAGESIZE / 8;
3708 
3709 #ifdef _KERNEL
3710         /*
3711          * On architectures where the physical memory can be larger
3712          * than the addressable space (intel in 32-bit mode), we may
3713          * need to limit the cache to 1/8 of VM size.
3714          */
3715         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3716 #endif
3717 
3718         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3719         arc_c_min = MAX(arc_c / 4, 64<<20);
3720         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3721         if (arc_c * 8 >= 1<<30)
3722                 arc_c_max = (arc_c * 8) - (1<<30);
3723         else
3724                 arc_c_max = arc_c_min;
3725         arc_c_max = MAX(arc_c * 6, arc_c_max);
3726 
3727         /*
3728          * Allow the tunables to override our calculations if they are
3729          * reasonable (ie. over 64MB)
3730          */
3731         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3732                 arc_c_max = zfs_arc_max;
3733         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3734                 arc_c_min = zfs_arc_min;
3735 
3736         arc_c = arc_c_max;
3737         arc_p = (arc_c >> 1);
3738 
3739         /* limit meta-data to 1/4 of the arc capacity */
3740         arc_meta_limit = arc_c_max / 4;
3741 
3742         /* Allow the tunable to override if it is reasonable */
3743         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3744                 arc_meta_limit = zfs_arc_meta_limit;
3745 
3746         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3747                 arc_c_min = arc_meta_limit / 2;
3748 
3749         if (zfs_arc_grow_retry > 0)
3750                 arc_grow_retry = zfs_arc_grow_retry;
3751 
3752         if (zfs_arc_shrink_shift > 0)
3753                 arc_shrink_shift = zfs_arc_shrink_shift;
3754 
3755         if (zfs_arc_p_min_shift > 0)
3756                 arc_p_min_shift = zfs_arc_p_min_shift;
3757 
3758         /* if kmem_flags are set, lets try to use less memory */
3759         if (kmem_debugging())
3760                 arc_c = arc_c / 2;
3761         if (arc_c < arc_c_min)
3762                 arc_c = arc_c_min;
3763 
3764         arc_anon = &ARC_anon;
3765         arc_mru = &ARC_mru;
3766         arc_mru_ghost = &ARC_mru_ghost;
3767         arc_mfu = &ARC_mfu;
3768         arc_mfu_ghost = &ARC_mfu_ghost;
3769         arc_l2c_only = &ARC_l2c_only;
3770         arc_size = 0;
3771 
3772         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3773         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3774         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3775         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3776         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3777         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3778 
3779         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3780             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3781         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3782             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3783         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3784             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3785         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3786             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3787         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3788             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3789         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3790             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3791         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3792             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3793         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3794             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3795         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3796             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3797         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3798             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3799 
3800         buf_init();
3801 
3802         arc_thread_exit = 0;
3803         arc_eviction_list = NULL;
3804         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3805         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3806 
3807         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3808             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3809 
3810         if (arc_ksp != NULL) {
3811                 arc_ksp->ks_data = &arc_stats;
3812                 kstat_install(arc_ksp);
3813         }
3814 
3815         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3816             TS_RUN, minclsyspri);
3817 
3818         arc_dead = FALSE;
3819         arc_warm = B_FALSE;
3820 
3821         /*
3822          * Calculate maximum amount of dirty data per pool.
3823          *
3824          * If it has been set by /etc/system, take that.
3825          * Otherwise, use a percentage of physical memory defined by
3826          * zfs_dirty_data_max_percent (default 10%) with a cap at
3827          * zfs_dirty_data_max_max (default 4GB).
3828          */
3829         if (zfs_dirty_data_max == 0) {
3830                 zfs_dirty_data_max = physmem * PAGESIZE *
3831                     zfs_dirty_data_max_percent / 100;
3832                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
3833                     zfs_dirty_data_max_max);
3834         }
3835 }
3836 
3837 void
3838 arc_fini(void)
3839 {
3840         mutex_enter(&arc_reclaim_thr_lock);
3841         arc_thread_exit = 1;
3842         while (arc_thread_exit != 0)
3843                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3844         mutex_exit(&arc_reclaim_thr_lock);
3845 
3846         arc_flush(NULL);
3847 
3848         arc_dead = TRUE;
3849 
3850         if (arc_ksp != NULL) {
3851                 kstat_delete(arc_ksp);
3852                 arc_ksp = NULL;
3853         }
3854 
3855         mutex_destroy(&arc_eviction_mtx);
3856         mutex_destroy(&arc_reclaim_thr_lock);
3857         cv_destroy(&arc_reclaim_thr_cv);
3858 
3859         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3860         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3861         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3862         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3863         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3864         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3865         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3866         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3867 
3868         mutex_destroy(&arc_anon->arcs_mtx);
3869         mutex_destroy(&arc_mru->arcs_mtx);
3870         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3871         mutex_destroy(&arc_mfu->arcs_mtx);
3872         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3873         mutex_destroy(&arc_l2c_only->arcs_mtx);
3874 
3875         buf_fini();
3876 
3877         ASSERT(arc_loaned_bytes == 0);
3878 }
3879 
3880 /*
3881  * Level 2 ARC
3882  *
3883  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3884  * It uses dedicated storage devices to hold cached data, which are populated
3885  * using large infrequent writes.  The main role of this cache is to boost
3886  * the performance of random read workloads.  The intended L2ARC devices
3887  * include short-stroked disks, solid state disks, and other media with
3888  * substantially faster read latency than disk.
3889  *
3890  *                 +-----------------------+
3891  *                 |         ARC           |
3892  *                 +-----------------------+
3893  *                    |         ^     ^
3894  *                    |         |     |
3895  *      l2arc_feed_thread()    arc_read()
3896  *                    |         |     |
3897  *                    |  l2arc read   |
3898  *                    V         |     |
3899  *               +---------------+    |
3900  *               |     L2ARC     |    |
3901  *               +---------------+    |
3902  *                   |    ^           |
3903  *          l2arc_write() |           |
3904  *                   |    |           |
3905  *                   V    |           |
3906  *                 +-------+      +-------+
3907  *                 | vdev  |      | vdev  |
3908  *                 | cache |      | cache |
3909  *                 +-------+      +-------+
3910  *                 +=========+     .-----.
3911  *                 :  L2ARC  :    |-_____-|
3912  *                 : devices :    | Disks |
3913  *                 +=========+    `-_____-'
3914  *
3915  * Read requests are satisfied from the following sources, in order:
3916  *
3917  *      1) ARC
3918  *      2) vdev cache of L2ARC devices
3919  *      3) L2ARC devices
3920  *      4) vdev cache of disks
3921  *      5) disks
3922  *
3923  * Some L2ARC device types exhibit extremely slow write performance.
3924  * To accommodate for this there are some significant differences between
3925  * the L2ARC and traditional cache design:
3926  *
3927  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3928  * the ARC behave as usual, freeing buffers and placing headers on ghost
3929  * lists.  The ARC does not send buffers to the L2ARC during eviction as
3930  * this would add inflated write latencies for all ARC memory pressure.
3931  *
3932  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3933  * It does this by periodically scanning buffers from the eviction-end of
3934  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3935  * not already there. It scans until a headroom of buffers is satisfied,
3936  * which itself is a buffer for ARC eviction. If a compressible buffer is
3937  * found during scanning and selected for writing to an L2ARC device, we
3938  * temporarily boost scanning headroom during the next scan cycle to make
3939  * sure we adapt to compression effects (which might significantly reduce
3940  * the data volume we write to L2ARC). The thread that does this is
3941  * l2arc_feed_thread(), illustrated below; example sizes are included to
3942  * provide a better sense of ratio than this diagram:
3943  *
3944  *             head -->                        tail
3945  *              +---------------------+----------+
3946  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3947  *              +---------------------+----------+   |   o L2ARC eligible
3948  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3949  *              +---------------------+----------+   |
3950  *                   15.9 Gbytes      ^ 32 Mbytes    |
3951  *                                 headroom          |
3952  *                                            l2arc_feed_thread()
3953  *                                                   |
3954  *                       l2arc write hand <--[oooo]--'
3955  *                               |           8 Mbyte
3956  *                               |          write max
3957  *                               V
3958  *                +==============================+
3959  *      L2ARC dev |####|#|###|###|    |####| ... |
3960  *                +==============================+
3961  *                           32 Gbytes
3962  *
3963  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3964  * evicted, then the L2ARC has cached a buffer much sooner than it probably
3965  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3966  * safe to say that this is an uncommon case, since buffers at the end of
3967  * the ARC lists have moved there due to inactivity.
3968  *
3969  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3970  * then the L2ARC simply misses copying some buffers.  This serves as a
3971  * pressure valve to prevent heavy read workloads from both stalling the ARC
3972  * with waits and clogging the L2ARC with writes.  This also helps prevent
3973  * the potential for the L2ARC to churn if it attempts to cache content too
3974  * quickly, such as during backups of the entire pool.
3975  *
3976  * 5. After system boot and before the ARC has filled main memory, there are
3977  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3978  * lists can remain mostly static.  Instead of searching from tail of these
3979  * lists as pictured, the l2arc_feed_thread() will search from the list heads
3980  * for eligible buffers, greatly increasing its chance of finding them.
3981  *
3982  * The L2ARC device write speed is also boosted during this time so that
3983  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3984  * there are no L2ARC reads, and no fear of degrading read performance
3985  * through increased writes.
3986  *
3987  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3988  * the vdev queue can aggregate them into larger and fewer writes.  Each
3989  * device is written to in a rotor fashion, sweeping writes through
3990  * available space then repeating.
3991  *
3992  * 7. The L2ARC does not store dirty content.  It never needs to flush
3993  * write buffers back to disk based storage.
3994  *
3995  * 8. If an ARC buffer is written (and dirtied) which also exists in the
3996  * L2ARC, the now stale L2ARC buffer is immediately dropped.
3997  *
3998  * The performance of the L2ARC can be tweaked by a number of tunables, which
3999  * may be necessary for different workloads:
4000  *
4001  *      l2arc_write_max         max write bytes per interval
4002  *      l2arc_write_boost       extra write bytes during device warmup
4003  *      l2arc_noprefetch        skip caching prefetched buffers
4004  *      l2arc_headroom          number of max device writes to precache
4005  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4006  *                              scanning, we multiply headroom by this
4007  *                              percentage factor for the next scan cycle,
4008  *                              since more compressed buffers are likely to
4009  *                              be present
4010  *      l2arc_feed_secs         seconds between L2ARC writing
4011  *
4012  * Tunables may be removed or added as future performance improvements are
4013  * integrated, and also may become zpool properties.
4014  *
4015  * There are three key functions that control how the L2ARC warms up:
4016  *
4017  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4018  *      l2arc_write_size()      calculate how much to write
4019  *      l2arc_write_interval()  calculate sleep delay between writes
4020  *
4021  * These three functions determine what to write, how much, and how quickly
4022  * to send writes.
4023  */
4024 
4025 static boolean_t
4026 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4027 {
4028         /*
4029          * A buffer is *not* eligible for the L2ARC if it:
4030          * 1. belongs to a different spa.
4031          * 2. is already cached on the L2ARC.
4032          * 3. has an I/O in progress (it may be an incomplete read).
4033          * 4. is flagged not eligible (zfs property).
4034          */
4035         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4036             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4037                 return (B_FALSE);
4038 
4039         return (B_TRUE);
4040 }
4041 
4042 static uint64_t
4043 l2arc_write_size(void)
4044 {
4045         uint64_t size;
4046 
4047         /*
4048          * Make sure our globals have meaningful values in case the user
4049          * altered them.
4050          */
4051         size = l2arc_write_max;
4052         if (size == 0) {
4053                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4054                     "be greater than zero, resetting it to the default (%d)",
4055                     L2ARC_WRITE_SIZE);
4056                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4057         }
4058 
4059         if (arc_warm == B_FALSE)
4060                 size += l2arc_write_boost;
4061 
4062         return (size);
4063 
4064 }
4065 
4066 static clock_t
4067 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4068 {
4069         clock_t interval, next, now;
4070 
4071         /*
4072          * If the ARC lists are busy, increase our write rate; if the
4073          * lists are stale, idle back.  This is achieved by checking
4074          * how much we previously wrote - if it was more than half of
4075          * what we wanted, schedule the next write much sooner.
4076          */
4077         if (l2arc_feed_again && wrote > (wanted / 2))
4078                 interval = (hz * l2arc_feed_min_ms) / 1000;
4079         else
4080                 interval = hz * l2arc_feed_secs;
4081 
4082         now = ddi_get_lbolt();
4083         next = MAX(now, MIN(now + interval, began + interval));
4084 
4085         return (next);
4086 }
4087 
4088 static void
4089 l2arc_hdr_stat_add(void)
4090 {
4091         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4092         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4093 }
4094 
4095 static void
4096 l2arc_hdr_stat_remove(void)
4097 {
4098         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4099         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4100 }
4101 
4102 /*
4103  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4104  * If a device is returned, this also returns holding the spa config lock.
4105  */
4106 static l2arc_dev_t *
4107 l2arc_dev_get_next(void)
4108 {
4109         l2arc_dev_t *first, *next = NULL;
4110 
4111         /*
4112          * Lock out the removal of spas (spa_namespace_lock), then removal
4113          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4114          * both locks will be dropped and a spa config lock held instead.
4115          */
4116         mutex_enter(&spa_namespace_lock);
4117         mutex_enter(&l2arc_dev_mtx);
4118 
4119         /* if there are no vdevs, there is nothing to do */
4120         if (l2arc_ndev == 0)
4121                 goto out;
4122 
4123         first = NULL;
4124         next = l2arc_dev_last;
4125         do {
4126                 /* loop around the list looking for a non-faulted vdev */
4127                 if (next == NULL) {
4128                         next = list_head(l2arc_dev_list);
4129                 } else {
4130                         next = list_next(l2arc_dev_list, next);
4131                         if (next == NULL)
4132                                 next = list_head(l2arc_dev_list);
4133                 }
4134 
4135                 /* if we have come back to the start, bail out */
4136                 if (first == NULL)
4137                         first = next;
4138                 else if (next == first)
4139                         break;
4140 
4141         } while (vdev_is_dead(next->l2ad_vdev));
4142 
4143         /* if we were unable to find any usable vdevs, return NULL */
4144         if (vdev_is_dead(next->l2ad_vdev))
4145                 next = NULL;
4146 
4147         l2arc_dev_last = next;
4148 
4149 out:
4150         mutex_exit(&l2arc_dev_mtx);
4151 
4152         /*
4153          * Grab the config lock to prevent the 'next' device from being
4154          * removed while we are writing to it.
4155          */
4156         if (next != NULL)
4157                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4158         mutex_exit(&spa_namespace_lock);
4159 
4160         return (next);
4161 }
4162 
4163 /*
4164  * Free buffers that were tagged for destruction.
4165  */
4166 static void
4167 l2arc_do_free_on_write()
4168 {
4169         list_t *buflist;
4170         l2arc_data_free_t *df, *df_prev;
4171 
4172         mutex_enter(&l2arc_free_on_write_mtx);
4173         buflist = l2arc_free_on_write;
4174 
4175         for (df = list_tail(buflist); df; df = df_prev) {
4176                 df_prev = list_prev(buflist, df);
4177                 ASSERT(df->l2df_data != NULL);
4178                 ASSERT(df->l2df_func != NULL);
4179                 df->l2df_func(df->l2df_data, df->l2df_size);
4180                 list_remove(buflist, df);
4181                 kmem_free(df, sizeof (l2arc_data_free_t));
4182         }
4183 
4184         mutex_exit(&l2arc_free_on_write_mtx);
4185 }
4186 
4187 /*
4188  * A write to a cache device has completed.  Update all headers to allow
4189  * reads from these buffers to begin.
4190  */
4191 static void
4192 l2arc_write_done(zio_t *zio)
4193 {
4194         l2arc_write_callback_t *cb;
4195         l2arc_dev_t *dev;
4196         list_t *buflist;
4197         arc_buf_hdr_t *head, *ab, *ab_prev;
4198         l2arc_buf_hdr_t *abl2;
4199         kmutex_t *hash_lock;
4200 
4201         cb = zio->io_private;
4202         ASSERT(cb != NULL);
4203         dev = cb->l2wcb_dev;
4204         ASSERT(dev != NULL);
4205         head = cb->l2wcb_head;
4206         ASSERT(head != NULL);
4207         buflist = dev->l2ad_buflist;
4208         ASSERT(buflist != NULL);
4209         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4210             l2arc_write_callback_t *, cb);
4211 
4212         if (zio->io_error != 0)
4213                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4214 
4215         mutex_enter(&l2arc_buflist_mtx);
4216 
4217         /*
4218          * All writes completed, or an error was hit.
4219          */
4220         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4221                 ab_prev = list_prev(buflist, ab);
4222 
4223                 hash_lock = HDR_LOCK(ab);
4224                 if (!mutex_tryenter(hash_lock)) {
4225                         /*
4226                          * This buffer misses out.  It may be in a stage
4227                          * of eviction.  Its ARC_L2_WRITING flag will be
4228                          * left set, denying reads to this buffer.
4229                          */
4230                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4231                         continue;
4232                 }
4233 
4234                 abl2 = ab->b_l2hdr;
4235 
4236                 /*
4237                  * Release the temporary compressed buffer as soon as possible.
4238                  */
4239                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4240                         l2arc_release_cdata_buf(ab);
4241 
4242                 if (zio->io_error != 0) {
4243                         /*
4244                          * Error - drop L2ARC entry.
4245                          */
4246                         list_remove(buflist, ab);
4247                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4248                         ab->b_l2hdr = NULL;
4249                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4250                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4251                 }
4252 
4253                 /*
4254                  * Allow ARC to begin reads to this L2ARC entry.
4255                  */
4256                 ab->b_flags &= ~ARC_L2_WRITING;
4257 
4258                 mutex_exit(hash_lock);
4259         }
4260 
4261         atomic_inc_64(&l2arc_writes_done);
4262         list_remove(buflist, head);
4263         kmem_cache_free(hdr_cache, head);
4264         mutex_exit(&l2arc_buflist_mtx);
4265 
4266         l2arc_do_free_on_write();
4267 
4268         kmem_free(cb, sizeof (l2arc_write_callback_t));
4269 }
4270 
4271 /*
4272  * A read to a cache device completed.  Validate buffer contents before
4273  * handing over to the regular ARC routines.
4274  */
4275 static void
4276 l2arc_read_done(zio_t *zio)
4277 {
4278         l2arc_read_callback_t *cb;
4279         arc_buf_hdr_t *hdr;
4280         arc_buf_t *buf;
4281         kmutex_t *hash_lock;
4282         int equal;
4283 
4284         ASSERT(zio->io_vd != NULL);
4285         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4286 
4287         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4288 
4289         cb = zio->io_private;
4290         ASSERT(cb != NULL);
4291         buf = cb->l2rcb_buf;
4292         ASSERT(buf != NULL);
4293 
4294         hash_lock = HDR_LOCK(buf->b_hdr);
4295         mutex_enter(hash_lock);
4296         hdr = buf->b_hdr;
4297         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4298 
4299         /*
4300          * If the buffer was compressed, decompress it first.
4301          */
4302         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4303                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4304         ASSERT(zio->io_data != NULL);
4305 
4306         /*
4307          * Check this survived the L2ARC journey.
4308          */
4309         equal = arc_cksum_equal(buf);
4310         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4311                 mutex_exit(hash_lock);
4312                 zio->io_private = buf;
4313                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
4314                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
4315                 arc_read_done(zio);
4316         } else {
4317                 mutex_exit(hash_lock);
4318                 /*
4319                  * Buffer didn't survive caching.  Increment stats and
4320                  * reissue to the original storage device.
4321                  */
4322                 if (zio->io_error != 0) {
4323                         ARCSTAT_BUMP(arcstat_l2_io_error);
4324                 } else {
4325                         zio->io_error = SET_ERROR(EIO);
4326                 }
4327                 if (!equal)
4328                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4329 
4330                 /*
4331                  * If there's no waiter, issue an async i/o to the primary
4332                  * storage now.  If there *is* a waiter, the caller must
4333                  * issue the i/o in a context where it's OK to block.
4334                  */
4335                 if (zio->io_waiter == NULL) {
4336                         zio_t *pio = zio_unique_parent(zio);
4337 
4338                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4339 
4340                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4341                             buf->b_data, zio->io_size, arc_read_done, buf,
4342                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4343                 }
4344         }
4345 
4346         kmem_free(cb, sizeof (l2arc_read_callback_t));
4347 }
4348 
4349 /*
4350  * This is the list priority from which the L2ARC will search for pages to
4351  * cache.  This is used within loops (0..3) to cycle through lists in the
4352  * desired order.  This order can have a significant effect on cache
4353  * performance.
4354  *
4355  * Currently the metadata lists are hit first, MFU then MRU, followed by
4356  * the data lists.  This function returns a locked list, and also returns
4357  * the lock pointer.
4358  */
4359 static list_t *
4360 l2arc_list_locked(int list_num, kmutex_t **lock)
4361 {
4362         list_t *list = NULL;
4363 
4364         ASSERT(list_num >= 0 && list_num <= 3);
4365 
4366         switch (list_num) {
4367         case 0:
4368                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4369                 *lock = &arc_mfu->arcs_mtx;
4370                 break;
4371         case 1:
4372                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4373                 *lock = &arc_mru->arcs_mtx;
4374                 break;
4375         case 2:
4376                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4377                 *lock = &arc_mfu->arcs_mtx;
4378                 break;
4379         case 3:
4380                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4381                 *lock = &arc_mru->arcs_mtx;
4382                 break;
4383         }
4384 
4385         ASSERT(!(MUTEX_HELD(*lock)));
4386         mutex_enter(*lock);
4387         return (list);
4388 }
4389 
4390 /*
4391  * Evict buffers from the device write hand to the distance specified in
4392  * bytes.  This distance may span populated buffers, it may span nothing.
4393  * This is clearing a region on the L2ARC device ready for writing.
4394  * If the 'all' boolean is set, every buffer is evicted.
4395  */
4396 static void
4397 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4398 {
4399         list_t *buflist;
4400         l2arc_buf_hdr_t *abl2;
4401         arc_buf_hdr_t *ab, *ab_prev;
4402         kmutex_t *hash_lock;
4403         uint64_t taddr;
4404 
4405         buflist = dev->l2ad_buflist;
4406 
4407         if (buflist == NULL)
4408                 return;
4409 
4410         if (!all && dev->l2ad_first) {
4411                 /*
4412                  * This is the first sweep through the device.  There is
4413                  * nothing to evict.
4414                  */
4415                 return;
4416         }
4417 
4418         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4419                 /*
4420                  * When nearing the end of the device, evict to the end
4421                  * before the device write hand jumps to the start.
4422                  */
4423                 taddr = dev->l2ad_end;
4424         } else {
4425                 taddr = dev->l2ad_hand + distance;
4426         }
4427         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4428             uint64_t, taddr, boolean_t, all);
4429 
4430 top:
4431         mutex_enter(&l2arc_buflist_mtx);
4432         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4433                 ab_prev = list_prev(buflist, ab);
4434 
4435                 hash_lock = HDR_LOCK(ab);
4436                 if (!mutex_tryenter(hash_lock)) {
4437                         /*
4438                          * Missed the hash lock.  Retry.
4439                          */
4440                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4441                         mutex_exit(&l2arc_buflist_mtx);
4442                         mutex_enter(hash_lock);
4443                         mutex_exit(hash_lock);
4444                         goto top;
4445                 }
4446 
4447                 if (HDR_L2_WRITE_HEAD(ab)) {
4448                         /*
4449                          * We hit a write head node.  Leave it for
4450                          * l2arc_write_done().
4451                          */
4452                         list_remove(buflist, ab);
4453                         mutex_exit(hash_lock);
4454                         continue;
4455                 }
4456 
4457                 if (!all && ab->b_l2hdr != NULL &&
4458                     (ab->b_l2hdr->b_daddr > taddr ||
4459                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4460                         /*
4461                          * We've evicted to the target address,
4462                          * or the end of the device.
4463                          */
4464                         mutex_exit(hash_lock);
4465                         break;
4466                 }
4467 
4468                 if (HDR_FREE_IN_PROGRESS(ab)) {
4469                         /*
4470                          * Already on the path to destruction.
4471                          */
4472                         mutex_exit(hash_lock);
4473                         continue;
4474                 }
4475 
4476                 if (ab->b_state == arc_l2c_only) {
4477                         ASSERT(!HDR_L2_READING(ab));
4478                         /*
4479                          * This doesn't exist in the ARC.  Destroy.
4480                          * arc_hdr_destroy() will call list_remove()
4481                          * and decrement arcstat_l2_size.
4482                          */
4483                         arc_change_state(arc_anon, ab, hash_lock);
4484                         arc_hdr_destroy(ab);
4485                 } else {
4486                         /*
4487                          * Invalidate issued or about to be issued
4488                          * reads, since we may be about to write
4489                          * over this location.
4490                          */
4491                         if (HDR_L2_READING(ab)) {
4492                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4493                                 ab->b_flags |= ARC_L2_EVICTED;
4494                         }
4495 
4496                         /*
4497                          * Tell ARC this no longer exists in L2ARC.
4498                          */
4499                         if (ab->b_l2hdr != NULL) {
4500                                 abl2 = ab->b_l2hdr;
4501                                 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4502                                 ab->b_l2hdr = NULL;
4503                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4504                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4505                         }
4506                         list_remove(buflist, ab);
4507 
4508                         /*
4509                          * This may have been leftover after a
4510                          * failed write.
4511                          */
4512                         ab->b_flags &= ~ARC_L2_WRITING;
4513                 }
4514                 mutex_exit(hash_lock);
4515         }
4516         mutex_exit(&l2arc_buflist_mtx);
4517 
4518         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4519         dev->l2ad_evict = taddr;
4520 }
4521 
4522 /*
4523  * Find and write ARC buffers to the L2ARC device.
4524  *
4525  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4526  * for reading until they have completed writing.
4527  * The headroom_boost is an in-out parameter used to maintain headroom boost
4528  * state between calls to this function.
4529  *
4530  * Returns the number of bytes actually written (which may be smaller than
4531  * the delta by which the device hand has changed due to alignment).
4532  */
4533 static uint64_t
4534 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4535     boolean_t *headroom_boost)
4536 {
4537         arc_buf_hdr_t *ab, *ab_prev, *head;
4538         list_t *list;
4539         uint64_t write_asize, write_psize, write_sz, headroom,
4540             buf_compress_minsz;
4541         void *buf_data;
4542         kmutex_t *list_lock;
4543         boolean_t full;
4544         l2arc_write_callback_t *cb;
4545         zio_t *pio, *wzio;
4546         uint64_t guid = spa_load_guid(spa);
4547         const boolean_t do_headroom_boost = *headroom_boost;
4548 
4549         ASSERT(dev->l2ad_vdev != NULL);
4550 
4551         /* Lower the flag now, we might want to raise it again later. */
4552         *headroom_boost = B_FALSE;
4553 
4554         pio = NULL;
4555         write_sz = write_asize = write_psize = 0;
4556         full = B_FALSE;
4557         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4558         head->b_flags |= ARC_L2_WRITE_HEAD;
4559 
4560         /*
4561          * We will want to try to compress buffers that are at least 2x the
4562          * device sector size.
4563          */
4564         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4565 
4566         /*
4567          * Copy buffers for L2ARC writing.
4568          */
4569         mutex_enter(&l2arc_buflist_mtx);
4570         for (int try = 0; try <= 3; try++) {
4571                 uint64_t passed_sz = 0;
4572 
4573                 list = l2arc_list_locked(try, &list_lock);
4574 
4575                 /*
4576                  * L2ARC fast warmup.
4577                  *
4578                  * Until the ARC is warm and starts to evict, read from the
4579                  * head of the ARC lists rather than the tail.
4580                  */
4581                 if (arc_warm == B_FALSE)
4582                         ab = list_head(list);
4583                 else
4584                         ab = list_tail(list);
4585 
4586                 headroom = target_sz * l2arc_headroom;
4587                 if (do_headroom_boost)
4588                         headroom = (headroom * l2arc_headroom_boost) / 100;
4589 
4590                 for (; ab; ab = ab_prev) {
4591                         l2arc_buf_hdr_t *l2hdr;
4592                         kmutex_t *hash_lock;
4593                         uint64_t buf_sz;
4594 
4595                         if (arc_warm == B_FALSE)
4596                                 ab_prev = list_next(list, ab);
4597                         else
4598                                 ab_prev = list_prev(list, ab);
4599 
4600                         hash_lock = HDR_LOCK(ab);
4601                         if (!mutex_tryenter(hash_lock)) {
4602                                 /*
4603                                  * Skip this buffer rather than waiting.
4604                                  */
4605                                 continue;
4606                         }
4607 
4608                         passed_sz += ab->b_size;
4609                         if (passed_sz > headroom) {
4610                                 /*
4611                                  * Searched too far.
4612                                  */
4613                                 mutex_exit(hash_lock);
4614                                 break;
4615                         }
4616 
4617                         if (!l2arc_write_eligible(guid, ab)) {
4618                                 mutex_exit(hash_lock);
4619                                 continue;
4620                         }
4621 
4622                         if ((write_sz + ab->b_size) > target_sz) {
4623                                 full = B_TRUE;
4624                                 mutex_exit(hash_lock);
4625                                 break;
4626                         }
4627 
4628                         if (pio == NULL) {
4629                                 /*
4630                                  * Insert a dummy header on the buflist so
4631                                  * l2arc_write_done() can find where the
4632                                  * write buffers begin without searching.
4633                                  */
4634                                 list_insert_head(dev->l2ad_buflist, head);
4635 
4636                                 cb = kmem_alloc(
4637                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4638                                 cb->l2wcb_dev = dev;
4639                                 cb->l2wcb_head = head;
4640                                 pio = zio_root(spa, l2arc_write_done, cb,
4641                                     ZIO_FLAG_CANFAIL);
4642                         }
4643 
4644                         /*
4645                          * Create and add a new L2ARC header.
4646                          */
4647                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4648                         l2hdr->b_dev = dev;
4649                         ab->b_flags |= ARC_L2_WRITING;
4650 
4651                         /*
4652                          * Temporarily stash the data buffer in b_tmp_cdata.
4653                          * The subsequent write step will pick it up from
4654                          * there. This is because can't access ab->b_buf
4655                          * without holding the hash_lock, which we in turn
4656                          * can't access without holding the ARC list locks
4657                          * (which we want to avoid during compression/writing).
4658                          */
4659                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
4660                         l2hdr->b_asize = ab->b_size;
4661                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4662 
4663                         buf_sz = ab->b_size;
4664                         ab->b_l2hdr = l2hdr;
4665 
4666                         list_insert_head(dev->l2ad_buflist, ab);
4667 
4668                         /*
4669                          * Compute and store the buffer cksum before
4670                          * writing.  On debug the cksum is verified first.
4671                          */
4672                         arc_cksum_verify(ab->b_buf);
4673                         arc_cksum_compute(ab->b_buf, B_TRUE);
4674 
4675                         mutex_exit(hash_lock);
4676 
4677                         write_sz += buf_sz;
4678                 }
4679 
4680                 mutex_exit(list_lock);
4681 
4682                 if (full == B_TRUE)
4683                         break;
4684         }
4685 
4686         /* No buffers selected for writing? */
4687         if (pio == NULL) {
4688                 ASSERT0(write_sz);
4689                 mutex_exit(&l2arc_buflist_mtx);
4690                 kmem_cache_free(hdr_cache, head);
4691                 return (0);
4692         }
4693 
4694         /*
4695          * Now start writing the buffers. We're starting at the write head
4696          * and work backwards, retracing the course of the buffer selector
4697          * loop above.
4698          */
4699         for (ab = list_prev(dev->l2ad_buflist, head); ab;
4700             ab = list_prev(dev->l2ad_buflist, ab)) {
4701                 l2arc_buf_hdr_t *l2hdr;
4702                 uint64_t buf_sz;
4703 
4704                 /*
4705                  * We shouldn't need to lock the buffer here, since we flagged
4706                  * it as ARC_L2_WRITING in the previous step, but we must take
4707                  * care to only access its L2 cache parameters. In particular,
4708                  * ab->b_buf may be invalid by now due to ARC eviction.
4709                  */
4710                 l2hdr = ab->b_l2hdr;
4711                 l2hdr->b_daddr = dev->l2ad_hand;
4712 
4713                 if ((ab->b_flags & ARC_L2COMPRESS) &&
4714                     l2hdr->b_asize >= buf_compress_minsz) {
4715                         if (l2arc_compress_buf(l2hdr)) {
4716                                 /*
4717                                  * If compression succeeded, enable headroom
4718                                  * boost on the next scan cycle.
4719                                  */
4720                                 *headroom_boost = B_TRUE;
4721                         }
4722                 }
4723 
4724                 /*
4725                  * Pick up the buffer data we had previously stashed away
4726                  * (and now potentially also compressed).
4727                  */
4728                 buf_data = l2hdr->b_tmp_cdata;
4729                 buf_sz = l2hdr->b_asize;
4730 
4731                 /* Compression may have squashed the buffer to zero length. */
4732                 if (buf_sz != 0) {
4733                         uint64_t buf_p_sz;
4734 
4735                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4736                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4737                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4738                             ZIO_FLAG_CANFAIL, B_FALSE);
4739 
4740                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4741                             zio_t *, wzio);
4742                         (void) zio_nowait(wzio);
4743 
4744                         write_asize += buf_sz;
4745                         /*
4746                          * Keep the clock hand suitably device-aligned.
4747                          */
4748                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4749                         write_psize += buf_p_sz;
4750                         dev->l2ad_hand += buf_p_sz;
4751                 }
4752         }
4753 
4754         mutex_exit(&l2arc_buflist_mtx);
4755 
4756         ASSERT3U(write_asize, <=, target_sz);
4757         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4758         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4759         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4760         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4761         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4762 
4763         /*
4764          * Bump device hand to the device start if it is approaching the end.
4765          * l2arc_evict() will already have evicted ahead for this case.
4766          */
4767         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4768                 vdev_space_update(dev->l2ad_vdev,
4769                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4770                 dev->l2ad_hand = dev->l2ad_start;
4771                 dev->l2ad_evict = dev->l2ad_start;
4772                 dev->l2ad_first = B_FALSE;
4773         }
4774 
4775         dev->l2ad_writing = B_TRUE;
4776         (void) zio_wait(pio);
4777         dev->l2ad_writing = B_FALSE;
4778 
4779         return (write_asize);
4780 }
4781 
4782 /*
4783  * Compresses an L2ARC buffer.
4784  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4785  * size in l2hdr->b_asize. This routine tries to compress the data and
4786  * depending on the compression result there are three possible outcomes:
4787  * *) The buffer was incompressible. The original l2hdr contents were left
4788  *    untouched and are ready for writing to an L2 device.
4789  * *) The buffer was all-zeros, so there is no need to write it to an L2
4790  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4791  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4792  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4793  *    data buffer which holds the compressed data to be written, and b_asize
4794  *    tells us how much data there is. b_compress is set to the appropriate
4795  *    compression algorithm. Once writing is done, invoke
4796  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4797  *
4798  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4799  * buffer was incompressible).
4800  */
4801 static boolean_t
4802 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4803 {
4804         void *cdata;
4805         size_t csize, len;
4806 
4807         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4808         ASSERT(l2hdr->b_tmp_cdata != NULL);
4809 
4810         len = l2hdr->b_asize;
4811         cdata = zio_data_buf_alloc(len);
4812         csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4813             cdata, l2hdr->b_asize);
4814 
4815         if (csize == 0) {
4816                 /* zero block, indicate that there's nothing to write */
4817                 zio_data_buf_free(cdata, len);
4818                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4819                 l2hdr->b_asize = 0;
4820                 l2hdr->b_tmp_cdata = NULL;
4821                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4822                 return (B_TRUE);
4823         } else if (csize > 0 && csize < len) {
4824                 /*
4825                  * Compression succeeded, we'll keep the cdata around for
4826                  * writing and release it afterwards.
4827                  */
4828                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4829                 l2hdr->b_asize = csize;
4830                 l2hdr->b_tmp_cdata = cdata;
4831                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
4832                 return (B_TRUE);
4833         } else {
4834                 /*
4835                  * Compression failed, release the compressed buffer.
4836                  * l2hdr will be left unmodified.
4837                  */
4838                 zio_data_buf_free(cdata, len);
4839                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
4840                 return (B_FALSE);
4841         }
4842 }
4843 
4844 /*
4845  * Decompresses a zio read back from an l2arc device. On success, the
4846  * underlying zio's io_data buffer is overwritten by the uncompressed
4847  * version. On decompression error (corrupt compressed stream), the
4848  * zio->io_error value is set to signal an I/O error.
4849  *
4850  * Please note that the compressed data stream is not checksummed, so
4851  * if the underlying device is experiencing data corruption, we may feed
4852  * corrupt data to the decompressor, so the decompressor needs to be
4853  * able to handle this situation (LZ4 does).
4854  */
4855 static void
4856 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4857 {
4858         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4859 
4860         if (zio->io_error != 0) {
4861                 /*
4862                  * An io error has occured, just restore the original io
4863                  * size in preparation for a main pool read.
4864                  */
4865                 zio->io_orig_size = zio->io_size = hdr->b_size;
4866                 return;
4867         }
4868 
4869         if (c == ZIO_COMPRESS_EMPTY) {
4870                 /*
4871                  * An empty buffer results in a null zio, which means we
4872                  * need to fill its io_data after we're done restoring the
4873                  * buffer's contents.
4874                  */
4875                 ASSERT(hdr->b_buf != NULL);
4876                 bzero(hdr->b_buf->b_data, hdr->b_size);
4877                 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
4878         } else {
4879                 ASSERT(zio->io_data != NULL);
4880                 /*
4881                  * We copy the compressed data from the start of the arc buffer
4882                  * (the zio_read will have pulled in only what we need, the
4883                  * rest is garbage which we will overwrite at decompression)
4884                  * and then decompress back to the ARC data buffer. This way we
4885                  * can minimize copying by simply decompressing back over the
4886                  * original compressed data (rather than decompressing to an
4887                  * aux buffer and then copying back the uncompressed buffer,
4888                  * which is likely to be much larger).
4889                  */
4890                 uint64_t csize;
4891                 void *cdata;
4892 
4893                 csize = zio->io_size;
4894                 cdata = zio_data_buf_alloc(csize);
4895                 bcopy(zio->io_data, cdata, csize);
4896                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
4897                     hdr->b_size) != 0)
4898                         zio->io_error = EIO;
4899                 zio_data_buf_free(cdata, csize);
4900         }
4901 
4902         /* Restore the expected uncompressed IO size. */
4903         zio->io_orig_size = zio->io_size = hdr->b_size;
4904 }
4905 
4906 /*
4907  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
4908  * This buffer serves as a temporary holder of compressed data while
4909  * the buffer entry is being written to an l2arc device. Once that is
4910  * done, we can dispose of it.
4911  */
4912 static void
4913 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
4914 {
4915         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4916 
4917         if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
4918                 /*
4919                  * If the data was compressed, then we've allocated a
4920                  * temporary buffer for it, so now we need to release it.
4921                  */
4922                 ASSERT(l2hdr->b_tmp_cdata != NULL);
4923                 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
4924         }
4925         l2hdr->b_tmp_cdata = NULL;
4926 }
4927 
4928 /*
4929  * This thread feeds the L2ARC at regular intervals.  This is the beating
4930  * heart of the L2ARC.
4931  */
4932 static void
4933 l2arc_feed_thread(void)
4934 {
4935         callb_cpr_t cpr;
4936         l2arc_dev_t *dev;
4937         spa_t *spa;
4938         uint64_t size, wrote;
4939         clock_t begin, next = ddi_get_lbolt();
4940         boolean_t headroom_boost = B_FALSE;
4941 
4942         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4943 
4944         mutex_enter(&l2arc_feed_thr_lock);
4945 
4946         while (l2arc_thread_exit == 0) {
4947                 CALLB_CPR_SAFE_BEGIN(&cpr);
4948                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4949                     next);
4950                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4951                 next = ddi_get_lbolt() + hz;
4952 
4953                 /*
4954                  * Quick check for L2ARC devices.
4955                  */
4956                 mutex_enter(&l2arc_dev_mtx);
4957                 if (l2arc_ndev == 0) {
4958                         mutex_exit(&l2arc_dev_mtx);
4959                         continue;
4960                 }
4961                 mutex_exit(&l2arc_dev_mtx);
4962                 begin = ddi_get_lbolt();
4963 
4964                 /*
4965                  * This selects the next l2arc device to write to, and in
4966                  * doing so the next spa to feed from: dev->l2ad_spa.   This
4967                  * will return NULL if there are now no l2arc devices or if
4968                  * they are all faulted.
4969                  *
4970                  * If a device is returned, its spa's config lock is also
4971                  * held to prevent device removal.  l2arc_dev_get_next()
4972                  * will grab and release l2arc_dev_mtx.
4973                  */
4974                 if ((dev = l2arc_dev_get_next()) == NULL)
4975                         continue;
4976 
4977                 spa = dev->l2ad_spa;
4978                 ASSERT(spa != NULL);
4979 
4980                 /*
4981                  * If the pool is read-only then force the feed thread to
4982                  * sleep a little longer.
4983                  */
4984                 if (!spa_writeable(spa)) {
4985                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4986                         spa_config_exit(spa, SCL_L2ARC, dev);
4987                         continue;
4988                 }
4989 
4990                 /*
4991                  * Avoid contributing to memory pressure.
4992                  */
4993                 if (arc_reclaim_needed()) {
4994                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4995                         spa_config_exit(spa, SCL_L2ARC, dev);
4996                         continue;
4997                 }
4998 
4999                 ARCSTAT_BUMP(arcstat_l2_feeds);
5000 
5001                 size = l2arc_write_size();
5002 
5003                 /*
5004                  * Evict L2ARC buffers that will be overwritten.
5005                  */
5006                 l2arc_evict(dev, size, B_FALSE);
5007 
5008                 /*
5009                  * Write ARC buffers.
5010                  */
5011                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5012 
5013                 /*
5014                  * Calculate interval between writes.
5015                  */
5016                 next = l2arc_write_interval(begin, size, wrote);
5017                 spa_config_exit(spa, SCL_L2ARC, dev);
5018         }
5019 
5020         l2arc_thread_exit = 0;
5021         cv_broadcast(&l2arc_feed_thr_cv);
5022         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
5023         thread_exit();
5024 }
5025 
5026 boolean_t
5027 l2arc_vdev_present(vdev_t *vd)
5028 {
5029         l2arc_dev_t *dev;
5030 
5031         mutex_enter(&l2arc_dev_mtx);
5032         for (dev = list_head(l2arc_dev_list); dev != NULL;
5033             dev = list_next(l2arc_dev_list, dev)) {
5034                 if (dev->l2ad_vdev == vd)
5035                         break;
5036         }
5037         mutex_exit(&l2arc_dev_mtx);
5038 
5039         return (dev != NULL);
5040 }
5041 
5042 /*
5043  * Add a vdev for use by the L2ARC.  By this point the spa has already
5044  * validated the vdev and opened it.
5045  */
5046 void
5047 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5048 {
5049         l2arc_dev_t *adddev;
5050 
5051         ASSERT(!l2arc_vdev_present(vd));
5052 
5053         /*
5054          * Create a new l2arc device entry.
5055          */
5056         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5057         adddev->l2ad_spa = spa;
5058         adddev->l2ad_vdev = vd;
5059         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5060         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5061         adddev->l2ad_hand = adddev->l2ad_start;
5062         adddev->l2ad_evict = adddev->l2ad_start;
5063         adddev->l2ad_first = B_TRUE;
5064         adddev->l2ad_writing = B_FALSE;
5065 
5066         /*
5067          * This is a list of all ARC buffers that are still valid on the
5068          * device.
5069          */
5070         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5071         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5072             offsetof(arc_buf_hdr_t, b_l2node));
5073 
5074         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5075 
5076         /*
5077          * Add device to global list
5078          */
5079         mutex_enter(&l2arc_dev_mtx);
5080         list_insert_head(l2arc_dev_list, adddev);
5081         atomic_inc_64(&l2arc_ndev);
5082         mutex_exit(&l2arc_dev_mtx);
5083 }
5084 
5085 /*
5086  * Remove a vdev from the L2ARC.
5087  */
5088 void
5089 l2arc_remove_vdev(vdev_t *vd)
5090 {
5091         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5092 
5093         /*
5094          * Find the device by vdev
5095          */
5096         mutex_enter(&l2arc_dev_mtx);
5097         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5098                 nextdev = list_next(l2arc_dev_list, dev);
5099                 if (vd == dev->l2ad_vdev) {
5100                         remdev = dev;
5101                         break;
5102                 }
5103         }
5104         ASSERT(remdev != NULL);
5105 
5106         /*
5107          * Remove device from global list
5108          */
5109         list_remove(l2arc_dev_list, remdev);
5110         l2arc_dev_last = NULL;          /* may have been invalidated */
5111         atomic_dec_64(&l2arc_ndev);
5112         mutex_exit(&l2arc_dev_mtx);
5113 
5114         /*
5115          * Clear all buflists and ARC references.  L2ARC device flush.
5116          */
5117         l2arc_evict(remdev, 0, B_TRUE);
5118         list_destroy(remdev->l2ad_buflist);
5119         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5120         kmem_free(remdev, sizeof (l2arc_dev_t));
5121 }
5122 
5123 void
5124 l2arc_init(void)
5125 {
5126         l2arc_thread_exit = 0;
5127         l2arc_ndev = 0;
5128         l2arc_writes_sent = 0;
5129         l2arc_writes_done = 0;
5130 
5131         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5132         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5133         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5134         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5135         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5136 
5137         l2arc_dev_list = &L2ARC_dev_list;
5138         l2arc_free_on_write = &L2ARC_free_on_write;
5139         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5140             offsetof(l2arc_dev_t, l2ad_node));
5141         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5142             offsetof(l2arc_data_free_t, l2df_list_node));
5143 }
5144 
5145 void
5146 l2arc_fini(void)
5147 {
5148         /*
5149          * This is called from dmu_fini(), which is called from spa_fini();
5150          * Because of this, we can assume that all l2arc devices have
5151          * already been removed when the pools themselves were removed.
5152          */
5153 
5154         l2arc_do_free_on_write();
5155 
5156         mutex_destroy(&l2arc_feed_thr_lock);
5157         cv_destroy(&l2arc_feed_thr_cv);
5158         mutex_destroy(&l2arc_dev_mtx);
5159         mutex_destroy(&l2arc_buflist_mtx);
5160         mutex_destroy(&l2arc_free_on_write_mtx);
5161 
5162         list_destroy(l2arc_dev_list);
5163         list_destroy(l2arc_free_on_write);
5164 }
5165 
5166 void
5167 l2arc_start(void)
5168 {
5169         if (!(spa_mode_global & FWRITE))
5170                 return;
5171 
5172         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5173             TS_RUN, minclsyspri);
5174 }
5175 
5176 void
5177 l2arc_stop(void)
5178 {
5179         if (!(spa_mode_global & FWRITE))
5180                 return;
5181 
5182         mutex_enter(&l2arc_feed_thr_lock);
5183         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
5184         l2arc_thread_exit = 1;
5185         while (l2arc_thread_exit != 0)
5186                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5187         mutex_exit(&l2arc_feed_thr_lock);
5188 }