1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26 
  27 /*
  28  * DVA-based Adjustable Replacement Cache
  29  *
  30  * While much of the theory of operation used here is
  31  * based on the self-tuning, low overhead replacement cache
  32  * presented by Megiddo and Modha at FAST 2003, there are some
  33  * significant differences:
  34  *
  35  * 1. The Megiddo and Modha model assumes any page is evictable.
  36  * Pages in its cache cannot be "locked" into memory.  This makes
  37  * the eviction algorithm simple: evict the last page in the list.
  38  * This also make the performance characteristics easy to reason
  39  * about.  Our cache is not so simple.  At any given moment, some
  40  * subset of the blocks in the cache are un-evictable because we
  41  * have handed out a reference to them.  Blocks are only evictable
  42  * when there are no external references active.  This makes
  43  * eviction far more problematic:  we choose to evict the evictable
  44  * blocks that are the "lowest" in the list.
  45  *
  46  * There are times when it is not possible to evict the requested
  47  * space.  In these circumstances we are unable to adjust the cache
  48  * size.  To prevent the cache growing unbounded at these times we
  49  * implement a "cache throttle" that slows the flow of new data
  50  * into the cache until we can make space available.
  51  *
  52  * 2. The Megiddo and Modha model assumes a fixed cache size.
  53  * Pages are evicted when the cache is full and there is a cache
  54  * miss.  Our model has a variable sized cache.  It grows with
  55  * high use, but also tries to react to memory pressure from the
  56  * operating system: decreasing its size when system memory is
  57  * tight.
  58  *
  59  * 3. The Megiddo and Modha model assumes a fixed page size. All
  60  * elements of the cache are therefore exactly the same size.  So
  61  * when adjusting the cache size following a cache miss, its simply
  62  * a matter of choosing a single page to evict.  In our model, we
  63  * have variable sized cache blocks (rangeing from 512 bytes to
  64  * 128K bytes).  We therefore choose a set of blocks to evict to make
  65  * space for a cache miss that approximates as closely as possible
  66  * the space used by the new block.
  67  *
  68  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  69  * by N. Megiddo & D. Modha, FAST 2003
  70  */
  71 
  72 /*
  73  * The locking model:
  74  *
  75  * A new reference to a cache buffer can be obtained in two
  76  * ways: 1) via a hash table lookup using the DVA as a key,
  77  * or 2) via one of the ARC lists.  The arc_read() interface
  78  * uses method 1, while the internal arc algorithms for
  79  * adjusting the cache use method 2.  We therefore provide two
  80  * types of locks: 1) the hash table lock array, and 2) the
  81  * arc list locks.
  82  *
  83  * Buffers do not have their own mutexes, rather they rely on the
  84  * hash table mutexes for the bulk of their protection (i.e. most
  85  * fields in the arc_buf_hdr_t are protected by these mutexes).
  86  *
  87  * buf_hash_find() returns the appropriate mutex (held) when it
  88  * locates the requested buffer in the hash table.  It returns
  89  * NULL for the mutex if the buffer was not in the table.
  90  *
  91  * buf_hash_remove() expects the appropriate hash mutex to be
  92  * already held before it is invoked.
  93  *
  94  * Each arc state also has a mutex which is used to protect the
  95  * buffer list associated with the state.  When attempting to
  96  * obtain a hash table lock while holding an arc list lock you
  97  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  98  * the active state mutex must be held before the ghost state mutex.
  99  *
 100  * Arc buffers may have an associated eviction callback function.
 101  * This function will be invoked prior to removing the buffer (e.g.
 102  * in arc_do_user_evicts()).  Note however that the data associated
 103  * with the buffer may be evicted prior to the callback.  The callback
 104  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 105  * the users of callbacks must ensure that their private data is
 106  * protected from simultaneous callbacks from arc_buf_evict()
 107  * and arc_do_user_evicts().
 108  *
 109  * Note that the majority of the performance stats are manipulated
 110  * with atomic operations.
 111  *
 112  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 113  *
 114  *      - L2ARC buflist creation
 115  *      - L2ARC buflist eviction
 116  *      - L2ARC write completion, which walks L2ARC buflists
 117  *      - ARC header destruction, as it removes from L2ARC buflists
 118  *      - ARC header release, as it removes from L2ARC buflists
 119  */
 120 
 121 #include <sys/spa.h>
 122 #include <sys/zio.h>
 123 #include <sys/zfs_context.h>
 124 #include <sys/arc.h>
 125 #include <sys/refcount.h>
 126 #include <sys/vdev.h>
 127 #include <sys/vdev_impl.h>
 128 #ifdef _KERNEL
 129 #include <sys/vmsystm.h>
 130 #include <vm/anon.h>
 131 #include <sys/fs/swapnode.h>
 132 #include <sys/dnlc.h>
 133 #endif
 134 #include <sys/callb.h>
 135 #include <sys/kstat.h>
 136 #include <zfs_fletcher.h>
 137 
 138 #ifndef _KERNEL
 139 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 140 boolean_t arc_watch = B_FALSE;
 141 int arc_procfd;
 142 #endif
 143 
 144 static kmutex_t         arc_reclaim_thr_lock;
 145 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 146 static uint8_t          arc_thread_exit;
 147 
 148 extern int zfs_write_limit_shift;
 149 extern uint64_t zfs_write_limit_max;
 150 extern kmutex_t zfs_write_limit_lock;
 151 
 152 #define ARC_REDUCE_DNLC_PERCENT 3
 153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154 
 155 typedef enum arc_reclaim_strategy {
 156         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158 } arc_reclaim_strategy_t;
 159 
 160 /* number of seconds before growing cache again */
 161 static int              arc_grow_retry = 60;
 162 
 163 /* shift of arc_c for calculating both min and max arc_p */
 164 static int              arc_p_min_shift = 4;
 165 
 166 /* log2(fraction of arc to reclaim) */
 167 static int              arc_shrink_shift = 5;
 168 
 169 /*
 170  * minimum lifespan of a prefetch block in clock ticks
 171  * (initialized in arc_init())
 172  */
 173 static int              arc_min_prefetch_lifespan;
 174 
 175 static int arc_dead;
 176 
 177 /*
 178  * The arc has filled available memory and has now warmed up.
 179  */
 180 static boolean_t arc_warm;
 181 
 182 /*
 183  * These tunables are for performance analysis.
 184  */
 185 uint64_t zfs_arc_max;
 186 uint64_t zfs_arc_min;
 187 uint64_t zfs_arc_meta_limit = 0;
 188 int zfs_arc_grow_retry = 0;
 189 int zfs_arc_shrink_shift = 0;
 190 int zfs_arc_p_min_shift = 0;
 191 int zfs_disable_dup_eviction = 0;
 192 
 193 /*
 194  * Note that buffers can be in one of 6 states:
 195  *      ARC_anon        - anonymous (discussed below)
 196  *      ARC_mru         - recently used, currently cached
 197  *      ARC_mru_ghost   - recentely used, no longer in cache
 198  *      ARC_mfu         - frequently used, currently cached
 199  *      ARC_mfu_ghost   - frequently used, no longer in cache
 200  *      ARC_l2c_only    - exists in L2ARC but not other states
 201  * When there are no active references to the buffer, they are
 202  * are linked onto a list in one of these arc states.  These are
 203  * the only buffers that can be evicted or deleted.  Within each
 204  * state there are multiple lists, one for meta-data and one for
 205  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 206  * etc.) is tracked separately so that it can be managed more
 207  * explicitly: favored over data, limited explicitly.
 208  *
 209  * Anonymous buffers are buffers that are not associated with
 210  * a DVA.  These are buffers that hold dirty block copies
 211  * before they are written to stable storage.  By definition,
 212  * they are "ref'd" and are considered part of arc_mru
 213  * that cannot be freed.  Generally, they will aquire a DVA
 214  * as they are written and migrate onto the arc_mru list.
 215  *
 216  * The ARC_l2c_only state is for buffers that are in the second
 217  * level ARC but no longer in any of the ARC_m* lists.  The second
 218  * level ARC itself may also contain buffers that are in any of
 219  * the ARC_m* states - meaning that a buffer can exist in two
 220  * places.  The reason for the ARC_l2c_only state is to keep the
 221  * buffer header in the hash table, so that reads that hit the
 222  * second level ARC benefit from these fast lookups.
 223  */
 224 
 225 typedef struct arc_state {
 226         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 227         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 228         uint64_t arcs_size;     /* total amount of data in this state */
 229         kmutex_t arcs_mtx;
 230 } arc_state_t;
 231 
 232 /* The 6 states: */
 233 static arc_state_t ARC_anon;
 234 static arc_state_t ARC_mru;
 235 static arc_state_t ARC_mru_ghost;
 236 static arc_state_t ARC_mfu;
 237 static arc_state_t ARC_mfu_ghost;
 238 static arc_state_t ARC_l2c_only;
 239 
 240 typedef struct arc_stats {
 241         kstat_named_t arcstat_hits;
 242         kstat_named_t arcstat_misses;
 243         kstat_named_t arcstat_demand_data_hits;
 244         kstat_named_t arcstat_demand_data_misses;
 245         kstat_named_t arcstat_demand_metadata_hits;
 246         kstat_named_t arcstat_demand_metadata_misses;
 247         kstat_named_t arcstat_prefetch_data_hits;
 248         kstat_named_t arcstat_prefetch_data_misses;
 249         kstat_named_t arcstat_prefetch_metadata_hits;
 250         kstat_named_t arcstat_prefetch_metadata_misses;
 251         kstat_named_t arcstat_mru_hits;
 252         kstat_named_t arcstat_mru_ghost_hits;
 253         kstat_named_t arcstat_mfu_hits;
 254         kstat_named_t arcstat_mfu_ghost_hits;
 255         kstat_named_t arcstat_deleted;
 256         kstat_named_t arcstat_recycle_miss;
 257         /*
 258          * Number of buffers that could not be evicted because the hash lock
 259          * was held by another thread.  The lock may not necessarily be held
 260          * by something using the same buffer, since hash locks are shared
 261          * by multiple buffers.
 262          */
 263         kstat_named_t arcstat_mutex_miss;
 264         /*
 265          * Number of buffers skipped because they have I/O in progress, are
 266          * indrect prefetch buffers that have not lived long enough, or are
 267          * not from the spa we're trying to evict from.
 268          */
 269         kstat_named_t arcstat_evict_skip;
 270         kstat_named_t arcstat_evict_l2_cached;
 271         kstat_named_t arcstat_evict_l2_eligible;
 272         kstat_named_t arcstat_evict_l2_ineligible;
 273         kstat_named_t arcstat_hash_elements;
 274         kstat_named_t arcstat_hash_elements_max;
 275         kstat_named_t arcstat_hash_collisions;
 276         kstat_named_t arcstat_hash_chains;
 277         kstat_named_t arcstat_hash_chain_max;
 278         kstat_named_t arcstat_p;
 279         kstat_named_t arcstat_c;
 280         kstat_named_t arcstat_c_min;
 281         kstat_named_t arcstat_c_max;
 282         kstat_named_t arcstat_size;
 283         kstat_named_t arcstat_hdr_size;
 284         kstat_named_t arcstat_data_size;
 285         kstat_named_t arcstat_other_size;
 286         kstat_named_t arcstat_l2_hits;
 287         kstat_named_t arcstat_l2_misses;
 288         kstat_named_t arcstat_l2_feeds;
 289         kstat_named_t arcstat_l2_rw_clash;
 290         kstat_named_t arcstat_l2_read_bytes;
 291         kstat_named_t arcstat_l2_write_bytes;
 292         kstat_named_t arcstat_l2_writes_sent;
 293         kstat_named_t arcstat_l2_writes_done;
 294         kstat_named_t arcstat_l2_writes_error;
 295         kstat_named_t arcstat_l2_writes_hdr_miss;
 296         kstat_named_t arcstat_l2_evict_lock_retry;
 297         kstat_named_t arcstat_l2_evict_reading;
 298         kstat_named_t arcstat_l2_free_on_write;
 299         kstat_named_t arcstat_l2_abort_lowmem;
 300         kstat_named_t arcstat_l2_cksum_bad;
 301         kstat_named_t arcstat_l2_io_error;
 302         kstat_named_t arcstat_l2_size;
 303         kstat_named_t arcstat_l2_hdr_size;
 304         kstat_named_t arcstat_memory_throttle_count;
 305         kstat_named_t arcstat_duplicate_buffers;
 306         kstat_named_t arcstat_duplicate_buffers_size;
 307         kstat_named_t arcstat_duplicate_reads;
 308         kstat_named_t arcstat_meta_used;
 309         kstat_named_t arcstat_meta_limit;
 310         kstat_named_t arcstat_meta_max;
 311 } arc_stats_t;
 312 
 313 static arc_stats_t arc_stats = {
 314         { "hits",                       KSTAT_DATA_UINT64 },
 315         { "misses",                     KSTAT_DATA_UINT64 },
 316         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 317         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 318         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 319         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 320         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 321         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 322         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 323         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 324         { "mru_hits",                   KSTAT_DATA_UINT64 },
 325         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 326         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 327         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 328         { "deleted",                    KSTAT_DATA_UINT64 },
 329         { "recycle_miss",               KSTAT_DATA_UINT64 },
 330         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 331         { "evict_skip",                 KSTAT_DATA_UINT64 },
 332         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 333         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 334         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 335         { "hash_elements",              KSTAT_DATA_UINT64 },
 336         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 337         { "hash_collisions",            KSTAT_DATA_UINT64 },
 338         { "hash_chains",                KSTAT_DATA_UINT64 },
 339         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 340         { "p",                          KSTAT_DATA_UINT64 },
 341         { "c",                          KSTAT_DATA_UINT64 },
 342         { "c_min",                      KSTAT_DATA_UINT64 },
 343         { "c_max",                      KSTAT_DATA_UINT64 },
 344         { "size",                       KSTAT_DATA_UINT64 },
 345         { "hdr_size",                   KSTAT_DATA_UINT64 },
 346         { "data_size",                  KSTAT_DATA_UINT64 },
 347         { "other_size",                 KSTAT_DATA_UINT64 },
 348         { "l2_hits",                    KSTAT_DATA_UINT64 },
 349         { "l2_misses",                  KSTAT_DATA_UINT64 },
 350         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 351         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 352         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 353         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 354         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 355         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 356         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 357         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 358         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 359         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 360         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 361         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 362         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 363         { "l2_io_error",                KSTAT_DATA_UINT64 },
 364         { "l2_size",                    KSTAT_DATA_UINT64 },
 365         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 366         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 367         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 368         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 369         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 370         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 371         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 372         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 373 };
 374 
 375 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 376 
 377 #define ARCSTAT_INCR(stat, val) \
 378         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 379 
 380 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 381 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 382 
 383 #define ARCSTAT_MAX(stat, val) {                                        \
 384         uint64_t m;                                                     \
 385         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 386             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 387                 continue;                                               \
 388 }
 389 
 390 #define ARCSTAT_MAXSTAT(stat) \
 391         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 392 
 393 /*
 394  * We define a macro to allow ARC hits/misses to be easily broken down by
 395  * two separate conditions, giving a total of four different subtypes for
 396  * each of hits and misses (so eight statistics total).
 397  */
 398 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 399         if (cond1) {                                                    \
 400                 if (cond2) {                                            \
 401                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 402                 } else {                                                \
 403                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 404                 }                                                       \
 405         } else {                                                        \
 406                 if (cond2) {                                            \
 407                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 408                 } else {                                                \
 409                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 410                 }                                                       \
 411         }
 412 
 413 kstat_t                 *arc_ksp;
 414 static arc_state_t      *arc_anon;
 415 static arc_state_t      *arc_mru;
 416 static arc_state_t      *arc_mru_ghost;
 417 static arc_state_t      *arc_mfu;
 418 static arc_state_t      *arc_mfu_ghost;
 419 static arc_state_t      *arc_l2c_only;
 420 
 421 /*
 422  * There are several ARC variables that are critical to export as kstats --
 423  * but we don't want to have to grovel around in the kstat whenever we wish to
 424  * manipulate them.  For these variables, we therefore define them to be in
 425  * terms of the statistic variable.  This assures that we are not introducing
 426  * the possibility of inconsistency by having shadow copies of the variables,
 427  * while still allowing the code to be readable.
 428  */
 429 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 430 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 431 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 432 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 433 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 434 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 435 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 436 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 437 
 438 static int              arc_no_grow;    /* Don't try to grow cache size */
 439 static uint64_t         arc_tempreserve;
 440 static uint64_t         arc_loaned_bytes;
 441 
 442 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 443 
 444 typedef struct arc_callback arc_callback_t;
 445 
 446 struct arc_callback {
 447         void                    *acb_private;
 448         arc_done_func_t         *acb_done;
 449         arc_buf_t               *acb_buf;
 450         zio_t                   *acb_zio_dummy;
 451         arc_callback_t          *acb_next;
 452 };
 453 
 454 typedef struct arc_write_callback arc_write_callback_t;
 455 
 456 struct arc_write_callback {
 457         void            *awcb_private;
 458         arc_done_func_t *awcb_ready;
 459         arc_done_func_t *awcb_done;
 460         arc_buf_t       *awcb_buf;
 461 };
 462 
 463 struct arc_buf_hdr {
 464         /* protected by hash lock */
 465         dva_t                   b_dva;
 466         uint64_t                b_birth;
 467         uint64_t                b_cksum0;
 468 
 469         kmutex_t                b_freeze_lock;
 470         zio_cksum_t             *b_freeze_cksum;
 471         void                    *b_thawed;
 472 
 473         arc_buf_hdr_t           *b_hash_next;
 474         arc_buf_t               *b_buf;
 475         uint32_t                b_flags;
 476         uint32_t                b_datacnt;
 477 
 478         arc_callback_t          *b_acb;
 479         kcondvar_t              b_cv;
 480 
 481         /* immutable */
 482         arc_buf_contents_t      b_type;
 483         uint64_t                b_size;
 484         uint64_t                b_spa;
 485 
 486         /* protected by arc state mutex */
 487         arc_state_t             *b_state;
 488         list_node_t             b_arc_node;
 489 
 490         /* updated atomically */
 491         clock_t                 b_arc_access;
 492 
 493         /* self protecting */
 494         refcount_t              b_refcnt;
 495 
 496         l2arc_buf_hdr_t         *b_l2hdr;
 497         list_node_t             b_l2node;
 498 };
 499 
 500 static arc_buf_t *arc_eviction_list;
 501 static kmutex_t arc_eviction_mtx;
 502 static arc_buf_hdr_t arc_eviction_hdr;
 503 static void arc_get_data_buf(arc_buf_t *buf);
 504 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 505 static int arc_evict_needed(arc_buf_contents_t type);
 506 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 507 static void arc_buf_watch(arc_buf_t *buf);
 508 
 509 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 510 
 511 #define GHOST_STATE(state)      \
 512         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 513         (state) == arc_l2c_only)
 514 
 515 /*
 516  * Private ARC flags.  These flags are private ARC only flags that will show up
 517  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 518  * be passed in as arc_flags in things like arc_read.  However, these flags
 519  * should never be passed and should only be set by ARC code.  When adding new
 520  * public flags, make sure not to smash the private ones.
 521  */
 522 
 523 #define ARC_IN_HASH_TABLE       (1 << 9)  /* this buffer is hashed */
 524 #define ARC_IO_IN_PROGRESS      (1 << 10) /* I/O in progress for buf */
 525 #define ARC_IO_ERROR            (1 << 11) /* I/O failed for buf */
 526 #define ARC_FREED_IN_READ       (1 << 12) /* buf freed while in read */
 527 #define ARC_BUF_AVAILABLE       (1 << 13) /* block not in active use */
 528 #define ARC_INDIRECT            (1 << 14) /* this is an indirect block */
 529 #define ARC_FREE_IN_PROGRESS    (1 << 15) /* hdr about to be freed */
 530 #define ARC_L2_WRITING          (1 << 16) /* L2ARC write in progress */
 531 #define ARC_L2_EVICTED          (1 << 17) /* evicted during I/O */
 532 #define ARC_L2_WRITE_HEAD       (1 << 18) /* head of write list */
 533 
 534 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 535 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 536 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 537 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 538 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 539 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 540 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 541 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 542 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS &&  \
 543                                     (hdr)->b_l2hdr != NULL)
 544 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 545 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 546 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 547 
 548 /*
 549  * Other sizes
 550  */
 551 
 552 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 553 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 554 
 555 /*
 556  * Hash table routines
 557  */
 558 
 559 #define HT_LOCK_PAD     64
 560 
 561 struct ht_lock {
 562         kmutex_t        ht_lock;
 563 #ifdef _KERNEL
 564         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 565 #endif
 566 };
 567 
 568 #define BUF_LOCKS 256
 569 typedef struct buf_hash_table {
 570         uint64_t ht_mask;
 571         arc_buf_hdr_t **ht_table;
 572         struct ht_lock ht_locks[BUF_LOCKS];
 573 } buf_hash_table_t;
 574 
 575 static buf_hash_table_t buf_hash_table;
 576 
 577 #define BUF_HASH_INDEX(spa, dva, birth) \
 578         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 579 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 580 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 581 #define HDR_LOCK(hdr) \
 582         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 583 
 584 uint64_t zfs_crc64_table[256];
 585 
 586 /*
 587  * Level 2 ARC
 588  */
 589 
 590 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 591 #define L2ARC_HEADROOM          2               /* num of writes */
 592 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 593 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 594 
 595 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 596 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 597 
 598 /* L2ARC Performance Tunables */
 599 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 600 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 601 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 602 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 603 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 604 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 605 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 606 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 607 
 608 /*
 609  * L2ARC Internals
 610  */
 611 typedef struct l2arc_dev {
 612         vdev_t                  *l2ad_vdev;     /* vdev */
 613         spa_t                   *l2ad_spa;      /* spa */
 614         uint64_t                l2ad_hand;      /* next write location */
 615         uint64_t                l2ad_write;     /* desired write size, bytes */
 616         uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 617         uint64_t                l2ad_start;     /* first addr on device */
 618         uint64_t                l2ad_end;       /* last addr on device */
 619         uint64_t                l2ad_evict;     /* last addr eviction reached */
 620         boolean_t               l2ad_first;     /* first sweep through */
 621         boolean_t               l2ad_writing;   /* currently writing */
 622         list_t                  *l2ad_buflist;  /* buffer list */
 623         list_node_t             l2ad_node;      /* device list node */
 624 } l2arc_dev_t;
 625 
 626 static list_t L2ARC_dev_list;                   /* device list */
 627 static list_t *l2arc_dev_list;                  /* device list pointer */
 628 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 629 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 630 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 631 static list_t L2ARC_free_on_write;              /* free after write buf list */
 632 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 633 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 634 static uint64_t l2arc_ndev;                     /* number of devices */
 635 
 636 typedef struct l2arc_read_callback {
 637         arc_buf_t       *l2rcb_buf;             /* read buffer */
 638         spa_t           *l2rcb_spa;             /* spa */
 639         blkptr_t        l2rcb_bp;               /* original blkptr */
 640         zbookmark_t     l2rcb_zb;               /* original bookmark */
 641         int             l2rcb_flags;            /* original flags */
 642 } l2arc_read_callback_t;
 643 
 644 typedef struct l2arc_write_callback {
 645         l2arc_dev_t     *l2wcb_dev;             /* device info */
 646         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 647 } l2arc_write_callback_t;
 648 
 649 struct l2arc_buf_hdr {
 650         /* protected by arc_buf_hdr  mutex */
 651         l2arc_dev_t     *b_dev;                 /* L2ARC device */
 652         uint64_t        b_daddr;                /* disk address, offset byte */
 653 };
 654 
 655 typedef struct l2arc_data_free {
 656         /* protected by l2arc_free_on_write_mtx */
 657         void            *l2df_data;
 658         size_t          l2df_size;
 659         void            (*l2df_func)(void *, size_t);
 660         list_node_t     l2df_list_node;
 661 } l2arc_data_free_t;
 662 
 663 static kmutex_t l2arc_feed_thr_lock;
 664 static kcondvar_t l2arc_feed_thr_cv;
 665 static uint8_t l2arc_thread_exit;
 666 
 667 static void l2arc_read_done(zio_t *zio);
 668 static void l2arc_hdr_stat_add(void);
 669 static void l2arc_hdr_stat_remove(void);
 670 
 671 static uint64_t
 672 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 673 {
 674         uint8_t *vdva = (uint8_t *)dva;
 675         uint64_t crc = -1ULL;
 676         int i;
 677 
 678         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 679 
 680         for (i = 0; i < sizeof (dva_t); i++)
 681                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 682 
 683         crc ^= (spa>>8) ^ birth;
 684 
 685         return (crc);
 686 }
 687 
 688 #define BUF_EMPTY(buf)                                          \
 689         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 690         (buf)->b_dva.dva_word[1] == 0 &&                     \
 691         (buf)->b_birth == 0)
 692 
 693 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 694         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&       \
 695         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&       \
 696         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 697 
 698 static void
 699 buf_discard_identity(arc_buf_hdr_t *hdr)
 700 {
 701         hdr->b_dva.dva_word[0] = 0;
 702         hdr->b_dva.dva_word[1] = 0;
 703         hdr->b_birth = 0;
 704         hdr->b_cksum0 = 0;
 705 }
 706 
 707 static arc_buf_hdr_t *
 708 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 709 {
 710         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 711         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 712         arc_buf_hdr_t *buf;
 713 
 714         mutex_enter(hash_lock);
 715         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 716             buf = buf->b_hash_next) {
 717                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 718                         *lockp = hash_lock;
 719                         return (buf);
 720                 }
 721         }
 722         mutex_exit(hash_lock);
 723         *lockp = NULL;
 724         return (NULL);
 725 }
 726 
 727 /*
 728  * Insert an entry into the hash table.  If there is already an element
 729  * equal to elem in the hash table, then the already existing element
 730  * will be returned and the new element will not be inserted.
 731  * Otherwise returns NULL.
 732  */
 733 static arc_buf_hdr_t *
 734 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 735 {
 736         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 737         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 738         arc_buf_hdr_t *fbuf;
 739         uint32_t i;
 740 
 741         ASSERT(!HDR_IN_HASH_TABLE(buf));
 742         *lockp = hash_lock;
 743         mutex_enter(hash_lock);
 744         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 745             fbuf = fbuf->b_hash_next, i++) {
 746                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 747                         return (fbuf);
 748         }
 749 
 750         buf->b_hash_next = buf_hash_table.ht_table[idx];
 751         buf_hash_table.ht_table[idx] = buf;
 752         buf->b_flags |= ARC_IN_HASH_TABLE;
 753 
 754         /* collect some hash table performance data */
 755         if (i > 0) {
 756                 ARCSTAT_BUMP(arcstat_hash_collisions);
 757                 if (i == 1)
 758                         ARCSTAT_BUMP(arcstat_hash_chains);
 759 
 760                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 761         }
 762 
 763         ARCSTAT_BUMP(arcstat_hash_elements);
 764         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 765 
 766         return (NULL);
 767 }
 768 
 769 static void
 770 buf_hash_remove(arc_buf_hdr_t *buf)
 771 {
 772         arc_buf_hdr_t *fbuf, **bufp;
 773         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 774 
 775         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 776         ASSERT(HDR_IN_HASH_TABLE(buf));
 777 
 778         bufp = &buf_hash_table.ht_table[idx];
 779         while ((fbuf = *bufp) != buf) {
 780                 ASSERT(fbuf != NULL);
 781                 bufp = &fbuf->b_hash_next;
 782         }
 783         *bufp = buf->b_hash_next;
 784         buf->b_hash_next = NULL;
 785         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 786 
 787         /* collect some hash table performance data */
 788         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 789 
 790         if (buf_hash_table.ht_table[idx] &&
 791             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 792                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 793 }
 794 
 795 /*
 796  * Global data structures and functions for the buf kmem cache.
 797  */
 798 static kmem_cache_t *hdr_cache;
 799 static kmem_cache_t *buf_cache;
 800 
 801 static void
 802 buf_fini(void)
 803 {
 804         int i;
 805 
 806         kmem_free(buf_hash_table.ht_table,
 807             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 808         for (i = 0; i < BUF_LOCKS; i++)
 809                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 810         kmem_cache_destroy(hdr_cache);
 811         kmem_cache_destroy(buf_cache);
 812 }
 813 
 814 /*
 815  * Constructor callback - called when the cache is empty
 816  * and a new buf is requested.
 817  */
 818 /* ARGSUSED */
 819 static int
 820 hdr_cons(void *vbuf, void *unused, int kmflag)
 821 {
 822         arc_buf_hdr_t *buf = vbuf;
 823 
 824         bzero(buf, sizeof (arc_buf_hdr_t));
 825         refcount_create(&buf->b_refcnt);
 826         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 827         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 828         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 829 
 830         return (0);
 831 }
 832 
 833 /* ARGSUSED */
 834 static int
 835 buf_cons(void *vbuf, void *unused, int kmflag)
 836 {
 837         arc_buf_t *buf = vbuf;
 838 
 839         bzero(buf, sizeof (arc_buf_t));
 840         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 841         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 842 
 843         return (0);
 844 }
 845 
 846 /*
 847  * Destructor callback - called when a cached buf is
 848  * no longer required.
 849  */
 850 /* ARGSUSED */
 851 static void
 852 hdr_dest(void *vbuf, void *unused)
 853 {
 854         arc_buf_hdr_t *buf = vbuf;
 855 
 856         ASSERT(BUF_EMPTY(buf));
 857         refcount_destroy(&buf->b_refcnt);
 858         cv_destroy(&buf->b_cv);
 859         mutex_destroy(&buf->b_freeze_lock);
 860         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 861 }
 862 
 863 /* ARGSUSED */
 864 static void
 865 buf_dest(void *vbuf, void *unused)
 866 {
 867         arc_buf_t *buf = vbuf;
 868 
 869         mutex_destroy(&buf->b_evict_lock);
 870         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 871 }
 872 
 873 /*
 874  * Reclaim callback -- invoked when memory is low.
 875  */
 876 /* ARGSUSED */
 877 static void
 878 hdr_recl(void *unused)
 879 {
 880         dprintf("hdr_recl called\n");
 881         /*
 882          * umem calls the reclaim func when we destroy the buf cache,
 883          * which is after we do arc_fini().
 884          */
 885         if (!arc_dead)
 886                 cv_signal(&arc_reclaim_thr_cv);
 887 }
 888 
 889 static void
 890 buf_init(void)
 891 {
 892         uint64_t *ct;
 893         uint64_t hsize = 1ULL << 12;
 894         int i, j;
 895 
 896         /*
 897          * The hash table is big enough to fill all of physical memory
 898          * with an average 64K block size.  The table will take up
 899          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 900          */
 901         while (hsize * 65536 < physmem * PAGESIZE)
 902                 hsize <<= 1;
 903 retry:
 904         buf_hash_table.ht_mask = hsize - 1;
 905         buf_hash_table.ht_table =
 906             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 907         if (buf_hash_table.ht_table == NULL) {
 908                 ASSERT(hsize > (1ULL << 8));
 909                 hsize >>= 1;
 910                 goto retry;
 911         }
 912 
 913         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 914             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 915         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 916             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 917 
 918         for (i = 0; i < 256; i++)
 919                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 920                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 921 
 922         for (i = 0; i < BUF_LOCKS; i++) {
 923                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 924                     NULL, MUTEX_DEFAULT, NULL);
 925         }
 926 }
 927 
 928 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 929 
 930 static void
 931 arc_cksum_verify(arc_buf_t *buf)
 932 {
 933         zio_cksum_t zc;
 934 
 935         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 936                 return;
 937 
 938         mutex_enter(&buf->b_hdr->b_freeze_lock);
 939         if (buf->b_hdr->b_freeze_cksum == NULL ||
 940             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 941                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 942                 return;
 943         }
 944         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 945         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 946                 panic("buffer modified while frozen!");
 947         mutex_exit(&buf->b_hdr->b_freeze_lock);
 948 }
 949 
 950 static int
 951 arc_cksum_equal(arc_buf_t *buf)
 952 {
 953         zio_cksum_t zc;
 954         int equal;
 955 
 956         mutex_enter(&buf->b_hdr->b_freeze_lock);
 957         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 958         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 959         mutex_exit(&buf->b_hdr->b_freeze_lock);
 960 
 961         return (equal);
 962 }
 963 
 964 static void
 965 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 966 {
 967         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 968                 return;
 969 
 970         mutex_enter(&buf->b_hdr->b_freeze_lock);
 971         if (buf->b_hdr->b_freeze_cksum != NULL) {
 972                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 973                 return;
 974         }
 975         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
 976         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
 977             buf->b_hdr->b_freeze_cksum);
 978         mutex_exit(&buf->b_hdr->b_freeze_lock);
 979         arc_buf_watch(buf);
 980 }
 981 
 982 #ifndef _KERNEL
 983 typedef struct procctl {
 984         long cmd;
 985         prwatch_t prwatch;
 986 } procctl_t;
 987 #endif
 988 
 989 /* ARGSUSED */
 990 static void
 991 arc_buf_unwatch(arc_buf_t *buf)
 992 {
 993 #ifndef _KERNEL
 994         if (arc_watch) {
 995                 int result;
 996                 procctl_t ctl;
 997                 ctl.cmd = PCWATCH;
 998                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
 999                 ctl.prwatch.pr_size = 0;
1000                 ctl.prwatch.pr_wflags = 0;
1001                 result = write(arc_procfd, &ctl, sizeof (ctl));
1002                 ASSERT3U(result, ==, sizeof (ctl));
1003         }
1004 #endif
1005 }
1006 
1007 /* ARGSUSED */
1008 static void
1009 arc_buf_watch(arc_buf_t *buf)
1010 {
1011 #ifndef _KERNEL
1012         if (arc_watch) {
1013                 int result;
1014                 procctl_t ctl;
1015                 ctl.cmd = PCWATCH;
1016                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1017                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1018                 ctl.prwatch.pr_wflags = WA_WRITE;
1019                 result = write(arc_procfd, &ctl, sizeof (ctl));
1020                 ASSERT3U(result, ==, sizeof (ctl));
1021         }
1022 #endif
1023 }
1024 
1025 void
1026 arc_buf_thaw(arc_buf_t *buf)
1027 {
1028         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1029                 if (buf->b_hdr->b_state != arc_anon)
1030                         panic("modifying non-anon buffer!");
1031                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1032                         panic("modifying buffer while i/o in progress!");
1033                 arc_cksum_verify(buf);
1034         }
1035 
1036         mutex_enter(&buf->b_hdr->b_freeze_lock);
1037         if (buf->b_hdr->b_freeze_cksum != NULL) {
1038                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1039                 buf->b_hdr->b_freeze_cksum = NULL;
1040         }
1041 
1042         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1043                 if (buf->b_hdr->b_thawed)
1044                         kmem_free(buf->b_hdr->b_thawed, 1);
1045                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1046         }
1047 
1048         mutex_exit(&buf->b_hdr->b_freeze_lock);
1049 
1050         arc_buf_unwatch(buf);
1051 }
1052 
1053 void
1054 arc_buf_freeze(arc_buf_t *buf)
1055 {
1056         kmutex_t *hash_lock;
1057 
1058         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1059                 return;
1060 
1061         hash_lock = HDR_LOCK(buf->b_hdr);
1062         mutex_enter(hash_lock);
1063 
1064         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1065             buf->b_hdr->b_state == arc_anon);
1066         arc_cksum_compute(buf, B_FALSE);
1067         mutex_exit(hash_lock);
1068 
1069 }
1070 
1071 static void
1072 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1073 {
1074         ASSERT(MUTEX_HELD(hash_lock));
1075 
1076         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1077             (ab->b_state != arc_anon)) {
1078                 uint64_t delta = ab->b_size * ab->b_datacnt;
1079                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1080                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1081 
1082                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1083                 mutex_enter(&ab->b_state->arcs_mtx);
1084                 ASSERT(list_link_active(&ab->b_arc_node));
1085                 list_remove(list, ab);
1086                 if (GHOST_STATE(ab->b_state)) {
1087                         ASSERT0(ab->b_datacnt);
1088                         ASSERT3P(ab->b_buf, ==, NULL);
1089                         delta = ab->b_size;
1090                 }
1091                 ASSERT(delta > 0);
1092                 ASSERT3U(*size, >=, delta);
1093                 atomic_add_64(size, -delta);
1094                 mutex_exit(&ab->b_state->arcs_mtx);
1095                 /* remove the prefetch flag if we get a reference */
1096                 if (ab->b_flags & ARC_PREFETCH)
1097                         ab->b_flags &= ~ARC_PREFETCH;
1098         }
1099 }
1100 
1101 static int
1102 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1103 {
1104         int cnt;
1105         arc_state_t *state = ab->b_state;
1106 
1107         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1108         ASSERT(!GHOST_STATE(state));
1109 
1110         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1111             (state != arc_anon)) {
1112                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1113 
1114                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1115                 mutex_enter(&state->arcs_mtx);
1116                 ASSERT(!list_link_active(&ab->b_arc_node));
1117                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1118                 ASSERT(ab->b_datacnt > 0);
1119                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1120                 mutex_exit(&state->arcs_mtx);
1121         }
1122         return (cnt);
1123 }
1124 
1125 /*
1126  * Move the supplied buffer to the indicated state.  The mutex
1127  * for the buffer must be held by the caller.
1128  */
1129 static void
1130 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1131 {
1132         arc_state_t *old_state = ab->b_state;
1133         int64_t refcnt = refcount_count(&ab->b_refcnt);
1134         uint64_t from_delta, to_delta;
1135 
1136         ASSERT(MUTEX_HELD(hash_lock));
1137         ASSERT(new_state != old_state);
1138         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1139         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1140         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1141 
1142         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1143 
1144         /*
1145          * If this buffer is evictable, transfer it from the
1146          * old state list to the new state list.
1147          */
1148         if (refcnt == 0) {
1149                 if (old_state != arc_anon) {
1150                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1151                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1152 
1153                         if (use_mutex)
1154                                 mutex_enter(&old_state->arcs_mtx);
1155 
1156                         ASSERT(list_link_active(&ab->b_arc_node));
1157                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1158 
1159                         /*
1160                          * If prefetching out of the ghost cache,
1161                          * we will have a non-zero datacnt.
1162                          */
1163                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1164                                 /* ghost elements have a ghost size */
1165                                 ASSERT(ab->b_buf == NULL);
1166                                 from_delta = ab->b_size;
1167                         }
1168                         ASSERT3U(*size, >=, from_delta);
1169                         atomic_add_64(size, -from_delta);
1170 
1171                         if (use_mutex)
1172                                 mutex_exit(&old_state->arcs_mtx);
1173                 }
1174                 if (new_state != arc_anon) {
1175                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1176                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1177 
1178                         if (use_mutex)
1179                                 mutex_enter(&new_state->arcs_mtx);
1180 
1181                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1182 
1183                         /* ghost elements have a ghost size */
1184                         if (GHOST_STATE(new_state)) {
1185                                 ASSERT(ab->b_datacnt == 0);
1186                                 ASSERT(ab->b_buf == NULL);
1187                                 to_delta = ab->b_size;
1188                         }
1189                         atomic_add_64(size, to_delta);
1190 
1191                         if (use_mutex)
1192                                 mutex_exit(&new_state->arcs_mtx);
1193                 }
1194         }
1195 
1196         ASSERT(!BUF_EMPTY(ab));
1197         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1198                 buf_hash_remove(ab);
1199 
1200         /* adjust state sizes */
1201         if (to_delta)
1202                 atomic_add_64(&new_state->arcs_size, to_delta);
1203         if (from_delta) {
1204                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1205                 atomic_add_64(&old_state->arcs_size, -from_delta);
1206         }
1207         ab->b_state = new_state;
1208 
1209         /* adjust l2arc hdr stats */
1210         if (new_state == arc_l2c_only)
1211                 l2arc_hdr_stat_add();
1212         else if (old_state == arc_l2c_only)
1213                 l2arc_hdr_stat_remove();
1214 }
1215 
1216 void
1217 arc_space_consume(uint64_t space, arc_space_type_t type)
1218 {
1219         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1220 
1221         switch (type) {
1222         case ARC_SPACE_DATA:
1223                 ARCSTAT_INCR(arcstat_data_size, space);
1224                 break;
1225         case ARC_SPACE_OTHER:
1226                 ARCSTAT_INCR(arcstat_other_size, space);
1227                 break;
1228         case ARC_SPACE_HDRS:
1229                 ARCSTAT_INCR(arcstat_hdr_size, space);
1230                 break;
1231         case ARC_SPACE_L2HDRS:
1232                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1233                 break;
1234         }
1235 
1236         ARCSTAT_INCR(arcstat_meta_used, space);
1237         atomic_add_64(&arc_size, space);
1238 }
1239 
1240 void
1241 arc_space_return(uint64_t space, arc_space_type_t type)
1242 {
1243         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1244 
1245         switch (type) {
1246         case ARC_SPACE_DATA:
1247                 ARCSTAT_INCR(arcstat_data_size, -space);
1248                 break;
1249         case ARC_SPACE_OTHER:
1250                 ARCSTAT_INCR(arcstat_other_size, -space);
1251                 break;
1252         case ARC_SPACE_HDRS:
1253                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1254                 break;
1255         case ARC_SPACE_L2HDRS:
1256                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1257                 break;
1258         }
1259 
1260         ASSERT(arc_meta_used >= space);
1261         if (arc_meta_max < arc_meta_used)
1262                 arc_meta_max = arc_meta_used;
1263         ARCSTAT_INCR(arcstat_meta_used, -space);
1264         ASSERT(arc_size >= space);
1265         atomic_add_64(&arc_size, -space);
1266 }
1267 
1268 void *
1269 arc_data_buf_alloc(uint64_t size)
1270 {
1271         if (arc_evict_needed(ARC_BUFC_DATA))
1272                 cv_signal(&arc_reclaim_thr_cv);
1273         atomic_add_64(&arc_size, size);
1274         return (zio_data_buf_alloc(size));
1275 }
1276 
1277 void
1278 arc_data_buf_free(void *buf, uint64_t size)
1279 {
1280         zio_data_buf_free(buf, size);
1281         ASSERT(arc_size >= size);
1282         atomic_add_64(&arc_size, -size);
1283 }
1284 
1285 arc_buf_t *
1286 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1287 {
1288         arc_buf_hdr_t *hdr;
1289         arc_buf_t *buf;
1290 
1291         ASSERT3U(size, >, 0);
1292         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1293         ASSERT(BUF_EMPTY(hdr));
1294         hdr->b_size = size;
1295         hdr->b_type = type;
1296         hdr->b_spa = spa_load_guid(spa);
1297         hdr->b_state = arc_anon;
1298         hdr->b_arc_access = 0;
1299         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1300         buf->b_hdr = hdr;
1301         buf->b_data = NULL;
1302         buf->b_efunc = NULL;
1303         buf->b_private = NULL;
1304         buf->b_next = NULL;
1305         hdr->b_buf = buf;
1306         arc_get_data_buf(buf);
1307         hdr->b_datacnt = 1;
1308         hdr->b_flags = 0;
1309         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1310         (void) refcount_add(&hdr->b_refcnt, tag);
1311 
1312         return (buf);
1313 }
1314 
1315 static char *arc_onloan_tag = "onloan";
1316 
1317 /*
1318  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1319  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1320  * buffers must be returned to the arc before they can be used by the DMU or
1321  * freed.
1322  */
1323 arc_buf_t *
1324 arc_loan_buf(spa_t *spa, int size)
1325 {
1326         arc_buf_t *buf;
1327 
1328         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1329 
1330         atomic_add_64(&arc_loaned_bytes, size);
1331         return (buf);
1332 }
1333 
1334 /*
1335  * Return a loaned arc buffer to the arc.
1336  */
1337 void
1338 arc_return_buf(arc_buf_t *buf, void *tag)
1339 {
1340         arc_buf_hdr_t *hdr = buf->b_hdr;
1341 
1342         ASSERT(buf->b_data != NULL);
1343         (void) refcount_add(&hdr->b_refcnt, tag);
1344         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1345 
1346         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1347 }
1348 
1349 /* Detach an arc_buf from a dbuf (tag) */
1350 void
1351 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1352 {
1353         arc_buf_hdr_t *hdr;
1354 
1355         ASSERT(buf->b_data != NULL);
1356         hdr = buf->b_hdr;
1357         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1358         (void) refcount_remove(&hdr->b_refcnt, tag);
1359         buf->b_efunc = NULL;
1360         buf->b_private = NULL;
1361 
1362         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1363 }
1364 
1365 static arc_buf_t *
1366 arc_buf_clone(arc_buf_t *from)
1367 {
1368         arc_buf_t *buf;
1369         arc_buf_hdr_t *hdr = from->b_hdr;
1370         uint64_t size = hdr->b_size;
1371 
1372         ASSERT(hdr->b_state != arc_anon);
1373 
1374         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1375         buf->b_hdr = hdr;
1376         buf->b_data = NULL;
1377         buf->b_efunc = NULL;
1378         buf->b_private = NULL;
1379         buf->b_next = hdr->b_buf;
1380         hdr->b_buf = buf;
1381         arc_get_data_buf(buf);
1382         bcopy(from->b_data, buf->b_data, size);
1383 
1384         /*
1385          * This buffer already exists in the arc so create a duplicate
1386          * copy for the caller.  If the buffer is associated with user data
1387          * then track the size and number of duplicates.  These stats will be
1388          * updated as duplicate buffers are created and destroyed.
1389          */
1390         if (hdr->b_type == ARC_BUFC_DATA) {
1391                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1392                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1393         }
1394         hdr->b_datacnt += 1;
1395         return (buf);
1396 }
1397 
1398 void
1399 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1400 {
1401         arc_buf_hdr_t *hdr;
1402         kmutex_t *hash_lock;
1403 
1404         /*
1405          * Check to see if this buffer is evicted.  Callers
1406          * must verify b_data != NULL to know if the add_ref
1407          * was successful.
1408          */
1409         mutex_enter(&buf->b_evict_lock);
1410         if (buf->b_data == NULL) {
1411                 mutex_exit(&buf->b_evict_lock);
1412                 return;
1413         }
1414         hash_lock = HDR_LOCK(buf->b_hdr);
1415         mutex_enter(hash_lock);
1416         hdr = buf->b_hdr;
1417         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1418         mutex_exit(&buf->b_evict_lock);
1419 
1420         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1421         add_reference(hdr, hash_lock, tag);
1422         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1423         arc_access(hdr, hash_lock);
1424         mutex_exit(hash_lock);
1425         ARCSTAT_BUMP(arcstat_hits);
1426         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1427             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1428             data, metadata, hits);
1429 }
1430 
1431 /*
1432  * Free the arc data buffer.  If it is an l2arc write in progress,
1433  * the buffer is placed on l2arc_free_on_write to be freed later.
1434  */
1435 static void
1436 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1437 {
1438         arc_buf_hdr_t *hdr = buf->b_hdr;
1439 
1440         if (HDR_L2_WRITING(hdr)) {
1441                 l2arc_data_free_t *df;
1442                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1443                 df->l2df_data = buf->b_data;
1444                 df->l2df_size = hdr->b_size;
1445                 df->l2df_func = free_func;
1446                 mutex_enter(&l2arc_free_on_write_mtx);
1447                 list_insert_head(l2arc_free_on_write, df);
1448                 mutex_exit(&l2arc_free_on_write_mtx);
1449                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1450         } else {
1451                 free_func(buf->b_data, hdr->b_size);
1452         }
1453 }
1454 
1455 static void
1456 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1457 {
1458         arc_buf_t **bufp;
1459 
1460         /* free up data associated with the buf */
1461         if (buf->b_data) {
1462                 arc_state_t *state = buf->b_hdr->b_state;
1463                 uint64_t size = buf->b_hdr->b_size;
1464                 arc_buf_contents_t type = buf->b_hdr->b_type;
1465 
1466                 arc_cksum_verify(buf);
1467                 arc_buf_unwatch(buf);
1468 
1469                 if (!recycle) {
1470                         if (type == ARC_BUFC_METADATA) {
1471                                 arc_buf_data_free(buf, zio_buf_free);
1472                                 arc_space_return(size, ARC_SPACE_DATA);
1473                         } else {
1474                                 ASSERT(type == ARC_BUFC_DATA);
1475                                 arc_buf_data_free(buf, zio_data_buf_free);
1476                                 ARCSTAT_INCR(arcstat_data_size, -size);
1477                                 atomic_add_64(&arc_size, -size);
1478                         }
1479                 }
1480                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1481                         uint64_t *cnt = &state->arcs_lsize[type];
1482 
1483                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1484                         ASSERT(state != arc_anon);
1485 
1486                         ASSERT3U(*cnt, >=, size);
1487                         atomic_add_64(cnt, -size);
1488                 }
1489                 ASSERT3U(state->arcs_size, >=, size);
1490                 atomic_add_64(&state->arcs_size, -size);
1491                 buf->b_data = NULL;
1492 
1493                 /*
1494                  * If we're destroying a duplicate buffer make sure
1495                  * that the appropriate statistics are updated.
1496                  */
1497                 if (buf->b_hdr->b_datacnt > 1 &&
1498                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1499                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1500                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1501                 }
1502                 ASSERT(buf->b_hdr->b_datacnt > 0);
1503                 buf->b_hdr->b_datacnt -= 1;
1504         }
1505 
1506         /* only remove the buf if requested */
1507         if (!all)
1508                 return;
1509 
1510         /* remove the buf from the hdr list */
1511         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1512                 continue;
1513         *bufp = buf->b_next;
1514         buf->b_next = NULL;
1515 
1516         ASSERT(buf->b_efunc == NULL);
1517 
1518         /* clean up the buf */
1519         buf->b_hdr = NULL;
1520         kmem_cache_free(buf_cache, buf);
1521 }
1522 
1523 static void
1524 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1525 {
1526         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1527         ASSERT3P(hdr->b_state, ==, arc_anon);
1528         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1529         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1530 
1531         if (l2hdr != NULL) {
1532                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1533                 /*
1534                  * To prevent arc_free() and l2arc_evict() from
1535                  * attempting to free the same buffer at the same time,
1536                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1537                  * give it priority.  l2arc_evict() can't destroy this
1538                  * header while we are waiting on l2arc_buflist_mtx.
1539                  *
1540                  * The hdr may be removed from l2ad_buflist before we
1541                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1542                  */
1543                 if (!buflist_held) {
1544                         mutex_enter(&l2arc_buflist_mtx);
1545                         l2hdr = hdr->b_l2hdr;
1546                 }
1547 
1548                 if (l2hdr != NULL) {
1549                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1550                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1551                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1552                         if (hdr->b_state == arc_l2c_only)
1553                                 l2arc_hdr_stat_remove();
1554                         hdr->b_l2hdr = NULL;
1555                 }
1556 
1557                 if (!buflist_held)
1558                         mutex_exit(&l2arc_buflist_mtx);
1559         }
1560 
1561         if (!BUF_EMPTY(hdr)) {
1562                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1563                 buf_discard_identity(hdr);
1564         }
1565         while (hdr->b_buf) {
1566                 arc_buf_t *buf = hdr->b_buf;
1567 
1568                 if (buf->b_efunc) {
1569                         mutex_enter(&arc_eviction_mtx);
1570                         mutex_enter(&buf->b_evict_lock);
1571                         ASSERT(buf->b_hdr != NULL);
1572                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1573                         hdr->b_buf = buf->b_next;
1574                         buf->b_hdr = &arc_eviction_hdr;
1575                         buf->b_next = arc_eviction_list;
1576                         arc_eviction_list = buf;
1577                         mutex_exit(&buf->b_evict_lock);
1578                         mutex_exit(&arc_eviction_mtx);
1579                 } else {
1580                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1581                 }
1582         }
1583         if (hdr->b_freeze_cksum != NULL) {
1584                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1585                 hdr->b_freeze_cksum = NULL;
1586         }
1587         if (hdr->b_thawed) {
1588                 kmem_free(hdr->b_thawed, 1);
1589                 hdr->b_thawed = NULL;
1590         }
1591 
1592         ASSERT(!list_link_active(&hdr->b_arc_node));
1593         ASSERT3P(hdr->b_hash_next, ==, NULL);
1594         ASSERT3P(hdr->b_acb, ==, NULL);
1595         kmem_cache_free(hdr_cache, hdr);
1596 }
1597 
1598 void
1599 arc_buf_free(arc_buf_t *buf, void *tag)
1600 {
1601         arc_buf_hdr_t *hdr = buf->b_hdr;
1602         int hashed = hdr->b_state != arc_anon;
1603 
1604         ASSERT(buf->b_efunc == NULL);
1605         ASSERT(buf->b_data != NULL);
1606 
1607         if (hashed) {
1608                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1609 
1610                 mutex_enter(hash_lock);
1611                 hdr = buf->b_hdr;
1612                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1613 
1614                 (void) remove_reference(hdr, hash_lock, tag);
1615                 if (hdr->b_datacnt > 1) {
1616                         arc_buf_destroy(buf, FALSE, TRUE);
1617                 } else {
1618                         ASSERT(buf == hdr->b_buf);
1619                         ASSERT(buf->b_efunc == NULL);
1620                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1621                 }
1622                 mutex_exit(hash_lock);
1623         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1624                 int destroy_hdr;
1625                 /*
1626                  * We are in the middle of an async write.  Don't destroy
1627                  * this buffer unless the write completes before we finish
1628                  * decrementing the reference count.
1629                  */
1630                 mutex_enter(&arc_eviction_mtx);
1631                 (void) remove_reference(hdr, NULL, tag);
1632                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1633                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1634                 mutex_exit(&arc_eviction_mtx);
1635                 if (destroy_hdr)
1636                         arc_hdr_destroy(hdr);
1637         } else {
1638                 if (remove_reference(hdr, NULL, tag) > 0)
1639                         arc_buf_destroy(buf, FALSE, TRUE);
1640                 else
1641                         arc_hdr_destroy(hdr);
1642         }
1643 }
1644 
1645 boolean_t
1646 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1647 {
1648         arc_buf_hdr_t *hdr = buf->b_hdr;
1649         kmutex_t *hash_lock = HDR_LOCK(hdr);
1650         boolean_t no_callback = (buf->b_efunc == NULL);
1651 
1652         if (hdr->b_state == arc_anon) {
1653                 ASSERT(hdr->b_datacnt == 1);
1654                 arc_buf_free(buf, tag);
1655                 return (no_callback);
1656         }
1657 
1658         mutex_enter(hash_lock);
1659         hdr = buf->b_hdr;
1660         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1661         ASSERT(hdr->b_state != arc_anon);
1662         ASSERT(buf->b_data != NULL);
1663 
1664         (void) remove_reference(hdr, hash_lock, tag);
1665         if (hdr->b_datacnt > 1) {
1666                 if (no_callback)
1667                         arc_buf_destroy(buf, FALSE, TRUE);
1668         } else if (no_callback) {
1669                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1670                 ASSERT(buf->b_efunc == NULL);
1671                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1672         }
1673         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1674             refcount_is_zero(&hdr->b_refcnt));
1675         mutex_exit(hash_lock);
1676         return (no_callback);
1677 }
1678 
1679 int
1680 arc_buf_size(arc_buf_t *buf)
1681 {
1682         return (buf->b_hdr->b_size);
1683 }
1684 
1685 /*
1686  * Called from the DMU to determine if the current buffer should be
1687  * evicted. In order to ensure proper locking, the eviction must be initiated
1688  * from the DMU. Return true if the buffer is associated with user data and
1689  * duplicate buffers still exist.
1690  */
1691 boolean_t
1692 arc_buf_eviction_needed(arc_buf_t *buf)
1693 {
1694         arc_buf_hdr_t *hdr;
1695         boolean_t evict_needed = B_FALSE;
1696 
1697         if (zfs_disable_dup_eviction)
1698                 return (B_FALSE);
1699 
1700         mutex_enter(&buf->b_evict_lock);
1701         hdr = buf->b_hdr;
1702         if (hdr == NULL) {
1703                 /*
1704                  * We are in arc_do_user_evicts(); let that function
1705                  * perform the eviction.
1706                  */
1707                 ASSERT(buf->b_data == NULL);
1708                 mutex_exit(&buf->b_evict_lock);
1709                 return (B_FALSE);
1710         } else if (buf->b_data == NULL) {
1711                 /*
1712                  * We have already been added to the arc eviction list;
1713                  * recommend eviction.
1714                  */
1715                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1716                 mutex_exit(&buf->b_evict_lock);
1717                 return (B_TRUE);
1718         }
1719 
1720         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1721                 evict_needed = B_TRUE;
1722 
1723         mutex_exit(&buf->b_evict_lock);
1724         return (evict_needed);
1725 }
1726 
1727 /*
1728  * Evict buffers from list until we've removed the specified number of
1729  * bytes.  Move the removed buffers to the appropriate evict state.
1730  * If the recycle flag is set, then attempt to "recycle" a buffer:
1731  * - look for a buffer to evict that is `bytes' long.
1732  * - return the data block from this buffer rather than freeing it.
1733  * This flag is used by callers that are trying to make space for a
1734  * new buffer in a full arc cache.
1735  *
1736  * This function makes a "best effort".  It skips over any buffers
1737  * it can't get a hash_lock on, and so may not catch all candidates.
1738  * It may also return without evicting as much space as requested.
1739  */
1740 static void *
1741 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1742     arc_buf_contents_t type)
1743 {
1744         arc_state_t *evicted_state;
1745         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1746         arc_buf_hdr_t *ab, *ab_prev = NULL;
1747         list_t *list = &state->arcs_list[type];
1748         kmutex_t *hash_lock;
1749         boolean_t have_lock;
1750         void *stolen = NULL;
1751 
1752         ASSERT(state == arc_mru || state == arc_mfu);
1753 
1754         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1755 
1756         mutex_enter(&state->arcs_mtx);
1757         mutex_enter(&evicted_state->arcs_mtx);
1758 
1759         for (ab = list_tail(list); ab; ab = ab_prev) {
1760                 ab_prev = list_prev(list, ab);
1761                 /* prefetch buffers have a minimum lifespan */
1762                 if (HDR_IO_IN_PROGRESS(ab) ||
1763                     (spa && ab->b_spa != spa) ||
1764                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1765                     ddi_get_lbolt() - ab->b_arc_access <
1766                     arc_min_prefetch_lifespan)) {
1767                         skipped++;
1768                         continue;
1769                 }
1770                 /* "lookahead" for better eviction candidate */
1771                 if (recycle && ab->b_size != bytes &&
1772                     ab_prev && ab_prev->b_size == bytes)
1773                         continue;
1774                 hash_lock = HDR_LOCK(ab);
1775                 have_lock = MUTEX_HELD(hash_lock);
1776                 if (have_lock || mutex_tryenter(hash_lock)) {
1777                         ASSERT0(refcount_count(&ab->b_refcnt));
1778                         ASSERT(ab->b_datacnt > 0);
1779                         while (ab->b_buf) {
1780                                 arc_buf_t *buf = ab->b_buf;
1781                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1782                                         missed += 1;
1783                                         break;
1784                                 }
1785                                 if (buf->b_data) {
1786                                         bytes_evicted += ab->b_size;
1787                                         if (recycle && ab->b_type == type &&
1788                                             ab->b_size == bytes &&
1789                                             !HDR_L2_WRITING(ab)) {
1790                                                 stolen = buf->b_data;
1791                                                 recycle = FALSE;
1792                                         }
1793                                 }
1794                                 if (buf->b_efunc) {
1795                                         mutex_enter(&arc_eviction_mtx);
1796                                         arc_buf_destroy(buf,
1797                                             buf->b_data == stolen, FALSE);
1798                                         ab->b_buf = buf->b_next;
1799                                         buf->b_hdr = &arc_eviction_hdr;
1800                                         buf->b_next = arc_eviction_list;
1801                                         arc_eviction_list = buf;
1802                                         mutex_exit(&arc_eviction_mtx);
1803                                         mutex_exit(&buf->b_evict_lock);
1804                                 } else {
1805                                         mutex_exit(&buf->b_evict_lock);
1806                                         arc_buf_destroy(buf,
1807                                             buf->b_data == stolen, TRUE);
1808                                 }
1809                         }
1810 
1811                         if (ab->b_l2hdr) {
1812                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1813                                     ab->b_size);
1814                         } else {
1815                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1816                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1817                                             ab->b_size);
1818                                 } else {
1819                                         ARCSTAT_INCR(
1820                                             arcstat_evict_l2_ineligible,
1821                                             ab->b_size);
1822                                 }
1823                         }
1824 
1825                         if (ab->b_datacnt == 0) {
1826                                 arc_change_state(evicted_state, ab, hash_lock);
1827                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1828                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1829                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1830                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1831                         }
1832                         if (!have_lock)
1833                                 mutex_exit(hash_lock);
1834                         if (bytes >= 0 && bytes_evicted >= bytes)
1835                                 break;
1836                 } else {
1837                         missed += 1;
1838                 }
1839         }
1840 
1841         mutex_exit(&evicted_state->arcs_mtx);
1842         mutex_exit(&state->arcs_mtx);
1843 
1844         if (bytes_evicted < bytes)
1845                 dprintf("only evicted %lld bytes from %x",
1846                     (longlong_t)bytes_evicted, state);
1847 
1848         if (skipped)
1849                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1850 
1851         if (missed)
1852                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1853 
1854         /*
1855          * We have just evicted some data into the ghost state, make
1856          * sure we also adjust the ghost state size if necessary.
1857          */
1858         if (arc_no_grow &&
1859             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1860                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1861                     arc_mru_ghost->arcs_size - arc_c;
1862 
1863                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1864                         int64_t todelete =
1865                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1866                         arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1867                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1868                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1869                             arc_mru_ghost->arcs_size +
1870                             arc_mfu_ghost->arcs_size - arc_c);
1871                         arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1872                 }
1873         }
1874 
1875         return (stolen);
1876 }
1877 
1878 /*
1879  * Remove buffers from list until we've removed the specified number of
1880  * bytes.  Destroy the buffers that are removed.
1881  */
1882 static void
1883 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1884 {
1885         arc_buf_hdr_t *ab, *ab_prev;
1886         arc_buf_hdr_t marker = { 0 };
1887         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1888         kmutex_t *hash_lock;
1889         uint64_t bytes_deleted = 0;
1890         uint64_t bufs_skipped = 0;
1891 
1892         ASSERT(GHOST_STATE(state));
1893 top:
1894         mutex_enter(&state->arcs_mtx);
1895         for (ab = list_tail(list); ab; ab = ab_prev) {
1896                 ab_prev = list_prev(list, ab);
1897                 if (spa && ab->b_spa != spa)
1898                         continue;
1899 
1900                 /* ignore markers */
1901                 if (ab->b_spa == 0)
1902                         continue;
1903 
1904                 hash_lock = HDR_LOCK(ab);
1905                 /* caller may be trying to modify this buffer, skip it */
1906                 if (MUTEX_HELD(hash_lock))
1907                         continue;
1908                 if (mutex_tryenter(hash_lock)) {
1909                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1910                         ASSERT(ab->b_buf == NULL);
1911                         ARCSTAT_BUMP(arcstat_deleted);
1912                         bytes_deleted += ab->b_size;
1913 
1914                         if (ab->b_l2hdr != NULL) {
1915                                 /*
1916                                  * This buffer is cached on the 2nd Level ARC;
1917                                  * don't destroy the header.
1918                                  */
1919                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1920                                 mutex_exit(hash_lock);
1921                         } else {
1922                                 arc_change_state(arc_anon, ab, hash_lock);
1923                                 mutex_exit(hash_lock);
1924                                 arc_hdr_destroy(ab);
1925                         }
1926 
1927                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1928                         if (bytes >= 0 && bytes_deleted >= bytes)
1929                                 break;
1930                 } else if (bytes < 0) {
1931                         /*
1932                          * Insert a list marker and then wait for the
1933                          * hash lock to become available. Once its
1934                          * available, restart from where we left off.
1935                          */
1936                         list_insert_after(list, ab, &marker);
1937                         mutex_exit(&state->arcs_mtx);
1938                         mutex_enter(hash_lock);
1939                         mutex_exit(hash_lock);
1940                         mutex_enter(&state->arcs_mtx);
1941                         ab_prev = list_prev(list, &marker);
1942                         list_remove(list, &marker);
1943                 } else
1944                         bufs_skipped += 1;
1945         }
1946         mutex_exit(&state->arcs_mtx);
1947 
1948         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1949             (bytes < 0 || bytes_deleted < bytes)) {
1950                 list = &state->arcs_list[ARC_BUFC_METADATA];
1951                 goto top;
1952         }
1953 
1954         if (bufs_skipped) {
1955                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1956                 ASSERT(bytes >= 0);
1957         }
1958 
1959         if (bytes_deleted < bytes)
1960                 dprintf("only deleted %lld bytes from %p",
1961                     (longlong_t)bytes_deleted, state);
1962 }
1963 
1964 static void
1965 arc_adjust(void)
1966 {
1967         int64_t adjustment, delta;
1968 
1969         /*
1970          * Adjust MRU size
1971          */
1972 
1973         adjustment = MIN((int64_t)(arc_size - arc_c),
1974             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1975             arc_p));
1976 
1977         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1978                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1979                 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
1980                 adjustment -= delta;
1981         }
1982 
1983         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1984                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1985                 (void) arc_evict(arc_mru, NULL, delta, FALSE,
1986                     ARC_BUFC_METADATA);
1987         }
1988 
1989         /*
1990          * Adjust MFU size
1991          */
1992 
1993         adjustment = arc_size - arc_c;
1994 
1995         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1996                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1997                 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
1998                 adjustment -= delta;
1999         }
2000 
2001         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2002                 int64_t delta = MIN(adjustment,
2003                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2004                 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2005                     ARC_BUFC_METADATA);
2006         }
2007 
2008         /*
2009          * Adjust ghost lists
2010          */
2011 
2012         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2013 
2014         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2015                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2016                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2017         }
2018 
2019         adjustment =
2020             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2021 
2022         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2023                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2024                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2025         }
2026 }
2027 
2028 static void
2029 arc_do_user_evicts(void)
2030 {
2031         mutex_enter(&arc_eviction_mtx);
2032         while (arc_eviction_list != NULL) {
2033                 arc_buf_t *buf = arc_eviction_list;
2034                 arc_eviction_list = buf->b_next;
2035                 mutex_enter(&buf->b_evict_lock);
2036                 buf->b_hdr = NULL;
2037                 mutex_exit(&buf->b_evict_lock);
2038                 mutex_exit(&arc_eviction_mtx);
2039 
2040                 if (buf->b_efunc != NULL)
2041                         VERIFY(buf->b_efunc(buf) == 0);
2042 
2043                 buf->b_efunc = NULL;
2044                 buf->b_private = NULL;
2045                 kmem_cache_free(buf_cache, buf);
2046                 mutex_enter(&arc_eviction_mtx);
2047         }
2048         mutex_exit(&arc_eviction_mtx);
2049 }
2050 
2051 /*
2052  * Flush all *evictable* data from the cache for the given spa.
2053  * NOTE: this will not touch "active" (i.e. referenced) data.
2054  */
2055 void
2056 arc_flush(spa_t *spa)
2057 {
2058         uint64_t guid = 0;
2059 
2060         if (spa)
2061                 guid = spa_load_guid(spa);
2062 
2063         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2064                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2065                 if (spa)
2066                         break;
2067         }
2068         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2069                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2070                 if (spa)
2071                         break;
2072         }
2073         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2074                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2075                 if (spa)
2076                         break;
2077         }
2078         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2079                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2080                 if (spa)
2081                         break;
2082         }
2083 
2084         arc_evict_ghost(arc_mru_ghost, guid, -1);
2085         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2086 
2087         mutex_enter(&arc_reclaim_thr_lock);
2088         arc_do_user_evicts();
2089         mutex_exit(&arc_reclaim_thr_lock);
2090         ASSERT(spa || arc_eviction_list == NULL);
2091 }
2092 
2093 void
2094 arc_shrink(void)
2095 {
2096         if (arc_c > arc_c_min) {
2097                 uint64_t to_free;
2098 
2099 #ifdef _KERNEL
2100                 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2101 #else
2102                 to_free = arc_c >> arc_shrink_shift;
2103 #endif
2104                 if (arc_c > arc_c_min + to_free)
2105                         atomic_add_64(&arc_c, -to_free);
2106                 else
2107                         arc_c = arc_c_min;
2108 
2109                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2110                 if (arc_c > arc_size)
2111                         arc_c = MAX(arc_size, arc_c_min);
2112                 if (arc_p > arc_c)
2113                         arc_p = (arc_c >> 1);
2114                 ASSERT(arc_c >= arc_c_min);
2115                 ASSERT((int64_t)arc_p >= 0);
2116         }
2117 
2118         if (arc_size > arc_c)
2119                 arc_adjust();
2120 }
2121 
2122 /*
2123  * Determine if the system is under memory pressure and is asking
2124  * to reclaim memory. A return value of 1 indicates that the system
2125  * is under memory pressure and that the arc should adjust accordingly.
2126  */
2127 static int
2128 arc_reclaim_needed(void)
2129 {
2130         uint64_t extra;
2131 
2132 #ifdef _KERNEL
2133 
2134         if (needfree)
2135                 return (1);
2136 
2137         /*
2138          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2139          */
2140         extra = desfree;
2141 
2142         /*
2143          * check that we're out of range of the pageout scanner.  It starts to
2144          * schedule paging if freemem is less than lotsfree and needfree.
2145          * lotsfree is the high-water mark for pageout, and needfree is the
2146          * number of needed free pages.  We add extra pages here to make sure
2147          * the scanner doesn't start up while we're freeing memory.
2148          */
2149         if (freemem < lotsfree + needfree + extra)
2150                 return (1);
2151 
2152         /*
2153          * check to make sure that swapfs has enough space so that anon
2154          * reservations can still succeed. anon_resvmem() checks that the
2155          * availrmem is greater than swapfs_minfree, and the number of reserved
2156          * swap pages.  We also add a bit of extra here just to prevent
2157          * circumstances from getting really dire.
2158          */
2159         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2160                 return (1);
2161 
2162 #if defined(__i386)
2163         /*
2164          * If we're on an i386 platform, it's possible that we'll exhaust the
2165          * kernel heap space before we ever run out of available physical
2166          * memory.  Most checks of the size of the heap_area compare against
2167          * tune.t_minarmem, which is the minimum available real memory that we
2168          * can have in the system.  However, this is generally fixed at 25 pages
2169          * which is so low that it's useless.  In this comparison, we seek to
2170          * calculate the total heap-size, and reclaim if more than 3/4ths of the
2171          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2172          * free)
2173          */
2174         if (vmem_size(heap_arena, VMEM_FREE) <
2175             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2176                 return (1);
2177 #endif
2178 
2179         /*
2180          * If zio data pages are being allocated out of a separate heap segment,
2181          * then enforce that the size of available vmem for this arena remains
2182          * above about 1/16th free.
2183          *
2184          * Note: The 1/16th arena free requirement was put in place
2185          * to aggressively evict memory from the arc in order to avoid
2186          * memory fragmentation issues.
2187          */
2188         if (zio_arena != NULL &&
2189             vmem_size(zio_arena, VMEM_FREE) <
2190             (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2191                 return (1);
2192 #else
2193         if (spa_get_random(100) == 0)
2194                 return (1);
2195 #endif
2196         return (0);
2197 }
2198 
2199 static void
2200 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2201 {
2202         size_t                  i;
2203         kmem_cache_t            *prev_cache = NULL;
2204         kmem_cache_t            *prev_data_cache = NULL;
2205         extern kmem_cache_t     *zio_buf_cache[];
2206         extern kmem_cache_t     *zio_data_buf_cache[];
2207 
2208 #ifdef _KERNEL
2209         if (arc_meta_used >= arc_meta_limit) {
2210                 /*
2211                  * We are exceeding our meta-data cache limit.
2212                  * Purge some DNLC entries to release holds on meta-data.
2213                  */
2214                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2215         }
2216 #if defined(__i386)
2217         /*
2218          * Reclaim unused memory from all kmem caches.
2219          */
2220         kmem_reap();
2221 #endif
2222 #endif
2223 
2224         /*
2225          * An aggressive reclamation will shrink the cache size as well as
2226          * reap free buffers from the arc kmem caches.
2227          */
2228         if (strat == ARC_RECLAIM_AGGR)
2229                 arc_shrink();
2230 
2231         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2232                 if (zio_buf_cache[i] != prev_cache) {
2233                         prev_cache = zio_buf_cache[i];
2234                         kmem_cache_reap_now(zio_buf_cache[i]);
2235                 }
2236                 if (zio_data_buf_cache[i] != prev_data_cache) {
2237                         prev_data_cache = zio_data_buf_cache[i];
2238                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2239                 }
2240         }
2241         kmem_cache_reap_now(buf_cache);
2242         kmem_cache_reap_now(hdr_cache);
2243 
2244         /*
2245          * Ask the vmem areana to reclaim unused memory from its
2246          * quantum caches.
2247          */
2248         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2249                 vmem_qcache_reap(zio_arena);
2250 }
2251 
2252 static void
2253 arc_reclaim_thread(void)
2254 {
2255         clock_t                 growtime = 0;
2256         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2257         callb_cpr_t             cpr;
2258 
2259         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2260 
2261         mutex_enter(&arc_reclaim_thr_lock);
2262         while (arc_thread_exit == 0) {
2263                 if (arc_reclaim_needed()) {
2264 
2265                         if (arc_no_grow) {
2266                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2267                                         last_reclaim = ARC_RECLAIM_AGGR;
2268                                 } else {
2269                                         last_reclaim = ARC_RECLAIM_CONS;
2270                                 }
2271                         } else {
2272                                 arc_no_grow = TRUE;
2273                                 last_reclaim = ARC_RECLAIM_AGGR;
2274                                 membar_producer();
2275                         }
2276 
2277                         /* reset the growth delay for every reclaim */
2278                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2279 
2280                         arc_kmem_reap_now(last_reclaim);
2281                         arc_warm = B_TRUE;
2282 
2283                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2284                         arc_no_grow = FALSE;
2285                 }
2286 
2287                 arc_adjust();
2288 
2289                 if (arc_eviction_list != NULL)
2290                         arc_do_user_evicts();
2291 
2292                 /* block until needed, or one second, whichever is shorter */
2293                 CALLB_CPR_SAFE_BEGIN(&cpr);
2294                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2295                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2296                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2297         }
2298 
2299         arc_thread_exit = 0;
2300         cv_broadcast(&arc_reclaim_thr_cv);
2301         CALLB_CPR_EXIT(&cpr);               /* drops arc_reclaim_thr_lock */
2302         thread_exit();
2303 }
2304 
2305 /*
2306  * Adapt arc info given the number of bytes we are trying to add and
2307  * the state that we are comming from.  This function is only called
2308  * when we are adding new content to the cache.
2309  */
2310 static void
2311 arc_adapt(int bytes, arc_state_t *state)
2312 {
2313         int mult;
2314         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2315 
2316         if (state == arc_l2c_only)
2317                 return;
2318 
2319         ASSERT(bytes > 0);
2320         /*
2321          * Adapt the target size of the MRU list:
2322          *      - if we just hit in the MRU ghost list, then increase
2323          *        the target size of the MRU list.
2324          *      - if we just hit in the MFU ghost list, then increase
2325          *        the target size of the MFU list by decreasing the
2326          *        target size of the MRU list.
2327          */
2328         if (state == arc_mru_ghost) {
2329                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2330                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2331                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2332 
2333                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2334         } else if (state == arc_mfu_ghost) {
2335                 uint64_t delta;
2336 
2337                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2338                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2339                 mult = MIN(mult, 10);
2340 
2341                 delta = MIN(bytes * mult, arc_p);
2342                 arc_p = MAX(arc_p_min, arc_p - delta);
2343         }
2344         ASSERT((int64_t)arc_p >= 0);
2345 
2346         if (arc_reclaim_needed()) {
2347                 cv_signal(&arc_reclaim_thr_cv);
2348                 return;
2349         }
2350 
2351         if (arc_no_grow)
2352                 return;
2353 
2354         if (arc_c >= arc_c_max)
2355                 return;
2356 
2357         /*
2358          * If we're within (2 * maxblocksize) bytes of the target
2359          * cache size, increment the target cache size
2360          */
2361         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2362                 atomic_add_64(&arc_c, (int64_t)bytes);
2363                 if (arc_c > arc_c_max)
2364                         arc_c = arc_c_max;
2365                 else if (state == arc_anon)
2366                         atomic_add_64(&arc_p, (int64_t)bytes);
2367                 if (arc_p > arc_c)
2368                         arc_p = arc_c;
2369         }
2370         ASSERT((int64_t)arc_p >= 0);
2371 }
2372 
2373 /*
2374  * Check if the cache has reached its limits and eviction is required
2375  * prior to insert.
2376  */
2377 static int
2378 arc_evict_needed(arc_buf_contents_t type)
2379 {
2380         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2381                 return (1);
2382 
2383         if (arc_reclaim_needed())
2384                 return (1);
2385 
2386         return (arc_size > arc_c);
2387 }
2388 
2389 /*
2390  * The buffer, supplied as the first argument, needs a data block.
2391  * So, if we are at cache max, determine which cache should be victimized.
2392  * We have the following cases:
2393  *
2394  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2395  * In this situation if we're out of space, but the resident size of the MFU is
2396  * under the limit, victimize the MFU cache to satisfy this insertion request.
2397  *
2398  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2399  * Here, we've used up all of the available space for the MRU, so we need to
2400  * evict from our own cache instead.  Evict from the set of resident MRU
2401  * entries.
2402  *
2403  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2404  * c minus p represents the MFU space in the cache, since p is the size of the
2405  * cache that is dedicated to the MRU.  In this situation there's still space on
2406  * the MFU side, so the MRU side needs to be victimized.
2407  *
2408  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2409  * MFU's resident set is consuming more space than it has been allotted.  In
2410  * this situation, we must victimize our own cache, the MFU, for this insertion.
2411  */
2412 static void
2413 arc_get_data_buf(arc_buf_t *buf)
2414 {
2415         arc_state_t             *state = buf->b_hdr->b_state;
2416         uint64_t                size = buf->b_hdr->b_size;
2417         arc_buf_contents_t      type = buf->b_hdr->b_type;
2418 
2419         arc_adapt(size, state);
2420 
2421         /*
2422          * We have not yet reached cache maximum size,
2423          * just allocate a new buffer.
2424          */
2425         if (!arc_evict_needed(type)) {
2426                 if (type == ARC_BUFC_METADATA) {
2427                         buf->b_data = zio_buf_alloc(size);
2428                         arc_space_consume(size, ARC_SPACE_DATA);
2429                 } else {
2430                         ASSERT(type == ARC_BUFC_DATA);
2431                         buf->b_data = zio_data_buf_alloc(size);
2432                         ARCSTAT_INCR(arcstat_data_size, size);
2433                         atomic_add_64(&arc_size, size);
2434                 }
2435                 goto out;
2436         }
2437 
2438         /*
2439          * If we are prefetching from the mfu ghost list, this buffer
2440          * will end up on the mru list; so steal space from there.
2441          */
2442         if (state == arc_mfu_ghost)
2443                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2444         else if (state == arc_mru_ghost)
2445                 state = arc_mru;
2446 
2447         if (state == arc_mru || state == arc_anon) {
2448                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2449                 state = (arc_mfu->arcs_lsize[type] >= size &&
2450                     arc_p > mru_used) ? arc_mfu : arc_mru;
2451         } else {
2452                 /* MFU cases */
2453                 uint64_t mfu_space = arc_c - arc_p;
2454                 state =  (arc_mru->arcs_lsize[type] >= size &&
2455                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2456         }
2457         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2458                 if (type == ARC_BUFC_METADATA) {
2459                         buf->b_data = zio_buf_alloc(size);
2460                         arc_space_consume(size, ARC_SPACE_DATA);
2461                 } else {
2462                         ASSERT(type == ARC_BUFC_DATA);
2463                         buf->b_data = zio_data_buf_alloc(size);
2464                         ARCSTAT_INCR(arcstat_data_size, size);
2465                         atomic_add_64(&arc_size, size);
2466                 }
2467                 ARCSTAT_BUMP(arcstat_recycle_miss);
2468         }
2469         ASSERT(buf->b_data != NULL);
2470 out:
2471         /*
2472          * Update the state size.  Note that ghost states have a
2473          * "ghost size" and so don't need to be updated.
2474          */
2475         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2476                 arc_buf_hdr_t *hdr = buf->b_hdr;
2477 
2478                 atomic_add_64(&hdr->b_state->arcs_size, size);
2479                 if (list_link_active(&hdr->b_arc_node)) {
2480                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2481                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2482                 }
2483                 /*
2484                  * If we are growing the cache, and we are adding anonymous
2485                  * data, and we have outgrown arc_p, update arc_p
2486                  */
2487                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2488                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2489                         arc_p = MIN(arc_c, arc_p + size);
2490         }
2491 }
2492 
2493 /*
2494  * This routine is called whenever a buffer is accessed.
2495  * NOTE: the hash lock is dropped in this function.
2496  */
2497 static void
2498 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2499 {
2500         clock_t now;
2501 
2502         ASSERT(MUTEX_HELD(hash_lock));
2503 
2504         if (buf->b_state == arc_anon) {
2505                 /*
2506                  * This buffer is not in the cache, and does not
2507                  * appear in our "ghost" list.  Add the new buffer
2508                  * to the MRU state.
2509                  */
2510 
2511                 ASSERT(buf->b_arc_access == 0);
2512                 buf->b_arc_access = ddi_get_lbolt();
2513                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2514                 arc_change_state(arc_mru, buf, hash_lock);
2515 
2516         } else if (buf->b_state == arc_mru) {
2517                 now = ddi_get_lbolt();
2518 
2519                 /*
2520                  * If this buffer is here because of a prefetch, then either:
2521                  * - clear the flag if this is a "referencing" read
2522                  *   (any subsequent access will bump this into the MFU state).
2523                  * or
2524                  * - move the buffer to the head of the list if this is
2525                  *   another prefetch (to make it less likely to be evicted).
2526                  */
2527                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2528                         if (refcount_count(&buf->b_refcnt) == 0) {
2529                                 ASSERT(list_link_active(&buf->b_arc_node));
2530                         } else {
2531                                 buf->b_flags &= ~ARC_PREFETCH;
2532                                 ARCSTAT_BUMP(arcstat_mru_hits);
2533                         }
2534                         buf->b_arc_access = now;
2535                         return;
2536                 }
2537 
2538                 /*
2539                  * This buffer has been "accessed" only once so far,
2540                  * but it is still in the cache. Move it to the MFU
2541                  * state.
2542                  */
2543                 if (now > buf->b_arc_access + ARC_MINTIME) {
2544                         /*
2545                          * More than 125ms have passed since we
2546                          * instantiated this buffer.  Move it to the
2547                          * most frequently used state.
2548                          */
2549                         buf->b_arc_access = now;
2550                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2551                         arc_change_state(arc_mfu, buf, hash_lock);
2552                 }
2553                 ARCSTAT_BUMP(arcstat_mru_hits);
2554         } else if (buf->b_state == arc_mru_ghost) {
2555                 arc_state_t     *new_state;
2556                 /*
2557                  * This buffer has been "accessed" recently, but
2558                  * was evicted from the cache.  Move it to the
2559                  * MFU state.
2560                  */
2561 
2562                 if (buf->b_flags & ARC_PREFETCH) {
2563                         new_state = arc_mru;
2564                         if (refcount_count(&buf->b_refcnt) > 0)
2565                                 buf->b_flags &= ~ARC_PREFETCH;
2566                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2567                 } else {
2568                         new_state = arc_mfu;
2569                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2570                 }
2571 
2572                 buf->b_arc_access = ddi_get_lbolt();
2573                 arc_change_state(new_state, buf, hash_lock);
2574 
2575                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2576         } else if (buf->b_state == arc_mfu) {
2577                 /*
2578                  * This buffer has been accessed more than once and is
2579                  * still in the cache.  Keep it in the MFU state.
2580                  *
2581                  * NOTE: an add_reference() that occurred when we did
2582                  * the arc_read() will have kicked this off the list.
2583                  * If it was a prefetch, we will explicitly move it to
2584                  * the head of the list now.
2585                  */
2586                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2587                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2588                         ASSERT(list_link_active(&buf->b_arc_node));
2589                 }
2590                 ARCSTAT_BUMP(arcstat_mfu_hits);
2591                 buf->b_arc_access = ddi_get_lbolt();
2592         } else if (buf->b_state == arc_mfu_ghost) {
2593                 arc_state_t     *new_state = arc_mfu;
2594                 /*
2595                  * This buffer has been accessed more than once but has
2596                  * been evicted from the cache.  Move it back to the
2597                  * MFU state.
2598                  */
2599 
2600                 if (buf->b_flags & ARC_PREFETCH) {
2601                         /*
2602                          * This is a prefetch access...
2603                          * move this block back to the MRU state.
2604                          */
2605                         ASSERT0(refcount_count(&buf->b_refcnt));
2606                         new_state = arc_mru;
2607                 }
2608 
2609                 buf->b_arc_access = ddi_get_lbolt();
2610                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2611                 arc_change_state(new_state, buf, hash_lock);
2612 
2613                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2614         } else if (buf->b_state == arc_l2c_only) {
2615                 /*
2616                  * This buffer is on the 2nd Level ARC.
2617                  */
2618 
2619                 buf->b_arc_access = ddi_get_lbolt();
2620                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2621                 arc_change_state(arc_mfu, buf, hash_lock);
2622         } else {
2623                 ASSERT(!"invalid arc state");
2624         }
2625 }
2626 
2627 /* a generic arc_done_func_t which you can use */
2628 /* ARGSUSED */
2629 void
2630 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2631 {
2632         if (zio == NULL || zio->io_error == 0)
2633                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2634         VERIFY(arc_buf_remove_ref(buf, arg));
2635 }
2636 
2637 /* a generic arc_done_func_t */
2638 void
2639 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2640 {
2641         arc_buf_t **bufp = arg;
2642         if (zio && zio->io_error) {
2643                 VERIFY(arc_buf_remove_ref(buf, arg));
2644                 *bufp = NULL;
2645         } else {
2646                 *bufp = buf;
2647                 ASSERT(buf->b_data);
2648         }
2649 }
2650 
2651 static void
2652 arc_read_done(zio_t *zio)
2653 {
2654         arc_buf_hdr_t   *hdr, *found;
2655         arc_buf_t       *buf;
2656         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2657         kmutex_t        *hash_lock;
2658         arc_callback_t  *callback_list, *acb;
2659         int             freeable = FALSE;
2660 
2661         buf = zio->io_private;
2662         hdr = buf->b_hdr;
2663 
2664         /*
2665          * The hdr was inserted into hash-table and removed from lists
2666          * prior to starting I/O.  We should find this header, since
2667          * it's in the hash table, and it should be legit since it's
2668          * not possible to evict it during the I/O.  The only possible
2669          * reason for it not to be found is if we were freed during the
2670          * read.
2671          */
2672         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2673             &hash_lock);
2674 
2675         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2676             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2677             (found == hdr && HDR_L2_READING(hdr)));
2678 
2679         hdr->b_flags &= ~ARC_L2_EVICTED;
2680         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2681                 hdr->b_flags &= ~ARC_L2CACHE;
2682 
2683         /* byteswap if necessary */
2684         callback_list = hdr->b_acb;
2685         ASSERT(callback_list != NULL);
2686         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2687                 dmu_object_byteswap_t bswap =
2688                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2689                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2690                     byteswap_uint64_array :
2691                     dmu_ot_byteswap[bswap].ob_func;
2692                 func(buf->b_data, hdr->b_size);
2693         }
2694 
2695         arc_cksum_compute(buf, B_FALSE);
2696         arc_buf_watch(buf);
2697 
2698         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2699                 /*
2700                  * Only call arc_access on anonymous buffers.  This is because
2701                  * if we've issued an I/O for an evicted buffer, we've already
2702                  * called arc_access (to prevent any simultaneous readers from
2703                  * getting confused).
2704                  */
2705                 arc_access(hdr, hash_lock);
2706         }
2707 
2708         /* create copies of the data buffer for the callers */
2709         abuf = buf;
2710         for (acb = callback_list; acb; acb = acb->acb_next) {
2711                 if (acb->acb_done) {
2712                         if (abuf == NULL) {
2713                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
2714                                 abuf = arc_buf_clone(buf);
2715                         }
2716                         acb->acb_buf = abuf;
2717                         abuf = NULL;
2718                 }
2719         }
2720         hdr->b_acb = NULL;
2721         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2722         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2723         if (abuf == buf) {
2724                 ASSERT(buf->b_efunc == NULL);
2725                 ASSERT(hdr->b_datacnt == 1);
2726                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2727         }
2728 
2729         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2730 
2731         if (zio->io_error != 0) {
2732                 hdr->b_flags |= ARC_IO_ERROR;
2733                 if (hdr->b_state != arc_anon)
2734                         arc_change_state(arc_anon, hdr, hash_lock);
2735                 if (HDR_IN_HASH_TABLE(hdr))
2736                         buf_hash_remove(hdr);
2737                 freeable = refcount_is_zero(&hdr->b_refcnt);
2738         }
2739 
2740         /*
2741          * Broadcast before we drop the hash_lock to avoid the possibility
2742          * that the hdr (and hence the cv) might be freed before we get to
2743          * the cv_broadcast().
2744          */
2745         cv_broadcast(&hdr->b_cv);
2746 
2747         if (hash_lock) {
2748                 mutex_exit(hash_lock);
2749         } else {
2750                 /*
2751                  * This block was freed while we waited for the read to
2752                  * complete.  It has been removed from the hash table and
2753                  * moved to the anonymous state (so that it won't show up
2754                  * in the cache).
2755                  */
2756                 ASSERT3P(hdr->b_state, ==, arc_anon);
2757                 freeable = refcount_is_zero(&hdr->b_refcnt);
2758         }
2759 
2760         /* execute each callback and free its structure */
2761         while ((acb = callback_list) != NULL) {
2762                 if (acb->acb_done)
2763                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2764 
2765                 if (acb->acb_zio_dummy != NULL) {
2766                         acb->acb_zio_dummy->io_error = zio->io_error;
2767                         zio_nowait(acb->acb_zio_dummy);
2768                 }
2769 
2770                 callback_list = acb->acb_next;
2771                 kmem_free(acb, sizeof (arc_callback_t));
2772         }
2773 
2774         if (freeable)
2775                 arc_hdr_destroy(hdr);
2776 }
2777 
2778 /*
2779  * "Read" the block at the specified DVA (in bp) via the
2780  * cache.  If the block is found in the cache, invoke the provided
2781  * callback immediately and return.  Note that the `zio' parameter
2782  * in the callback will be NULL in this case, since no IO was
2783  * required.  If the block is not in the cache pass the read request
2784  * on to the spa with a substitute callback function, so that the
2785  * requested block will be added to the cache.
2786  *
2787  * If a read request arrives for a block that has a read in-progress,
2788  * either wait for the in-progress read to complete (and return the
2789  * results); or, if this is a read with a "done" func, add a record
2790  * to the read to invoke the "done" func when the read completes,
2791  * and return; or just return.
2792  *
2793  * arc_read_done() will invoke all the requested "done" functions
2794  * for readers of this block.
2795  */
2796 int
2797 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2798     void *private, int priority, int zio_flags, uint32_t *arc_flags,
2799     const zbookmark_t *zb)
2800 {
2801         arc_buf_hdr_t *hdr;
2802         arc_buf_t *buf = NULL;
2803         kmutex_t *hash_lock;
2804         zio_t *rzio;
2805         uint64_t guid = spa_load_guid(spa);
2806 
2807 top:
2808         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2809             &hash_lock);
2810         if (hdr && hdr->b_datacnt > 0) {
2811 
2812                 *arc_flags |= ARC_CACHED;
2813 
2814                 if (HDR_IO_IN_PROGRESS(hdr)) {
2815 
2816                         if (*arc_flags & ARC_WAIT) {
2817                                 cv_wait(&hdr->b_cv, hash_lock);
2818                                 mutex_exit(hash_lock);
2819                                 goto top;
2820                         }
2821                         ASSERT(*arc_flags & ARC_NOWAIT);
2822 
2823                         if (done) {
2824                                 arc_callback_t  *acb = NULL;
2825 
2826                                 acb = kmem_zalloc(sizeof (arc_callback_t),
2827                                     KM_SLEEP);
2828                                 acb->acb_done = done;
2829                                 acb->acb_private = private;
2830                                 if (pio != NULL)
2831                                         acb->acb_zio_dummy = zio_null(pio,
2832                                             spa, NULL, NULL, NULL, zio_flags);
2833 
2834                                 ASSERT(acb->acb_done != NULL);
2835                                 acb->acb_next = hdr->b_acb;
2836                                 hdr->b_acb = acb;
2837                                 add_reference(hdr, hash_lock, private);
2838                                 mutex_exit(hash_lock);
2839                                 return (0);
2840                         }
2841                         mutex_exit(hash_lock);
2842                         return (0);
2843                 }
2844 
2845                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2846 
2847                 if (done) {
2848                         add_reference(hdr, hash_lock, private);
2849                         /*
2850                          * If this block is already in use, create a new
2851                          * copy of the data so that we will be guaranteed
2852                          * that arc_release() will always succeed.
2853                          */
2854                         buf = hdr->b_buf;
2855                         ASSERT(buf);
2856                         ASSERT(buf->b_data);
2857                         if (HDR_BUF_AVAILABLE(hdr)) {
2858                                 ASSERT(buf->b_efunc == NULL);
2859                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2860                         } else {
2861                                 buf = arc_buf_clone(buf);
2862                         }
2863 
2864                 } else if (*arc_flags & ARC_PREFETCH &&
2865                     refcount_count(&hdr->b_refcnt) == 0) {
2866                         hdr->b_flags |= ARC_PREFETCH;
2867                 }
2868                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2869                 arc_access(hdr, hash_lock);
2870                 if (*arc_flags & ARC_L2CACHE)
2871                         hdr->b_flags |= ARC_L2CACHE;
2872                 mutex_exit(hash_lock);
2873                 ARCSTAT_BUMP(arcstat_hits);
2874                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2875                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2876                     data, metadata, hits);
2877 
2878                 if (done)
2879                         done(NULL, buf, private);
2880         } else {
2881                 uint64_t size = BP_GET_LSIZE(bp);
2882                 arc_callback_t  *acb;
2883                 vdev_t *vd = NULL;
2884                 uint64_t addr = 0;
2885                 boolean_t devw = B_FALSE;
2886 
2887                 if (hdr == NULL) {
2888                         /* this block is not in the cache */
2889                         arc_buf_hdr_t   *exists;
2890                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2891                         buf = arc_buf_alloc(spa, size, private, type);
2892                         hdr = buf->b_hdr;
2893                         hdr->b_dva = *BP_IDENTITY(bp);
2894                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2895                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2896                         exists = buf_hash_insert(hdr, &hash_lock);
2897                         if (exists) {
2898                                 /* somebody beat us to the hash insert */
2899                                 mutex_exit(hash_lock);
2900                                 buf_discard_identity(hdr);
2901                                 (void) arc_buf_remove_ref(buf, private);
2902                                 goto top; /* restart the IO request */
2903                         }
2904                         /* if this is a prefetch, we don't have a reference */
2905                         if (*arc_flags & ARC_PREFETCH) {
2906                                 (void) remove_reference(hdr, hash_lock,
2907                                     private);
2908                                 hdr->b_flags |= ARC_PREFETCH;
2909                         }
2910                         if (*arc_flags & ARC_L2CACHE)
2911                                 hdr->b_flags |= ARC_L2CACHE;
2912                         if (BP_GET_LEVEL(bp) > 0)
2913                                 hdr->b_flags |= ARC_INDIRECT;
2914                 } else {
2915                         /* this block is in the ghost cache */
2916                         ASSERT(GHOST_STATE(hdr->b_state));
2917                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2918                         ASSERT0(refcount_count(&hdr->b_refcnt));
2919                         ASSERT(hdr->b_buf == NULL);
2920 
2921                         /* if this is a prefetch, we don't have a reference */
2922                         if (*arc_flags & ARC_PREFETCH)
2923                                 hdr->b_flags |= ARC_PREFETCH;
2924                         else
2925                                 add_reference(hdr, hash_lock, private);
2926                         if (*arc_flags & ARC_L2CACHE)
2927                                 hdr->b_flags |= ARC_L2CACHE;
2928                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2929                         buf->b_hdr = hdr;
2930                         buf->b_data = NULL;
2931                         buf->b_efunc = NULL;
2932                         buf->b_private = NULL;
2933                         buf->b_next = NULL;
2934                         hdr->b_buf = buf;
2935                         ASSERT(hdr->b_datacnt == 0);
2936                         hdr->b_datacnt = 1;
2937                         arc_get_data_buf(buf);
2938                         arc_access(hdr, hash_lock);
2939                 }
2940 
2941                 ASSERT(!GHOST_STATE(hdr->b_state));
2942 
2943                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2944                 acb->acb_done = done;
2945                 acb->acb_private = private;
2946 
2947                 ASSERT(hdr->b_acb == NULL);
2948                 hdr->b_acb = acb;
2949                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2950 
2951                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2952                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2953                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2954                         addr = hdr->b_l2hdr->b_daddr;
2955                         /*
2956                          * Lock out device removal.
2957                          */
2958                         if (vdev_is_dead(vd) ||
2959                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2960                                 vd = NULL;
2961                 }
2962 
2963                 mutex_exit(hash_lock);
2964 
2965                 /*
2966                  * At this point, we have a level 1 cache miss.  Try again in
2967                  * L2ARC if possible.
2968                  */
2969                 ASSERT3U(hdr->b_size, ==, size);
2970                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
2971                     uint64_t, size, zbookmark_t *, zb);
2972                 ARCSTAT_BUMP(arcstat_misses);
2973                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2974                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2975                     data, metadata, misses);
2976 
2977                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
2978                         /*
2979                          * Read from the L2ARC if the following are true:
2980                          * 1. The L2ARC vdev was previously cached.
2981                          * 2. This buffer still has L2ARC metadata.
2982                          * 3. This buffer isn't currently writing to the L2ARC.
2983                          * 4. The L2ARC entry wasn't evicted, which may
2984                          *    also have invalidated the vdev.
2985                          * 5. This isn't prefetch and l2arc_noprefetch is set.
2986                          */
2987                         if (hdr->b_l2hdr != NULL &&
2988                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
2989                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
2990                                 l2arc_read_callback_t *cb;
2991 
2992                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2993                                 ARCSTAT_BUMP(arcstat_l2_hits);
2994 
2995                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2996                                     KM_SLEEP);
2997                                 cb->l2rcb_buf = buf;
2998                                 cb->l2rcb_spa = spa;
2999                                 cb->l2rcb_bp = *bp;
3000                                 cb->l2rcb_zb = *zb;
3001                                 cb->l2rcb_flags = zio_flags;
3002 
3003                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3004                                     addr + size < vd->vdev_psize -
3005                                     VDEV_LABEL_END_SIZE);
3006 
3007                                 /*
3008                                  * l2arc read.  The SCL_L2ARC lock will be
3009                                  * released by l2arc_read_done().
3010                                  */
3011                                 rzio = zio_read_phys(pio, vd, addr, size,
3012                                     buf->b_data, ZIO_CHECKSUM_OFF,
3013                                     l2arc_read_done, cb, priority, zio_flags |
3014                                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3015                                     ZIO_FLAG_DONT_PROPAGATE |
3016                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
3017                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3018                                     zio_t *, rzio);
3019                                 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3020 
3021                                 if (*arc_flags & ARC_NOWAIT) {
3022                                         zio_nowait(rzio);
3023                                         return (0);
3024                                 }
3025 
3026                                 ASSERT(*arc_flags & ARC_WAIT);
3027                                 if (zio_wait(rzio) == 0)
3028                                         return (0);
3029 
3030                                 /* l2arc read error; goto zio_read() */
3031                         } else {
3032                                 DTRACE_PROBE1(l2arc__miss,
3033                                     arc_buf_hdr_t *, hdr);
3034                                 ARCSTAT_BUMP(arcstat_l2_misses);
3035                                 if (HDR_L2_WRITING(hdr))
3036                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3037                                 spa_config_exit(spa, SCL_L2ARC, vd);
3038                         }
3039                 } else {
3040                         if (vd != NULL)
3041                                 spa_config_exit(spa, SCL_L2ARC, vd);
3042                         if (l2arc_ndev != 0) {
3043                                 DTRACE_PROBE1(l2arc__miss,
3044                                     arc_buf_hdr_t *, hdr);
3045                                 ARCSTAT_BUMP(arcstat_l2_misses);
3046                         }
3047                 }
3048 
3049                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3050                     arc_read_done, buf, priority, zio_flags, zb);
3051 
3052                 if (*arc_flags & ARC_WAIT)
3053                         return (zio_wait(rzio));
3054 
3055                 ASSERT(*arc_flags & ARC_NOWAIT);
3056                 zio_nowait(rzio);
3057         }
3058         return (0);
3059 }
3060 
3061 void
3062 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3063 {
3064         ASSERT(buf->b_hdr != NULL);
3065         ASSERT(buf->b_hdr->b_state != arc_anon);
3066         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3067         ASSERT(buf->b_efunc == NULL);
3068         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3069 
3070         buf->b_efunc = func;
3071         buf->b_private = private;
3072 }
3073 
3074 /*
3075  * This is used by the DMU to let the ARC know that a buffer is
3076  * being evicted, so the ARC should clean up.  If this arc buf
3077  * is not yet in the evicted state, it will be put there.
3078  */
3079 int
3080 arc_buf_evict(arc_buf_t *buf)
3081 {
3082         arc_buf_hdr_t *hdr;
3083         kmutex_t *hash_lock;
3084         arc_buf_t **bufp;
3085 
3086         mutex_enter(&buf->b_evict_lock);
3087         hdr = buf->b_hdr;
3088         if (hdr == NULL) {
3089                 /*
3090                  * We are in arc_do_user_evicts().
3091                  */
3092                 ASSERT(buf->b_data == NULL);
3093                 mutex_exit(&buf->b_evict_lock);
3094                 return (0);
3095         } else if (buf->b_data == NULL) {
3096                 arc_buf_t copy = *buf; /* structure assignment */
3097                 /*
3098                  * We are on the eviction list; process this buffer now
3099                  * but let arc_do_user_evicts() do the reaping.
3100                  */
3101                 buf->b_efunc = NULL;
3102                 mutex_exit(&buf->b_evict_lock);
3103                 VERIFY(copy.b_efunc(&copy) == 0);
3104                 return (1);
3105         }
3106         hash_lock = HDR_LOCK(hdr);
3107         mutex_enter(hash_lock);
3108         hdr = buf->b_hdr;
3109         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3110 
3111         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3112         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3113 
3114         /*
3115          * Pull this buffer off of the hdr
3116          */
3117         bufp = &hdr->b_buf;
3118         while (*bufp != buf)
3119                 bufp = &(*bufp)->b_next;
3120         *bufp = buf->b_next;
3121 
3122         ASSERT(buf->b_data != NULL);
3123         arc_buf_destroy(buf, FALSE, FALSE);
3124 
3125         if (hdr->b_datacnt == 0) {
3126                 arc_state_t *old_state = hdr->b_state;
3127                 arc_state_t *evicted_state;
3128 
3129                 ASSERT(hdr->b_buf == NULL);
3130                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3131 
3132                 evicted_state =
3133                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3134 
3135                 mutex_enter(&old_state->arcs_mtx);
3136                 mutex_enter(&evicted_state->arcs_mtx);
3137 
3138                 arc_change_state(evicted_state, hdr, hash_lock);
3139                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3140                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3141                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3142 
3143                 mutex_exit(&evicted_state->arcs_mtx);
3144                 mutex_exit(&old_state->arcs_mtx);
3145         }
3146         mutex_exit(hash_lock);
3147         mutex_exit(&buf->b_evict_lock);
3148 
3149         VERIFY(buf->b_efunc(buf) == 0);
3150         buf->b_efunc = NULL;
3151         buf->b_private = NULL;
3152         buf->b_hdr = NULL;
3153         buf->b_next = NULL;
3154         kmem_cache_free(buf_cache, buf);
3155         return (1);
3156 }
3157 
3158 /*
3159  * Release this buffer from the cache, making it an anonymous buffer.  This
3160  * must be done after a read and prior to modifying the buffer contents.
3161  * If the buffer has more than one reference, we must make
3162  * a new hdr for the buffer.
3163  */
3164 void
3165 arc_release(arc_buf_t *buf, void *tag)
3166 {
3167         arc_buf_hdr_t *hdr;
3168         kmutex_t *hash_lock = NULL;
3169         l2arc_buf_hdr_t *l2hdr;
3170         uint64_t buf_size;
3171 
3172         /*
3173          * It would be nice to assert that if it's DMU metadata (level >
3174          * 0 || it's the dnode file), then it must be syncing context.
3175          * But we don't know that information at this level.
3176          */
3177 
3178         mutex_enter(&buf->b_evict_lock);
3179         hdr = buf->b_hdr;
3180 
3181         /* this buffer is not on any list */
3182         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3183 
3184         if (hdr->b_state == arc_anon) {
3185                 /* this buffer is already released */
3186                 ASSERT(buf->b_efunc == NULL);
3187         } else {
3188                 hash_lock = HDR_LOCK(hdr);
3189                 mutex_enter(hash_lock);
3190                 hdr = buf->b_hdr;
3191                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3192         }
3193 
3194         l2hdr = hdr->b_l2hdr;
3195         if (l2hdr) {
3196                 mutex_enter(&l2arc_buflist_mtx);
3197                 hdr->b_l2hdr = NULL;
3198         }
3199         buf_size = hdr->b_size;
3200 
3201         /*
3202          * Do we have more than one buf?
3203          */
3204         if (hdr->b_datacnt > 1) {
3205                 arc_buf_hdr_t *nhdr;
3206                 arc_buf_t **bufp;
3207                 uint64_t blksz = hdr->b_size;
3208                 uint64_t spa = hdr->b_spa;
3209                 arc_buf_contents_t type = hdr->b_type;
3210                 uint32_t flags = hdr->b_flags;
3211 
3212                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3213                 /*
3214                  * Pull the data off of this hdr and attach it to
3215                  * a new anonymous hdr.
3216                  */
3217                 (void) remove_reference(hdr, hash_lock, tag);
3218                 bufp = &hdr->b_buf;
3219                 while (*bufp != buf)
3220                         bufp = &(*bufp)->b_next;
3221                 *bufp = buf->b_next;
3222                 buf->b_next = NULL;
3223 
3224                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3225                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3226                 if (refcount_is_zero(&hdr->b_refcnt)) {
3227                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3228                         ASSERT3U(*size, >=, hdr->b_size);
3229                         atomic_add_64(size, -hdr->b_size);
3230                 }
3231 
3232                 /*
3233                  * We're releasing a duplicate user data buffer, update
3234                  * our statistics accordingly.
3235                  */
3236                 if (hdr->b_type == ARC_BUFC_DATA) {
3237                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3238                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3239                             -hdr->b_size);
3240                 }
3241                 hdr->b_datacnt -= 1;
3242                 arc_cksum_verify(buf);
3243                 arc_buf_unwatch(buf);
3244 
3245                 mutex_exit(hash_lock);
3246 
3247                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3248                 nhdr->b_size = blksz;
3249                 nhdr->b_spa = spa;
3250                 nhdr->b_type = type;
3251                 nhdr->b_buf = buf;
3252                 nhdr->b_state = arc_anon;
3253                 nhdr->b_arc_access = 0;
3254                 nhdr->b_flags = flags & ARC_L2_WRITING;
3255                 nhdr->b_l2hdr = NULL;
3256                 nhdr->b_datacnt = 1;
3257                 nhdr->b_freeze_cksum = NULL;
3258                 (void) refcount_add(&nhdr->b_refcnt, tag);
3259                 buf->b_hdr = nhdr;
3260                 mutex_exit(&buf->b_evict_lock);
3261                 atomic_add_64(&arc_anon->arcs_size, blksz);
3262         } else {
3263                 mutex_exit(&buf->b_evict_lock);
3264                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3265                 ASSERT(!list_link_active(&hdr->b_arc_node));
3266                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3267                 if (hdr->b_state != arc_anon)
3268                         arc_change_state(arc_anon, hdr, hash_lock);
3269                 hdr->b_arc_access = 0;
3270                 if (hash_lock)
3271                         mutex_exit(hash_lock);
3272 
3273                 buf_discard_identity(hdr);
3274                 arc_buf_thaw(buf);
3275         }
3276         buf->b_efunc = NULL;
3277         buf->b_private = NULL;
3278 
3279         if (l2hdr) {
3280                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3281                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3282                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3283                 mutex_exit(&l2arc_buflist_mtx);
3284         }
3285 }
3286 
3287 int
3288 arc_released(arc_buf_t *buf)
3289 {
3290         int released;
3291 
3292         mutex_enter(&buf->b_evict_lock);
3293         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3294         mutex_exit(&buf->b_evict_lock);
3295         return (released);
3296 }
3297 
3298 int
3299 arc_has_callback(arc_buf_t *buf)
3300 {
3301         int callback;
3302 
3303         mutex_enter(&buf->b_evict_lock);
3304         callback = (buf->b_efunc != NULL);
3305         mutex_exit(&buf->b_evict_lock);
3306         return (callback);
3307 }
3308 
3309 #ifdef ZFS_DEBUG
3310 int
3311 arc_referenced(arc_buf_t *buf)
3312 {
3313         int referenced;
3314 
3315         mutex_enter(&buf->b_evict_lock);
3316         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3317         mutex_exit(&buf->b_evict_lock);
3318         return (referenced);
3319 }
3320 #endif
3321 
3322 static void
3323 arc_write_ready(zio_t *zio)
3324 {
3325         arc_write_callback_t *callback = zio->io_private;
3326         arc_buf_t *buf = callback->awcb_buf;
3327         arc_buf_hdr_t *hdr = buf->b_hdr;
3328 
3329         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3330         callback->awcb_ready(zio, buf, callback->awcb_private);
3331 
3332         /*
3333          * If the IO is already in progress, then this is a re-write
3334          * attempt, so we need to thaw and re-compute the cksum.
3335          * It is the responsibility of the callback to handle the
3336          * accounting for any re-write attempt.
3337          */
3338         if (HDR_IO_IN_PROGRESS(hdr)) {
3339                 mutex_enter(&hdr->b_freeze_lock);
3340                 if (hdr->b_freeze_cksum != NULL) {
3341                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3342                         hdr->b_freeze_cksum = NULL;
3343                 }
3344                 mutex_exit(&hdr->b_freeze_lock);
3345         }
3346         arc_cksum_compute(buf, B_FALSE);
3347         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3348 }
3349 
3350 static void
3351 arc_write_done(zio_t *zio)
3352 {
3353         arc_write_callback_t *callback = zio->io_private;
3354         arc_buf_t *buf = callback->awcb_buf;
3355         arc_buf_hdr_t *hdr = buf->b_hdr;
3356 
3357         ASSERT(hdr->b_acb == NULL);
3358 
3359         if (zio->io_error == 0) {
3360                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3361                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3362                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3363         } else {
3364                 ASSERT(BUF_EMPTY(hdr));
3365         }
3366 
3367         /*
3368          * If the block to be written was all-zero, we may have
3369          * compressed it away.  In this case no write was performed
3370          * so there will be no dva/birth/checksum.  The buffer must
3371          * therefore remain anonymous (and uncached).
3372          */
3373         if (!BUF_EMPTY(hdr)) {
3374                 arc_buf_hdr_t *exists;
3375                 kmutex_t *hash_lock;
3376 
3377                 ASSERT(zio->io_error == 0);
3378 
3379                 arc_cksum_verify(buf);
3380 
3381                 exists = buf_hash_insert(hdr, &hash_lock);
3382                 if (exists) {
3383                         /*
3384                          * This can only happen if we overwrite for
3385                          * sync-to-convergence, because we remove
3386                          * buffers from the hash table when we arc_free().
3387                          */
3388                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3389                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3390                                         panic("bad overwrite, hdr=%p exists=%p",
3391                                             (void *)hdr, (void *)exists);
3392                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3393                                 arc_change_state(arc_anon, exists, hash_lock);
3394                                 mutex_exit(hash_lock);
3395                                 arc_hdr_destroy(exists);
3396                                 exists = buf_hash_insert(hdr, &hash_lock);
3397                                 ASSERT3P(exists, ==, NULL);
3398                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3399                                 /* nopwrite */
3400                                 ASSERT(zio->io_prop.zp_nopwrite);
3401                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3402                                         panic("bad nopwrite, hdr=%p exists=%p",
3403                                             (void *)hdr, (void *)exists);
3404                         } else {
3405                                 /* Dedup */
3406                                 ASSERT(hdr->b_datacnt == 1);
3407                                 ASSERT(hdr->b_state == arc_anon);
3408                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3409                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3410                         }
3411                 }
3412                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3413                 /* if it's not anon, we are doing a scrub */
3414                 if (!exists && hdr->b_state == arc_anon)
3415                         arc_access(hdr, hash_lock);
3416                 mutex_exit(hash_lock);
3417         } else {
3418                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3419         }
3420 
3421         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3422         callback->awcb_done(zio, buf, callback->awcb_private);
3423 
3424         kmem_free(callback, sizeof (arc_write_callback_t));
3425 }
3426 
3427 zio_t *
3428 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3429     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3430     arc_done_func_t *ready, arc_done_func_t *done, void *private,
3431     int priority, int zio_flags, const zbookmark_t *zb)
3432 {
3433         arc_buf_hdr_t *hdr = buf->b_hdr;
3434         arc_write_callback_t *callback;
3435         zio_t *zio;
3436 
3437         ASSERT(ready != NULL);
3438         ASSERT(done != NULL);
3439         ASSERT(!HDR_IO_ERROR(hdr));
3440         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3441         ASSERT(hdr->b_acb == NULL);
3442         if (l2arc)
3443                 hdr->b_flags |= ARC_L2CACHE;
3444         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3445         callback->awcb_ready = ready;
3446         callback->awcb_done = done;
3447         callback->awcb_private = private;
3448         callback->awcb_buf = buf;
3449 
3450         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3451             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3452 
3453         return (zio);
3454 }
3455 
3456 static int
3457 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3458 {
3459 #ifdef _KERNEL
3460         uint64_t available_memory = ptob(freemem);
3461         static uint64_t page_load = 0;
3462         static uint64_t last_txg = 0;
3463 
3464 #if defined(__i386)
3465         available_memory =
3466             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3467 #endif
3468         if (available_memory >= zfs_write_limit_max)
3469                 return (0);
3470 
3471         if (txg > last_txg) {
3472                 last_txg = txg;
3473                 page_load = 0;
3474         }
3475         /*
3476          * If we are in pageout, we know that memory is already tight,
3477          * the arc is already going to be evicting, so we just want to
3478          * continue to let page writes occur as quickly as possible.
3479          */
3480         if (curproc == proc_pageout) {
3481                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3482                         return (SET_ERROR(ERESTART));
3483                 /* Note: reserve is inflated, so we deflate */
3484                 page_load += reserve / 8;
3485                 return (0);
3486         } else if (page_load > 0 && arc_reclaim_needed()) {
3487                 /* memory is low, delay before restarting */
3488                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3489                 return (SET_ERROR(EAGAIN));
3490         }
3491         page_load = 0;
3492 
3493         if (arc_size > arc_c_min) {
3494                 uint64_t evictable_memory =
3495                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3496                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3497                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3498                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3499                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3500         }
3501 
3502         if (inflight_data > available_memory / 4) {
3503                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3504                 return (SET_ERROR(ERESTART));
3505         }
3506 #endif
3507         return (0);
3508 }
3509 
3510 void
3511 arc_tempreserve_clear(uint64_t reserve)
3512 {
3513         atomic_add_64(&arc_tempreserve, -reserve);
3514         ASSERT((int64_t)arc_tempreserve >= 0);
3515 }
3516 
3517 int
3518 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3519 {
3520         int error;
3521         uint64_t anon_size;
3522 
3523 #ifdef ZFS_DEBUG
3524         /*
3525          * Once in a while, fail for no reason.  Everything should cope.
3526          */
3527         if (spa_get_random(10000) == 0) {
3528                 dprintf("forcing random failure\n");
3529                 return (SET_ERROR(ERESTART));
3530         }
3531 #endif
3532         if (reserve > arc_c/4 && !arc_no_grow)
3533                 arc_c = MIN(arc_c_max, reserve * 4);
3534         if (reserve > arc_c)
3535                 return (SET_ERROR(ENOMEM));
3536 
3537         /*
3538          * Don't count loaned bufs as in flight dirty data to prevent long
3539          * network delays from blocking transactions that are ready to be
3540          * assigned to a txg.
3541          */
3542         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3543 
3544         /*
3545          * Writes will, almost always, require additional memory allocations
3546          * in order to compress/encrypt/etc the data.  We therefore need to
3547          * make sure that there is sufficient available memory for this.
3548          */
3549         if (error = arc_memory_throttle(reserve, anon_size, txg))
3550                 return (error);
3551 
3552         /*
3553          * Throttle writes when the amount of dirty data in the cache
3554          * gets too large.  We try to keep the cache less than half full
3555          * of dirty blocks so that our sync times don't grow too large.
3556          * Note: if two requests come in concurrently, we might let them
3557          * both succeed, when one of them should fail.  Not a huge deal.
3558          */
3559 
3560         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3561             anon_size > arc_c / 4) {
3562                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3563                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3564                     arc_tempreserve>>10,
3565                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3566                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3567                     reserve>>10, arc_c>>10);
3568                 return (SET_ERROR(ERESTART));
3569         }
3570         atomic_add_64(&arc_tempreserve, reserve);
3571         return (0);
3572 }
3573 
3574 void
3575 arc_init(void)
3576 {
3577         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3578         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3579 
3580         /* Convert seconds to clock ticks */
3581         arc_min_prefetch_lifespan = 1 * hz;
3582 
3583         /* Start out with 1/8 of all memory */
3584         arc_c = physmem * PAGESIZE / 8;
3585 
3586 #ifdef _KERNEL
3587         /*
3588          * On architectures where the physical memory can be larger
3589          * than the addressable space (intel in 32-bit mode), we may
3590          * need to limit the cache to 1/8 of VM size.
3591          */
3592         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3593 #endif
3594 
3595         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3596         arc_c_min = MAX(arc_c / 4, 64<<20);
3597         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3598         if (arc_c * 8 >= 1<<30)
3599                 arc_c_max = (arc_c * 8) - (1<<30);
3600         else
3601                 arc_c_max = arc_c_min;
3602         arc_c_max = MAX(arc_c * 6, arc_c_max);
3603 
3604         /*
3605          * Allow the tunables to override our calculations if they are
3606          * reasonable (ie. over 64MB)
3607          */
3608         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3609                 arc_c_max = zfs_arc_max;
3610         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3611                 arc_c_min = zfs_arc_min;
3612 
3613         arc_c = arc_c_max;
3614         arc_p = (arc_c >> 1);
3615 
3616         /* limit meta-data to 1/4 of the arc capacity */
3617         arc_meta_limit = arc_c_max / 4;
3618 
3619         /* Allow the tunable to override if it is reasonable */
3620         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3621                 arc_meta_limit = zfs_arc_meta_limit;
3622 
3623         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3624                 arc_c_min = arc_meta_limit / 2;
3625 
3626         if (zfs_arc_grow_retry > 0)
3627                 arc_grow_retry = zfs_arc_grow_retry;
3628 
3629         if (zfs_arc_shrink_shift > 0)
3630                 arc_shrink_shift = zfs_arc_shrink_shift;
3631 
3632         if (zfs_arc_p_min_shift > 0)
3633                 arc_p_min_shift = zfs_arc_p_min_shift;
3634 
3635         /* if kmem_flags are set, lets try to use less memory */
3636         if (kmem_debugging())
3637                 arc_c = arc_c / 2;
3638         if (arc_c < arc_c_min)
3639                 arc_c = arc_c_min;
3640 
3641         arc_anon = &ARC_anon;
3642         arc_mru = &ARC_mru;
3643         arc_mru_ghost = &ARC_mru_ghost;
3644         arc_mfu = &ARC_mfu;
3645         arc_mfu_ghost = &ARC_mfu_ghost;
3646         arc_l2c_only = &ARC_l2c_only;
3647         arc_size = 0;
3648 
3649         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3650         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3651         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3652         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3653         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3654         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3655 
3656         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3657             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3658         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3659             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3660         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3661             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3662         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3663             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3664         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3665             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3666         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3667             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3668         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3669             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3670         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3671             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3672         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3673             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3674         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3675             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3676 
3677         buf_init();
3678 
3679         arc_thread_exit = 0;
3680         arc_eviction_list = NULL;
3681         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3682         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3683 
3684         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3685             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3686 
3687         if (arc_ksp != NULL) {
3688                 arc_ksp->ks_data = &arc_stats;
3689                 kstat_install(arc_ksp);
3690         }
3691 
3692         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3693             TS_RUN, minclsyspri);
3694 
3695         arc_dead = FALSE;
3696         arc_warm = B_FALSE;
3697 
3698         if (zfs_write_limit_max == 0)
3699                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3700         else
3701                 zfs_write_limit_shift = 0;
3702         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3703 }
3704 
3705 void
3706 arc_fini(void)
3707 {
3708         mutex_enter(&arc_reclaim_thr_lock);
3709         arc_thread_exit = 1;
3710         while (arc_thread_exit != 0)
3711                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3712         mutex_exit(&arc_reclaim_thr_lock);
3713 
3714         arc_flush(NULL);
3715 
3716         arc_dead = TRUE;
3717 
3718         if (arc_ksp != NULL) {
3719                 kstat_delete(arc_ksp);
3720                 arc_ksp = NULL;
3721         }
3722 
3723         mutex_destroy(&arc_eviction_mtx);
3724         mutex_destroy(&arc_reclaim_thr_lock);
3725         cv_destroy(&arc_reclaim_thr_cv);
3726 
3727         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3728         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3729         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3730         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3731         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3732         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3733         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3734         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3735 
3736         mutex_destroy(&arc_anon->arcs_mtx);
3737         mutex_destroy(&arc_mru->arcs_mtx);
3738         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3739         mutex_destroy(&arc_mfu->arcs_mtx);
3740         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3741         mutex_destroy(&arc_l2c_only->arcs_mtx);
3742 
3743         mutex_destroy(&zfs_write_limit_lock);
3744 
3745         buf_fini();
3746 
3747         ASSERT(arc_loaned_bytes == 0);
3748 }
3749 
3750 /*
3751  * Level 2 ARC
3752  *
3753  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3754  * It uses dedicated storage devices to hold cached data, which are populated
3755  * using large infrequent writes.  The main role of this cache is to boost
3756  * the performance of random read workloads.  The intended L2ARC devices
3757  * include short-stroked disks, solid state disks, and other media with
3758  * substantially faster read latency than disk.
3759  *
3760  *                 +-----------------------+
3761  *                 |         ARC           |
3762  *                 +-----------------------+
3763  *                    |         ^     ^
3764  *                    |         |     |
3765  *      l2arc_feed_thread()    arc_read()
3766  *                    |         |     |
3767  *                    |  l2arc read   |
3768  *                    V         |     |
3769  *               +---------------+    |
3770  *               |     L2ARC     |    |
3771  *               +---------------+    |
3772  *                   |    ^           |
3773  *          l2arc_write() |           |
3774  *                   |    |           |
3775  *                   V    |           |
3776  *                 +-------+      +-------+
3777  *                 | vdev  |      | vdev  |
3778  *                 | cache |      | cache |
3779  *                 +-------+      +-------+
3780  *                 +=========+     .-----.
3781  *                 :  L2ARC  :    |-_____-|
3782  *                 : devices :    | Disks |
3783  *                 +=========+    `-_____-'
3784  *
3785  * Read requests are satisfied from the following sources, in order:
3786  *
3787  *      1) ARC
3788  *      2) vdev cache of L2ARC devices
3789  *      3) L2ARC devices
3790  *      4) vdev cache of disks
3791  *      5) disks
3792  *
3793  * Some L2ARC device types exhibit extremely slow write performance.
3794  * To accommodate for this there are some significant differences between
3795  * the L2ARC and traditional cache design:
3796  *
3797  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3798  * the ARC behave as usual, freeing buffers and placing headers on ghost
3799  * lists.  The ARC does not send buffers to the L2ARC during eviction as
3800  * this would add inflated write latencies for all ARC memory pressure.
3801  *
3802  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3803  * It does this by periodically scanning buffers from the eviction-end of
3804  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3805  * not already there.  It scans until a headroom of buffers is satisfied,
3806  * which itself is a buffer for ARC eviction.  The thread that does this is
3807  * l2arc_feed_thread(), illustrated below; example sizes are included to
3808  * provide a better sense of ratio than this diagram:
3809  *
3810  *             head -->                        tail
3811  *              +---------------------+----------+
3812  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3813  *              +---------------------+----------+   |   o L2ARC eligible
3814  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3815  *              +---------------------+----------+   |
3816  *                   15.9 Gbytes      ^ 32 Mbytes    |
3817  *                                 headroom          |
3818  *                                            l2arc_feed_thread()
3819  *                                                   |
3820  *                       l2arc write hand <--[oooo]--'
3821  *                               |           8 Mbyte
3822  *                               |          write max
3823  *                               V
3824  *                +==============================+
3825  *      L2ARC dev |####|#|###|###|    |####| ... |
3826  *                +==============================+
3827  *                           32 Gbytes
3828  *
3829  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3830  * evicted, then the L2ARC has cached a buffer much sooner than it probably
3831  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3832  * safe to say that this is an uncommon case, since buffers at the end of
3833  * the ARC lists have moved there due to inactivity.
3834  *
3835  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3836  * then the L2ARC simply misses copying some buffers.  This serves as a
3837  * pressure valve to prevent heavy read workloads from both stalling the ARC
3838  * with waits and clogging the L2ARC with writes.  This also helps prevent
3839  * the potential for the L2ARC to churn if it attempts to cache content too
3840  * quickly, such as during backups of the entire pool.
3841  *
3842  * 5. After system boot and before the ARC has filled main memory, there are
3843  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3844  * lists can remain mostly static.  Instead of searching from tail of these
3845  * lists as pictured, the l2arc_feed_thread() will search from the list heads
3846  * for eligible buffers, greatly increasing its chance of finding them.
3847  *
3848  * The L2ARC device write speed is also boosted during this time so that
3849  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3850  * there are no L2ARC reads, and no fear of degrading read performance
3851  * through increased writes.
3852  *
3853  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3854  * the vdev queue can aggregate them into larger and fewer writes.  Each
3855  * device is written to in a rotor fashion, sweeping writes through
3856  * available space then repeating.
3857  *
3858  * 7. The L2ARC does not store dirty content.  It never needs to flush
3859  * write buffers back to disk based storage.
3860  *
3861  * 8. If an ARC buffer is written (and dirtied) which also exists in the
3862  * L2ARC, the now stale L2ARC buffer is immediately dropped.
3863  *
3864  * The performance of the L2ARC can be tweaked by a number of tunables, which
3865  * may be necessary for different workloads:
3866  *
3867  *      l2arc_write_max         max write bytes per interval
3868  *      l2arc_write_boost       extra write bytes during device warmup
3869  *      l2arc_noprefetch        skip caching prefetched buffers
3870  *      l2arc_headroom          number of max device writes to precache
3871  *      l2arc_feed_secs         seconds between L2ARC writing
3872  *
3873  * Tunables may be removed or added as future performance improvements are
3874  * integrated, and also may become zpool properties.
3875  *
3876  * There are three key functions that control how the L2ARC warms up:
3877  *
3878  *      l2arc_write_eligible()  check if a buffer is eligible to cache
3879  *      l2arc_write_size()      calculate how much to write
3880  *      l2arc_write_interval()  calculate sleep delay between writes
3881  *
3882  * These three functions determine what to write, how much, and how quickly
3883  * to send writes.
3884  */
3885 
3886 static boolean_t
3887 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3888 {
3889         /*
3890          * A buffer is *not* eligible for the L2ARC if it:
3891          * 1. belongs to a different spa.
3892          * 2. is already cached on the L2ARC.
3893          * 3. has an I/O in progress (it may be an incomplete read).
3894          * 4. is flagged not eligible (zfs property).
3895          */
3896         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3897             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3898                 return (B_FALSE);
3899 
3900         return (B_TRUE);
3901 }
3902 
3903 static uint64_t
3904 l2arc_write_size(l2arc_dev_t *dev)
3905 {
3906         uint64_t size;
3907 
3908         size = dev->l2ad_write;
3909 
3910         if (arc_warm == B_FALSE)
3911                 size += dev->l2ad_boost;
3912 
3913         return (size);
3914 
3915 }
3916 
3917 static clock_t
3918 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
3919 {
3920         clock_t interval, next, now;
3921 
3922         /*
3923          * If the ARC lists are busy, increase our write rate; if the
3924          * lists are stale, idle back.  This is achieved by checking
3925          * how much we previously wrote - if it was more than half of
3926          * what we wanted, schedule the next write much sooner.
3927          */
3928         if (l2arc_feed_again && wrote > (wanted / 2))
3929                 interval = (hz * l2arc_feed_min_ms) / 1000;
3930         else
3931                 interval = hz * l2arc_feed_secs;
3932 
3933         now = ddi_get_lbolt();
3934         next = MAX(now, MIN(now + interval, began + interval));
3935 
3936         return (next);
3937 }
3938 
3939 static void
3940 l2arc_hdr_stat_add(void)
3941 {
3942         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3943         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3944 }
3945 
3946 static void
3947 l2arc_hdr_stat_remove(void)
3948 {
3949         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3950         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3951 }
3952 
3953 /*
3954  * Cycle through L2ARC devices.  This is how L2ARC load balances.
3955  * If a device is returned, this also returns holding the spa config lock.
3956  */
3957 static l2arc_dev_t *
3958 l2arc_dev_get_next(void)
3959 {
3960         l2arc_dev_t *first, *next = NULL;
3961 
3962         /*
3963          * Lock out the removal of spas (spa_namespace_lock), then removal
3964          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
3965          * both locks will be dropped and a spa config lock held instead.
3966          */
3967         mutex_enter(&spa_namespace_lock);
3968         mutex_enter(&l2arc_dev_mtx);
3969 
3970         /* if there are no vdevs, there is nothing to do */
3971         if (l2arc_ndev == 0)
3972                 goto out;
3973 
3974         first = NULL;
3975         next = l2arc_dev_last;
3976         do {
3977                 /* loop around the list looking for a non-faulted vdev */
3978                 if (next == NULL) {
3979                         next = list_head(l2arc_dev_list);
3980                 } else {
3981                         next = list_next(l2arc_dev_list, next);
3982                         if (next == NULL)
3983                                 next = list_head(l2arc_dev_list);
3984                 }
3985 
3986                 /* if we have come back to the start, bail out */
3987                 if (first == NULL)
3988                         first = next;
3989                 else if (next == first)
3990                         break;
3991 
3992         } while (vdev_is_dead(next->l2ad_vdev));
3993 
3994         /* if we were unable to find any usable vdevs, return NULL */
3995         if (vdev_is_dead(next->l2ad_vdev))
3996                 next = NULL;
3997 
3998         l2arc_dev_last = next;
3999 
4000 out:
4001         mutex_exit(&l2arc_dev_mtx);
4002 
4003         /*
4004          * Grab the config lock to prevent the 'next' device from being
4005          * removed while we are writing to it.
4006          */
4007         if (next != NULL)
4008                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4009         mutex_exit(&spa_namespace_lock);
4010 
4011         return (next);
4012 }
4013 
4014 /*
4015  * Free buffers that were tagged for destruction.
4016  */
4017 static void
4018 l2arc_do_free_on_write()
4019 {
4020         list_t *buflist;
4021         l2arc_data_free_t *df, *df_prev;
4022 
4023         mutex_enter(&l2arc_free_on_write_mtx);
4024         buflist = l2arc_free_on_write;
4025 
4026         for (df = list_tail(buflist); df; df = df_prev) {
4027                 df_prev = list_prev(buflist, df);
4028                 ASSERT(df->l2df_data != NULL);
4029                 ASSERT(df->l2df_func != NULL);
4030                 df->l2df_func(df->l2df_data, df->l2df_size);
4031                 list_remove(buflist, df);
4032                 kmem_free(df, sizeof (l2arc_data_free_t));
4033         }
4034 
4035         mutex_exit(&l2arc_free_on_write_mtx);
4036 }
4037 
4038 /*
4039  * A write to a cache device has completed.  Update all headers to allow
4040  * reads from these buffers to begin.
4041  */
4042 static void
4043 l2arc_write_done(zio_t *zio)
4044 {
4045         l2arc_write_callback_t *cb;
4046         l2arc_dev_t *dev;
4047         list_t *buflist;
4048         arc_buf_hdr_t *head, *ab, *ab_prev;
4049         l2arc_buf_hdr_t *abl2;
4050         kmutex_t *hash_lock;
4051 
4052         cb = zio->io_private;
4053         ASSERT(cb != NULL);
4054         dev = cb->l2wcb_dev;
4055         ASSERT(dev != NULL);
4056         head = cb->l2wcb_head;
4057         ASSERT(head != NULL);
4058         buflist = dev->l2ad_buflist;
4059         ASSERT(buflist != NULL);
4060         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4061             l2arc_write_callback_t *, cb);
4062 
4063         if (zio->io_error != 0)
4064                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4065 
4066         mutex_enter(&l2arc_buflist_mtx);
4067 
4068         /*
4069          * All writes completed, or an error was hit.
4070          */
4071         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4072                 ab_prev = list_prev(buflist, ab);
4073 
4074                 hash_lock = HDR_LOCK(ab);
4075                 if (!mutex_tryenter(hash_lock)) {
4076                         /*
4077                          * This buffer misses out.  It may be in a stage
4078                          * of eviction.  Its ARC_L2_WRITING flag will be
4079                          * left set, denying reads to this buffer.
4080                          */
4081                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4082                         continue;
4083                 }
4084 
4085                 if (zio->io_error != 0) {
4086                         /*
4087                          * Error - drop L2ARC entry.
4088                          */
4089                         list_remove(buflist, ab);
4090                         abl2 = ab->b_l2hdr;
4091                         ab->b_l2hdr = NULL;
4092                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4093                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4094                 }
4095 
4096                 /*
4097                  * Allow ARC to begin reads to this L2ARC entry.
4098                  */
4099                 ab->b_flags &= ~ARC_L2_WRITING;
4100 
4101                 mutex_exit(hash_lock);
4102         }
4103 
4104         atomic_inc_64(&l2arc_writes_done);
4105         list_remove(buflist, head);
4106         kmem_cache_free(hdr_cache, head);
4107         mutex_exit(&l2arc_buflist_mtx);
4108 
4109         l2arc_do_free_on_write();
4110 
4111         kmem_free(cb, sizeof (l2arc_write_callback_t));
4112 }
4113 
4114 /*
4115  * A read to a cache device completed.  Validate buffer contents before
4116  * handing over to the regular ARC routines.
4117  */
4118 static void
4119 l2arc_read_done(zio_t *zio)
4120 {
4121         l2arc_read_callback_t *cb;
4122         arc_buf_hdr_t *hdr;
4123         arc_buf_t *buf;
4124         kmutex_t *hash_lock;
4125         int equal;
4126 
4127         ASSERT(zio->io_vd != NULL);
4128         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4129 
4130         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4131 
4132         cb = zio->io_private;
4133         ASSERT(cb != NULL);
4134         buf = cb->l2rcb_buf;
4135         ASSERT(buf != NULL);
4136 
4137         hash_lock = HDR_LOCK(buf->b_hdr);
4138         mutex_enter(hash_lock);
4139         hdr = buf->b_hdr;
4140         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4141 
4142         /*
4143          * Check this survived the L2ARC journey.
4144          */
4145         equal = arc_cksum_equal(buf);
4146         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4147                 mutex_exit(hash_lock);
4148                 zio->io_private = buf;
4149                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
4150                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
4151                 arc_read_done(zio);
4152         } else {
4153                 mutex_exit(hash_lock);
4154                 /*
4155                  * Buffer didn't survive caching.  Increment stats and
4156                  * reissue to the original storage device.
4157                  */
4158                 if (zio->io_error != 0) {
4159                         ARCSTAT_BUMP(arcstat_l2_io_error);
4160                 } else {
4161                         zio->io_error = SET_ERROR(EIO);
4162                 }
4163                 if (!equal)
4164                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4165 
4166                 /*
4167                  * If there's no waiter, issue an async i/o to the primary
4168                  * storage now.  If there *is* a waiter, the caller must
4169                  * issue the i/o in a context where it's OK to block.
4170                  */
4171                 if (zio->io_waiter == NULL) {
4172                         zio_t *pio = zio_unique_parent(zio);
4173 
4174                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4175 
4176                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4177                             buf->b_data, zio->io_size, arc_read_done, buf,
4178                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4179                 }
4180         }
4181 
4182         kmem_free(cb, sizeof (l2arc_read_callback_t));
4183 }
4184 
4185 /*
4186  * This is the list priority from which the L2ARC will search for pages to
4187  * cache.  This is used within loops (0..3) to cycle through lists in the
4188  * desired order.  This order can have a significant effect on cache
4189  * performance.
4190  *
4191  * Currently the metadata lists are hit first, MFU then MRU, followed by
4192  * the data lists.  This function returns a locked list, and also returns
4193  * the lock pointer.
4194  */
4195 static list_t *
4196 l2arc_list_locked(int list_num, kmutex_t **lock)
4197 {
4198         list_t *list = NULL;
4199 
4200         ASSERT(list_num >= 0 && list_num <= 3);
4201 
4202         switch (list_num) {
4203         case 0:
4204                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4205                 *lock = &arc_mfu->arcs_mtx;
4206                 break;
4207         case 1:
4208                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4209                 *lock = &arc_mru->arcs_mtx;
4210                 break;
4211         case 2:
4212                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4213                 *lock = &arc_mfu->arcs_mtx;
4214                 break;
4215         case 3:
4216                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4217                 *lock = &arc_mru->arcs_mtx;
4218                 break;
4219         }
4220 
4221         ASSERT(!(MUTEX_HELD(*lock)));
4222         mutex_enter(*lock);
4223         return (list);
4224 }
4225 
4226 /*
4227  * Evict buffers from the device write hand to the distance specified in
4228  * bytes.  This distance may span populated buffers, it may span nothing.
4229  * This is clearing a region on the L2ARC device ready for writing.
4230  * If the 'all' boolean is set, every buffer is evicted.
4231  */
4232 static void
4233 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4234 {
4235         list_t *buflist;
4236         l2arc_buf_hdr_t *abl2;
4237         arc_buf_hdr_t *ab, *ab_prev;
4238         kmutex_t *hash_lock;
4239         uint64_t taddr;
4240 
4241         buflist = dev->l2ad_buflist;
4242 
4243         if (buflist == NULL)
4244                 return;
4245 
4246         if (!all && dev->l2ad_first) {
4247                 /*
4248                  * This is the first sweep through the device.  There is
4249                  * nothing to evict.
4250                  */
4251                 return;
4252         }
4253 
4254         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4255                 /*
4256                  * When nearing the end of the device, evict to the end
4257                  * before the device write hand jumps to the start.
4258                  */
4259                 taddr = dev->l2ad_end;
4260         } else {
4261                 taddr = dev->l2ad_hand + distance;
4262         }
4263         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4264             uint64_t, taddr, boolean_t, all);
4265 
4266 top:
4267         mutex_enter(&l2arc_buflist_mtx);
4268         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4269                 ab_prev = list_prev(buflist, ab);
4270 
4271                 hash_lock = HDR_LOCK(ab);
4272                 if (!mutex_tryenter(hash_lock)) {
4273                         /*
4274                          * Missed the hash lock.  Retry.
4275                          */
4276                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4277                         mutex_exit(&l2arc_buflist_mtx);
4278                         mutex_enter(hash_lock);
4279                         mutex_exit(hash_lock);
4280                         goto top;
4281                 }
4282 
4283                 if (HDR_L2_WRITE_HEAD(ab)) {
4284                         /*
4285                          * We hit a write head node.  Leave it for
4286                          * l2arc_write_done().
4287                          */
4288                         list_remove(buflist, ab);
4289                         mutex_exit(hash_lock);
4290                         continue;
4291                 }
4292 
4293                 if (!all && ab->b_l2hdr != NULL &&
4294                     (ab->b_l2hdr->b_daddr > taddr ||
4295                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4296                         /*
4297                          * We've evicted to the target address,
4298                          * or the end of the device.
4299                          */
4300                         mutex_exit(hash_lock);
4301                         break;
4302                 }
4303 
4304                 if (HDR_FREE_IN_PROGRESS(ab)) {
4305                         /*
4306                          * Already on the path to destruction.
4307                          */
4308                         mutex_exit(hash_lock);
4309                         continue;
4310                 }
4311 
4312                 if (ab->b_state == arc_l2c_only) {
4313                         ASSERT(!HDR_L2_READING(ab));
4314                         /*
4315                          * This doesn't exist in the ARC.  Destroy.
4316                          * arc_hdr_destroy() will call list_remove()
4317                          * and decrement arcstat_l2_size.
4318                          */
4319                         arc_change_state(arc_anon, ab, hash_lock);
4320                         arc_hdr_destroy(ab);
4321                 } else {
4322                         /*
4323                          * Invalidate issued or about to be issued
4324                          * reads, since we may be about to write
4325                          * over this location.
4326                          */
4327                         if (HDR_L2_READING(ab)) {
4328                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4329                                 ab->b_flags |= ARC_L2_EVICTED;
4330                         }
4331 
4332                         /*
4333                          * Tell ARC this no longer exists in L2ARC.
4334                          */
4335                         if (ab->b_l2hdr != NULL) {
4336                                 abl2 = ab->b_l2hdr;
4337                                 ab->b_l2hdr = NULL;
4338                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4339                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4340                         }
4341                         list_remove(buflist, ab);
4342 
4343                         /*
4344                          * This may have been leftover after a
4345                          * failed write.
4346                          */
4347                         ab->b_flags &= ~ARC_L2_WRITING;
4348                 }
4349                 mutex_exit(hash_lock);
4350         }
4351         mutex_exit(&l2arc_buflist_mtx);
4352 
4353         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4354         dev->l2ad_evict = taddr;
4355 }
4356 
4357 /*
4358  * Find and write ARC buffers to the L2ARC device.
4359  *
4360  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4361  * for reading until they have completed writing.
4362  */
4363 static uint64_t
4364 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4365 {
4366         arc_buf_hdr_t *ab, *ab_prev, *head;
4367         l2arc_buf_hdr_t *hdrl2;
4368         list_t *list;
4369         uint64_t passed_sz, write_sz, buf_sz, headroom;
4370         void *buf_data;
4371         kmutex_t *hash_lock, *list_lock;
4372         boolean_t have_lock, full;
4373         l2arc_write_callback_t *cb;
4374         zio_t *pio, *wzio;
4375         uint64_t guid = spa_load_guid(spa);
4376 
4377         ASSERT(dev->l2ad_vdev != NULL);
4378 
4379         pio = NULL;
4380         write_sz = 0;
4381         full = B_FALSE;
4382         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4383         head->b_flags |= ARC_L2_WRITE_HEAD;
4384 
4385         /*
4386          * Copy buffers for L2ARC writing.
4387          */
4388         mutex_enter(&l2arc_buflist_mtx);
4389         for (int try = 0; try <= 3; try++) {
4390                 list = l2arc_list_locked(try, &list_lock);
4391                 passed_sz = 0;
4392 
4393                 /*
4394                  * L2ARC fast warmup.
4395                  *
4396                  * Until the ARC is warm and starts to evict, read from the
4397                  * head of the ARC lists rather than the tail.
4398                  */
4399                 headroom = target_sz * l2arc_headroom;
4400                 if (arc_warm == B_FALSE)
4401                         ab = list_head(list);
4402                 else
4403                         ab = list_tail(list);
4404 
4405                 for (; ab; ab = ab_prev) {
4406                         if (arc_warm == B_FALSE)
4407                                 ab_prev = list_next(list, ab);
4408                         else
4409                                 ab_prev = list_prev(list, ab);
4410 
4411                         hash_lock = HDR_LOCK(ab);
4412                         have_lock = MUTEX_HELD(hash_lock);
4413                         if (!have_lock && !mutex_tryenter(hash_lock)) {
4414                                 /*
4415                                  * Skip this buffer rather than waiting.
4416                                  */
4417                                 continue;
4418                         }
4419 
4420                         passed_sz += ab->b_size;
4421                         if (passed_sz > headroom) {
4422                                 /*
4423                                  * Searched too far.
4424                                  */
4425                                 mutex_exit(hash_lock);
4426                                 break;
4427                         }
4428 
4429                         if (!l2arc_write_eligible(guid, ab)) {
4430                                 mutex_exit(hash_lock);
4431                                 continue;
4432                         }
4433 
4434                         if ((write_sz + ab->b_size) > target_sz) {
4435                                 full = B_TRUE;
4436                                 mutex_exit(hash_lock);
4437                                 break;
4438                         }
4439 
4440                         if (pio == NULL) {
4441                                 /*
4442                                  * Insert a dummy header on the buflist so
4443                                  * l2arc_write_done() can find where the
4444                                  * write buffers begin without searching.
4445                                  */
4446                                 list_insert_head(dev->l2ad_buflist, head);
4447 
4448                                 cb = kmem_alloc(
4449                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4450                                 cb->l2wcb_dev = dev;
4451                                 cb->l2wcb_head = head;
4452                                 pio = zio_root(spa, l2arc_write_done, cb,
4453                                     ZIO_FLAG_CANFAIL);
4454                         }
4455 
4456                         /*
4457                          * Create and add a new L2ARC header.
4458                          */
4459                         hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4460                         hdrl2->b_dev = dev;
4461                         hdrl2->b_daddr = dev->l2ad_hand;
4462 
4463                         ab->b_flags |= ARC_L2_WRITING;
4464                         ab->b_l2hdr = hdrl2;
4465                         list_insert_head(dev->l2ad_buflist, ab);
4466                         buf_data = ab->b_buf->b_data;
4467                         buf_sz = ab->b_size;
4468 
4469                         /*
4470                          * Compute and store the buffer cksum before
4471                          * writing.  On debug the cksum is verified first.
4472                          */
4473                         arc_cksum_verify(ab->b_buf);
4474                         arc_cksum_compute(ab->b_buf, B_TRUE);
4475 
4476                         mutex_exit(hash_lock);
4477 
4478                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4479                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4480                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4481                             ZIO_FLAG_CANFAIL, B_FALSE);
4482 
4483                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4484                             zio_t *, wzio);
4485                         (void) zio_nowait(wzio);
4486 
4487                         /*
4488                          * Keep the clock hand suitably device-aligned.
4489                          */
4490                         buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4491 
4492                         write_sz += buf_sz;
4493                         dev->l2ad_hand += buf_sz;
4494                 }
4495 
4496                 mutex_exit(list_lock);
4497 
4498                 if (full == B_TRUE)
4499                         break;
4500         }
4501         mutex_exit(&l2arc_buflist_mtx);
4502 
4503         if (pio == NULL) {
4504                 ASSERT0(write_sz);
4505                 kmem_cache_free(hdr_cache, head);
4506                 return (0);
4507         }
4508 
4509         ASSERT3U(write_sz, <=, target_sz);
4510         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4511         ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4512         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4513         vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4514 
4515         /*
4516          * Bump device hand to the device start if it is approaching the end.
4517          * l2arc_evict() will already have evicted ahead for this case.
4518          */
4519         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4520                 vdev_space_update(dev->l2ad_vdev,
4521                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4522                 dev->l2ad_hand = dev->l2ad_start;
4523                 dev->l2ad_evict = dev->l2ad_start;
4524                 dev->l2ad_first = B_FALSE;
4525         }
4526 
4527         dev->l2ad_writing = B_TRUE;
4528         (void) zio_wait(pio);
4529         dev->l2ad_writing = B_FALSE;
4530 
4531         return (write_sz);
4532 }
4533 
4534 /*
4535  * This thread feeds the L2ARC at regular intervals.  This is the beating
4536  * heart of the L2ARC.
4537  */
4538 static void
4539 l2arc_feed_thread(void)
4540 {
4541         callb_cpr_t cpr;
4542         l2arc_dev_t *dev;
4543         spa_t *spa;
4544         uint64_t size, wrote;
4545         clock_t begin, next = ddi_get_lbolt();
4546 
4547         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4548 
4549         mutex_enter(&l2arc_feed_thr_lock);
4550 
4551         while (l2arc_thread_exit == 0) {
4552                 CALLB_CPR_SAFE_BEGIN(&cpr);
4553                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4554                     next);
4555                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4556                 next = ddi_get_lbolt() + hz;
4557 
4558                 /*
4559                  * Quick check for L2ARC devices.
4560                  */
4561                 mutex_enter(&l2arc_dev_mtx);
4562                 if (l2arc_ndev == 0) {
4563                         mutex_exit(&l2arc_dev_mtx);
4564                         continue;
4565                 }
4566                 mutex_exit(&l2arc_dev_mtx);
4567                 begin = ddi_get_lbolt();
4568 
4569                 /*
4570                  * This selects the next l2arc device to write to, and in
4571                  * doing so the next spa to feed from: dev->l2ad_spa.   This
4572                  * will return NULL if there are now no l2arc devices or if
4573                  * they are all faulted.
4574                  *
4575                  * If a device is returned, its spa's config lock is also
4576                  * held to prevent device removal.  l2arc_dev_get_next()
4577                  * will grab and release l2arc_dev_mtx.
4578                  */
4579                 if ((dev = l2arc_dev_get_next()) == NULL)
4580                         continue;
4581 
4582                 spa = dev->l2ad_spa;
4583                 ASSERT(spa != NULL);
4584 
4585                 /*
4586                  * If the pool is read-only then force the feed thread to
4587                  * sleep a little longer.
4588                  */
4589                 if (!spa_writeable(spa)) {
4590                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4591                         spa_config_exit(spa, SCL_L2ARC, dev);
4592                         continue;
4593                 }
4594 
4595                 /*
4596                  * Avoid contributing to memory pressure.
4597                  */
4598                 if (arc_reclaim_needed()) {
4599                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4600                         spa_config_exit(spa, SCL_L2ARC, dev);
4601                         continue;
4602                 }
4603 
4604                 ARCSTAT_BUMP(arcstat_l2_feeds);
4605 
4606                 size = l2arc_write_size(dev);
4607 
4608                 /*
4609                  * Evict L2ARC buffers that will be overwritten.
4610                  */
4611                 l2arc_evict(dev, size, B_FALSE);
4612 
4613                 /*
4614                  * Write ARC buffers.
4615                  */
4616                 wrote = l2arc_write_buffers(spa, dev, size);
4617 
4618                 /*
4619                  * Calculate interval between writes.
4620                  */
4621                 next = l2arc_write_interval(begin, size, wrote);
4622                 spa_config_exit(spa, SCL_L2ARC, dev);
4623         }
4624 
4625         l2arc_thread_exit = 0;
4626         cv_broadcast(&l2arc_feed_thr_cv);
4627         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
4628         thread_exit();
4629 }
4630 
4631 boolean_t
4632 l2arc_vdev_present(vdev_t *vd)
4633 {
4634         l2arc_dev_t *dev;
4635 
4636         mutex_enter(&l2arc_dev_mtx);
4637         for (dev = list_head(l2arc_dev_list); dev != NULL;
4638             dev = list_next(l2arc_dev_list, dev)) {
4639                 if (dev->l2ad_vdev == vd)
4640                         break;
4641         }
4642         mutex_exit(&l2arc_dev_mtx);
4643 
4644         return (dev != NULL);
4645 }
4646 
4647 /*
4648  * Add a vdev for use by the L2ARC.  By this point the spa has already
4649  * validated the vdev and opened it.
4650  */
4651 void
4652 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4653 {
4654         l2arc_dev_t *adddev;
4655 
4656         ASSERT(!l2arc_vdev_present(vd));
4657 
4658         /*
4659          * Create a new l2arc device entry.
4660          */
4661         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4662         adddev->l2ad_spa = spa;
4663         adddev->l2ad_vdev = vd;
4664         adddev->l2ad_write = l2arc_write_max;
4665         adddev->l2ad_boost = l2arc_write_boost;
4666         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4667         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4668         adddev->l2ad_hand = adddev->l2ad_start;
4669         adddev->l2ad_evict = adddev->l2ad_start;
4670         adddev->l2ad_first = B_TRUE;
4671         adddev->l2ad_writing = B_FALSE;
4672         ASSERT3U(adddev->l2ad_write, >, 0);
4673 
4674         /*
4675          * This is a list of all ARC buffers that are still valid on the
4676          * device.
4677          */
4678         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4679         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4680             offsetof(arc_buf_hdr_t, b_l2node));
4681 
4682         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4683 
4684         /*
4685          * Add device to global list
4686          */
4687         mutex_enter(&l2arc_dev_mtx);
4688         list_insert_head(l2arc_dev_list, adddev);
4689         atomic_inc_64(&l2arc_ndev);
4690         mutex_exit(&l2arc_dev_mtx);
4691 }
4692 
4693 /*
4694  * Remove a vdev from the L2ARC.
4695  */
4696 void
4697 l2arc_remove_vdev(vdev_t *vd)
4698 {
4699         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4700 
4701         /*
4702          * Find the device by vdev
4703          */
4704         mutex_enter(&l2arc_dev_mtx);
4705         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4706                 nextdev = list_next(l2arc_dev_list, dev);
4707                 if (vd == dev->l2ad_vdev) {
4708                         remdev = dev;
4709                         break;
4710                 }
4711         }
4712         ASSERT(remdev != NULL);
4713 
4714         /*
4715          * Remove device from global list
4716          */
4717         list_remove(l2arc_dev_list, remdev);
4718         l2arc_dev_last = NULL;          /* may have been invalidated */
4719         atomic_dec_64(&l2arc_ndev);
4720         mutex_exit(&l2arc_dev_mtx);
4721 
4722         /*
4723          * Clear all buflists and ARC references.  L2ARC device flush.
4724          */
4725         l2arc_evict(remdev, 0, B_TRUE);
4726         list_destroy(remdev->l2ad_buflist);
4727         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4728         kmem_free(remdev, sizeof (l2arc_dev_t));
4729 }
4730 
4731 void
4732 l2arc_init(void)
4733 {
4734         l2arc_thread_exit = 0;
4735         l2arc_ndev = 0;
4736         l2arc_writes_sent = 0;
4737         l2arc_writes_done = 0;
4738 
4739         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4740         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4741         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4742         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4743         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4744 
4745         l2arc_dev_list = &L2ARC_dev_list;
4746         l2arc_free_on_write = &L2ARC_free_on_write;
4747         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4748             offsetof(l2arc_dev_t, l2ad_node));
4749         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4750             offsetof(l2arc_data_free_t, l2df_list_node));
4751 }
4752 
4753 void
4754 l2arc_fini(void)
4755 {
4756         /*
4757          * This is called from dmu_fini(), which is called from spa_fini();
4758          * Because of this, we can assume that all l2arc devices have
4759          * already been removed when the pools themselves were removed.
4760          */
4761 
4762         l2arc_do_free_on_write();
4763 
4764         mutex_destroy(&l2arc_feed_thr_lock);
4765         cv_destroy(&l2arc_feed_thr_cv);
4766         mutex_destroy(&l2arc_dev_mtx);
4767         mutex_destroy(&l2arc_buflist_mtx);
4768         mutex_destroy(&l2arc_free_on_write_mtx);
4769 
4770         list_destroy(l2arc_dev_list);
4771         list_destroy(l2arc_free_on_write);
4772 }
4773 
4774 void
4775 l2arc_start(void)
4776 {
4777         if (!(spa_mode_global & FWRITE))
4778                 return;
4779 
4780         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4781             TS_RUN, minclsyspri);
4782 }
4783 
4784 void
4785 l2arc_stop(void)
4786 {
4787         if (!(spa_mode_global & FWRITE))
4788                 return;
4789 
4790         mutex_enter(&l2arc_feed_thr_lock);
4791         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
4792         l2arc_thread_exit = 1;
4793         while (l2arc_thread_exit != 0)
4794                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4795         mutex_exit(&l2arc_feed_thr_lock);
4796 }