illumos-3748.3 Old usr/src/uts/common/fs/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26 
  27 /*
  28  * DVA-based Adjustable Replacement Cache
  29  *
  30  * While much of the theory of operation used here is
  31  * based on the self-tuning, low overhead replacement cache
  32  * presented by Megiddo and Modha at FAST 2003, there are some
  33  * significant differences:
  34  *
  35  * 1. The Megiddo and Modha model assumes any page is evictable.
  36  * Pages in its cache cannot be "locked" into memory.  This makes
  37  * the eviction algorithm simple: evict the last page in the list.
  38  * This also make the performance characteristics easy to reason
  39  * about.  Our cache is not so simple.  At any given moment, some
  40  * subset of the blocks in the cache are un-evictable because we
  41  * have handed out a reference to them.  Blocks are only evictable
  42  * when there are no external references active.  This makes
  43  * eviction far more problematic:  we choose to evict the evictable
  44  * blocks that are the "lowest" in the list.
  45  *
  46  * There are times when it is not possible to evict the requested
  47  * space.  In these circumstances we are unable to adjust the cache
  48  * size.  To prevent the cache growing unbounded at these times we
  49  * implement a "cache throttle" that slows the flow of new data
  50  * into the cache until we can make space available.
  51  *
  52  * 2. The Megiddo and Modha model assumes a fixed cache size.
  53  * Pages are evicted when the cache is full and there is a cache
  54  * miss.  Our model has a variable sized cache.  It grows with
  55  * high use, but also tries to react to memory pressure from the
  56  * operating system: decreasing its size when system memory is
  57  * tight.
  58  *
  59  * 3. The Megiddo and Modha model assumes a fixed page size. All
  60  * elements of the cache are therefor exactly the same size.  So
  61  * when adjusting the cache size following a cache miss, its simply
  62  * a matter of choosing a single page to evict.  In our model, we
  63  * have variable sized cache blocks (rangeing from 512 bytes to
  64  * 128K bytes).  We therefor choose a set of blocks to evict to make
  65  * space for a cache miss that approximates as closely as possible
  66  * the space used by the new block.
  67  *
  68  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  69  * by N. Megiddo & D. Modha, FAST 2003
  70  */
  71 
  72 /*
  73  * The locking model:
  74  *
  75  * A new reference to a cache buffer can be obtained in two
  76  * ways: 1) via a hash table lookup using the DVA as a key,
  77  * or 2) via one of the ARC lists.  The arc_read() interface
  78  * uses method 1, while the internal arc algorithms for
  79  * adjusting the cache use method 2.  We therefor provide two
  80  * types of locks: 1) the hash table lock array, and 2) the
  81  * arc list locks.
  82  *
  83  * Buffers do not have their own mutexes, rather they rely on the
  84  * hash table mutexes for the bulk of their protection (i.e. most
  85  * fields in the arc_buf_hdr_t are protected by these mutexes).
  86  *
  87  * buf_hash_find() returns the appropriate mutex (held) when it
  88  * locates the requested buffer in the hash table.  It returns
  89  * NULL for the mutex if the buffer was not in the table.
  90  *
  91  * buf_hash_remove() expects the appropriate hash mutex to be
  92  * already held before it is invoked.
  93  *
  94  * Each arc state also has a mutex which is used to protect the
  95  * buffer list associated with the state.  When attempting to
  96  * obtain a hash table lock while holding an arc list lock you
  97  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  98  * the active state mutex must be held before the ghost state mutex.
  99  *
 100  * Arc buffers may have an associated eviction callback function.
 101  * This function will be invoked prior to removing the buffer (e.g.
 102  * in arc_do_user_evicts()).  Note however that the data associated
 103  * with the buffer may be evicted prior to the callback.  The callback
 104  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 105  * the users of callbacks must ensure that their private data is
 106  * protected from simultaneous callbacks from arc_buf_evict()
 107  * and arc_do_user_evicts().
 108  *
 109  * Note that the majority of the performance stats are manipulated
 110  * with atomic operations.
 111  *
 112  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 113  *
 114  *      - L2ARC buflist creation
 115  *      - L2ARC buflist eviction
 116  *      - L2ARC write completion, which walks L2ARC buflists
 117  *      - ARC header destruction, as it removes from L2ARC buflists
 118  *      - ARC header release, as it removes from L2ARC buflists
 119  */
 120 
 121 #include <sys/spa.h>
 122 #include <sys/zio.h>
 123 #include <sys/zfs_context.h>
 124 #include <sys/arc.h>
 125 #include <sys/refcount.h>
 126 #include <sys/vdev.h>
 127 #include <sys/vdev_impl.h>
 128 #ifdef _KERNEL
 129 #include <sys/vmsystm.h>
 130 #include <vm/anon.h>
 131 #include <sys/fs/swapnode.h>
 132 #include <sys/dnlc.h>
 133 #endif
 134 #include <sys/callb.h>
 135 #include <sys/kstat.h>
 136 #include <zfs_fletcher.h>
 137 
 138 #ifndef _KERNEL
 139 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 140 boolean_t arc_watch = B_FALSE;
 141 int arc_procfd;
 142 #endif
 143 
 144 static kmutex_t         arc_reclaim_thr_lock;
 145 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 146 static uint8_t          arc_thread_exit;
 147 
 148 extern int zfs_write_limit_shift;
 149 extern uint64_t zfs_write_limit_max;
 150 extern kmutex_t zfs_write_limit_lock;
 151 
 152 #define ARC_REDUCE_DNLC_PERCENT 3
 153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154 
 155 typedef enum arc_reclaim_strategy {
 156         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158 } arc_reclaim_strategy_t;
 159 
 160 /* number of seconds before growing cache again */
 161 static int              arc_grow_retry = 60;
 162 
 163 /* shift of arc_c for calculating both min and max arc_p */
 164 static int              arc_p_min_shift = 4;
 165 
 166 /* log2(fraction of arc to reclaim) */
 167 static int              arc_shrink_shift = 5;
 168 
 169 /*
 170  * minimum lifespan of a prefetch block in clock ticks
 171  * (initialized in arc_init())
 172  */
 173 static int              arc_min_prefetch_lifespan;
 174 
 175 static int arc_dead;
 176 
 177 /*
 178  * The arc has filled available memory and has now warmed up.
 179  */
 180 static boolean_t arc_warm;
 181 
 182 /*
 183  * These tunables are for performance analysis.
 184  */
 185 uint64_t zfs_arc_max;
 186 uint64_t zfs_arc_min;
 187 uint64_t zfs_arc_meta_limit = 0;
 188 int zfs_arc_grow_retry = 0;
 189 int zfs_arc_shrink_shift = 0;
 190 int zfs_arc_p_min_shift = 0;
 191 int zfs_disable_dup_eviction = 0;
 192 
 193 /*
 194  * Note that buffers can be in one of 6 states:
 195  *      ARC_anon        - anonymous (discussed below)
 196  *      ARC_mru         - recently used, currently cached
 197  *      ARC_mru_ghost   - recentely used, no longer in cache
 198  *      ARC_mfu         - frequently used, currently cached
 199  *      ARC_mfu_ghost   - frequently used, no longer in cache
 200  *      ARC_l2c_only    - exists in L2ARC but not other states
 201  * When there are no active references to the buffer, they are
 202  * are linked onto a list in one of these arc states.  These are
 203  * the only buffers that can be evicted or deleted.  Within each
 204  * state there are multiple lists, one for meta-data and one for
 205  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 206  * etc.) is tracked separately so that it can be managed more
 207  * explicitly: favored over data, limited explicitly.
 208  *
 209  * Anonymous buffers are buffers that are not associated with
 210  * a DVA.  These are buffers that hold dirty block copies
 211  * before they are written to stable storage.  By definition,
 212  * they are "ref'd" and are considered part of arc_mru
 213  * that cannot be freed.  Generally, they will aquire a DVA
 214  * as they are written and migrate onto the arc_mru list.
 215  *
 216  * The ARC_l2c_only state is for buffers that are in the second
 217  * level ARC but no longer in any of the ARC_m* lists.  The second
 218  * level ARC itself may also contain buffers that are in any of
 219  * the ARC_m* states - meaning that a buffer can exist in two
 220  * places.  The reason for the ARC_l2c_only state is to keep the
 221  * buffer header in the hash table, so that reads that hit the
 222  * second level ARC benefit from these fast lookups.
 223  */
 224 
 225 typedef struct arc_state {
 226         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 227         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 228         uint64_t arcs_size;     /* total amount of data in this state */
 229         kmutex_t arcs_mtx;
 230 } arc_state_t;
 231 
 232 /* The 6 states: */
 233 static arc_state_t ARC_anon;
 234 static arc_state_t ARC_mru;
 235 static arc_state_t ARC_mru_ghost;
 236 static arc_state_t ARC_mfu;
 237 static arc_state_t ARC_mfu_ghost;
 238 static arc_state_t ARC_l2c_only;
 239 
 240 typedef struct arc_stats {
 241         kstat_named_t arcstat_hits;
 242         kstat_named_t arcstat_misses;
 243         kstat_named_t arcstat_demand_data_hits;
 244         kstat_named_t arcstat_demand_data_misses;
 245         kstat_named_t arcstat_demand_metadata_hits;
 246         kstat_named_t arcstat_demand_metadata_misses;
 247         kstat_named_t arcstat_prefetch_data_hits;
 248         kstat_named_t arcstat_prefetch_data_misses;
 249         kstat_named_t arcstat_prefetch_metadata_hits;
 250         kstat_named_t arcstat_prefetch_metadata_misses;
 251         kstat_named_t arcstat_mru_hits;
 252         kstat_named_t arcstat_mru_ghost_hits;
 253         kstat_named_t arcstat_mfu_hits;
 254         kstat_named_t arcstat_mfu_ghost_hits;
 255         kstat_named_t arcstat_deleted;
 256         kstat_named_t arcstat_recycle_miss;
 257         kstat_named_t arcstat_mutex_miss;
 258         kstat_named_t arcstat_evict_skip;
 259         kstat_named_t arcstat_evict_l2_cached;
 260         kstat_named_t arcstat_evict_l2_eligible;
 261         kstat_named_t arcstat_evict_l2_ineligible;
 262         kstat_named_t arcstat_hash_elements;
 263         kstat_named_t arcstat_hash_elements_max;
 264         kstat_named_t arcstat_hash_collisions;
 265         kstat_named_t arcstat_hash_chains;
 266         kstat_named_t arcstat_hash_chain_max;
 267         kstat_named_t arcstat_p;
 268         kstat_named_t arcstat_c;
 269         kstat_named_t arcstat_c_min;
 270         kstat_named_t arcstat_c_max;
 271         kstat_named_t arcstat_size;
 272         kstat_named_t arcstat_hdr_size;
 273         kstat_named_t arcstat_data_size;
 274         kstat_named_t arcstat_other_size;
 275         kstat_named_t arcstat_l2_hits;
 276         kstat_named_t arcstat_l2_misses;
 277         kstat_named_t arcstat_l2_feeds;
 278         kstat_named_t arcstat_l2_rw_clash;
 279         kstat_named_t arcstat_l2_read_bytes;
 280         kstat_named_t arcstat_l2_write_bytes;
 281         kstat_named_t arcstat_l2_writes_sent;
 282         kstat_named_t arcstat_l2_writes_done;
 283         kstat_named_t arcstat_l2_writes_error;
 284         kstat_named_t arcstat_l2_writes_hdr_miss;
 285         kstat_named_t arcstat_l2_evict_lock_retry;
 286         kstat_named_t arcstat_l2_evict_reading;
 287         kstat_named_t arcstat_l2_free_on_write;
 288         kstat_named_t arcstat_l2_abort_lowmem;
 289         kstat_named_t arcstat_l2_cksum_bad;
 290         kstat_named_t arcstat_l2_io_error;
 291         kstat_named_t arcstat_l2_size;
 292         kstat_named_t arcstat_l2_hdr_size;
 293         kstat_named_t arcstat_memory_throttle_count;
 294         kstat_named_t arcstat_duplicate_buffers;
 295         kstat_named_t arcstat_duplicate_buffers_size;
 296         kstat_named_t arcstat_duplicate_reads;
 297         kstat_named_t arcstat_meta_used;
 298         kstat_named_t arcstat_meta_limit;
 299         kstat_named_t arcstat_meta_max;
 300 } arc_stats_t;
 301 
 302 static arc_stats_t arc_stats = {
 303         { "hits",                       KSTAT_DATA_UINT64 },
 304         { "misses",                     KSTAT_DATA_UINT64 },
 305         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 306         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 307         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 308         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 309         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 310         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 311         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 312         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 313         { "mru_hits",                   KSTAT_DATA_UINT64 },
 314         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 315         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 316         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 317         { "deleted",                    KSTAT_DATA_UINT64 },
 318         { "recycle_miss",               KSTAT_DATA_UINT64 },
 319         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 320         { "evict_skip",                 KSTAT_DATA_UINT64 },
 321         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 322         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 323         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 324         { "hash_elements",              KSTAT_DATA_UINT64 },
 325         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 326         { "hash_collisions",            KSTAT_DATA_UINT64 },
 327         { "hash_chains",                KSTAT_DATA_UINT64 },
 328         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 329         { "p",                          KSTAT_DATA_UINT64 },
 330         { "c",                          KSTAT_DATA_UINT64 },
 331         { "c_min",                      KSTAT_DATA_UINT64 },
 332         { "c_max",                      KSTAT_DATA_UINT64 },
 333         { "size",                       KSTAT_DATA_UINT64 },
 334         { "hdr_size",                   KSTAT_DATA_UINT64 },
 335         { "data_size",                  KSTAT_DATA_UINT64 },
 336         { "other_size",                 KSTAT_DATA_UINT64 },
 337         { "l2_hits",                    KSTAT_DATA_UINT64 },
 338         { "l2_misses",                  KSTAT_DATA_UINT64 },
 339         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 340         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 341         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 342         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 343         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 344         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 345         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 346         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 347         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 348         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 349         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 350         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 351         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 352         { "l2_io_error",                KSTAT_DATA_UINT64 },
 353         { "l2_size",                    KSTAT_DATA_UINT64 },
 354         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 355         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 356         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 357         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 358         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 359         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 360         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 361         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 362 };
 363 
 364 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 365 
 366 #define ARCSTAT_INCR(stat, val) \
 367         atomic_add_64(&arc_stats.stat.value.ui64, (val));
 368 
 369 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 370 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 371 
 372 #define ARCSTAT_MAX(stat, val) {                                        \
 373         uint64_t m;                                                     \
 374         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 375             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 376                 continue;                                               \
 377 }
 378 
 379 #define ARCSTAT_MAXSTAT(stat) \
 380         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 381 
 382 /*
 383  * We define a macro to allow ARC hits/misses to be easily broken down by
 384  * two separate conditions, giving a total of four different subtypes for
 385  * each of hits and misses (so eight statistics total).
 386  */
 387 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 388         if (cond1) {                                                    \
 389                 if (cond2) {                                            \
 390                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 391                 } else {                                                \
 392                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 393                 }                                                       \
 394         } else {                                                        \
 395                 if (cond2) {                                            \
 396                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 397                 } else {                                                \
 398                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 399                 }                                                       \
 400         }
 401 
 402 kstat_t                 *arc_ksp;
 403 static arc_state_t      *arc_anon;
 404 static arc_state_t      *arc_mru;
 405 static arc_state_t      *arc_mru_ghost;
 406 static arc_state_t      *arc_mfu;
 407 static arc_state_t      *arc_mfu_ghost;
 408 static arc_state_t      *arc_l2c_only;
 409 
 410 /*
 411  * There are several ARC variables that are critical to export as kstats --
 412  * but we don't want to have to grovel around in the kstat whenever we wish to
 413  * manipulate them.  For these variables, we therefore define them to be in
 414  * terms of the statistic variable.  This assures that we are not introducing
 415  * the possibility of inconsistency by having shadow copies of the variables,
 416  * while still allowing the code to be readable.
 417  */
 418 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 419 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 420 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 421 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 422 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 423 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 424 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 425 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 426 
 427 static int              arc_no_grow;    /* Don't try to grow cache size */
 428 static uint64_t         arc_tempreserve;
 429 static uint64_t         arc_loaned_bytes;
 430 
 431 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 432 
 433 typedef struct arc_callback arc_callback_t;
 434 
 435 struct arc_callback {
 436         void                    *acb_private;
 437         arc_done_func_t         *acb_done;
 438         arc_buf_t               *acb_buf;
 439         zio_t                   *acb_zio_dummy;
 440         arc_callback_t          *acb_next;
 441 };
 442 
 443 typedef struct arc_write_callback arc_write_callback_t;
 444 
 445 struct arc_write_callback {
 446         void            *awcb_private;
 447         arc_done_func_t *awcb_ready;
 448         arc_done_func_t *awcb_done;
 449         arc_buf_t       *awcb_buf;
 450 };
 451 
 452 struct arc_buf_hdr {
 453         /* protected by hash lock */
 454         dva_t                   b_dva;
 455         uint64_t                b_birth;
 456         uint64_t                b_cksum0;
 457 
 458         kmutex_t                b_freeze_lock;
 459         zio_cksum_t             *b_freeze_cksum;
 460         void                    *b_thawed;
 461 
 462         arc_buf_hdr_t           *b_hash_next;
 463         arc_buf_t               *b_buf;
 464         uint32_t                b_flags;
 465         uint32_t                b_datacnt;
 466 
 467         arc_callback_t          *b_acb;
 468         kcondvar_t              b_cv;
 469 
 470         /* immutable */
 471         arc_buf_contents_t      b_type;
 472         uint64_t                b_size;
 473         uint64_t                b_spa;
 474 
 475         /* protected by arc state mutex */
 476         arc_state_t             *b_state;
 477         list_node_t             b_arc_node;
 478 
 479         /* updated atomically */
 480         clock_t                 b_arc_access;
 481 
 482         /* self protecting */
 483         refcount_t              b_refcnt;
 484 
 485         l2arc_buf_hdr_t         *b_l2hdr;
 486         list_node_t             b_l2node;
 487 };
 488 
 489 static arc_buf_t *arc_eviction_list;
 490 static kmutex_t arc_eviction_mtx;
 491 static arc_buf_hdr_t arc_eviction_hdr;
 492 static void arc_get_data_buf(arc_buf_t *buf);
 493 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 494 static int arc_evict_needed(arc_buf_contents_t type);
 495 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 496 static void arc_buf_watch(arc_buf_t *buf);
 497 
 498 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 499 
 500 #define GHOST_STATE(state)      \
 501         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 502         (state) == arc_l2c_only)
 503 
 504 /*
 505  * Private ARC flags.  These flags are private ARC only flags that will show up
 506  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 507  * be passed in as arc_flags in things like arc_read.  However, these flags
 508  * should never be passed and should only be set by ARC code.  When adding new
 509  * public flags, make sure not to smash the private ones.
 510  */
 511 
 512 #define ARC_IN_HASH_TABLE       (1 << 9)  /* this buffer is hashed */
 513 #define ARC_IO_IN_PROGRESS      (1 << 10) /* I/O in progress for buf */
 514 #define ARC_IO_ERROR            (1 << 11) /* I/O failed for buf */
 515 #define ARC_FREED_IN_READ       (1 << 12) /* buf freed while in read */
 516 #define ARC_BUF_AVAILABLE       (1 << 13) /* block not in active use */
 517 #define ARC_INDIRECT            (1 << 14) /* this is an indirect block */
 518 #define ARC_FREE_IN_PROGRESS    (1 << 15) /* hdr about to be freed */
 519 #define ARC_L2_WRITING          (1 << 16) /* L2ARC write in progress */
 520 #define ARC_L2_EVICTED          (1 << 17) /* evicted during I/O */
 521 #define ARC_L2_WRITE_HEAD       (1 << 18) /* head of write list */
 522 
 523 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 524 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 525 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 526 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 527 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 528 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 529 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 530 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 531 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS &&  \
 532                                     (hdr)->b_l2hdr != NULL)
 533 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 534 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 535 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 536 
 537 /*
 538  * Other sizes
 539  */
 540 
 541 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 542 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 543 
 544 /*
 545  * Hash table routines
 546  */
 547 
 548 #define HT_LOCK_PAD     64
 549 
 550 struct ht_lock {
 551         kmutex_t        ht_lock;
 552 #ifdef _KERNEL
 553         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 554 #endif
 555 };
 556 
 557 #define BUF_LOCKS 256
 558 typedef struct buf_hash_table {
 559         uint64_t ht_mask;
 560         arc_buf_hdr_t **ht_table;
 561         struct ht_lock ht_locks[BUF_LOCKS];
 562 } buf_hash_table_t;
 563 
 564 static buf_hash_table_t buf_hash_table;
 565 
 566 #define BUF_HASH_INDEX(spa, dva, birth) \
 567         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 568 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 569 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 570 #define HDR_LOCK(hdr) \
 571         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 572 
 573 uint64_t zfs_crc64_table[256];
 574 
 575 /*
 576  * Level 2 ARC
 577  */
 578 
 579 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 580 #define L2ARC_HEADROOM          2               /* num of writes */
 581 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 582 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 583 
 584 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 585 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 586 
 587 /*
 588  * L2ARC Performance Tunables
 589  */
 590 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 591 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 592 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 593 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 594 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 595 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 596 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 597 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 598 
 599 /*
 600  * L2ARC Internals
 601  */
 602 typedef struct l2arc_dev {
 603         vdev_t                  *l2ad_vdev;     /* vdev */
 604         spa_t                   *l2ad_spa;      /* spa */
 605         uint64_t                l2ad_hand;      /* next write location */
 606         uint64_t                l2ad_write;     /* desired write size, bytes */
 607         uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 608         uint64_t                l2ad_start;     /* first addr on device */
 609         uint64_t                l2ad_end;       /* last addr on device */
 610         uint64_t                l2ad_evict;     /* last addr eviction reached */
 611         boolean_t               l2ad_first;     /* first sweep through */
 612         boolean_t               l2ad_writing;   /* currently writing */
 613         list_t                  *l2ad_buflist;  /* buffer list */
 614         list_node_t             l2ad_node;      /* device list node */
 615 } l2arc_dev_t;
 616 
 617 static list_t L2ARC_dev_list;                   /* device list */
 618 static list_t *l2arc_dev_list;                  /* device list pointer */
 619 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 620 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 621 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 622 static list_t L2ARC_free_on_write;              /* free after write buf list */
 623 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 624 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 625 static uint64_t l2arc_ndev;                     /* number of devices */
 626 
 627 typedef struct l2arc_read_callback {
 628         arc_buf_t       *l2rcb_buf;             /* read buffer */
 629         spa_t           *l2rcb_spa;             /* spa */
 630         blkptr_t        l2rcb_bp;               /* original blkptr */
 631         zbookmark_t     l2rcb_zb;               /* original bookmark */
 632         int             l2rcb_flags;            /* original flags */
 633 } l2arc_read_callback_t;
 634 
 635 typedef struct l2arc_write_callback {
 636         l2arc_dev_t     *l2wcb_dev;             /* device info */
 637         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 638 } l2arc_write_callback_t;
 639 
 640 struct l2arc_buf_hdr {
 641         /* protected by arc_buf_hdr  mutex */
 642         l2arc_dev_t     *b_dev;                 /* L2ARC device */
 643         uint64_t        b_daddr;                /* disk address, offset byte */
 644 };
 645 
 646 typedef struct l2arc_data_free {
 647         /* protected by l2arc_free_on_write_mtx */
 648         void            *l2df_data;
 649         size_t          l2df_size;
 650         void            (*l2df_func)(void *, size_t);
 651         list_node_t     l2df_list_node;
 652 } l2arc_data_free_t;
 653 
 654 static kmutex_t l2arc_feed_thr_lock;
 655 static kcondvar_t l2arc_feed_thr_cv;
 656 static uint8_t l2arc_thread_exit;
 657 
 658 static void l2arc_read_done(zio_t *zio);
 659 static void l2arc_hdr_stat_add(void);
 660 static void l2arc_hdr_stat_remove(void);
 661 
 662 static uint64_t
 663 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 664 {
 665         uint8_t *vdva = (uint8_t *)dva;
 666         uint64_t crc = -1ULL;
 667         int i;
 668 
 669         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 670 
 671         for (i = 0; i < sizeof (dva_t); i++)
 672                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 673 
 674         crc ^= (spa>>8) ^ birth;
 675 
 676         return (crc);
 677 }
 678 
 679 #define BUF_EMPTY(buf)                                          \
 680         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 681         (buf)->b_dva.dva_word[1] == 0 &&                     \
 682         (buf)->b_birth == 0)
 683 
 684 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 685         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&       \
 686         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&       \
 687         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 688 
 689 static void
 690 buf_discard_identity(arc_buf_hdr_t *hdr)
 691 {
 692         hdr->b_dva.dva_word[0] = 0;
 693         hdr->b_dva.dva_word[1] = 0;
 694         hdr->b_birth = 0;
 695         hdr->b_cksum0 = 0;
 696 }
 697 
 698 static arc_buf_hdr_t *
 699 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 700 {
 701         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 702         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 703         arc_buf_hdr_t *buf;
 704 
 705         mutex_enter(hash_lock);
 706         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 707             buf = buf->b_hash_next) {
 708                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 709                         *lockp = hash_lock;
 710                         return (buf);
 711                 }
 712         }
 713         mutex_exit(hash_lock);
 714         *lockp = NULL;
 715         return (NULL);
 716 }
 717 
 718 /*
 719  * Insert an entry into the hash table.  If there is already an element
 720  * equal to elem in the hash table, then the already existing element
 721  * will be returned and the new element will not be inserted.
 722  * Otherwise returns NULL.
 723  */
 724 static arc_buf_hdr_t *
 725 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 726 {
 727         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 728         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 729         arc_buf_hdr_t *fbuf;
 730         uint32_t i;
 731 
 732         ASSERT(!HDR_IN_HASH_TABLE(buf));
 733         *lockp = hash_lock;
 734         mutex_enter(hash_lock);
 735         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 736             fbuf = fbuf->b_hash_next, i++) {
 737                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 738                         return (fbuf);
 739         }
 740 
 741         buf->b_hash_next = buf_hash_table.ht_table[idx];
 742         buf_hash_table.ht_table[idx] = buf;
 743         buf->b_flags |= ARC_IN_HASH_TABLE;
 744 
 745         /* collect some hash table performance data */
 746         if (i > 0) {
 747                 ARCSTAT_BUMP(arcstat_hash_collisions);
 748                 if (i == 1)
 749                         ARCSTAT_BUMP(arcstat_hash_chains);
 750 
 751                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 752         }
 753 
 754         ARCSTAT_BUMP(arcstat_hash_elements);
 755         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 756 
 757         return (NULL);
 758 }
 759 
 760 static void
 761 buf_hash_remove(arc_buf_hdr_t *buf)
 762 {
 763         arc_buf_hdr_t *fbuf, **bufp;
 764         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 765 
 766         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 767         ASSERT(HDR_IN_HASH_TABLE(buf));
 768 
 769         bufp = &buf_hash_table.ht_table[idx];
 770         while ((fbuf = *bufp) != buf) {
 771                 ASSERT(fbuf != NULL);
 772                 bufp = &fbuf->b_hash_next;
 773         }
 774         *bufp = buf->b_hash_next;
 775         buf->b_hash_next = NULL;
 776         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 777 
 778         /* collect some hash table performance data */
 779         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 780 
 781         if (buf_hash_table.ht_table[idx] &&
 782             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 783                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 784 }
 785 
 786 /*
 787  * Global data structures and functions for the buf kmem cache.
 788  */
 789 static kmem_cache_t *hdr_cache;
 790 static kmem_cache_t *buf_cache;
 791 
 792 static void
 793 buf_fini(void)
 794 {
 795         int i;
 796 
 797         kmem_free(buf_hash_table.ht_table,
 798             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 799         for (i = 0; i < BUF_LOCKS; i++)
 800                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 801         kmem_cache_destroy(hdr_cache);
 802         kmem_cache_destroy(buf_cache);
 803 }
 804 
 805 /*
 806  * Constructor callback - called when the cache is empty
 807  * and a new buf is requested.
 808  */
 809 /* ARGSUSED */
 810 static int
 811 hdr_cons(void *vbuf, void *unused, int kmflag)
 812 {
 813         arc_buf_hdr_t *buf = vbuf;
 814 
 815         bzero(buf, sizeof (arc_buf_hdr_t));
 816         refcount_create(&buf->b_refcnt);
 817         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 818         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 819         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 820 
 821         return (0);
 822 }
 823 
 824 /* ARGSUSED */
 825 static int
 826 buf_cons(void *vbuf, void *unused, int kmflag)
 827 {
 828         arc_buf_t *buf = vbuf;
 829 
 830         bzero(buf, sizeof (arc_buf_t));
 831         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 832         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 833 
 834         return (0);
 835 }
 836 
 837 /*
 838  * Destructor callback - called when a cached buf is
 839  * no longer required.
 840  */
 841 /* ARGSUSED */
 842 static void
 843 hdr_dest(void *vbuf, void *unused)
 844 {
 845         arc_buf_hdr_t *buf = vbuf;
 846 
 847         ASSERT(BUF_EMPTY(buf));
 848         refcount_destroy(&buf->b_refcnt);
 849         cv_destroy(&buf->b_cv);
 850         mutex_destroy(&buf->b_freeze_lock);
 851         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 852 }
 853 
 854 /* ARGSUSED */
 855 static void
 856 buf_dest(void *vbuf, void *unused)
 857 {
 858         arc_buf_t *buf = vbuf;
 859 
 860         mutex_destroy(&buf->b_evict_lock);
 861         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 862 }
 863 
 864 /*
 865  * Reclaim callback -- invoked when memory is low.
 866  */
 867 /* ARGSUSED */
 868 static void
 869 hdr_recl(void *unused)
 870 {
 871         dprintf("hdr_recl called\n");
 872         /*
 873          * umem calls the reclaim func when we destroy the buf cache,
 874          * which is after we do arc_fini().
 875          */
 876         if (!arc_dead)
 877                 cv_signal(&arc_reclaim_thr_cv);
 878 }
 879 
 880 static void
 881 buf_init(void)
 882 {
 883         uint64_t *ct;
 884         uint64_t hsize = 1ULL << 12;
 885         int i, j;
 886 
 887         /*
 888          * The hash table is big enough to fill all of physical memory
 889          * with an average 64K block size.  The table will take up
 890          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 891          */
 892         while (hsize * 65536 < physmem * PAGESIZE)
 893                 hsize <<= 1;
 894 retry:
 895         buf_hash_table.ht_mask = hsize - 1;
 896         buf_hash_table.ht_table =
 897             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 898         if (buf_hash_table.ht_table == NULL) {
 899                 ASSERT(hsize > (1ULL << 8));
 900                 hsize >>= 1;
 901                 goto retry;
 902         }
 903 
 904         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 905             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 906         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 907             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 908 
 909         for (i = 0; i < 256; i++)
 910                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 911                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 912 
 913         for (i = 0; i < BUF_LOCKS; i++) {
 914                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 915                     NULL, MUTEX_DEFAULT, NULL);
 916         }
 917 }
 918 
 919 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 920 
 921 static void
 922 arc_cksum_verify(arc_buf_t *buf)
 923 {
 924         zio_cksum_t zc;
 925 
 926         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 927                 return;
 928 
 929         mutex_enter(&buf->b_hdr->b_freeze_lock);
 930         if (buf->b_hdr->b_freeze_cksum == NULL ||
 931             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 932                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 933                 return;
 934         }
 935         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 936         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 937                 panic("buffer modified while frozen!");
 938         mutex_exit(&buf->b_hdr->b_freeze_lock);
 939 }
 940 
 941 static int
 942 arc_cksum_equal(arc_buf_t *buf)
 943 {
 944         zio_cksum_t zc;
 945         int equal;
 946 
 947         mutex_enter(&buf->b_hdr->b_freeze_lock);
 948         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 949         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 950         mutex_exit(&buf->b_hdr->b_freeze_lock);
 951 
 952         return (equal);
 953 }
 954 
 955 static void
 956 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 957 {
 958         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 959                 return;
 960 
 961         mutex_enter(&buf->b_hdr->b_freeze_lock);
 962         if (buf->b_hdr->b_freeze_cksum != NULL) {
 963                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 964                 return;
 965         }
 966         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
 967         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
 968             buf->b_hdr->b_freeze_cksum);
 969         mutex_exit(&buf->b_hdr->b_freeze_lock);
 970         arc_buf_watch(buf);
 971 }
 972 
 973 #ifndef _KERNEL
 974 typedef struct procctl {
 975         long cmd;
 976         prwatch_t prwatch;
 977 } procctl_t;
 978 #endif
 979 
 980 /* ARGSUSED */
 981 static void
 982 arc_buf_unwatch(arc_buf_t *buf)
 983 {
 984 #ifndef _KERNEL
 985         if (arc_watch) {
 986                 int result;
 987                 procctl_t ctl;
 988                 ctl.cmd = PCWATCH;
 989                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
 990                 ctl.prwatch.pr_size = 0;
 991                 ctl.prwatch.pr_wflags = 0;
 992                 result = write(arc_procfd, &ctl, sizeof (ctl));
 993                 ASSERT3U(result, ==, sizeof (ctl));
 994         }
 995 #endif
 996 }
 997 
 998 /* ARGSUSED */
 999 static void
1000 arc_buf_watch(arc_buf_t *buf)
1001 {
1002 #ifndef _KERNEL
1003         if (arc_watch) {
1004                 int result;
1005                 procctl_t ctl;
1006                 ctl.cmd = PCWATCH;
1007                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1008                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1009                 ctl.prwatch.pr_wflags = WA_WRITE;
1010                 result = write(arc_procfd, &ctl, sizeof (ctl));
1011                 ASSERT3U(result, ==, sizeof (ctl));
1012         }
1013 #endif
1014 }
1015 
1016 void
1017 arc_buf_thaw(arc_buf_t *buf)
1018 {
1019         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1020                 if (buf->b_hdr->b_state != arc_anon)
1021                         panic("modifying non-anon buffer!");
1022                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1023                         panic("modifying buffer while i/o in progress!");
1024                 arc_cksum_verify(buf);
1025         }
1026 
1027         mutex_enter(&buf->b_hdr->b_freeze_lock);
1028         if (buf->b_hdr->b_freeze_cksum != NULL) {
1029                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1030                 buf->b_hdr->b_freeze_cksum = NULL;
1031         }
1032 
1033         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1034                 if (buf->b_hdr->b_thawed)
1035                         kmem_free(buf->b_hdr->b_thawed, 1);
1036                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1037         }
1038 
1039         mutex_exit(&buf->b_hdr->b_freeze_lock);
1040 
1041         arc_buf_unwatch(buf);
1042 }
1043 
1044 void
1045 arc_buf_freeze(arc_buf_t *buf)
1046 {
1047         kmutex_t *hash_lock;
1048 
1049         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1050                 return;
1051 
1052         hash_lock = HDR_LOCK(buf->b_hdr);
1053         mutex_enter(hash_lock);
1054 
1055         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1056             buf->b_hdr->b_state == arc_anon);
1057         arc_cksum_compute(buf, B_FALSE);
1058         mutex_exit(hash_lock);
1059 
1060 }
1061 
1062 static void
1063 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1064 {
1065         ASSERT(MUTEX_HELD(hash_lock));
1066 
1067         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1068             (ab->b_state != arc_anon)) {
1069                 uint64_t delta = ab->b_size * ab->b_datacnt;
1070                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1071                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1072 
1073                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1074                 mutex_enter(&ab->b_state->arcs_mtx);
1075                 ASSERT(list_link_active(&ab->b_arc_node));
1076                 list_remove(list, ab);
1077                 if (GHOST_STATE(ab->b_state)) {
1078                         ASSERT0(ab->b_datacnt);
1079                         ASSERT3P(ab->b_buf, ==, NULL);
1080                         delta = ab->b_size;
1081                 }
1082                 ASSERT(delta > 0);
1083                 ASSERT3U(*size, >=, delta);
1084                 atomic_add_64(size, -delta);
1085                 mutex_exit(&ab->b_state->arcs_mtx);
1086                 /* remove the prefetch flag if we get a reference */
1087                 if (ab->b_flags & ARC_PREFETCH)
1088                         ab->b_flags &= ~ARC_PREFETCH;
1089         }
1090 }
1091 
1092 static int
1093 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1094 {
1095         int cnt;
1096         arc_state_t *state = ab->b_state;
1097 
1098         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1099         ASSERT(!GHOST_STATE(state));
1100 
1101         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1102             (state != arc_anon)) {
1103                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1104 
1105                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1106                 mutex_enter(&state->arcs_mtx);
1107                 ASSERT(!list_link_active(&ab->b_arc_node));
1108                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1109                 ASSERT(ab->b_datacnt > 0);
1110                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1111                 mutex_exit(&state->arcs_mtx);
1112         }
1113         return (cnt);
1114 }
1115 
1116 /*
1117  * Move the supplied buffer to the indicated state.  The mutex
1118  * for the buffer must be held by the caller.
1119  */
1120 static void
1121 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1122 {
1123         arc_state_t *old_state = ab->b_state;
1124         int64_t refcnt = refcount_count(&ab->b_refcnt);
1125         uint64_t from_delta, to_delta;
1126 
1127         ASSERT(MUTEX_HELD(hash_lock));
1128         ASSERT(new_state != old_state);
1129         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1130         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1131         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1132 
1133         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1134 
1135         /*
1136          * If this buffer is evictable, transfer it from the
1137          * old state list to the new state list.
1138          */
1139         if (refcnt == 0) {
1140                 if (old_state != arc_anon) {
1141                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1142                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1143 
1144                         if (use_mutex)
1145                                 mutex_enter(&old_state->arcs_mtx);
1146 
1147                         ASSERT(list_link_active(&ab->b_arc_node));
1148                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1149 
1150                         /*
1151                          * If prefetching out of the ghost cache,
1152                          * we will have a non-zero datacnt.
1153                          */
1154                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1155                                 /* ghost elements have a ghost size */
1156                                 ASSERT(ab->b_buf == NULL);
1157                                 from_delta = ab->b_size;
1158                         }
1159                         ASSERT3U(*size, >=, from_delta);
1160                         atomic_add_64(size, -from_delta);
1161 
1162                         if (use_mutex)
1163                                 mutex_exit(&old_state->arcs_mtx);
1164                 }
1165                 if (new_state != arc_anon) {
1166                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1167                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1168 
1169                         if (use_mutex)
1170                                 mutex_enter(&new_state->arcs_mtx);
1171 
1172                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1173 
1174                         /* ghost elements have a ghost size */
1175                         if (GHOST_STATE(new_state)) {
1176                                 ASSERT(ab->b_datacnt == 0);
1177                                 ASSERT(ab->b_buf == NULL);
1178                                 to_delta = ab->b_size;
1179                         }
1180                         atomic_add_64(size, to_delta);
1181 
1182                         if (use_mutex)
1183                                 mutex_exit(&new_state->arcs_mtx);
1184                 }
1185         }
1186 
1187         ASSERT(!BUF_EMPTY(ab));
1188         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1189                 buf_hash_remove(ab);
1190 
1191         /* adjust state sizes */
1192         if (to_delta)
1193                 atomic_add_64(&new_state->arcs_size, to_delta);
1194         if (from_delta) {
1195                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1196                 atomic_add_64(&old_state->arcs_size, -from_delta);
1197         }
1198         ab->b_state = new_state;
1199 
1200         /* adjust l2arc hdr stats */
1201         if (new_state == arc_l2c_only)
1202                 l2arc_hdr_stat_add();
1203         else if (old_state == arc_l2c_only)
1204                 l2arc_hdr_stat_remove();
1205 }
1206 
1207 void
1208 arc_space_consume(uint64_t space, arc_space_type_t type)
1209 {
1210         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1211 
1212         switch (type) {
1213         case ARC_SPACE_DATA:
1214                 ARCSTAT_INCR(arcstat_data_size, space);
1215                 break;
1216         case ARC_SPACE_OTHER:
1217                 ARCSTAT_INCR(arcstat_other_size, space);
1218                 break;
1219         case ARC_SPACE_HDRS:
1220                 ARCSTAT_INCR(arcstat_hdr_size, space);
1221                 break;
1222         case ARC_SPACE_L2HDRS:
1223                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1224                 break;
1225         }
1226 
1227         ARCSTAT_INCR(arcstat_meta_used, space);
1228         atomic_add_64(&arc_size, space);
1229 }
1230 
1231 void
1232 arc_space_return(uint64_t space, arc_space_type_t type)
1233 {
1234         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1235 
1236         switch (type) {
1237         case ARC_SPACE_DATA:
1238                 ARCSTAT_INCR(arcstat_data_size, -space);
1239                 break;
1240         case ARC_SPACE_OTHER:
1241                 ARCSTAT_INCR(arcstat_other_size, -space);
1242                 break;
1243         case ARC_SPACE_HDRS:
1244                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1245                 break;
1246         case ARC_SPACE_L2HDRS:
1247                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1248                 break;
1249         }
1250 
1251         ASSERT(arc_meta_used >= space);
1252         if (arc_meta_max < arc_meta_used)
1253                 arc_meta_max = arc_meta_used;
1254         ARCSTAT_INCR(arcstat_meta_used, -space);
1255         ASSERT(arc_size >= space);
1256         atomic_add_64(&arc_size, -space);
1257 }
1258 
1259 void *
1260 arc_data_buf_alloc(uint64_t size)
1261 {
1262         if (arc_evict_needed(ARC_BUFC_DATA))
1263                 cv_signal(&arc_reclaim_thr_cv);
1264         atomic_add_64(&arc_size, size);
1265         return (zio_data_buf_alloc(size));
1266 }
1267 
1268 void
1269 arc_data_buf_free(void *buf, uint64_t size)
1270 {
1271         zio_data_buf_free(buf, size);
1272         ASSERT(arc_size >= size);
1273         atomic_add_64(&arc_size, -size);
1274 }
1275 
1276 arc_buf_t *
1277 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1278 {
1279         arc_buf_hdr_t *hdr;
1280         arc_buf_t *buf;
1281 
1282         ASSERT3U(size, >, 0);
1283         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1284         ASSERT(BUF_EMPTY(hdr));
1285         hdr->b_size = size;
1286         hdr->b_type = type;
1287         hdr->b_spa = spa_load_guid(spa);
1288         hdr->b_state = arc_anon;
1289         hdr->b_arc_access = 0;
1290         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1291         buf->b_hdr = hdr;
1292         buf->b_data = NULL;
1293         buf->b_efunc = NULL;
1294         buf->b_private = NULL;
1295         buf->b_next = NULL;
1296         hdr->b_buf = buf;
1297         arc_get_data_buf(buf);
1298         hdr->b_datacnt = 1;
1299         hdr->b_flags = 0;
1300         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1301         (void) refcount_add(&hdr->b_refcnt, tag);
1302 
1303         return (buf);
1304 }
1305 
1306 static char *arc_onloan_tag = "onloan";
1307 
1308 /*
1309  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1310  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1311  * buffers must be returned to the arc before they can be used by the DMU or
1312  * freed.
1313  */
1314 arc_buf_t *
1315 arc_loan_buf(spa_t *spa, int size)
1316 {
1317         arc_buf_t *buf;
1318 
1319         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1320 
1321         atomic_add_64(&arc_loaned_bytes, size);
1322         return (buf);
1323 }
1324 
1325 /*
1326  * Return a loaned arc buffer to the arc.
1327  */
1328 void
1329 arc_return_buf(arc_buf_t *buf, void *tag)
1330 {
1331         arc_buf_hdr_t *hdr = buf->b_hdr;
1332 
1333         ASSERT(buf->b_data != NULL);
1334         (void) refcount_add(&hdr->b_refcnt, tag);
1335         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1336 
1337         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1338 }
1339 
1340 /* Detach an arc_buf from a dbuf (tag) */
1341 void
1342 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1343 {
1344         arc_buf_hdr_t *hdr;
1345 
1346         ASSERT(buf->b_data != NULL);
1347         hdr = buf->b_hdr;
1348         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1349         (void) refcount_remove(&hdr->b_refcnt, tag);
1350         buf->b_efunc = NULL;
1351         buf->b_private = NULL;
1352 
1353         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1354 }
1355 
1356 static arc_buf_t *
1357 arc_buf_clone(arc_buf_t *from)
1358 {
1359         arc_buf_t *buf;
1360         arc_buf_hdr_t *hdr = from->b_hdr;
1361         uint64_t size = hdr->b_size;
1362 
1363         ASSERT(hdr->b_state != arc_anon);
1364 
1365         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1366         buf->b_hdr = hdr;
1367         buf->b_data = NULL;
1368         buf->b_efunc = NULL;
1369         buf->b_private = NULL;
1370         buf->b_next = hdr->b_buf;
1371         hdr->b_buf = buf;
1372         arc_get_data_buf(buf);
1373         bcopy(from->b_data, buf->b_data, size);
1374 
1375         /*
1376          * This buffer already exists in the arc so create a duplicate
1377          * copy for the caller.  If the buffer is associated with user data
1378          * then track the size and number of duplicates.  These stats will be
1379          * updated as duplicate buffers are created and destroyed.
1380          */
1381         if (hdr->b_type == ARC_BUFC_DATA) {
1382                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1383                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1384         }
1385         hdr->b_datacnt += 1;
1386         return (buf);
1387 }
1388 
1389 void
1390 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1391 {
1392         arc_buf_hdr_t *hdr;
1393         kmutex_t *hash_lock;
1394 
1395         /*
1396          * Check to see if this buffer is evicted.  Callers
1397          * must verify b_data != NULL to know if the add_ref
1398          * was successful.
1399          */
1400         mutex_enter(&buf->b_evict_lock);
1401         if (buf->b_data == NULL) {
1402                 mutex_exit(&buf->b_evict_lock);
1403                 return;
1404         }
1405         hash_lock = HDR_LOCK(buf->b_hdr);
1406         mutex_enter(hash_lock);
1407         hdr = buf->b_hdr;
1408         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1409         mutex_exit(&buf->b_evict_lock);
1410 
1411         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1412         add_reference(hdr, hash_lock, tag);
1413         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1414         arc_access(hdr, hash_lock);
1415         mutex_exit(hash_lock);
1416         ARCSTAT_BUMP(arcstat_hits);
1417         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1418             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1419             data, metadata, hits);
1420 }
1421 
1422 /*
1423  * Free the arc data buffer.  If it is an l2arc write in progress,
1424  * the buffer is placed on l2arc_free_on_write to be freed later.
1425  */
1426 static void
1427 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1428 {
1429         arc_buf_hdr_t *hdr = buf->b_hdr;
1430 
1431         if (HDR_L2_WRITING(hdr)) {
1432                 l2arc_data_free_t *df;
1433                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1434                 df->l2df_data = buf->b_data;
1435                 df->l2df_size = hdr->b_size;
1436                 df->l2df_func = free_func;
1437                 mutex_enter(&l2arc_free_on_write_mtx);
1438                 list_insert_head(l2arc_free_on_write, df);
1439                 mutex_exit(&l2arc_free_on_write_mtx);
1440                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1441         } else {
1442                 free_func(buf->b_data, hdr->b_size);
1443         }
1444 }
1445 
1446 static void
1447 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1448 {
1449         arc_buf_t **bufp;
1450 
1451         /* free up data associated with the buf */
1452         if (buf->b_data) {
1453                 arc_state_t *state = buf->b_hdr->b_state;
1454                 uint64_t size = buf->b_hdr->b_size;
1455                 arc_buf_contents_t type = buf->b_hdr->b_type;
1456 
1457                 arc_cksum_verify(buf);
1458                 arc_buf_unwatch(buf);
1459 
1460                 if (!recycle) {
1461                         if (type == ARC_BUFC_METADATA) {
1462                                 arc_buf_data_free(buf, zio_buf_free);
1463                                 arc_space_return(size, ARC_SPACE_DATA);
1464                         } else {
1465                                 ASSERT(type == ARC_BUFC_DATA);
1466                                 arc_buf_data_free(buf, zio_data_buf_free);
1467                                 ARCSTAT_INCR(arcstat_data_size, -size);
1468                                 atomic_add_64(&arc_size, -size);
1469                         }
1470                 }
1471                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1472                         uint64_t *cnt = &state->arcs_lsize[type];
1473 
1474                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1475                         ASSERT(state != arc_anon);
1476 
1477                         ASSERT3U(*cnt, >=, size);
1478                         atomic_add_64(cnt, -size);
1479                 }
1480                 ASSERT3U(state->arcs_size, >=, size);
1481                 atomic_add_64(&state->arcs_size, -size);
1482                 buf->b_data = NULL;
1483 
1484                 /*
1485                  * If we're destroying a duplicate buffer make sure
1486                  * that the appropriate statistics are updated.
1487                  */
1488                 if (buf->b_hdr->b_datacnt > 1 &&
1489                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1490                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1491                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1492                 }
1493                 ASSERT(buf->b_hdr->b_datacnt > 0);
1494                 buf->b_hdr->b_datacnt -= 1;
1495         }
1496 
1497         /* only remove the buf if requested */
1498         if (!all)
1499                 return;
1500 
1501         /* remove the buf from the hdr list */
1502         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1503                 continue;
1504         *bufp = buf->b_next;
1505         buf->b_next = NULL;
1506 
1507         ASSERT(buf->b_efunc == NULL);
1508 
1509         /* clean up the buf */
1510         buf->b_hdr = NULL;
1511         kmem_cache_free(buf_cache, buf);
1512 }
1513 
1514 static void
1515 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1516 {
1517         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1518         ASSERT3P(hdr->b_state, ==, arc_anon);
1519         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1520         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1521 
1522         if (l2hdr != NULL) {
1523                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1524                 /*
1525                  * To prevent arc_free() and l2arc_evict() from
1526                  * attempting to free the same buffer at the same time,
1527                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1528                  * give it priority.  l2arc_evict() can't destroy this
1529                  * header while we are waiting on l2arc_buflist_mtx.
1530                  *
1531                  * The hdr may be removed from l2ad_buflist before we
1532                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1533                  */
1534                 if (!buflist_held) {
1535                         mutex_enter(&l2arc_buflist_mtx);
1536                         l2hdr = hdr->b_l2hdr;
1537                 }
1538 
1539                 if (l2hdr != NULL) {
1540                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1541                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1542                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1543                         if (hdr->b_state == arc_l2c_only)
1544                                 l2arc_hdr_stat_remove();
1545                         hdr->b_l2hdr = NULL;
1546                 }
1547 
1548                 if (!buflist_held)
1549                         mutex_exit(&l2arc_buflist_mtx);
1550         }
1551 
1552         if (!BUF_EMPTY(hdr)) {
1553                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1554                 buf_discard_identity(hdr);
1555         }
1556         while (hdr->b_buf) {
1557                 arc_buf_t *buf = hdr->b_buf;
1558 
1559                 if (buf->b_efunc) {
1560                         mutex_enter(&arc_eviction_mtx);
1561                         mutex_enter(&buf->b_evict_lock);
1562                         ASSERT(buf->b_hdr != NULL);
1563                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1564                         hdr->b_buf = buf->b_next;
1565                         buf->b_hdr = &arc_eviction_hdr;
1566                         buf->b_next = arc_eviction_list;
1567                         arc_eviction_list = buf;
1568                         mutex_exit(&buf->b_evict_lock);
1569                         mutex_exit(&arc_eviction_mtx);
1570                 } else {
1571                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1572                 }
1573         }
1574         if (hdr->b_freeze_cksum != NULL) {
1575                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1576                 hdr->b_freeze_cksum = NULL;
1577         }
1578         if (hdr->b_thawed) {
1579                 kmem_free(hdr->b_thawed, 1);
1580                 hdr->b_thawed = NULL;
1581         }
1582 
1583         ASSERT(!list_link_active(&hdr->b_arc_node));
1584         ASSERT3P(hdr->b_hash_next, ==, NULL);
1585         ASSERT3P(hdr->b_acb, ==, NULL);
1586         kmem_cache_free(hdr_cache, hdr);
1587 }
1588 
1589 void
1590 arc_buf_free(arc_buf_t *buf, void *tag)
1591 {
1592         arc_buf_hdr_t *hdr = buf->b_hdr;
1593         int hashed = hdr->b_state != arc_anon;
1594 
1595         ASSERT(buf->b_efunc == NULL);
1596         ASSERT(buf->b_data != NULL);
1597 
1598         if (hashed) {
1599                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1600 
1601                 mutex_enter(hash_lock);
1602                 hdr = buf->b_hdr;
1603                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1604 
1605                 (void) remove_reference(hdr, hash_lock, tag);
1606                 if (hdr->b_datacnt > 1) {
1607                         arc_buf_destroy(buf, FALSE, TRUE);
1608                 } else {
1609                         ASSERT(buf == hdr->b_buf);
1610                         ASSERT(buf->b_efunc == NULL);
1611                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1612                 }
1613                 mutex_exit(hash_lock);
1614         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1615                 int destroy_hdr;
1616                 /*
1617                  * We are in the middle of an async write.  Don't destroy
1618                  * this buffer unless the write completes before we finish
1619                  * decrementing the reference count.
1620                  */
1621                 mutex_enter(&arc_eviction_mtx);
1622                 (void) remove_reference(hdr, NULL, tag);
1623                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1624                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1625                 mutex_exit(&arc_eviction_mtx);
1626                 if (destroy_hdr)
1627                         arc_hdr_destroy(hdr);
1628         } else {
1629                 if (remove_reference(hdr, NULL, tag) > 0)
1630                         arc_buf_destroy(buf, FALSE, TRUE);
1631                 else
1632                         arc_hdr_destroy(hdr);
1633         }
1634 }
1635 
1636 boolean_t
1637 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1638 {
1639         arc_buf_hdr_t *hdr = buf->b_hdr;
1640         kmutex_t *hash_lock = HDR_LOCK(hdr);
1641         boolean_t no_callback = (buf->b_efunc == NULL);
1642 
1643         if (hdr->b_state == arc_anon) {
1644                 ASSERT(hdr->b_datacnt == 1);
1645                 arc_buf_free(buf, tag);
1646                 return (no_callback);
1647         }
1648 
1649         mutex_enter(hash_lock);
1650         hdr = buf->b_hdr;
1651         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1652         ASSERT(hdr->b_state != arc_anon);
1653         ASSERT(buf->b_data != NULL);
1654 
1655         (void) remove_reference(hdr, hash_lock, tag);
1656         if (hdr->b_datacnt > 1) {
1657                 if (no_callback)
1658                         arc_buf_destroy(buf, FALSE, TRUE);
1659         } else if (no_callback) {
1660                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1661                 ASSERT(buf->b_efunc == NULL);
1662                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1663         }
1664         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1665             refcount_is_zero(&hdr->b_refcnt));
1666         mutex_exit(hash_lock);
1667         return (no_callback);
1668 }
1669 
1670 int
1671 arc_buf_size(arc_buf_t *buf)
1672 {
1673         return (buf->b_hdr->b_size);
1674 }
1675 
1676 /*
1677  * Called from the DMU to determine if the current buffer should be
1678  * evicted. In order to ensure proper locking, the eviction must be initiated
1679  * from the DMU. Return true if the buffer is associated with user data and
1680  * duplicate buffers still exist.
1681  */
1682 boolean_t
1683 arc_buf_eviction_needed(arc_buf_t *buf)
1684 {
1685         arc_buf_hdr_t *hdr;
1686         boolean_t evict_needed = B_FALSE;
1687 
1688         if (zfs_disable_dup_eviction)
1689                 return (B_FALSE);
1690 
1691         mutex_enter(&buf->b_evict_lock);
1692         hdr = buf->b_hdr;
1693         if (hdr == NULL) {
1694                 /*
1695                  * We are in arc_do_user_evicts(); let that function
1696                  * perform the eviction.
1697                  */
1698                 ASSERT(buf->b_data == NULL);
1699                 mutex_exit(&buf->b_evict_lock);
1700                 return (B_FALSE);
1701         } else if (buf->b_data == NULL) {
1702                 /*
1703                  * We have already been added to the arc eviction list;
1704                  * recommend eviction.
1705                  */
1706                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1707                 mutex_exit(&buf->b_evict_lock);
1708                 return (B_TRUE);
1709         }
1710 
1711         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1712                 evict_needed = B_TRUE;
1713 
1714         mutex_exit(&buf->b_evict_lock);
1715         return (evict_needed);
1716 }
1717 
1718 /*
1719  * Evict buffers from list until we've removed the specified number of
1720  * bytes.  Move the removed buffers to the appropriate evict state.
1721  * If the recycle flag is set, then attempt to "recycle" a buffer:
1722  * - look for a buffer to evict that is `bytes' long.
1723  * - return the data block from this buffer rather than freeing it.
1724  * This flag is used by callers that are trying to make space for a
1725  * new buffer in a full arc cache.
1726  *
1727  * This function makes a "best effort".  It skips over any buffers
1728  * it can't get a hash_lock on, and so may not catch all candidates.
1729  * It may also return without evicting as much space as requested.
1730  */
1731 static void *
1732 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1733     arc_buf_contents_t type)
1734 {
1735         arc_state_t *evicted_state;
1736         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1737         arc_buf_hdr_t *ab, *ab_prev = NULL;
1738         list_t *list = &state->arcs_list[type];
1739         kmutex_t *hash_lock;
1740         boolean_t have_lock;
1741         void *stolen = NULL;
1742 
1743         ASSERT(state == arc_mru || state == arc_mfu);
1744 
1745         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1746 
1747         mutex_enter(&state->arcs_mtx);
1748         mutex_enter(&evicted_state->arcs_mtx);
1749 
1750         for (ab = list_tail(list); ab; ab = ab_prev) {
1751                 ab_prev = list_prev(list, ab);
1752                 /* prefetch buffers have a minimum lifespan */
1753                 if (HDR_IO_IN_PROGRESS(ab) ||
1754                     (spa && ab->b_spa != spa) ||
1755                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1756                     ddi_get_lbolt() - ab->b_arc_access <
1757                     arc_min_prefetch_lifespan)) {
1758                         skipped++;
1759                         continue;
1760                 }
1761                 /* "lookahead" for better eviction candidate */
1762                 if (recycle && ab->b_size != bytes &&
1763                     ab_prev && ab_prev->b_size == bytes)
1764                         continue;
1765                 hash_lock = HDR_LOCK(ab);
1766                 have_lock = MUTEX_HELD(hash_lock);
1767                 if (have_lock || mutex_tryenter(hash_lock)) {
1768                         ASSERT0(refcount_count(&ab->b_refcnt));
1769                         ASSERT(ab->b_datacnt > 0);
1770                         while (ab->b_buf) {
1771                                 arc_buf_t *buf = ab->b_buf;
1772                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1773                                         missed += 1;
1774                                         break;
1775                                 }
1776                                 if (buf->b_data) {
1777                                         bytes_evicted += ab->b_size;
1778                                         if (recycle && ab->b_type == type &&
1779                                             ab->b_size == bytes &&
1780                                             !HDR_L2_WRITING(ab)) {
1781                                                 stolen = buf->b_data;
1782                                                 recycle = FALSE;
1783                                         }
1784                                 }
1785                                 if (buf->b_efunc) {
1786                                         mutex_enter(&arc_eviction_mtx);
1787                                         arc_buf_destroy(buf,
1788                                             buf->b_data == stolen, FALSE);
1789                                         ab->b_buf = buf->b_next;
1790                                         buf->b_hdr = &arc_eviction_hdr;
1791                                         buf->b_next = arc_eviction_list;
1792                                         arc_eviction_list = buf;
1793                                         mutex_exit(&arc_eviction_mtx);
1794                                         mutex_exit(&buf->b_evict_lock);
1795                                 } else {
1796                                         mutex_exit(&buf->b_evict_lock);
1797                                         arc_buf_destroy(buf,
1798                                             buf->b_data == stolen, TRUE);
1799                                 }
1800                         }
1801 
1802                         if (ab->b_l2hdr) {
1803                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1804                                     ab->b_size);
1805                         } else {
1806                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1807                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1808                                             ab->b_size);
1809                                 } else {
1810                                         ARCSTAT_INCR(
1811                                             arcstat_evict_l2_ineligible,
1812                                             ab->b_size);
1813                                 }
1814                         }
1815 
1816                         if (ab->b_datacnt == 0) {
1817                                 arc_change_state(evicted_state, ab, hash_lock);
1818                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1819                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1820                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1821                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1822                         }
1823                         if (!have_lock)
1824                                 mutex_exit(hash_lock);
1825                         if (bytes >= 0 && bytes_evicted >= bytes)
1826                                 break;
1827                 } else {
1828                         missed += 1;
1829                 }
1830         }
1831 
1832         mutex_exit(&evicted_state->arcs_mtx);
1833         mutex_exit(&state->arcs_mtx);
1834 
1835         if (bytes_evicted < bytes)
1836                 dprintf("only evicted %lld bytes from %x",
1837                     (longlong_t)bytes_evicted, state);
1838 
1839         if (skipped)
1840                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1841 
1842         if (missed)
1843                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1844 
1845         /*
1846          * We have just evicted some data into the ghost state, make
1847          * sure we also adjust the ghost state size if necessary.
1848          */
1849         if (arc_no_grow &&
1850             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1851                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1852                     arc_mru_ghost->arcs_size - arc_c;
1853 
1854                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1855                         int64_t todelete =
1856                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1857                         arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1858                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1859                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1860                             arc_mru_ghost->arcs_size +
1861                             arc_mfu_ghost->arcs_size - arc_c);
1862                         arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1863                 }
1864         }
1865 
1866         return (stolen);
1867 }
1868 
1869 /*
1870  * Remove buffers from list until we've removed the specified number of
1871  * bytes.  Destroy the buffers that are removed.
1872  */
1873 static void
1874 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1875 {
1876         arc_buf_hdr_t *ab, *ab_prev;
1877         arc_buf_hdr_t marker = { 0 };
1878         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1879         kmutex_t *hash_lock;
1880         uint64_t bytes_deleted = 0;
1881         uint64_t bufs_skipped = 0;
1882 
1883         ASSERT(GHOST_STATE(state));
1884 top:
1885         mutex_enter(&state->arcs_mtx);
1886         for (ab = list_tail(list); ab; ab = ab_prev) {
1887                 ab_prev = list_prev(list, ab);
1888                 if (spa && ab->b_spa != spa)
1889                         continue;
1890 
1891                 /* ignore markers */
1892                 if (ab->b_spa == 0)
1893                         continue;
1894 
1895                 hash_lock = HDR_LOCK(ab);
1896                 /* caller may be trying to modify this buffer, skip it */
1897                 if (MUTEX_HELD(hash_lock))
1898                         continue;
1899                 if (mutex_tryenter(hash_lock)) {
1900                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1901                         ASSERT(ab->b_buf == NULL);
1902                         ARCSTAT_BUMP(arcstat_deleted);
1903                         bytes_deleted += ab->b_size;
1904 
1905                         if (ab->b_l2hdr != NULL) {
1906                                 /*
1907                                  * This buffer is cached on the 2nd Level ARC;
1908                                  * don't destroy the header.
1909                                  */
1910                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1911                                 mutex_exit(hash_lock);
1912                         } else {
1913                                 arc_change_state(arc_anon, ab, hash_lock);
1914                                 mutex_exit(hash_lock);
1915                                 arc_hdr_destroy(ab);
1916                         }
1917 
1918                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1919                         if (bytes >= 0 && bytes_deleted >= bytes)
1920                                 break;
1921                 } else if (bytes < 0) {
1922                         /*
1923                          * Insert a list marker and then wait for the
1924                          * hash lock to become available. Once its
1925                          * available, restart from where we left off.
1926                          */
1927                         list_insert_after(list, ab, &marker);
1928                         mutex_exit(&state->arcs_mtx);
1929                         mutex_enter(hash_lock);
1930                         mutex_exit(hash_lock);
1931                         mutex_enter(&state->arcs_mtx);
1932                         ab_prev = list_prev(list, &marker);
1933                         list_remove(list, &marker);
1934                 } else
1935                         bufs_skipped += 1;
1936         }
1937         mutex_exit(&state->arcs_mtx);
1938 
1939         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1940             (bytes < 0 || bytes_deleted < bytes)) {
1941                 list = &state->arcs_list[ARC_BUFC_METADATA];
1942                 goto top;
1943         }
1944 
1945         if (bufs_skipped) {
1946                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1947                 ASSERT(bytes >= 0);
1948         }
1949 
1950         if (bytes_deleted < bytes)
1951                 dprintf("only deleted %lld bytes from %p",
1952                     (longlong_t)bytes_deleted, state);
1953 }
1954 
1955 static void
1956 arc_adjust(void)
1957 {
1958         int64_t adjustment, delta;
1959 
1960         /*
1961          * Adjust MRU size
1962          */
1963 
1964         adjustment = MIN((int64_t)(arc_size - arc_c),
1965             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1966             arc_p));
1967 
1968         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1969                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1970                 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
1971                 adjustment -= delta;
1972         }
1973 
1974         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1975                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1976                 (void) arc_evict(arc_mru, NULL, delta, FALSE,
1977                     ARC_BUFC_METADATA);
1978         }
1979 
1980         /*
1981          * Adjust MFU size
1982          */
1983 
1984         adjustment = arc_size - arc_c;
1985 
1986         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1987                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1988                 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
1989                 adjustment -= delta;
1990         }
1991 
1992         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1993                 int64_t delta = MIN(adjustment,
1994                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
1995                 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
1996                     ARC_BUFC_METADATA);
1997         }
1998 
1999         /*
2000          * Adjust ghost lists
2001          */
2002 
2003         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2004 
2005         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2006                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2007                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2008         }
2009 
2010         adjustment =
2011             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2012 
2013         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2014                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2015                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2016         }
2017 }
2018 
2019 static void
2020 arc_do_user_evicts(void)
2021 {
2022         mutex_enter(&arc_eviction_mtx);
2023         while (arc_eviction_list != NULL) {
2024                 arc_buf_t *buf = arc_eviction_list;
2025                 arc_eviction_list = buf->b_next;
2026                 mutex_enter(&buf->b_evict_lock);
2027                 buf->b_hdr = NULL;
2028                 mutex_exit(&buf->b_evict_lock);
2029                 mutex_exit(&arc_eviction_mtx);
2030 
2031                 if (buf->b_efunc != NULL)
2032                         VERIFY(buf->b_efunc(buf) == 0);
2033 
2034                 buf->b_efunc = NULL;
2035                 buf->b_private = NULL;
2036                 kmem_cache_free(buf_cache, buf);
2037                 mutex_enter(&arc_eviction_mtx);
2038         }
2039         mutex_exit(&arc_eviction_mtx);
2040 }
2041 
2042 /*
2043  * Flush all *evictable* data from the cache for the given spa.
2044  * NOTE: this will not touch "active" (i.e. referenced) data.
2045  */
2046 void
2047 arc_flush(spa_t *spa)
2048 {
2049         uint64_t guid = 0;
2050 
2051         if (spa)
2052                 guid = spa_load_guid(spa);
2053 
2054         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2055                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2056                 if (spa)
2057                         break;
2058         }
2059         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2060                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2061                 if (spa)
2062                         break;
2063         }
2064         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2065                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2066                 if (spa)
2067                         break;
2068         }
2069         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2070                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2071                 if (spa)
2072                         break;
2073         }
2074 
2075         arc_evict_ghost(arc_mru_ghost, guid, -1);
2076         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2077 
2078         mutex_enter(&arc_reclaim_thr_lock);
2079         arc_do_user_evicts();
2080         mutex_exit(&arc_reclaim_thr_lock);
2081         ASSERT(spa || arc_eviction_list == NULL);
2082 }
2083 
2084 void
2085 arc_shrink(void)
2086 {
2087         if (arc_c > arc_c_min) {
2088                 uint64_t to_free;
2089 
2090 #ifdef _KERNEL
2091                 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2092 #else
2093                 to_free = arc_c >> arc_shrink_shift;
2094 #endif
2095                 if (arc_c > arc_c_min + to_free)
2096                         atomic_add_64(&arc_c, -to_free);
2097                 else
2098                         arc_c = arc_c_min;
2099 
2100                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2101                 if (arc_c > arc_size)
2102                         arc_c = MAX(arc_size, arc_c_min);
2103                 if (arc_p > arc_c)
2104                         arc_p = (arc_c >> 1);
2105                 ASSERT(arc_c >= arc_c_min);
2106                 ASSERT((int64_t)arc_p >= 0);
2107         }
2108 
2109         if (arc_size > arc_c)
2110                 arc_adjust();
2111 }
2112 
2113 /*
2114  * Determine if the system is under memory pressure and is asking
2115  * to reclaim memory. A return value of 1 indicates that the system
2116  * is under memory pressure and that the arc should adjust accordingly.
2117  */
2118 static int
2119 arc_reclaim_needed(void)
2120 {
2121         uint64_t extra;
2122 
2123 #ifdef _KERNEL
2124 
2125         if (needfree)
2126                 return (1);
2127 
2128         /*
2129          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2130          */
2131         extra = desfree;
2132 
2133         /*
2134          * check that we're out of range of the pageout scanner.  It starts to
2135          * schedule paging if freemem is less than lotsfree and needfree.
2136          * lotsfree is the high-water mark for pageout, and needfree is the
2137          * number of needed free pages.  We add extra pages here to make sure
2138          * the scanner doesn't start up while we're freeing memory.
2139          */
2140         if (freemem < lotsfree + needfree + extra)
2141                 return (1);
2142 
2143         /*
2144          * check to make sure that swapfs has enough space so that anon
2145          * reservations can still succeed. anon_resvmem() checks that the
2146          * availrmem is greater than swapfs_minfree, and the number of reserved
2147          * swap pages.  We also add a bit of extra here just to prevent
2148          * circumstances from getting really dire.
2149          */
2150         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2151                 return (1);
2152 
2153 #if defined(__i386)
2154         /*
2155          * If we're on an i386 platform, it's possible that we'll exhaust the
2156          * kernel heap space before we ever run out of available physical
2157          * memory.  Most checks of the size of the heap_area compare against
2158          * tune.t_minarmem, which is the minimum available real memory that we
2159          * can have in the system.  However, this is generally fixed at 25 pages
2160          * which is so low that it's useless.  In this comparison, we seek to
2161          * calculate the total heap-size, and reclaim if more than 3/4ths of the
2162          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2163          * free)
2164          */
2165         if (vmem_size(heap_arena, VMEM_FREE) <
2166             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2167                 return (1);
2168 #endif
2169 
2170         /*
2171          * If zio data pages are being allocated out of a separate heap segment,
2172          * then enforce that the size of available vmem for this arena remains
2173          * above about 1/16th free.
2174          *
2175          * Note: The 1/16th arena free requirement was put in place
2176          * to aggressively evict memory from the arc in order to avoid
2177          * memory fragmentation issues.
2178          */
2179         if (zio_arena != NULL &&
2180             vmem_size(zio_arena, VMEM_FREE) <
2181             (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2182                 return (1);
2183 #else
2184         if (spa_get_random(100) == 0)
2185                 return (1);
2186 #endif
2187         return (0);
2188 }
2189 
2190 static void
2191 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2192 {
2193         size_t                  i;
2194         kmem_cache_t            *prev_cache = NULL;
2195         kmem_cache_t            *prev_data_cache = NULL;
2196         extern kmem_cache_t     *zio_buf_cache[];
2197         extern kmem_cache_t     *zio_data_buf_cache[];
2198 
2199 #ifdef _KERNEL
2200         if (arc_meta_used >= arc_meta_limit) {
2201                 /*
2202                  * We are exceeding our meta-data cache limit.
2203                  * Purge some DNLC entries to release holds on meta-data.
2204                  */
2205                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2206         }
2207 #if defined(__i386)
2208         /*
2209          * Reclaim unused memory from all kmem caches.
2210          */
2211         kmem_reap();
2212 #endif
2213 #endif
2214 
2215         /*
2216          * An aggressive reclamation will shrink the cache size as well as
2217          * reap free buffers from the arc kmem caches.
2218          */
2219         if (strat == ARC_RECLAIM_AGGR)
2220                 arc_shrink();
2221 
2222         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2223                 if (zio_buf_cache[i] != prev_cache) {
2224                         prev_cache = zio_buf_cache[i];
2225                         kmem_cache_reap_now(zio_buf_cache[i]);
2226                 }
2227                 if (zio_data_buf_cache[i] != prev_data_cache) {
2228                         prev_data_cache = zio_data_buf_cache[i];
2229                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2230                 }
2231         }
2232         kmem_cache_reap_now(buf_cache);
2233         kmem_cache_reap_now(hdr_cache);
2234 
2235         /*
2236          * Ask the vmem areana to reclaim unused memory from its
2237          * quantum caches.
2238          */
2239         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2240                 vmem_qcache_reap(zio_arena);
2241 }
2242 
2243 static void
2244 arc_reclaim_thread(void)
2245 {
2246         clock_t                 growtime = 0;
2247         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2248         callb_cpr_t             cpr;
2249 
2250         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2251 
2252         mutex_enter(&arc_reclaim_thr_lock);
2253         while (arc_thread_exit == 0) {
2254                 if (arc_reclaim_needed()) {
2255 
2256                         if (arc_no_grow) {
2257                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2258                                         last_reclaim = ARC_RECLAIM_AGGR;
2259                                 } else {
2260                                         last_reclaim = ARC_RECLAIM_CONS;
2261                                 }
2262                         } else {
2263                                 arc_no_grow = TRUE;
2264                                 last_reclaim = ARC_RECLAIM_AGGR;
2265                                 membar_producer();
2266                         }
2267 
2268                         /* reset the growth delay for every reclaim */
2269                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2270 
2271                         arc_kmem_reap_now(last_reclaim);
2272                         arc_warm = B_TRUE;
2273 
2274                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2275                         arc_no_grow = FALSE;
2276                 }
2277 
2278                 arc_adjust();
2279 
2280                 if (arc_eviction_list != NULL)
2281                         arc_do_user_evicts();
2282 
2283                 /* block until needed, or one second, whichever is shorter */
2284                 CALLB_CPR_SAFE_BEGIN(&cpr);
2285                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2286                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2287                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2288         }
2289 
2290         arc_thread_exit = 0;
2291         cv_broadcast(&arc_reclaim_thr_cv);
2292         CALLB_CPR_EXIT(&cpr);               /* drops arc_reclaim_thr_lock */
2293         thread_exit();
2294 }
2295 
2296 /*
2297  * Adapt arc info given the number of bytes we are trying to add and
2298  * the state that we are comming from.  This function is only called
2299  * when we are adding new content to the cache.
2300  */
2301 static void
2302 arc_adapt(int bytes, arc_state_t *state)
2303 {
2304         int mult;
2305         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2306 
2307         if (state == arc_l2c_only)
2308                 return;
2309 
2310         ASSERT(bytes > 0);
2311         /*
2312          * Adapt the target size of the MRU list:
2313          *      - if we just hit in the MRU ghost list, then increase
2314          *        the target size of the MRU list.
2315          *      - if we just hit in the MFU ghost list, then increase
2316          *        the target size of the MFU list by decreasing the
2317          *        target size of the MRU list.
2318          */
2319         if (state == arc_mru_ghost) {
2320                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2321                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2322                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2323 
2324                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2325         } else if (state == arc_mfu_ghost) {
2326                 uint64_t delta;
2327 
2328                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2329                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2330                 mult = MIN(mult, 10);
2331 
2332                 delta = MIN(bytes * mult, arc_p);
2333                 arc_p = MAX(arc_p_min, arc_p - delta);
2334         }
2335         ASSERT((int64_t)arc_p >= 0);
2336 
2337         if (arc_reclaim_needed()) {
2338                 cv_signal(&arc_reclaim_thr_cv);
2339                 return;
2340         }
2341 
2342         if (arc_no_grow)
2343                 return;
2344 
2345         if (arc_c >= arc_c_max)
2346                 return;
2347 
2348         /*
2349          * If we're within (2 * maxblocksize) bytes of the target
2350          * cache size, increment the target cache size
2351          */
2352         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2353                 atomic_add_64(&arc_c, (int64_t)bytes);
2354                 if (arc_c > arc_c_max)
2355                         arc_c = arc_c_max;
2356                 else if (state == arc_anon)
2357                         atomic_add_64(&arc_p, (int64_t)bytes);
2358                 if (arc_p > arc_c)
2359                         arc_p = arc_c;
2360         }
2361         ASSERT((int64_t)arc_p >= 0);
2362 }
2363 
2364 /*
2365  * Check if the cache has reached its limits and eviction is required
2366  * prior to insert.
2367  */
2368 static int
2369 arc_evict_needed(arc_buf_contents_t type)
2370 {
2371         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2372                 return (1);
2373 
2374         if (arc_reclaim_needed())
2375                 return (1);
2376 
2377         return (arc_size > arc_c);
2378 }
2379 
2380 /*
2381  * The buffer, supplied as the first argument, needs a data block.
2382  * So, if we are at cache max, determine which cache should be victimized.
2383  * We have the following cases:
2384  *
2385  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2386  * In this situation if we're out of space, but the resident size of the MFU is
2387  * under the limit, victimize the MFU cache to satisfy this insertion request.
2388  *
2389  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2390  * Here, we've used up all of the available space for the MRU, so we need to
2391  * evict from our own cache instead.  Evict from the set of resident MRU
2392  * entries.
2393  *
2394  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2395  * c minus p represents the MFU space in the cache, since p is the size of the
2396  * cache that is dedicated to the MRU.  In this situation there's still space on
2397  * the MFU side, so the MRU side needs to be victimized.
2398  *
2399  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2400  * MFU's resident set is consuming more space than it has been allotted.  In
2401  * this situation, we must victimize our own cache, the MFU, for this insertion.
2402  */
2403 static void
2404 arc_get_data_buf(arc_buf_t *buf)
2405 {
2406         arc_state_t             *state = buf->b_hdr->b_state;
2407         uint64_t                size = buf->b_hdr->b_size;
2408         arc_buf_contents_t      type = buf->b_hdr->b_type;
2409 
2410         arc_adapt(size, state);
2411 
2412         /*
2413          * We have not yet reached cache maximum size,
2414          * just allocate a new buffer.
2415          */
2416         if (!arc_evict_needed(type)) {
2417                 if (type == ARC_BUFC_METADATA) {
2418                         buf->b_data = zio_buf_alloc(size);
2419                         arc_space_consume(size, ARC_SPACE_DATA);
2420                 } else {
2421                         ASSERT(type == ARC_BUFC_DATA);
2422                         buf->b_data = zio_data_buf_alloc(size);
2423                         ARCSTAT_INCR(arcstat_data_size, size);
2424                         atomic_add_64(&arc_size, size);
2425                 }
2426                 goto out;
2427         }
2428 
2429         /*
2430          * If we are prefetching from the mfu ghost list, this buffer
2431          * will end up on the mru list; so steal space from there.
2432          */
2433         if (state == arc_mfu_ghost)
2434                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2435         else if (state == arc_mru_ghost)
2436                 state = arc_mru;
2437 
2438         if (state == arc_mru || state == arc_anon) {
2439                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2440                 state = (arc_mfu->arcs_lsize[type] >= size &&
2441                     arc_p > mru_used) ? arc_mfu : arc_mru;
2442         } else {
2443                 /* MFU cases */
2444                 uint64_t mfu_space = arc_c - arc_p;
2445                 state =  (arc_mru->arcs_lsize[type] >= size &&
2446                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2447         }
2448         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2449                 if (type == ARC_BUFC_METADATA) {
2450                         buf->b_data = zio_buf_alloc(size);
2451                         arc_space_consume(size, ARC_SPACE_DATA);
2452                 } else {
2453                         ASSERT(type == ARC_BUFC_DATA);
2454                         buf->b_data = zio_data_buf_alloc(size);
2455                         ARCSTAT_INCR(arcstat_data_size, size);
2456                         atomic_add_64(&arc_size, size);
2457                 }
2458                 ARCSTAT_BUMP(arcstat_recycle_miss);
2459         }
2460         ASSERT(buf->b_data != NULL);
2461 out:
2462         /*
2463          * Update the state size.  Note that ghost states have a
2464          * "ghost size" and so don't need to be updated.
2465          */
2466         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2467                 arc_buf_hdr_t *hdr = buf->b_hdr;
2468 
2469                 atomic_add_64(&hdr->b_state->arcs_size, size);
2470                 if (list_link_active(&hdr->b_arc_node)) {
2471                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2472                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2473                 }
2474                 /*
2475                  * If we are growing the cache, and we are adding anonymous
2476                  * data, and we have outgrown arc_p, update arc_p
2477                  */
2478                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2479                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2480                         arc_p = MIN(arc_c, arc_p + size);
2481         }
2482 }
2483 
2484 /*
2485  * This routine is called whenever a buffer is accessed.
2486  * NOTE: the hash lock is dropped in this function.
2487  */
2488 static void
2489 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2490 {
2491         clock_t now;
2492 
2493         ASSERT(MUTEX_HELD(hash_lock));
2494 
2495         if (buf->b_state == arc_anon) {
2496                 /*
2497                  * This buffer is not in the cache, and does not
2498                  * appear in our "ghost" list.  Add the new buffer
2499                  * to the MRU state.
2500                  */
2501 
2502                 ASSERT(buf->b_arc_access == 0);
2503                 buf->b_arc_access = ddi_get_lbolt();
2504                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2505                 arc_change_state(arc_mru, buf, hash_lock);
2506 
2507         } else if (buf->b_state == arc_mru) {
2508                 now = ddi_get_lbolt();
2509 
2510                 /*
2511                  * If this buffer is here because of a prefetch, then either:
2512                  * - clear the flag if this is a "referencing" read
2513                  *   (any subsequent access will bump this into the MFU state).
2514                  * or
2515                  * - move the buffer to the head of the list if this is
2516                  *   another prefetch (to make it less likely to be evicted).
2517                  */
2518                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2519                         if (refcount_count(&buf->b_refcnt) == 0) {
2520                                 ASSERT(list_link_active(&buf->b_arc_node));
2521                         } else {
2522                                 buf->b_flags &= ~ARC_PREFETCH;
2523                                 ARCSTAT_BUMP(arcstat_mru_hits);
2524                         }
2525                         buf->b_arc_access = now;
2526                         return;
2527                 }
2528 
2529                 /*
2530                  * This buffer has been "accessed" only once so far,
2531                  * but it is still in the cache. Move it to the MFU
2532                  * state.
2533                  */
2534                 if (now > buf->b_arc_access + ARC_MINTIME) {
2535                         /*
2536                          * More than 125ms have passed since we
2537                          * instantiated this buffer.  Move it to the
2538                          * most frequently used state.
2539                          */
2540                         buf->b_arc_access = now;
2541                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2542                         arc_change_state(arc_mfu, buf, hash_lock);
2543                 }
2544                 ARCSTAT_BUMP(arcstat_mru_hits);
2545         } else if (buf->b_state == arc_mru_ghost) {
2546                 arc_state_t     *new_state;
2547                 /*
2548                  * This buffer has been "accessed" recently, but
2549                  * was evicted from the cache.  Move it to the
2550                  * MFU state.
2551                  */
2552 
2553                 if (buf->b_flags & ARC_PREFETCH) {
2554                         new_state = arc_mru;
2555                         if (refcount_count(&buf->b_refcnt) > 0)
2556                                 buf->b_flags &= ~ARC_PREFETCH;
2557                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2558                 } else {
2559                         new_state = arc_mfu;
2560                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2561                 }
2562 
2563                 buf->b_arc_access = ddi_get_lbolt();
2564                 arc_change_state(new_state, buf, hash_lock);
2565 
2566                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2567         } else if (buf->b_state == arc_mfu) {
2568                 /*
2569                  * This buffer has been accessed more than once and is
2570                  * still in the cache.  Keep it in the MFU state.
2571                  *
2572                  * NOTE: an add_reference() that occurred when we did
2573                  * the arc_read() will have kicked this off the list.
2574                  * If it was a prefetch, we will explicitly move it to
2575                  * the head of the list now.
2576                  */
2577                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2578                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2579                         ASSERT(list_link_active(&buf->b_arc_node));
2580                 }
2581                 ARCSTAT_BUMP(arcstat_mfu_hits);
2582                 buf->b_arc_access = ddi_get_lbolt();
2583         } else if (buf->b_state == arc_mfu_ghost) {
2584                 arc_state_t     *new_state = arc_mfu;
2585                 /*
2586                  * This buffer has been accessed more than once but has
2587                  * been evicted from the cache.  Move it back to the
2588                  * MFU state.
2589                  */
2590 
2591                 if (buf->b_flags & ARC_PREFETCH) {
2592                         /*
2593                          * This is a prefetch access...
2594                          * move this block back to the MRU state.
2595                          */
2596                         ASSERT0(refcount_count(&buf->b_refcnt));
2597                         new_state = arc_mru;
2598                 }
2599 
2600                 buf->b_arc_access = ddi_get_lbolt();
2601                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2602                 arc_change_state(new_state, buf, hash_lock);
2603 
2604                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2605         } else if (buf->b_state == arc_l2c_only) {
2606                 /*
2607                  * This buffer is on the 2nd Level ARC.
2608                  */
2609 
2610                 buf->b_arc_access = ddi_get_lbolt();
2611                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2612                 arc_change_state(arc_mfu, buf, hash_lock);
2613         } else {
2614                 ASSERT(!"invalid arc state");
2615         }
2616 }
2617 
2618 /* a generic arc_done_func_t which you can use */
2619 /* ARGSUSED */
2620 void
2621 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2622 {
2623         if (zio == NULL || zio->io_error == 0)
2624                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2625         VERIFY(arc_buf_remove_ref(buf, arg));
2626 }
2627 
2628 /* a generic arc_done_func_t */
2629 void
2630 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2631 {
2632         arc_buf_t **bufp = arg;
2633         if (zio && zio->io_error) {
2634                 VERIFY(arc_buf_remove_ref(buf, arg));
2635                 *bufp = NULL;
2636         } else {
2637                 *bufp = buf;
2638                 ASSERT(buf->b_data);
2639         }
2640 }
2641 
2642 static void
2643 arc_read_done(zio_t *zio)
2644 {
2645         arc_buf_hdr_t   *hdr, *found;
2646         arc_buf_t       *buf;
2647         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2648         kmutex_t        *hash_lock;
2649         arc_callback_t  *callback_list, *acb;
2650         int             freeable = FALSE;
2651 
2652         buf = zio->io_private;
2653         hdr = buf->b_hdr;
2654 
2655         /*
2656          * The hdr was inserted into hash-table and removed from lists
2657          * prior to starting I/O.  We should find this header, since
2658          * it's in the hash table, and it should be legit since it's
2659          * not possible to evict it during the I/O.  The only possible
2660          * reason for it not to be found is if we were freed during the
2661          * read.
2662          */
2663         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2664             &hash_lock);
2665 
2666         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2667             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2668             (found == hdr && HDR_L2_READING(hdr)));
2669 
2670         hdr->b_flags &= ~ARC_L2_EVICTED;
2671         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2672                 hdr->b_flags &= ~ARC_L2CACHE;
2673 
2674         /* byteswap if necessary */
2675         callback_list = hdr->b_acb;
2676         ASSERT(callback_list != NULL);
2677         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2678                 dmu_object_byteswap_t bswap =
2679                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2680                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2681                     byteswap_uint64_array :
2682                     dmu_ot_byteswap[bswap].ob_func;
2683                 func(buf->b_data, hdr->b_size);
2684         }
2685 
2686         arc_cksum_compute(buf, B_FALSE);
2687         arc_buf_watch(buf);
2688 
2689         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2690                 /*
2691                  * Only call arc_access on anonymous buffers.  This is because
2692                  * if we've issued an I/O for an evicted buffer, we've already
2693                  * called arc_access (to prevent any simultaneous readers from
2694                  * getting confused).
2695                  */
2696                 arc_access(hdr, hash_lock);
2697         }
2698 
2699         /* create copies of the data buffer for the callers */
2700         abuf = buf;
2701         for (acb = callback_list; acb; acb = acb->acb_next) {
2702                 if (acb->acb_done) {
2703                         if (abuf == NULL) {
2704                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
2705                                 abuf = arc_buf_clone(buf);
2706                         }
2707                         acb->acb_buf = abuf;
2708                         abuf = NULL;
2709                 }
2710         }
2711         hdr->b_acb = NULL;
2712         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2713         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2714         if (abuf == buf) {
2715                 ASSERT(buf->b_efunc == NULL);
2716                 ASSERT(hdr->b_datacnt == 1);
2717                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2718         }
2719 
2720         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2721 
2722         if (zio->io_error != 0) {
2723                 hdr->b_flags |= ARC_IO_ERROR;
2724                 if (hdr->b_state != arc_anon)
2725                         arc_change_state(arc_anon, hdr, hash_lock);
2726                 if (HDR_IN_HASH_TABLE(hdr))
2727                         buf_hash_remove(hdr);
2728                 freeable = refcount_is_zero(&hdr->b_refcnt);
2729         }
2730 
2731         /*
2732          * Broadcast before we drop the hash_lock to avoid the possibility
2733          * that the hdr (and hence the cv) might be freed before we get to
2734          * the cv_broadcast().
2735          */
2736         cv_broadcast(&hdr->b_cv);
2737 
2738         if (hash_lock) {
2739                 mutex_exit(hash_lock);
2740         } else {
2741                 /*
2742                  * This block was freed while we waited for the read to
2743                  * complete.  It has been removed from the hash table and
2744                  * moved to the anonymous state (so that it won't show up
2745                  * in the cache).
2746                  */
2747                 ASSERT3P(hdr->b_state, ==, arc_anon);
2748                 freeable = refcount_is_zero(&hdr->b_refcnt);
2749         }
2750 
2751         /* execute each callback and free its structure */
2752         while ((acb = callback_list) != NULL) {
2753                 if (acb->acb_done)
2754                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2755 
2756                 if (acb->acb_zio_dummy != NULL) {
2757                         acb->acb_zio_dummy->io_error = zio->io_error;
2758                         zio_nowait(acb->acb_zio_dummy);
2759                 }
2760 
2761                 callback_list = acb->acb_next;
2762                 kmem_free(acb, sizeof (arc_callback_t));
2763         }
2764 
2765         if (freeable)
2766                 arc_hdr_destroy(hdr);
2767 }
2768 
2769 /*
2770  * "Read" the block at the specified DVA (in bp) via the
2771  * cache.  If the block is found in the cache, invoke the provided
2772  * callback immediately and return.  Note that the `zio' parameter
2773  * in the callback will be NULL in this case, since no IO was
2774  * required.  If the block is not in the cache pass the read request
2775  * on to the spa with a substitute callback function, so that the
2776  * requested block will be added to the cache.
2777  *
2778  * If a read request arrives for a block that has a read in-progress,
2779  * either wait for the in-progress read to complete (and return the
2780  * results); or, if this is a read with a "done" func, add a record
2781  * to the read to invoke the "done" func when the read completes,
2782  * and return; or just return.
2783  *
2784  * arc_read_done() will invoke all the requested "done" functions
2785  * for readers of this block.
2786  */
2787 int
2788 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2789     void *private, int priority, int zio_flags, uint32_t *arc_flags,
2790     const zbookmark_t *zb)
2791 {
2792         arc_buf_hdr_t *hdr;
2793         arc_buf_t *buf = NULL;
2794         kmutex_t *hash_lock;
2795         zio_t *rzio;
2796         uint64_t guid = spa_load_guid(spa);
2797 
2798 top:
2799         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2800             &hash_lock);
2801         if (hdr && hdr->b_datacnt > 0) {
2802 
2803                 *arc_flags |= ARC_CACHED;
2804 
2805                 if (HDR_IO_IN_PROGRESS(hdr)) {
2806 
2807                         if (*arc_flags & ARC_WAIT) {
2808                                 cv_wait(&hdr->b_cv, hash_lock);
2809                                 mutex_exit(hash_lock);
2810                                 goto top;
2811                         }
2812                         ASSERT(*arc_flags & ARC_NOWAIT);
2813 
2814                         if (done) {
2815                                 arc_callback_t  *acb = NULL;
2816 
2817                                 acb = kmem_zalloc(sizeof (arc_callback_t),
2818                                     KM_SLEEP);
2819                                 acb->acb_done = done;
2820                                 acb->acb_private = private;
2821                                 if (pio != NULL)
2822                                         acb->acb_zio_dummy = zio_null(pio,
2823                                             spa, NULL, NULL, NULL, zio_flags);
2824 
2825                                 ASSERT(acb->acb_done != NULL);
2826                                 acb->acb_next = hdr->b_acb;
2827                                 hdr->b_acb = acb;
2828                                 add_reference(hdr, hash_lock, private);
2829                                 mutex_exit(hash_lock);
2830                                 return (0);
2831                         }
2832                         mutex_exit(hash_lock);
2833                         return (0);
2834                 }
2835 
2836                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2837 
2838                 if (done) {
2839                         add_reference(hdr, hash_lock, private);
2840                         /*
2841                          * If this block is already in use, create a new
2842                          * copy of the data so that we will be guaranteed
2843                          * that arc_release() will always succeed.
2844                          */
2845                         buf = hdr->b_buf;
2846                         ASSERT(buf);
2847                         ASSERT(buf->b_data);
2848                         if (HDR_BUF_AVAILABLE(hdr)) {
2849                                 ASSERT(buf->b_efunc == NULL);
2850                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2851                         } else {
2852                                 buf = arc_buf_clone(buf);
2853                         }
2854 
2855                 } else if (*arc_flags & ARC_PREFETCH &&
2856                     refcount_count(&hdr->b_refcnt) == 0) {
2857                         hdr->b_flags |= ARC_PREFETCH;
2858                 }
2859                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2860                 arc_access(hdr, hash_lock);
2861                 if (*arc_flags & ARC_L2CACHE)
2862                         hdr->b_flags |= ARC_L2CACHE;
2863                 mutex_exit(hash_lock);
2864                 ARCSTAT_BUMP(arcstat_hits);
2865                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2866                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2867                     data, metadata, hits);
2868 
2869                 if (done)
2870                         done(NULL, buf, private);
2871         } else {
2872                 uint64_t size = BP_GET_LSIZE(bp);
2873                 arc_callback_t  *acb;
2874                 vdev_t *vd = NULL;
2875                 uint64_t addr = 0;
2876                 boolean_t devw = B_FALSE;
2877 
2878                 if (hdr == NULL) {
2879                         /* this block is not in the cache */
2880                         arc_buf_hdr_t   *exists;
2881                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2882                         buf = arc_buf_alloc(spa, size, private, type);
2883                         hdr = buf->b_hdr;
2884                         hdr->b_dva = *BP_IDENTITY(bp);
2885                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2886                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2887                         exists = buf_hash_insert(hdr, &hash_lock);
2888                         if (exists) {
2889                                 /* somebody beat us to the hash insert */
2890                                 mutex_exit(hash_lock);
2891                                 buf_discard_identity(hdr);
2892                                 (void) arc_buf_remove_ref(buf, private);
2893                                 goto top; /* restart the IO request */
2894                         }
2895                         /* if this is a prefetch, we don't have a reference */
2896                         if (*arc_flags & ARC_PREFETCH) {
2897                                 (void) remove_reference(hdr, hash_lock,
2898                                     private);
2899                                 hdr->b_flags |= ARC_PREFETCH;
2900                         }
2901                         if (*arc_flags & ARC_L2CACHE)
2902                                 hdr->b_flags |= ARC_L2CACHE;
2903                         if (BP_GET_LEVEL(bp) > 0)
2904                                 hdr->b_flags |= ARC_INDIRECT;
2905                 } else {
2906                         /* this block is in the ghost cache */
2907                         ASSERT(GHOST_STATE(hdr->b_state));
2908                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2909                         ASSERT0(refcount_count(&hdr->b_refcnt));
2910                         ASSERT(hdr->b_buf == NULL);
2911 
2912                         /* if this is a prefetch, we don't have a reference */
2913                         if (*arc_flags & ARC_PREFETCH)
2914                                 hdr->b_flags |= ARC_PREFETCH;
2915                         else
2916                                 add_reference(hdr, hash_lock, private);
2917                         if (*arc_flags & ARC_L2CACHE)
2918                                 hdr->b_flags |= ARC_L2CACHE;
2919                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2920                         buf->b_hdr = hdr;
2921                         buf->b_data = NULL;
2922                         buf->b_efunc = NULL;
2923                         buf->b_private = NULL;
2924                         buf->b_next = NULL;
2925                         hdr->b_buf = buf;
2926                         ASSERT(hdr->b_datacnt == 0);
2927                         hdr->b_datacnt = 1;
2928                         arc_get_data_buf(buf);
2929                         arc_access(hdr, hash_lock);
2930                 }
2931 
2932                 ASSERT(!GHOST_STATE(hdr->b_state));
2933 
2934                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2935                 acb->acb_done = done;
2936                 acb->acb_private = private;
2937 
2938                 ASSERT(hdr->b_acb == NULL);
2939                 hdr->b_acb = acb;
2940                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2941 
2942                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2943                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2944                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2945                         addr = hdr->b_l2hdr->b_daddr;
2946                         /*
2947                          * Lock out device removal.
2948                          */
2949                         if (vdev_is_dead(vd) ||
2950                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2951                                 vd = NULL;
2952                 }
2953 
2954                 mutex_exit(hash_lock);
2955 
2956                 ASSERT3U(hdr->b_size, ==, size);
2957                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
2958                     uint64_t, size, zbookmark_t *, zb);
2959                 ARCSTAT_BUMP(arcstat_misses);
2960                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2961                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2962                     data, metadata, misses);
2963 
2964                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
2965                         /*
2966                          * Read from the L2ARC if the following are true:
2967                          * 1. The L2ARC vdev was previously cached.
2968                          * 2. This buffer still has L2ARC metadata.
2969                          * 3. This buffer isn't currently writing to the L2ARC.
2970                          * 4. The L2ARC entry wasn't evicted, which may
2971                          *    also have invalidated the vdev.
2972                          * 5. This isn't prefetch and l2arc_noprefetch is set.
2973                          */
2974                         if (hdr->b_l2hdr != NULL &&
2975                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
2976                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
2977                                 l2arc_read_callback_t *cb;
2978 
2979                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2980                                 ARCSTAT_BUMP(arcstat_l2_hits);
2981 
2982                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2983                                     KM_SLEEP);
2984                                 cb->l2rcb_buf = buf;
2985                                 cb->l2rcb_spa = spa;
2986                                 cb->l2rcb_bp = *bp;
2987                                 cb->l2rcb_zb = *zb;
2988                                 cb->l2rcb_flags = zio_flags;
2989 
2990                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
2991                                     addr + size < vd->vdev_psize -
2992                                     VDEV_LABEL_END_SIZE);
2993 
2994                                 /*
2995                                  * l2arc read.  The SCL_L2ARC lock will be
2996                                  * released by l2arc_read_done().
2997                                  */
2998                                 rzio = zio_read_phys(pio, vd, addr, size,
2999                                     buf->b_data, ZIO_CHECKSUM_OFF,
3000                                     l2arc_read_done, cb, priority, zio_flags |
3001                                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3002                                     ZIO_FLAG_DONT_PROPAGATE |
3003                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
3004                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3005                                     zio_t *, rzio);
3006                                 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3007 
3008                                 if (*arc_flags & ARC_NOWAIT) {
3009                                         zio_nowait(rzio);
3010                                         return (0);
3011                                 }
3012 
3013                                 ASSERT(*arc_flags & ARC_WAIT);
3014                                 if (zio_wait(rzio) == 0)
3015                                         return (0);
3016 
3017                                 /* l2arc read error; goto zio_read() */
3018                         } else {
3019                                 DTRACE_PROBE1(l2arc__miss,
3020                                     arc_buf_hdr_t *, hdr);
3021                                 ARCSTAT_BUMP(arcstat_l2_misses);
3022                                 if (HDR_L2_WRITING(hdr))
3023                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3024                                 spa_config_exit(spa, SCL_L2ARC, vd);
3025                         }
3026                 } else {
3027                         if (vd != NULL)
3028                                 spa_config_exit(spa, SCL_L2ARC, vd);
3029                         if (l2arc_ndev != 0) {
3030                                 DTRACE_PROBE1(l2arc__miss,
3031                                     arc_buf_hdr_t *, hdr);
3032                                 ARCSTAT_BUMP(arcstat_l2_misses);
3033                         }
3034                 }
3035 
3036                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3037                     arc_read_done, buf, priority, zio_flags, zb);
3038 
3039                 if (*arc_flags & ARC_WAIT)
3040                         return (zio_wait(rzio));
3041 
3042                 ASSERT(*arc_flags & ARC_NOWAIT);
3043                 zio_nowait(rzio);
3044         }
3045         return (0);
3046 }
3047 
3048 void
3049 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3050 {
3051         ASSERT(buf->b_hdr != NULL);
3052         ASSERT(buf->b_hdr->b_state != arc_anon);
3053         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3054         ASSERT(buf->b_efunc == NULL);
3055         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3056 
3057         buf->b_efunc = func;
3058         buf->b_private = private;
3059 }
3060 
3061 /*
3062  * This is used by the DMU to let the ARC know that a buffer is
3063  * being evicted, so the ARC should clean up.  If this arc buf
3064  * is not yet in the evicted state, it will be put there.
3065  */
3066 int
3067 arc_buf_evict(arc_buf_t *buf)
3068 {
3069         arc_buf_hdr_t *hdr;
3070         kmutex_t *hash_lock;
3071         arc_buf_t **bufp;
3072 
3073         mutex_enter(&buf->b_evict_lock);
3074         hdr = buf->b_hdr;
3075         if (hdr == NULL) {
3076                 /*
3077                  * We are in arc_do_user_evicts().
3078                  */
3079                 ASSERT(buf->b_data == NULL);
3080                 mutex_exit(&buf->b_evict_lock);
3081                 return (0);
3082         } else if (buf->b_data == NULL) {
3083                 arc_buf_t copy = *buf; /* structure assignment */
3084                 /*
3085                  * We are on the eviction list; process this buffer now
3086                  * but let arc_do_user_evicts() do the reaping.
3087                  */
3088                 buf->b_efunc = NULL;
3089                 mutex_exit(&buf->b_evict_lock);
3090                 VERIFY(copy.b_efunc(&copy) == 0);
3091                 return (1);
3092         }
3093         hash_lock = HDR_LOCK(hdr);
3094         mutex_enter(hash_lock);
3095         hdr = buf->b_hdr;
3096         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3097 
3098         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3099         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3100 
3101         /*
3102          * Pull this buffer off of the hdr
3103          */
3104         bufp = &hdr->b_buf;
3105         while (*bufp != buf)
3106                 bufp = &(*bufp)->b_next;
3107         *bufp = buf->b_next;
3108 
3109         ASSERT(buf->b_data != NULL);
3110         arc_buf_destroy(buf, FALSE, FALSE);
3111 
3112         if (hdr->b_datacnt == 0) {
3113                 arc_state_t *old_state = hdr->b_state;
3114                 arc_state_t *evicted_state;
3115 
3116                 ASSERT(hdr->b_buf == NULL);
3117                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3118 
3119                 evicted_state =
3120                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3121 
3122                 mutex_enter(&old_state->arcs_mtx);
3123                 mutex_enter(&evicted_state->arcs_mtx);
3124 
3125                 arc_change_state(evicted_state, hdr, hash_lock);
3126                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3127                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3128                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3129 
3130                 mutex_exit(&evicted_state->arcs_mtx);
3131                 mutex_exit(&old_state->arcs_mtx);
3132         }
3133         mutex_exit(hash_lock);
3134         mutex_exit(&buf->b_evict_lock);
3135 
3136         VERIFY(buf->b_efunc(buf) == 0);
3137         buf->b_efunc = NULL;
3138         buf->b_private = NULL;
3139         buf->b_hdr = NULL;
3140         buf->b_next = NULL;
3141         kmem_cache_free(buf_cache, buf);
3142         return (1);
3143 }
3144 
3145 /*
3146  * Release this buffer from the cache.  This must be done
3147  * after a read and prior to modifying the buffer contents.
3148  * If the buffer has more than one reference, we must make
3149  * a new hdr for the buffer.
3150  */
3151 void
3152 arc_release(arc_buf_t *buf, void *tag)
3153 {
3154         arc_buf_hdr_t *hdr;
3155         kmutex_t *hash_lock = NULL;
3156         l2arc_buf_hdr_t *l2hdr;
3157         uint64_t buf_size;
3158 
3159         /*
3160          * It would be nice to assert that if it's DMU metadata (level >
3161          * 0 || it's the dnode file), then it must be syncing context.
3162          * But we don't know that information at this level.
3163          */
3164 
3165         mutex_enter(&buf->b_evict_lock);
3166         hdr = buf->b_hdr;
3167 
3168         /* this buffer is not on any list */
3169         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3170 
3171         if (hdr->b_state == arc_anon) {
3172                 /* this buffer is already released */
3173                 ASSERT(buf->b_efunc == NULL);
3174         } else {
3175                 hash_lock = HDR_LOCK(hdr);
3176                 mutex_enter(hash_lock);
3177                 hdr = buf->b_hdr;
3178                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3179         }
3180 
3181         l2hdr = hdr->b_l2hdr;
3182         if (l2hdr) {
3183                 mutex_enter(&l2arc_buflist_mtx);
3184                 hdr->b_l2hdr = NULL;
3185         }
3186         buf_size = hdr->b_size;
3187 
3188         /*
3189          * Do we have more than one buf?
3190          */
3191         if (hdr->b_datacnt > 1) {
3192                 arc_buf_hdr_t *nhdr;
3193                 arc_buf_t **bufp;
3194                 uint64_t blksz = hdr->b_size;
3195                 uint64_t spa = hdr->b_spa;
3196                 arc_buf_contents_t type = hdr->b_type;
3197                 uint32_t flags = hdr->b_flags;
3198 
3199                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3200                 /*
3201                  * Pull the data off of this hdr and attach it to
3202                  * a new anonymous hdr.
3203                  */
3204                 (void) remove_reference(hdr, hash_lock, tag);
3205                 bufp = &hdr->b_buf;
3206                 while (*bufp != buf)
3207                         bufp = &(*bufp)->b_next;
3208                 *bufp = buf->b_next;
3209                 buf->b_next = NULL;
3210 
3211                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3212                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3213                 if (refcount_is_zero(&hdr->b_refcnt)) {
3214                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3215                         ASSERT3U(*size, >=, hdr->b_size);
3216                         atomic_add_64(size, -hdr->b_size);
3217                 }
3218 
3219                 /*
3220                  * We're releasing a duplicate user data buffer, update
3221                  * our statistics accordingly.
3222                  */
3223                 if (hdr->b_type == ARC_BUFC_DATA) {
3224                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3225                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3226                             -hdr->b_size);
3227                 }
3228                 hdr->b_datacnt -= 1;
3229                 arc_cksum_verify(buf);
3230                 arc_buf_unwatch(buf);
3231 
3232                 mutex_exit(hash_lock);
3233 
3234                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3235                 nhdr->b_size = blksz;
3236                 nhdr->b_spa = spa;
3237                 nhdr->b_type = type;
3238                 nhdr->b_buf = buf;
3239                 nhdr->b_state = arc_anon;
3240                 nhdr->b_arc_access = 0;
3241                 nhdr->b_flags = flags & ARC_L2_WRITING;
3242                 nhdr->b_l2hdr = NULL;
3243                 nhdr->b_datacnt = 1;
3244                 nhdr->b_freeze_cksum = NULL;
3245                 (void) refcount_add(&nhdr->b_refcnt, tag);
3246                 buf->b_hdr = nhdr;
3247                 mutex_exit(&buf->b_evict_lock);
3248                 atomic_add_64(&arc_anon->arcs_size, blksz);
3249         } else {
3250                 mutex_exit(&buf->b_evict_lock);
3251                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3252                 ASSERT(!list_link_active(&hdr->b_arc_node));
3253                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3254                 if (hdr->b_state != arc_anon)
3255                         arc_change_state(arc_anon, hdr, hash_lock);
3256                 hdr->b_arc_access = 0;
3257                 if (hash_lock)
3258                         mutex_exit(hash_lock);
3259 
3260                 buf_discard_identity(hdr);
3261                 arc_buf_thaw(buf);
3262         }
3263         buf->b_efunc = NULL;
3264         buf->b_private = NULL;
3265 
3266         if (l2hdr) {
3267                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3268                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3269                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3270                 mutex_exit(&l2arc_buflist_mtx);
3271         }
3272 }
3273 
3274 int
3275 arc_released(arc_buf_t *buf)
3276 {
3277         int released;
3278 
3279         mutex_enter(&buf->b_evict_lock);
3280         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3281         mutex_exit(&buf->b_evict_lock);
3282         return (released);
3283 }
3284 
3285 int
3286 arc_has_callback(arc_buf_t *buf)
3287 {
3288         int callback;
3289 
3290         mutex_enter(&buf->b_evict_lock);
3291         callback = (buf->b_efunc != NULL);
3292         mutex_exit(&buf->b_evict_lock);
3293         return (callback);
3294 }
3295 
3296 #ifdef ZFS_DEBUG
3297 int
3298 arc_referenced(arc_buf_t *buf)
3299 {
3300         int referenced;
3301 
3302         mutex_enter(&buf->b_evict_lock);
3303         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3304         mutex_exit(&buf->b_evict_lock);
3305         return (referenced);
3306 }
3307 #endif
3308 
3309 static void
3310 arc_write_ready(zio_t *zio)
3311 {
3312         arc_write_callback_t *callback = zio->io_private;
3313         arc_buf_t *buf = callback->awcb_buf;
3314         arc_buf_hdr_t *hdr = buf->b_hdr;
3315 
3316         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3317         callback->awcb_ready(zio, buf, callback->awcb_private);
3318 
3319         /*
3320          * If the IO is already in progress, then this is a re-write
3321          * attempt, so we need to thaw and re-compute the cksum.
3322          * It is the responsibility of the callback to handle the
3323          * accounting for any re-write attempt.
3324          */
3325         if (HDR_IO_IN_PROGRESS(hdr)) {
3326                 mutex_enter(&hdr->b_freeze_lock);
3327                 if (hdr->b_freeze_cksum != NULL) {
3328                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3329                         hdr->b_freeze_cksum = NULL;
3330                 }
3331                 mutex_exit(&hdr->b_freeze_lock);
3332         }
3333         arc_cksum_compute(buf, B_FALSE);
3334         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3335 }
3336 
3337 static void
3338 arc_write_done(zio_t *zio)
3339 {
3340         arc_write_callback_t *callback = zio->io_private;
3341         arc_buf_t *buf = callback->awcb_buf;
3342         arc_buf_hdr_t *hdr = buf->b_hdr;
3343 
3344         ASSERT(hdr->b_acb == NULL);
3345 
3346         if (zio->io_error == 0) {
3347                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3348                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3349                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3350         } else {
3351                 ASSERT(BUF_EMPTY(hdr));
3352         }
3353 
3354         /*
3355          * If the block to be written was all-zero, we may have
3356          * compressed it away.  In this case no write was performed
3357          * so there will be no dva/birth/checksum.  The buffer must
3358          * therefore remain anonymous (and uncached).
3359          */
3360         if (!BUF_EMPTY(hdr)) {
3361                 arc_buf_hdr_t *exists;
3362                 kmutex_t *hash_lock;
3363 
3364                 ASSERT(zio->io_error == 0);
3365 
3366                 arc_cksum_verify(buf);
3367 
3368                 exists = buf_hash_insert(hdr, &hash_lock);
3369                 if (exists) {
3370                         /*
3371                          * This can only happen if we overwrite for
3372                          * sync-to-convergence, because we remove
3373                          * buffers from the hash table when we arc_free().
3374                          */
3375                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3376                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3377                                         panic("bad overwrite, hdr=%p exists=%p",
3378                                             (void *)hdr, (void *)exists);
3379                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3380                                 arc_change_state(arc_anon, exists, hash_lock);
3381                                 mutex_exit(hash_lock);
3382                                 arc_hdr_destroy(exists);
3383                                 exists = buf_hash_insert(hdr, &hash_lock);
3384                                 ASSERT3P(exists, ==, NULL);
3385                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3386                                 /* nopwrite */
3387                                 ASSERT(zio->io_prop.zp_nopwrite);
3388                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3389                                         panic("bad nopwrite, hdr=%p exists=%p",
3390                                             (void *)hdr, (void *)exists);
3391                         } else {
3392                                 /* Dedup */
3393                                 ASSERT(hdr->b_datacnt == 1);
3394                                 ASSERT(hdr->b_state == arc_anon);
3395                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3396                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3397                         }
3398                 }
3399                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3400                 /* if it's not anon, we are doing a scrub */
3401                 if (!exists && hdr->b_state == arc_anon)
3402                         arc_access(hdr, hash_lock);
3403                 mutex_exit(hash_lock);
3404         } else {
3405                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3406         }
3407 
3408         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3409         callback->awcb_done(zio, buf, callback->awcb_private);
3410 
3411         kmem_free(callback, sizeof (arc_write_callback_t));
3412 }
3413 
3414 zio_t *
3415 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3416     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3417     arc_done_func_t *ready, arc_done_func_t *done, void *private,
3418     int priority, int zio_flags, const zbookmark_t *zb)
3419 {
3420         arc_buf_hdr_t *hdr = buf->b_hdr;
3421         arc_write_callback_t *callback;
3422         zio_t *zio;
3423 
3424         ASSERT(ready != NULL);
3425         ASSERT(done != NULL);
3426         ASSERT(!HDR_IO_ERROR(hdr));
3427         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3428         ASSERT(hdr->b_acb == NULL);
3429         if (l2arc)
3430                 hdr->b_flags |= ARC_L2CACHE;
3431         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3432         callback->awcb_ready = ready;
3433         callback->awcb_done = done;
3434         callback->awcb_private = private;
3435         callback->awcb_buf = buf;
3436 
3437         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3438             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3439 
3440         return (zio);
3441 }
3442 
3443 static int
3444 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3445 {
3446 #ifdef _KERNEL
3447         uint64_t available_memory = ptob(freemem);
3448         static uint64_t page_load = 0;
3449         static uint64_t last_txg = 0;
3450 
3451 #if defined(__i386)
3452         available_memory =
3453             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3454 #endif
3455         if (available_memory >= zfs_write_limit_max)
3456                 return (0);
3457 
3458         if (txg > last_txg) {
3459                 last_txg = txg;
3460                 page_load = 0;
3461         }
3462         /*
3463          * If we are in pageout, we know that memory is already tight,
3464          * the arc is already going to be evicting, so we just want to
3465          * continue to let page writes occur as quickly as possible.
3466          */
3467         if (curproc == proc_pageout) {
3468                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3469                         return (SET_ERROR(ERESTART));
3470                 /* Note: reserve is inflated, so we deflate */
3471                 page_load += reserve / 8;
3472                 return (0);
3473         } else if (page_load > 0 && arc_reclaim_needed()) {
3474                 /* memory is low, delay before restarting */
3475                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3476                 return (SET_ERROR(EAGAIN));
3477         }
3478         page_load = 0;
3479 
3480         if (arc_size > arc_c_min) {
3481                 uint64_t evictable_memory =
3482                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3483                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3484                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3485                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3486                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3487         }
3488 
3489         if (inflight_data > available_memory / 4) {
3490                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3491                 return (SET_ERROR(ERESTART));
3492         }
3493 #endif
3494         return (0);
3495 }
3496 
3497 void
3498 arc_tempreserve_clear(uint64_t reserve)
3499 {
3500         atomic_add_64(&arc_tempreserve, -reserve);
3501         ASSERT((int64_t)arc_tempreserve >= 0);
3502 }
3503 
3504 int
3505 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3506 {
3507         int error;
3508         uint64_t anon_size;
3509 
3510 #ifdef ZFS_DEBUG
3511         /*
3512          * Once in a while, fail for no reason.  Everything should cope.
3513          */
3514         if (spa_get_random(10000) == 0) {
3515                 dprintf("forcing random failure\n");
3516                 return (SET_ERROR(ERESTART));
3517         }
3518 #endif
3519         if (reserve > arc_c/4 && !arc_no_grow)
3520                 arc_c = MIN(arc_c_max, reserve * 4);
3521         if (reserve > arc_c)
3522                 return (SET_ERROR(ENOMEM));
3523 
3524         /*
3525          * Don't count loaned bufs as in flight dirty data to prevent long
3526          * network delays from blocking transactions that are ready to be
3527          * assigned to a txg.
3528          */
3529         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3530 
3531         /*
3532          * Writes will, almost always, require additional memory allocations
3533          * in order to compress/encrypt/etc the data.  We therefor need to
3534          * make sure that there is sufficient available memory for this.
3535          */
3536         if (error = arc_memory_throttle(reserve, anon_size, txg))
3537                 return (error);
3538 
3539         /*
3540          * Throttle writes when the amount of dirty data in the cache
3541          * gets too large.  We try to keep the cache less than half full
3542          * of dirty blocks so that our sync times don't grow too large.
3543          * Note: if two requests come in concurrently, we might let them
3544          * both succeed, when one of them should fail.  Not a huge deal.
3545          */
3546 
3547         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3548             anon_size > arc_c / 4) {
3549                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3550                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3551                     arc_tempreserve>>10,
3552                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3553                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3554                     reserve>>10, arc_c>>10);
3555                 return (SET_ERROR(ERESTART));
3556         }
3557         atomic_add_64(&arc_tempreserve, reserve);
3558         return (0);
3559 }
3560 
3561 void
3562 arc_init(void)
3563 {
3564         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3565         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3566 
3567         /* Convert seconds to clock ticks */
3568         arc_min_prefetch_lifespan = 1 * hz;
3569 
3570         /* Start out with 1/8 of all memory */
3571         arc_c = physmem * PAGESIZE / 8;
3572 
3573 #ifdef _KERNEL
3574         /*
3575          * On architectures where the physical memory can be larger
3576          * than the addressable space (intel in 32-bit mode), we may
3577          * need to limit the cache to 1/8 of VM size.
3578          */
3579         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3580 #endif
3581 
3582         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3583         arc_c_min = MAX(arc_c / 4, 64<<20);
3584         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3585         if (arc_c * 8 >= 1<<30)
3586                 arc_c_max = (arc_c * 8) - (1<<30);
3587         else
3588                 arc_c_max = arc_c_min;
3589         arc_c_max = MAX(arc_c * 6, arc_c_max);
3590 
3591         /*
3592          * Allow the tunables to override our calculations if they are
3593          * reasonable (ie. over 64MB)
3594          */
3595         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3596                 arc_c_max = zfs_arc_max;
3597         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3598                 arc_c_min = zfs_arc_min;
3599 
3600         arc_c = arc_c_max;
3601         arc_p = (arc_c >> 1);
3602 
3603         /* limit meta-data to 1/4 of the arc capacity */
3604         arc_meta_limit = arc_c_max / 4;
3605 
3606         /* Allow the tunable to override if it is reasonable */
3607         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3608                 arc_meta_limit = zfs_arc_meta_limit;
3609 
3610         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3611                 arc_c_min = arc_meta_limit / 2;
3612 
3613         if (zfs_arc_grow_retry > 0)
3614                 arc_grow_retry = zfs_arc_grow_retry;
3615 
3616         if (zfs_arc_shrink_shift > 0)
3617                 arc_shrink_shift = zfs_arc_shrink_shift;
3618 
3619         if (zfs_arc_p_min_shift > 0)
3620                 arc_p_min_shift = zfs_arc_p_min_shift;
3621 
3622         /* if kmem_flags are set, lets try to use less memory */
3623         if (kmem_debugging())
3624                 arc_c = arc_c / 2;
3625         if (arc_c < arc_c_min)
3626                 arc_c = arc_c_min;
3627 
3628         arc_anon = &ARC_anon;
3629         arc_mru = &ARC_mru;
3630         arc_mru_ghost = &ARC_mru_ghost;
3631         arc_mfu = &ARC_mfu;
3632         arc_mfu_ghost = &ARC_mfu_ghost;
3633         arc_l2c_only = &ARC_l2c_only;
3634         arc_size = 0;
3635 
3636         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3637         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3638         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3639         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3640         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3641         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3642 
3643         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3644             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3645         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3646             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3647         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3648             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3649         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3650             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3651         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3652             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3653         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3654             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3655         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3656             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3657         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3658             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3659         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3660             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3661         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3662             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3663 
3664         buf_init();
3665 
3666         arc_thread_exit = 0;
3667         arc_eviction_list = NULL;
3668         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3669         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3670 
3671         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3672             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3673 
3674         if (arc_ksp != NULL) {
3675                 arc_ksp->ks_data = &arc_stats;
3676                 kstat_install(arc_ksp);
3677         }
3678 
3679         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3680             TS_RUN, minclsyspri);
3681 
3682         arc_dead = FALSE;
3683         arc_warm = B_FALSE;
3684 
3685         if (zfs_write_limit_max == 0)
3686                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3687         else
3688                 zfs_write_limit_shift = 0;
3689         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3690 }
3691 
3692 void
3693 arc_fini(void)
3694 {
3695         mutex_enter(&arc_reclaim_thr_lock);
3696         arc_thread_exit = 1;
3697         while (arc_thread_exit != 0)
3698                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3699         mutex_exit(&arc_reclaim_thr_lock);
3700 
3701         arc_flush(NULL);
3702 
3703         arc_dead = TRUE;
3704 
3705         if (arc_ksp != NULL) {
3706                 kstat_delete(arc_ksp);
3707                 arc_ksp = NULL;
3708         }
3709 
3710         mutex_destroy(&arc_eviction_mtx);
3711         mutex_destroy(&arc_reclaim_thr_lock);
3712         cv_destroy(&arc_reclaim_thr_cv);
3713 
3714         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3715         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3716         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3717         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3718         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3719         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3720         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3721         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3722 
3723         mutex_destroy(&arc_anon->arcs_mtx);
3724         mutex_destroy(&arc_mru->arcs_mtx);
3725         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3726         mutex_destroy(&arc_mfu->arcs_mtx);
3727         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3728         mutex_destroy(&arc_l2c_only->arcs_mtx);
3729 
3730         mutex_destroy(&zfs_write_limit_lock);
3731 
3732         buf_fini();
3733 
3734         ASSERT(arc_loaned_bytes == 0);
3735 }
3736 
3737 /*
3738  * Level 2 ARC
3739  *
3740  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3741  * It uses dedicated storage devices to hold cached data, which are populated
3742  * using large infrequent writes.  The main role of this cache is to boost
3743  * the performance of random read workloads.  The intended L2ARC devices
3744  * include short-stroked disks, solid state disks, and other media with
3745  * substantially faster read latency than disk.
3746  *
3747  *                 +-----------------------+
3748  *                 |         ARC           |
3749  *                 +-----------------------+
3750  *                    |         ^     ^
3751  *                    |         |     |
3752  *      l2arc_feed_thread()    arc_read()
3753  *                    |         |     |
3754  *                    |  l2arc read   |
3755  *                    V         |     |
3756  *               +---------------+    |
3757  *               |     L2ARC     |    |
3758  *               +---------------+    |
3759  *                   |    ^           |
3760  *          l2arc_write() |           |
3761  *                   |    |           |
3762  *                   V    |           |
3763  *                 +-------+      +-------+
3764  *                 | vdev  |      | vdev  |
3765  *                 | cache |      | cache |
3766  *                 +-------+      +-------+
3767  *                 +=========+     .-----.
3768  *                 :  L2ARC  :    |-_____-|
3769  *                 : devices :    | Disks |
3770  *                 +=========+    `-_____-'
3771  *
3772  * Read requests are satisfied from the following sources, in order:
3773  *
3774  *      1) ARC
3775  *      2) vdev cache of L2ARC devices
3776  *      3) L2ARC devices
3777  *      4) vdev cache of disks
3778  *      5) disks
3779  *
3780  * Some L2ARC device types exhibit extremely slow write performance.
3781  * To accommodate for this there are some significant differences between
3782  * the L2ARC and traditional cache design:
3783  *
3784  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3785  * the ARC behave as usual, freeing buffers and placing headers on ghost
3786  * lists.  The ARC does not send buffers to the L2ARC during eviction as
3787  * this would add inflated write latencies for all ARC memory pressure.
3788  *
3789  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3790  * It does this by periodically scanning buffers from the eviction-end of
3791  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3792  * not already there.  It scans until a headroom of buffers is satisfied,
3793  * which itself is a buffer for ARC eviction.  The thread that does this is
3794  * l2arc_feed_thread(), illustrated below; example sizes are included to
3795  * provide a better sense of ratio than this diagram:
3796  *
3797  *             head -->                        tail
3798  *              +---------------------+----------+
3799  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3800  *              +---------------------+----------+   |   o L2ARC eligible
3801  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3802  *              +---------------------+----------+   |
3803  *                   15.9 Gbytes      ^ 32 Mbytes    |
3804  *                                 headroom          |
3805  *                                            l2arc_feed_thread()
3806  *                                                   |
3807  *                       l2arc write hand <--[oooo]--'
3808  *                               |           8 Mbyte
3809  *                               |          write max
3810  *                               V
3811  *                +==============================+
3812  *      L2ARC dev |####|#|###|###|    |####| ... |
3813  *                +==============================+
3814  *                           32 Gbytes
3815  *
3816  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3817  * evicted, then the L2ARC has cached a buffer much sooner than it probably
3818  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3819  * safe to say that this is an uncommon case, since buffers at the end of
3820  * the ARC lists have moved there due to inactivity.
3821  *
3822  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3823  * then the L2ARC simply misses copying some buffers.  This serves as a
3824  * pressure valve to prevent heavy read workloads from both stalling the ARC
3825  * with waits and clogging the L2ARC with writes.  This also helps prevent
3826  * the potential for the L2ARC to churn if it attempts to cache content too
3827  * quickly, such as during backups of the entire pool.
3828  *
3829  * 5. After system boot and before the ARC has filled main memory, there are
3830  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3831  * lists can remain mostly static.  Instead of searching from tail of these
3832  * lists as pictured, the l2arc_feed_thread() will search from the list heads
3833  * for eligible buffers, greatly increasing its chance of finding them.
3834  *
3835  * The L2ARC device write speed is also boosted during this time so that
3836  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3837  * there are no L2ARC reads, and no fear of degrading read performance
3838  * through increased writes.
3839  *
3840  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3841  * the vdev queue can aggregate them into larger and fewer writes.  Each
3842  * device is written to in a rotor fashion, sweeping writes through
3843  * available space then repeating.
3844  *
3845  * 7. The L2ARC does not store dirty content.  It never needs to flush
3846  * write buffers back to disk based storage.
3847  *
3848  * 8. If an ARC buffer is written (and dirtied) which also exists in the
3849  * L2ARC, the now stale L2ARC buffer is immediately dropped.
3850  *
3851  * The performance of the L2ARC can be tweaked by a number of tunables, which
3852  * may be necessary for different workloads:
3853  *
3854  *      l2arc_write_max         max write bytes per interval
3855  *      l2arc_write_boost       extra write bytes during device warmup
3856  *      l2arc_noprefetch        skip caching prefetched buffers
3857  *      l2arc_headroom          number of max device writes to precache
3858  *      l2arc_feed_secs         seconds between L2ARC writing
3859  *
3860  * Tunables may be removed or added as future performance improvements are
3861  * integrated, and also may become zpool properties.
3862  *
3863  * There are three key functions that control how the L2ARC warms up:
3864  *
3865  *      l2arc_write_eligible()  check if a buffer is eligible to cache
3866  *      l2arc_write_size()      calculate how much to write
3867  *      l2arc_write_interval()  calculate sleep delay between writes
3868  *
3869  * These three functions determine what to write, how much, and how quickly
3870  * to send writes.
3871  */
3872 
3873 static boolean_t
3874 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3875 {
3876         /*
3877          * A buffer is *not* eligible for the L2ARC if it:
3878          * 1. belongs to a different spa.
3879          * 2. is already cached on the L2ARC.
3880          * 3. has an I/O in progress (it may be an incomplete read).
3881          * 4. is flagged not eligible (zfs property).
3882          */
3883         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3884             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3885                 return (B_FALSE);
3886 
3887         return (B_TRUE);
3888 }
3889 
3890 static uint64_t
3891 l2arc_write_size(l2arc_dev_t *dev)
3892 {
3893         uint64_t size;
3894 
3895         size = dev->l2ad_write;
3896 
3897         if (arc_warm == B_FALSE)
3898                 size += dev->l2ad_boost;
3899 
3900         return (size);
3901 
3902 }
3903 
3904 static clock_t
3905 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
3906 {
3907         clock_t interval, next, now;
3908 
3909         /*
3910          * If the ARC lists are busy, increase our write rate; if the
3911          * lists are stale, idle back.  This is achieved by checking
3912          * how much we previously wrote - if it was more than half of
3913          * what we wanted, schedule the next write much sooner.
3914          */
3915         if (l2arc_feed_again && wrote > (wanted / 2))
3916                 interval = (hz * l2arc_feed_min_ms) / 1000;
3917         else
3918                 interval = hz * l2arc_feed_secs;
3919 
3920         now = ddi_get_lbolt();
3921         next = MAX(now, MIN(now + interval, began + interval));
3922 
3923         return (next);
3924 }
3925 
3926 static void
3927 l2arc_hdr_stat_add(void)
3928 {
3929         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3930         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3931 }
3932 
3933 static void
3934 l2arc_hdr_stat_remove(void)
3935 {
3936         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3937         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3938 }
3939 
3940 /*
3941  * Cycle through L2ARC devices.  This is how L2ARC load balances.
3942  * If a device is returned, this also returns holding the spa config lock.
3943  */
3944 static l2arc_dev_t *
3945 l2arc_dev_get_next(void)
3946 {
3947         l2arc_dev_t *first, *next = NULL;
3948 
3949         /*
3950          * Lock out the removal of spas (spa_namespace_lock), then removal
3951          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
3952          * both locks will be dropped and a spa config lock held instead.
3953          */
3954         mutex_enter(&spa_namespace_lock);
3955         mutex_enter(&l2arc_dev_mtx);
3956 
3957         /* if there are no vdevs, there is nothing to do */
3958         if (l2arc_ndev == 0)
3959                 goto out;
3960 
3961         first = NULL;
3962         next = l2arc_dev_last;
3963         do {
3964                 /* loop around the list looking for a non-faulted vdev */
3965                 if (next == NULL) {
3966                         next = list_head(l2arc_dev_list);
3967                 } else {
3968                         next = list_next(l2arc_dev_list, next);
3969                         if (next == NULL)
3970                                 next = list_head(l2arc_dev_list);
3971                 }
3972 
3973                 /* if we have come back to the start, bail out */
3974                 if (first == NULL)
3975                         first = next;
3976                 else if (next == first)
3977                         break;
3978 
3979         } while (vdev_is_dead(next->l2ad_vdev));
3980 
3981         /* if we were unable to find any usable vdevs, return NULL */
3982         if (vdev_is_dead(next->l2ad_vdev))
3983                 next = NULL;
3984 
3985         l2arc_dev_last = next;
3986 
3987 out:
3988         mutex_exit(&l2arc_dev_mtx);
3989 
3990         /*
3991          * Grab the config lock to prevent the 'next' device from being
3992          * removed while we are writing to it.
3993          */
3994         if (next != NULL)
3995                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
3996         mutex_exit(&spa_namespace_lock);
3997 
3998         return (next);
3999 }
4000 
4001 /*
4002  * Free buffers that were tagged for destruction.
4003  */
4004 static void
4005 l2arc_do_free_on_write()
4006 {
4007         list_t *buflist;
4008         l2arc_data_free_t *df, *df_prev;
4009 
4010         mutex_enter(&l2arc_free_on_write_mtx);
4011         buflist = l2arc_free_on_write;
4012 
4013         for (df = list_tail(buflist); df; df = df_prev) {
4014                 df_prev = list_prev(buflist, df);
4015                 ASSERT(df->l2df_data != NULL);
4016                 ASSERT(df->l2df_func != NULL);
4017                 df->l2df_func(df->l2df_data, df->l2df_size);
4018                 list_remove(buflist, df);
4019                 kmem_free(df, sizeof (l2arc_data_free_t));
4020         }
4021 
4022         mutex_exit(&l2arc_free_on_write_mtx);
4023 }
4024 
4025 /*
4026  * A write to a cache device has completed.  Update all headers to allow
4027  * reads from these buffers to begin.
4028  */
4029 static void
4030 l2arc_write_done(zio_t *zio)
4031 {
4032         l2arc_write_callback_t *cb;
4033         l2arc_dev_t *dev;
4034         list_t *buflist;
4035         arc_buf_hdr_t *head, *ab, *ab_prev;
4036         l2arc_buf_hdr_t *abl2;
4037         kmutex_t *hash_lock;
4038 
4039         cb = zio->io_private;
4040         ASSERT(cb != NULL);
4041         dev = cb->l2wcb_dev;
4042         ASSERT(dev != NULL);
4043         head = cb->l2wcb_head;
4044         ASSERT(head != NULL);
4045         buflist = dev->l2ad_buflist;
4046         ASSERT(buflist != NULL);
4047         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4048             l2arc_write_callback_t *, cb);
4049 
4050         if (zio->io_error != 0)
4051                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4052 
4053         mutex_enter(&l2arc_buflist_mtx);
4054 
4055         /*
4056          * All writes completed, or an error was hit.
4057          */
4058         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4059                 ab_prev = list_prev(buflist, ab);
4060 
4061                 hash_lock = HDR_LOCK(ab);
4062                 if (!mutex_tryenter(hash_lock)) {
4063                         /*
4064                          * This buffer misses out.  It may be in a stage
4065                          * of eviction.  Its ARC_L2_WRITING flag will be
4066                          * left set, denying reads to this buffer.
4067                          */
4068                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4069                         continue;
4070                 }
4071 
4072                 if (zio->io_error != 0) {
4073                         /*
4074                          * Error - drop L2ARC entry.
4075                          */
4076                         list_remove(buflist, ab);
4077                         abl2 = ab->b_l2hdr;
4078                         ab->b_l2hdr = NULL;
4079                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4080                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4081                 }
4082 
4083                 /*
4084                  * Allow ARC to begin reads to this L2ARC entry.
4085                  */
4086                 ab->b_flags &= ~ARC_L2_WRITING;
4087 
4088                 mutex_exit(hash_lock);
4089         }
4090 
4091         atomic_inc_64(&l2arc_writes_done);
4092         list_remove(buflist, head);
4093         kmem_cache_free(hdr_cache, head);
4094         mutex_exit(&l2arc_buflist_mtx);
4095 
4096         l2arc_do_free_on_write();
4097 
4098         kmem_free(cb, sizeof (l2arc_write_callback_t));
4099 }
4100 
4101 /*
4102  * A read to a cache device completed.  Validate buffer contents before
4103  * handing over to the regular ARC routines.
4104  */
4105 static void
4106 l2arc_read_done(zio_t *zio)
4107 {
4108         l2arc_read_callback_t *cb;
4109         arc_buf_hdr_t *hdr;
4110         arc_buf_t *buf;
4111         kmutex_t *hash_lock;
4112         int equal;
4113 
4114         ASSERT(zio->io_vd != NULL);
4115         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4116 
4117         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4118 
4119         cb = zio->io_private;
4120         ASSERT(cb != NULL);
4121         buf = cb->l2rcb_buf;
4122         ASSERT(buf != NULL);
4123 
4124         hash_lock = HDR_LOCK(buf->b_hdr);
4125         mutex_enter(hash_lock);
4126         hdr = buf->b_hdr;
4127         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4128 
4129         /*
4130          * Check this survived the L2ARC journey.
4131          */
4132         equal = arc_cksum_equal(buf);
4133         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4134                 mutex_exit(hash_lock);
4135                 zio->io_private = buf;
4136                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
4137                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
4138                 arc_read_done(zio);
4139         } else {
4140                 mutex_exit(hash_lock);
4141                 /*
4142                  * Buffer didn't survive caching.  Increment stats and
4143                  * reissue to the original storage device.
4144                  */
4145                 if (zio->io_error != 0) {
4146                         ARCSTAT_BUMP(arcstat_l2_io_error);
4147                 } else {
4148                         zio->io_error = SET_ERROR(EIO);
4149                 }
4150                 if (!equal)
4151                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4152 
4153                 /*
4154                  * If there's no waiter, issue an async i/o to the primary
4155                  * storage now.  If there *is* a waiter, the caller must
4156                  * issue the i/o in a context where it's OK to block.
4157                  */
4158                 if (zio->io_waiter == NULL) {
4159                         zio_t *pio = zio_unique_parent(zio);
4160 
4161                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4162 
4163                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4164                             buf->b_data, zio->io_size, arc_read_done, buf,
4165                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4166                 }
4167         }
4168 
4169         kmem_free(cb, sizeof (l2arc_read_callback_t));
4170 }
4171 
4172 /*
4173  * This is the list priority from which the L2ARC will search for pages to
4174  * cache.  This is used within loops (0..3) to cycle through lists in the
4175  * desired order.  This order can have a significant effect on cache
4176  * performance.
4177  *
4178  * Currently the metadata lists are hit first, MFU then MRU, followed by
4179  * the data lists.  This function returns a locked list, and also returns
4180  * the lock pointer.
4181  */
4182 static list_t *
4183 l2arc_list_locked(int list_num, kmutex_t **lock)
4184 {
4185         list_t *list = NULL;
4186 
4187         ASSERT(list_num >= 0 && list_num <= 3);
4188 
4189         switch (list_num) {
4190         case 0:
4191                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4192                 *lock = &arc_mfu->arcs_mtx;
4193                 break;
4194         case 1:
4195                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4196                 *lock = &arc_mru->arcs_mtx;
4197                 break;
4198         case 2:
4199                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4200                 *lock = &arc_mfu->arcs_mtx;
4201                 break;
4202         case 3:
4203                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4204                 *lock = &arc_mru->arcs_mtx;
4205                 break;
4206         }
4207 
4208         ASSERT(!(MUTEX_HELD(*lock)));
4209         mutex_enter(*lock);
4210         return (list);
4211 }
4212 
4213 /*
4214  * Evict buffers from the device write hand to the distance specified in
4215  * bytes.  This distance may span populated buffers, it may span nothing.
4216  * This is clearing a region on the L2ARC device ready for writing.
4217  * If the 'all' boolean is set, every buffer is evicted.
4218  */
4219 static void
4220 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4221 {
4222         list_t *buflist;
4223         l2arc_buf_hdr_t *abl2;
4224         arc_buf_hdr_t *ab, *ab_prev;
4225         kmutex_t *hash_lock;
4226         uint64_t taddr;
4227 
4228         buflist = dev->l2ad_buflist;
4229 
4230         if (buflist == NULL)
4231                 return;
4232 
4233         if (!all && dev->l2ad_first) {
4234                 /*
4235                  * This is the first sweep through the device.  There is
4236                  * nothing to evict.
4237                  */
4238                 return;
4239         }
4240 
4241         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4242                 /*
4243                  * When nearing the end of the device, evict to the end
4244                  * before the device write hand jumps to the start.
4245                  */
4246                 taddr = dev->l2ad_end;
4247         } else {
4248                 taddr = dev->l2ad_hand + distance;
4249         }
4250         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4251             uint64_t, taddr, boolean_t, all);
4252 
4253 top:
4254         mutex_enter(&l2arc_buflist_mtx);
4255         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4256                 ab_prev = list_prev(buflist, ab);
4257 
4258                 hash_lock = HDR_LOCK(ab);
4259                 if (!mutex_tryenter(hash_lock)) {
4260                         /*
4261                          * Missed the hash lock.  Retry.
4262                          */
4263                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4264                         mutex_exit(&l2arc_buflist_mtx);
4265                         mutex_enter(hash_lock);
4266                         mutex_exit(hash_lock);
4267                         goto top;
4268                 }
4269 
4270                 if (HDR_L2_WRITE_HEAD(ab)) {
4271                         /*
4272                          * We hit a write head node.  Leave it for
4273                          * l2arc_write_done().
4274                          */
4275                         list_remove(buflist, ab);
4276                         mutex_exit(hash_lock);
4277                         continue;
4278                 }
4279 
4280                 if (!all && ab->b_l2hdr != NULL &&
4281                     (ab->b_l2hdr->b_daddr > taddr ||
4282                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4283                         /*
4284                          * We've evicted to the target address,
4285                          * or the end of the device.
4286                          */
4287                         mutex_exit(hash_lock);
4288                         break;
4289                 }
4290 
4291                 if (HDR_FREE_IN_PROGRESS(ab)) {
4292                         /*
4293                          * Already on the path to destruction.
4294                          */
4295                         mutex_exit(hash_lock);
4296                         continue;
4297                 }
4298 
4299                 if (ab->b_state == arc_l2c_only) {
4300                         ASSERT(!HDR_L2_READING(ab));
4301                         /*
4302                          * This doesn't exist in the ARC.  Destroy.
4303                          * arc_hdr_destroy() will call list_remove()
4304                          * and decrement arcstat_l2_size.
4305                          */
4306                         arc_change_state(arc_anon, ab, hash_lock);
4307                         arc_hdr_destroy(ab);
4308                 } else {
4309                         /*
4310                          * Invalidate issued or about to be issued
4311                          * reads, since we may be about to write
4312                          * over this location.
4313                          */
4314                         if (HDR_L2_READING(ab)) {
4315                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4316                                 ab->b_flags |= ARC_L2_EVICTED;
4317                         }
4318 
4319                         /*
4320                          * Tell ARC this no longer exists in L2ARC.
4321                          */
4322                         if (ab->b_l2hdr != NULL) {
4323                                 abl2 = ab->b_l2hdr;
4324                                 ab->b_l2hdr = NULL;
4325                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4326                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4327                         }
4328                         list_remove(buflist, ab);
4329 
4330                         /*
4331                          * This may have been leftover after a
4332                          * failed write.
4333                          */
4334                         ab->b_flags &= ~ARC_L2_WRITING;
4335                 }
4336                 mutex_exit(hash_lock);
4337         }
4338         mutex_exit(&l2arc_buflist_mtx);
4339 
4340         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4341         dev->l2ad_evict = taddr;
4342 }
4343 
4344 /*
4345  * Find and write ARC buffers to the L2ARC device.
4346  *
4347  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4348  * for reading until they have completed writing.
4349  */
4350 static uint64_t
4351 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4352 {
4353         arc_buf_hdr_t *ab, *ab_prev, *head;
4354         l2arc_buf_hdr_t *hdrl2;
4355         list_t *list;
4356         uint64_t passed_sz, write_sz, buf_sz, headroom;
4357         void *buf_data;
4358         kmutex_t *hash_lock, *list_lock;
4359         boolean_t have_lock, full;
4360         l2arc_write_callback_t *cb;
4361         zio_t *pio, *wzio;
4362         uint64_t guid = spa_load_guid(spa);
4363 
4364         ASSERT(dev->l2ad_vdev != NULL);
4365 
4366         pio = NULL;
4367         write_sz = 0;
4368         full = B_FALSE;
4369         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4370         head->b_flags |= ARC_L2_WRITE_HEAD;
4371 
4372         /*
4373          * Copy buffers for L2ARC writing.
4374          */
4375         mutex_enter(&l2arc_buflist_mtx);
4376         for (int try = 0; try <= 3; try++) {
4377                 list = l2arc_list_locked(try, &list_lock);
4378                 passed_sz = 0;
4379 
4380                 /*
4381                  * L2ARC fast warmup.
4382                  *
4383                  * Until the ARC is warm and starts to evict, read from the
4384                  * head of the ARC lists rather than the tail.
4385                  */
4386                 headroom = target_sz * l2arc_headroom;
4387                 if (arc_warm == B_FALSE)
4388                         ab = list_head(list);
4389                 else
4390                         ab = list_tail(list);
4391 
4392                 for (; ab; ab = ab_prev) {
4393                         if (arc_warm == B_FALSE)
4394                                 ab_prev = list_next(list, ab);
4395                         else
4396                                 ab_prev = list_prev(list, ab);
4397 
4398                         hash_lock = HDR_LOCK(ab);
4399                         have_lock = MUTEX_HELD(hash_lock);
4400                         if (!have_lock && !mutex_tryenter(hash_lock)) {
4401                                 /*
4402                                  * Skip this buffer rather than waiting.
4403                                  */
4404                                 continue;
4405                         }
4406 
4407                         passed_sz += ab->b_size;
4408                         if (passed_sz > headroom) {
4409                                 /*
4410                                  * Searched too far.
4411                                  */
4412                                 mutex_exit(hash_lock);
4413                                 break;
4414                         }
4415 
4416                         if (!l2arc_write_eligible(guid, ab)) {
4417                                 mutex_exit(hash_lock);
4418                                 continue;
4419                         }
4420 
4421                         if ((write_sz + ab->b_size) > target_sz) {
4422                                 full = B_TRUE;
4423                                 mutex_exit(hash_lock);
4424                                 break;
4425                         }
4426 
4427                         if (pio == NULL) {
4428                                 /*
4429                                  * Insert a dummy header on the buflist so
4430                                  * l2arc_write_done() can find where the
4431                                  * write buffers begin without searching.
4432                                  */
4433                                 list_insert_head(dev->l2ad_buflist, head);
4434 
4435                                 cb = kmem_alloc(
4436                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4437                                 cb->l2wcb_dev = dev;
4438                                 cb->l2wcb_head = head;
4439                                 pio = zio_root(spa, l2arc_write_done, cb,
4440                                     ZIO_FLAG_CANFAIL);
4441                         }
4442 
4443                         /*
4444                          * Create and add a new L2ARC header.
4445                          */
4446                         hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4447                         hdrl2->b_dev = dev;
4448                         hdrl2->b_daddr = dev->l2ad_hand;
4449 
4450                         ab->b_flags |= ARC_L2_WRITING;
4451                         ab->b_l2hdr = hdrl2;
4452                         list_insert_head(dev->l2ad_buflist, ab);
4453                         buf_data = ab->b_buf->b_data;
4454                         buf_sz = ab->b_size;
4455 
4456                         /*
4457                          * Compute and store the buffer cksum before
4458                          * writing.  On debug the cksum is verified first.
4459                          */
4460                         arc_cksum_verify(ab->b_buf);
4461                         arc_cksum_compute(ab->b_buf, B_TRUE);
4462 
4463                         mutex_exit(hash_lock);
4464 
4465                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4466                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4467                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4468                             ZIO_FLAG_CANFAIL, B_FALSE);
4469 
4470                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4471                             zio_t *, wzio);
4472                         (void) zio_nowait(wzio);
4473 
4474                         /*
4475                          * Keep the clock hand suitably device-aligned.
4476                          */
4477                         buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4478 
4479                         write_sz += buf_sz;
4480                         dev->l2ad_hand += buf_sz;
4481                 }
4482 
4483                 mutex_exit(list_lock);
4484 
4485                 if (full == B_TRUE)
4486                         break;
4487         }
4488         mutex_exit(&l2arc_buflist_mtx);
4489 
4490         if (pio == NULL) {
4491                 ASSERT0(write_sz);
4492                 kmem_cache_free(hdr_cache, head);
4493                 return (0);
4494         }
4495 
4496         ASSERT3U(write_sz, <=, target_sz);
4497         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4498         ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4499         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4500         vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4501 
4502         /*
4503          * Bump device hand to the device start if it is approaching the end.
4504          * l2arc_evict() will already have evicted ahead for this case.
4505          */
4506         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4507                 vdev_space_update(dev->l2ad_vdev,
4508                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4509                 dev->l2ad_hand = dev->l2ad_start;
4510                 dev->l2ad_evict = dev->l2ad_start;
4511                 dev->l2ad_first = B_FALSE;
4512         }
4513 
4514         dev->l2ad_writing = B_TRUE;
4515         (void) zio_wait(pio);
4516         dev->l2ad_writing = B_FALSE;
4517 
4518         return (write_sz);
4519 }
4520 
4521 /*
4522  * This thread feeds the L2ARC at regular intervals.  This is the beating
4523  * heart of the L2ARC.
4524  */
4525 static void
4526 l2arc_feed_thread(void)
4527 {
4528         callb_cpr_t cpr;
4529         l2arc_dev_t *dev;
4530         spa_t *spa;
4531         uint64_t size, wrote;
4532         clock_t begin, next = ddi_get_lbolt();
4533 
4534         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4535 
4536         mutex_enter(&l2arc_feed_thr_lock);
4537 
4538         while (l2arc_thread_exit == 0) {
4539                 CALLB_CPR_SAFE_BEGIN(&cpr);
4540                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4541                     next);
4542                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4543                 next = ddi_get_lbolt() + hz;
4544 
4545                 /*
4546                  * Quick check for L2ARC devices.
4547                  */
4548                 mutex_enter(&l2arc_dev_mtx);
4549                 if (l2arc_ndev == 0) {
4550                         mutex_exit(&l2arc_dev_mtx);
4551                         continue;
4552                 }
4553                 mutex_exit(&l2arc_dev_mtx);
4554                 begin = ddi_get_lbolt();
4555 
4556                 /*
4557                  * This selects the next l2arc device to write to, and in
4558                  * doing so the next spa to feed from: dev->l2ad_spa.   This
4559                  * will return NULL if there are now no l2arc devices or if
4560                  * they are all faulted.
4561                  *
4562                  * If a device is returned, its spa's config lock is also
4563                  * held to prevent device removal.  l2arc_dev_get_next()
4564                  * will grab and release l2arc_dev_mtx.
4565                  */
4566                 if ((dev = l2arc_dev_get_next()) == NULL)
4567                         continue;
4568 
4569                 spa = dev->l2ad_spa;
4570                 ASSERT(spa != NULL);
4571 
4572                 /*
4573                  * If the pool is read-only then force the feed thread to
4574                  * sleep a little longer.
4575                  */
4576                 if (!spa_writeable(spa)) {
4577                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4578                         spa_config_exit(spa, SCL_L2ARC, dev);
4579                         continue;
4580                 }
4581 
4582                 /*
4583                  * Avoid contributing to memory pressure.
4584                  */
4585                 if (arc_reclaim_needed()) {
4586                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4587                         spa_config_exit(spa, SCL_L2ARC, dev);
4588                         continue;
4589                 }
4590 
4591                 ARCSTAT_BUMP(arcstat_l2_feeds);
4592 
4593                 size = l2arc_write_size(dev);
4594 
4595                 /*
4596                  * Evict L2ARC buffers that will be overwritten.
4597                  */
4598                 l2arc_evict(dev, size, B_FALSE);
4599 
4600                 /*
4601                  * Write ARC buffers.
4602                  */
4603                 wrote = l2arc_write_buffers(spa, dev, size);
4604 
4605                 /*
4606                  * Calculate interval between writes.
4607                  */
4608                 next = l2arc_write_interval(begin, size, wrote);
4609                 spa_config_exit(spa, SCL_L2ARC, dev);
4610         }
4611 
4612         l2arc_thread_exit = 0;
4613         cv_broadcast(&l2arc_feed_thr_cv);
4614         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
4615         thread_exit();
4616 }
4617 
4618 boolean_t
4619 l2arc_vdev_present(vdev_t *vd)
4620 {
4621         l2arc_dev_t *dev;
4622 
4623         mutex_enter(&l2arc_dev_mtx);
4624         for (dev = list_head(l2arc_dev_list); dev != NULL;
4625             dev = list_next(l2arc_dev_list, dev)) {
4626                 if (dev->l2ad_vdev == vd)
4627                         break;
4628         }
4629         mutex_exit(&l2arc_dev_mtx);
4630 
4631         return (dev != NULL);
4632 }
4633 
4634 /*
4635  * Add a vdev for use by the L2ARC.  By this point the spa has already
4636  * validated the vdev and opened it.
4637  */
4638 void
4639 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4640 {
4641         l2arc_dev_t *adddev;
4642 
4643         ASSERT(!l2arc_vdev_present(vd));
4644 
4645         /*
4646          * Create a new l2arc device entry.
4647          */
4648         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4649         adddev->l2ad_spa = spa;
4650         adddev->l2ad_vdev = vd;
4651         adddev->l2ad_write = l2arc_write_max;
4652         adddev->l2ad_boost = l2arc_write_boost;
4653         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4654         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4655         adddev->l2ad_hand = adddev->l2ad_start;
4656         adddev->l2ad_evict = adddev->l2ad_start;
4657         adddev->l2ad_first = B_TRUE;
4658         adddev->l2ad_writing = B_FALSE;
4659         ASSERT3U(adddev->l2ad_write, >, 0);
4660 
4661         /*
4662          * This is a list of all ARC buffers that are still valid on the
4663          * device.
4664          */
4665         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4666         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4667             offsetof(arc_buf_hdr_t, b_l2node));
4668 
4669         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4670 
4671         /*
4672          * Add device to global list
4673          */
4674         mutex_enter(&l2arc_dev_mtx);
4675         list_insert_head(l2arc_dev_list, adddev);
4676         atomic_inc_64(&l2arc_ndev);
4677         mutex_exit(&l2arc_dev_mtx);
4678 }
4679 
4680 /*
4681  * Remove a vdev from the L2ARC.
4682  */
4683 void
4684 l2arc_remove_vdev(vdev_t *vd)
4685 {
4686         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4687 
4688         /*
4689          * Find the device by vdev
4690          */
4691         mutex_enter(&l2arc_dev_mtx);
4692         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4693                 nextdev = list_next(l2arc_dev_list, dev);
4694                 if (vd == dev->l2ad_vdev) {
4695                         remdev = dev;
4696                         break;
4697                 }
4698         }
4699         ASSERT(remdev != NULL);
4700 
4701         /*
4702          * Remove device from global list
4703          */
4704         list_remove(l2arc_dev_list, remdev);
4705         l2arc_dev_last = NULL;          /* may have been invalidated */
4706         atomic_dec_64(&l2arc_ndev);
4707         mutex_exit(&l2arc_dev_mtx);
4708 
4709         /*
4710          * Clear all buflists and ARC references.  L2ARC device flush.
4711          */
4712         l2arc_evict(remdev, 0, B_TRUE);
4713         list_destroy(remdev->l2ad_buflist);
4714         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4715         kmem_free(remdev, sizeof (l2arc_dev_t));
4716 }
4717 
4718 void
4719 l2arc_init(void)
4720 {
4721         l2arc_thread_exit = 0;
4722         l2arc_ndev = 0;
4723         l2arc_writes_sent = 0;
4724         l2arc_writes_done = 0;
4725 
4726         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4727         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4728         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4729         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4730         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4731 
4732         l2arc_dev_list = &L2ARC_dev_list;
4733         l2arc_free_on_write = &L2ARC_free_on_write;
4734         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4735             offsetof(l2arc_dev_t, l2ad_node));
4736         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4737             offsetof(l2arc_data_free_t, l2df_list_node));
4738 }
4739 
4740 void
4741 l2arc_fini(void)
4742 {
4743         /*
4744          * This is called from dmu_fini(), which is called from spa_fini();
4745          * Because of this, we can assume that all l2arc devices have
4746          * already been removed when the pools themselves were removed.
4747          */
4748 
4749         l2arc_do_free_on_write();
4750 
4751         mutex_destroy(&l2arc_feed_thr_lock);
4752         cv_destroy(&l2arc_feed_thr_cv);
4753         mutex_destroy(&l2arc_dev_mtx);
4754         mutex_destroy(&l2arc_buflist_mtx);
4755         mutex_destroy(&l2arc_free_on_write_mtx);
4756 
4757         list_destroy(l2arc_dev_list);
4758         list_destroy(l2arc_free_on_write);
4759 }
4760 
4761 void
4762 l2arc_start(void)
4763 {
4764         if (!(spa_mode_global & FWRITE))
4765                 return;
4766 
4767         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4768             TS_RUN, minclsyspri);
4769 }
4770 
4771 void
4772 l2arc_stop(void)
4773 {
4774         if (!(spa_mode_global & FWRITE))
4775                 return;
4776 
4777         mutex_enter(&l2arc_feed_thr_lock);
4778         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
4779         l2arc_thread_exit = 1;
4780         while (l2arc_thread_exit != 0)
4781                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4782         mutex_exit(&l2arc_feed_thr_lock);
4783 }