illumos-3741 Wdiff usr/src/uts/common/fs/zfs/arc.c

Print this page

3741 zfs needs better comments
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * DVA-based Adjustable Replacement Cache
  29   29   *
  30   30   * While much of the theory of operation used here is
  31   31   * based on the self-tuning, low overhead replacement cache
  32   32   * presented by Megiddo and Modha at FAST 2003, there are some
  33   33   * significant differences:
  34   34   *
  35   35   * 1. The Megiddo and Modha model assumes any page is evictable.
  36   36   * Pages in its cache cannot be "locked" into memory.  This makes
  37   37   * the eviction algorithm simple: evict the last page in the list.
  38   38   * This also make the performance characteristics easy to reason
  39   39   * about.  Our cache is not so simple.  At any given moment, some
  40   40   * subset of the blocks in the cache are un-evictable because we
  41   41   * have handed out a reference to them.  Blocks are only evictable
  42   42   * when there are no external references active.  This makes
  43   43   * eviction far more problematic:  we choose to evict the evictable
  44   44   * blocks that are the "lowest" in the list.
  45   45   *
  46   46   * There are times when it is not possible to evict the requested
  47   47   * space.  In these circumstances we are unable to adjust the cache
  48   48   * size.  To prevent the cache growing unbounded at these times we
  49   49   * implement a "cache throttle" that slows the flow of new data
  50   50   * into the cache until we can make space available.
  51   51   *
  52   52   * 2. The Megiddo and Modha model assumes a fixed cache size.
  53   53   * Pages are evicted when the cache is full and there is a cache
  54   54   * miss.  Our model has a variable sized cache.  It grows with
  55   55   * high use, but also tries to react to memory pressure from the
  56   56   * operating system: decreasing its size when system memory is
  57   57   * tight.
  58   58   *
  59   59   * 3. The Megiddo and Modha model assumes a fixed page size. All
  60   60   * elements of the cache are therefor exactly the same size.  So
  61   61   * when adjusting the cache size following a cache miss, its simply
  62   62   * a matter of choosing a single page to evict.  In our model, we
  63   63   * have variable sized cache blocks (rangeing from 512 bytes to
  64   64   * 128K bytes).  We therefor choose a set of blocks to evict to make
  65   65   * space for a cache miss that approximates as closely as possible
  66   66   * the space used by the new block.
  67   67   *
  68   68   * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  69   69   * by N. Megiddo & D. Modha, FAST 2003
  70   70   */
  71   71  
  72   72  /*
  73   73   * The locking model:
  74   74   *
  75   75   * A new reference to a cache buffer can be obtained in two
  76   76   * ways: 1) via a hash table lookup using the DVA as a key,
  77   77   * or 2) via one of the ARC lists.  The arc_read() interface
  78   78   * uses method 1, while the internal arc algorithms for
  79   79   * adjusting the cache use method 2.  We therefor provide two
  80   80   * types of locks: 1) the hash table lock array, and 2) the
  81   81   * arc list locks.
  82   82   *
  83   83   * Buffers do not have their own mutexes, rather they rely on the
  84   84   * hash table mutexes for the bulk of their protection (i.e. most
  85   85   * fields in the arc_buf_hdr_t are protected by these mutexes).
  86   86   *
  87   87   * buf_hash_find() returns the appropriate mutex (held) when it
  88   88   * locates the requested buffer in the hash table.  It returns
  89   89   * NULL for the mutex if the buffer was not in the table.
  90   90   *
  91   91   * buf_hash_remove() expects the appropriate hash mutex to be
  92   92   * already held before it is invoked.
  93   93   *
  94   94   * Each arc state also has a mutex which is used to protect the
  95   95   * buffer list associated with the state.  When attempting to
  96   96   * obtain a hash table lock while holding an arc list lock you
  97   97   * must use: mutex_tryenter() to avoid deadlock.  Also note that
  98   98   * the active state mutex must be held before the ghost state mutex.
  99   99   *
 100  100   * Arc buffers may have an associated eviction callback function.
 101  101   * This function will be invoked prior to removing the buffer (e.g.
 102  102   * in arc_do_user_evicts()).  Note however that the data associated
 103  103   * with the buffer may be evicted prior to the callback.  The callback
 104  104   * must be made with *no locks held* (to prevent deadlock).  Additionally,
 105  105   * the users of callbacks must ensure that their private data is
 106  106   * protected from simultaneous callbacks from arc_buf_evict()
 107  107   * and arc_do_user_evicts().
 108  108   *
 109  109   * Note that the majority of the performance stats are manipulated
 110  110   * with atomic operations.
 111  111   *
 112  112   * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 113  113   *
 114  114   *      - L2ARC buflist creation
 115  115   *      - L2ARC buflist eviction
 116  116   *      - L2ARC write completion, which walks L2ARC buflists
 117  117   *      - ARC header destruction, as it removes from L2ARC buflists
 118  118   *      - ARC header release, as it removes from L2ARC buflists
 119  119   */
 120  120  
 121  121  #include <sys/spa.h>
 122  122  #include <sys/zio.h>
 123  123  #include <sys/zfs_context.h>
 124  124  #include <sys/arc.h>
 125  125  #include <sys/refcount.h>
 126  126  #include <sys/vdev.h>
 127  127  #include <sys/vdev_impl.h>
 128  128  #ifdef _KERNEL
 129  129  #include <sys/vmsystm.h>
 130  130  #include <vm/anon.h>
 131  131  #include <sys/fs/swapnode.h>
 132  132  #include <sys/dnlc.h>
 133  133  #endif
 134  134  #include <sys/callb.h>
 135  135  #include <sys/kstat.h>
 136  136  #include <zfs_fletcher.h>
 137  137  
 138  138  #ifndef _KERNEL
 139  139  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 140  140  boolean_t arc_watch = B_FALSE;
 141  141  int arc_procfd;
 142  142  #endif
 143  143  
 144  144  static kmutex_t         arc_reclaim_thr_lock;
 145  145  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 146  146  static uint8_t          arc_thread_exit;
 147  147  
 148  148  extern int zfs_write_limit_shift;
 149  149  extern uint64_t zfs_write_limit_max;
 150  150  extern kmutex_t zfs_write_limit_lock;
 151  151  
 152  152  #define ARC_REDUCE_DNLC_PERCENT 3
 153  153  uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154  154  
 155  155  typedef enum arc_reclaim_strategy {
 156  156          ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157  157          ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158  158  } arc_reclaim_strategy_t;
 159  159  
 160  160  /* number of seconds before growing cache again */
 161  161  static int              arc_grow_retry = 60;
 162  162  
 163  163  /* shift of arc_c for calculating both min and max arc_p */
 164  164  static int              arc_p_min_shift = 4;
 165  165  
 166  166  /* log2(fraction of arc to reclaim) */
 167  167  static int              arc_shrink_shift = 5;
 168  168  
 169  169  /*
 170  170   * minimum lifespan of a prefetch block in clock ticks
 171  171   * (initialized in arc_init())
 172  172   */
 173  173  static int              arc_min_prefetch_lifespan;
 174  174  
 175  175  static int arc_dead;
 176  176  
 177  177  /*
 178  178   * The arc has filled available memory and has now warmed up.
 179  179   */
 180  180  static boolean_t arc_warm;
 181  181  
 182  182  /*
 183  183   * These tunables are for performance analysis.
 184  184   */
 185  185  uint64_t zfs_arc_max;
 186  186  uint64_t zfs_arc_min;
 187  187  uint64_t zfs_arc_meta_limit = 0;
 188  188  int zfs_arc_grow_retry = 0;
 189  189  int zfs_arc_shrink_shift = 0;
 190  190  int zfs_arc_p_min_shift = 0;
 191  191  int zfs_disable_dup_eviction = 0;
 192  192  
 193  193  /*
 194  194   * Note that buffers can be in one of 6 states:
 195  195   *      ARC_anon        - anonymous (discussed below)
 196  196   *      ARC_mru         - recently used, currently cached
 197  197   *      ARC_mru_ghost   - recentely used, no longer in cache
 198  198   *      ARC_mfu         - frequently used, currently cached
 199  199   *      ARC_mfu_ghost   - frequently used, no longer in cache
 200  200   *      ARC_l2c_only    - exists in L2ARC but not other states
 201  201   * When there are no active references to the buffer, they are
 202  202   * are linked onto a list in one of these arc states.  These are
 203  203   * the only buffers that can be evicted or deleted.  Within each
 204  204   * state there are multiple lists, one for meta-data and one for
 205  205   * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 206  206   * etc.) is tracked separately so that it can be managed more
 207  207   * explicitly: favored over data, limited explicitly.
 208  208   *
 209  209   * Anonymous buffers are buffers that are not associated with
 210  210   * a DVA.  These are buffers that hold dirty block copies
 211  211   * before they are written to stable storage.  By definition,
 212  212   * they are "ref'd" and are considered part of arc_mru
 213  213   * that cannot be freed.  Generally, they will aquire a DVA
 214  214   * as they are written and migrate onto the arc_mru list.
 215  215   *
 216  216   * The ARC_l2c_only state is for buffers that are in the second
 217  217   * level ARC but no longer in any of the ARC_m* lists.  The second
 218  218   * level ARC itself may also contain buffers that are in any of
 219  219   * the ARC_m* states - meaning that a buffer can exist in two
 220  220   * places.  The reason for the ARC_l2c_only state is to keep the
 221  221   * buffer header in the hash table, so that reads that hit the
 222  222   * second level ARC benefit from these fast lookups.
 223  223   */
 224  224  
 225  225  typedef struct arc_state {
 226  226          list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 227  227          uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 228  228          uint64_t arcs_size;     /* total amount of data in this state */
 229  229          kmutex_t arcs_mtx;
 230  230  } arc_state_t;
 231  231  
 232  232  /* The 6 states: */
 233  233  static arc_state_t ARC_anon;
 234  234  static arc_state_t ARC_mru;
 235  235  static arc_state_t ARC_mru_ghost;
 236  236  static arc_state_t ARC_mfu;
 237  237  static arc_state_t ARC_mfu_ghost;
 238  238  static arc_state_t ARC_l2c_only;
 239  239  
 240  240  typedef struct arc_stats {
 241  241          kstat_named_t arcstat_hits;
 242  242          kstat_named_t arcstat_misses;
 243  243          kstat_named_t arcstat_demand_data_hits;
 244  244          kstat_named_t arcstat_demand_data_misses;
 245  245          kstat_named_t arcstat_demand_metadata_hits;
 246  246          kstat_named_t arcstat_demand_metadata_misses;

↓ open down ↓

246 lines elided

↑ open up ↑

 247  247          kstat_named_t arcstat_prefetch_data_hits;
 248  248          kstat_named_t arcstat_prefetch_data_misses;
 249  249          kstat_named_t arcstat_prefetch_metadata_hits;
 250  250          kstat_named_t arcstat_prefetch_metadata_misses;
 251  251          kstat_named_t arcstat_mru_hits;
 252  252          kstat_named_t arcstat_mru_ghost_hits;
 253  253          kstat_named_t arcstat_mfu_hits;
 254  254          kstat_named_t arcstat_mfu_ghost_hits;
 255  255          kstat_named_t arcstat_deleted;
 256  256          kstat_named_t arcstat_recycle_miss;
      257 +        /*
      258 +         * Number of buffers that could not be evicted because the hash lock
      259 +         * was held by another thread.  The lock may not necessarily be held
      260 +         * by something using the same buffer, since hash locks are shared
      261 +         * by multiple buffers.
      262 +         */
 257  263          kstat_named_t arcstat_mutex_miss;
      264 +        /*
      265 +         * Number of buffers skipped because they have I/O in progress, are
      266 +         * indrect prefetch buffers that have not lived long enough, or are
      267 +         * not from the spa we're trying to evict from.
      268 +         */
 258  269          kstat_named_t arcstat_evict_skip;
 259  270          kstat_named_t arcstat_evict_l2_cached;
 260  271          kstat_named_t arcstat_evict_l2_eligible;
 261  272          kstat_named_t arcstat_evict_l2_ineligible;
 262  273          kstat_named_t arcstat_hash_elements;
 263  274          kstat_named_t arcstat_hash_elements_max;
 264  275          kstat_named_t arcstat_hash_collisions;
 265  276          kstat_named_t arcstat_hash_chains;
 266  277          kstat_named_t arcstat_hash_chain_max;
 267  278          kstat_named_t arcstat_p;

 268  279          kstat_named_t arcstat_c;
 269  280          kstat_named_t arcstat_c_min;
 270  281          kstat_named_t arcstat_c_max;
 271  282          kstat_named_t arcstat_size;
 272  283          kstat_named_t arcstat_hdr_size;
 273  284          kstat_named_t arcstat_data_size;
 274  285          kstat_named_t arcstat_other_size;
 275  286          kstat_named_t arcstat_l2_hits;
 276  287          kstat_named_t arcstat_l2_misses;
 277  288          kstat_named_t arcstat_l2_feeds;
 278  289          kstat_named_t arcstat_l2_rw_clash;
 279  290          kstat_named_t arcstat_l2_read_bytes;
 280  291          kstat_named_t arcstat_l2_write_bytes;
 281  292          kstat_named_t arcstat_l2_writes_sent;
 282  293          kstat_named_t arcstat_l2_writes_done;
 283  294          kstat_named_t arcstat_l2_writes_error;
 284  295          kstat_named_t arcstat_l2_writes_hdr_miss;
 285  296          kstat_named_t arcstat_l2_evict_lock_retry;
 286  297          kstat_named_t arcstat_l2_evict_reading;
 287  298          kstat_named_t arcstat_l2_free_on_write;
 288  299          kstat_named_t arcstat_l2_abort_lowmem;
 289  300          kstat_named_t arcstat_l2_cksum_bad;
 290  301          kstat_named_t arcstat_l2_io_error;
 291  302          kstat_named_t arcstat_l2_size;
 292  303          kstat_named_t arcstat_l2_hdr_size;
 293  304          kstat_named_t arcstat_memory_throttle_count;
 294  305          kstat_named_t arcstat_duplicate_buffers;
 295  306          kstat_named_t arcstat_duplicate_buffers_size;
 296  307          kstat_named_t arcstat_duplicate_reads;
 297  308          kstat_named_t arcstat_meta_used;
 298  309          kstat_named_t arcstat_meta_limit;
 299  310          kstat_named_t arcstat_meta_max;
 300  311  } arc_stats_t;
 301  312  
 302  313  static arc_stats_t arc_stats = {
 303  314          { "hits",                       KSTAT_DATA_UINT64 },
 304  315          { "misses",                     KSTAT_DATA_UINT64 },
 305  316          { "demand_data_hits",           KSTAT_DATA_UINT64 },
 306  317          { "demand_data_misses",         KSTAT_DATA_UINT64 },
 307  318          { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 308  319          { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 309  320          { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 310  321          { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 311  322          { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 312  323          { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 313  324          { "mru_hits",                   KSTAT_DATA_UINT64 },
 314  325          { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 315  326          { "mfu_hits",                   KSTAT_DATA_UINT64 },
 316  327          { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 317  328          { "deleted",                    KSTAT_DATA_UINT64 },
 318  329          { "recycle_miss",               KSTAT_DATA_UINT64 },
 319  330          { "mutex_miss",                 KSTAT_DATA_UINT64 },
 320  331          { "evict_skip",                 KSTAT_DATA_UINT64 },
 321  332          { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 322  333          { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 323  334          { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 324  335          { "hash_elements",              KSTAT_DATA_UINT64 },
 325  336          { "hash_elements_max",          KSTAT_DATA_UINT64 },
 326  337          { "hash_collisions",            KSTAT_DATA_UINT64 },
 327  338          { "hash_chains",                KSTAT_DATA_UINT64 },
 328  339          { "hash_chain_max",             KSTAT_DATA_UINT64 },
 329  340          { "p",                          KSTAT_DATA_UINT64 },
 330  341          { "c",                          KSTAT_DATA_UINT64 },
 331  342          { "c_min",                      KSTAT_DATA_UINT64 },
 332  343          { "c_max",                      KSTAT_DATA_UINT64 },
 333  344          { "size",                       KSTAT_DATA_UINT64 },
 334  345          { "hdr_size",                   KSTAT_DATA_UINT64 },
 335  346          { "data_size",                  KSTAT_DATA_UINT64 },
 336  347          { "other_size",                 KSTAT_DATA_UINT64 },
 337  348          { "l2_hits",                    KSTAT_DATA_UINT64 },
 338  349          { "l2_misses",                  KSTAT_DATA_UINT64 },
 339  350          { "l2_feeds",                   KSTAT_DATA_UINT64 },
 340  351          { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 341  352          { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 342  353          { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 343  354          { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 344  355          { "l2_writes_done",             KSTAT_DATA_UINT64 },
 345  356          { "l2_writes_error",            KSTAT_DATA_UINT64 },
 346  357          { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 347  358          { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 348  359          { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 349  360          { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 350  361          { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 351  362          { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 352  363          { "l2_io_error",                KSTAT_DATA_UINT64 },
 353  364          { "l2_size",                    KSTAT_DATA_UINT64 },
 354  365          { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 355  366          { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 356  367          { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 357  368          { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 358  369          { "duplicate_reads",            KSTAT_DATA_UINT64 },
 359  370          { "arc_meta_used",              KSTAT_DATA_UINT64 },
 360  371          { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 361  372          { "arc_meta_max",               KSTAT_DATA_UINT64 }
 362  373  };
 363  374  
 364  375  #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 365  376  
 366  377  #define ARCSTAT_INCR(stat, val) \
 367  378          atomic_add_64(&arc_stats.stat.value.ui64, (val));
 368  379  
 369  380  #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 370  381  #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 371  382  
 372  383  #define ARCSTAT_MAX(stat, val) {                                        \
 373  384          uint64_t m;                                                     \
 374  385          while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 375  386              (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 376  387                  continue;                                               \
 377  388  }
 378  389  
 379  390  #define ARCSTAT_MAXSTAT(stat) \
 380  391          ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 381  392  
 382  393  /*
 383  394   * We define a macro to allow ARC hits/misses to be easily broken down by
 384  395   * two separate conditions, giving a total of four different subtypes for
 385  396   * each of hits and misses (so eight statistics total).
 386  397   */
 387  398  #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 388  399          if (cond1) {                                                    \
 389  400                  if (cond2) {                                            \
 390  401                          ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 391  402                  } else {                                                \
 392  403                          ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 393  404                  }                                                       \
 394  405          } else {                                                        \
 395  406                  if (cond2) {                                            \
 396  407                          ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 397  408                  } else {                                                \
 398  409                          ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 399  410                  }                                                       \
 400  411          }
 401  412  
 402  413  kstat_t                 *arc_ksp;
 403  414  static arc_state_t      *arc_anon;
 404  415  static arc_state_t      *arc_mru;
 405  416  static arc_state_t      *arc_mru_ghost;
 406  417  static arc_state_t      *arc_mfu;
 407  418  static arc_state_t      *arc_mfu_ghost;
 408  419  static arc_state_t      *arc_l2c_only;
 409  420  
 410  421  /*
 411  422   * There are several ARC variables that are critical to export as kstats --
 412  423   * but we don't want to have to grovel around in the kstat whenever we wish to
 413  424   * manipulate them.  For these variables, we therefore define them to be in
 414  425   * terms of the statistic variable.  This assures that we are not introducing
 415  426   * the possibility of inconsistency by having shadow copies of the variables,
 416  427   * while still allowing the code to be readable.
 417  428   */
 418  429  #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 419  430  #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 420  431  #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 421  432  #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 422  433  #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 423  434  #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 424  435  #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 425  436  #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 426  437  
 427  438  static int              arc_no_grow;    /* Don't try to grow cache size */
 428  439  static uint64_t         arc_tempreserve;
 429  440  static uint64_t         arc_loaned_bytes;
 430  441  
 431  442  typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 432  443  
 433  444  typedef struct arc_callback arc_callback_t;
 434  445  
 435  446  struct arc_callback {
 436  447          void                    *acb_private;
 437  448          arc_done_func_t         *acb_done;
 438  449          arc_buf_t               *acb_buf;
 439  450          zio_t                   *acb_zio_dummy;
 440  451          arc_callback_t          *acb_next;
 441  452  };
 442  453  
 443  454  typedef struct arc_write_callback arc_write_callback_t;
 444  455  
 445  456  struct arc_write_callback {
 446  457          void            *awcb_private;
 447  458          arc_done_func_t *awcb_ready;
 448  459          arc_done_func_t *awcb_done;
 449  460          arc_buf_t       *awcb_buf;
 450  461  };
 451  462  
 452  463  struct arc_buf_hdr {
 453  464          /* protected by hash lock */
 454  465          dva_t                   b_dva;
 455  466          uint64_t                b_birth;
 456  467          uint64_t                b_cksum0;
 457  468  
 458  469          kmutex_t                b_freeze_lock;
 459  470          zio_cksum_t             *b_freeze_cksum;
 460  471          void                    *b_thawed;
 461  472  
 462  473          arc_buf_hdr_t           *b_hash_next;
 463  474          arc_buf_t               *b_buf;
 464  475          uint32_t                b_flags;
 465  476          uint32_t                b_datacnt;
 466  477  
 467  478          arc_callback_t          *b_acb;
 468  479          kcondvar_t              b_cv;
 469  480  
 470  481          /* immutable */
 471  482          arc_buf_contents_t      b_type;
 472  483          uint64_t                b_size;
 473  484          uint64_t                b_spa;
 474  485  
 475  486          /* protected by arc state mutex */
 476  487          arc_state_t             *b_state;
 477  488          list_node_t             b_arc_node;
 478  489  
 479  490          /* updated atomically */
 480  491          clock_t                 b_arc_access;
 481  492  
 482  493          /* self protecting */
 483  494          refcount_t              b_refcnt;
 484  495  
 485  496          l2arc_buf_hdr_t         *b_l2hdr;
 486  497          list_node_t             b_l2node;
 487  498  };
 488  499  
 489  500  static arc_buf_t *arc_eviction_list;
 490  501  static kmutex_t arc_eviction_mtx;
 491  502  static arc_buf_hdr_t arc_eviction_hdr;
 492  503  static void arc_get_data_buf(arc_buf_t *buf);
 493  504  static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 494  505  static int arc_evict_needed(arc_buf_contents_t type);
 495  506  static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 496  507  static void arc_buf_watch(arc_buf_t *buf);
 497  508  
 498  509  static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 499  510  
 500  511  #define GHOST_STATE(state)      \
 501  512          ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 502  513          (state) == arc_l2c_only)
 503  514  
 504  515  /*
 505  516   * Private ARC flags.  These flags are private ARC only flags that will show up
 506  517   * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 507  518   * be passed in as arc_flags in things like arc_read.  However, these flags
 508  519   * should never be passed and should only be set by ARC code.  When adding new
 509  520   * public flags, make sure not to smash the private ones.
 510  521   */
 511  522  
 512  523  #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 513  524  #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 514  525  #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 515  526  #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 516  527  #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 517  528  #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 518  529  #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 519  530  #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 520  531  #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 521  532  #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 522  533  
 523  534  #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 524  535  #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 525  536  #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 526  537  #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 527  538  #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 528  539  #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 529  540  #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 530  541  #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 531  542  #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 532  543                                      (hdr)->b_l2hdr != NULL)
 533  544  #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 534  545  #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 535  546  #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 536  547  
 537  548  /*
 538  549   * Other sizes
 539  550   */
 540  551  
 541  552  #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 542  553  #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 543  554  
 544  555  /*
 545  556   * Hash table routines
 546  557   */
 547  558  
 548  559  #define HT_LOCK_PAD     64
 549  560  
 550  561  struct ht_lock {
 551  562          kmutex_t        ht_lock;
 552  563  #ifdef _KERNEL
 553  564          unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 554  565  #endif
 555  566  };
 556  567  
 557  568  #define BUF_LOCKS 256
 558  569  typedef struct buf_hash_table {
 559  570          uint64_t ht_mask;
 560  571          arc_buf_hdr_t **ht_table;
 561  572          struct ht_lock ht_locks[BUF_LOCKS];
 562  573  } buf_hash_table_t;
 563  574  
 564  575  static buf_hash_table_t buf_hash_table;
 565  576  
 566  577  #define BUF_HASH_INDEX(spa, dva, birth) \
 567  578          (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 568  579  #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 569  580  #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 570  581  #define HDR_LOCK(hdr) \
 571  582          (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 572  583  
 573  584  uint64_t zfs_crc64_table[256];
 574  585  
 575  586  /*
 576  587   * Level 2 ARC
 577  588   */
 578  589  
 579  590  #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 580  591  #define L2ARC_HEADROOM          2               /* num of writes */
 581  592  #define L2ARC_FEED_SECS         1               /* caching interval secs */
 582  593  #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 583  594  
 584  595  #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 585  596  #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 586  597  
 587  598  /*
 588  599   * L2ARC Performance Tunables
 589  600   */
 590  601  uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 591  602  uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 592  603  uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 593  604  uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 594  605  uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 595  606  boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 596  607  boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 597  608  boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 598  609  
 599  610  /*
 600  611   * L2ARC Internals
 601  612   */
 602  613  typedef struct l2arc_dev {
 603  614          vdev_t                  *l2ad_vdev;     /* vdev */
 604  615          spa_t                   *l2ad_spa;      /* spa */
 605  616          uint64_t                l2ad_hand;      /* next write location */
 606  617          uint64_t                l2ad_write;     /* desired write size, bytes */
 607  618          uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 608  619          uint64_t                l2ad_start;     /* first addr on device */
 609  620          uint64_t                l2ad_end;       /* last addr on device */
 610  621          uint64_t                l2ad_evict;     /* last addr eviction reached */
 611  622          boolean_t               l2ad_first;     /* first sweep through */
 612  623          boolean_t               l2ad_writing;   /* currently writing */
 613  624          list_t                  *l2ad_buflist;  /* buffer list */
 614  625          list_node_t             l2ad_node;      /* device list node */
 615  626  } l2arc_dev_t;
 616  627  
 617  628  static list_t L2ARC_dev_list;                   /* device list */
 618  629  static list_t *l2arc_dev_list;                  /* device list pointer */
 619  630  static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 620  631  static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 621  632  static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 622  633  static list_t L2ARC_free_on_write;              /* free after write buf list */
 623  634  static list_t *l2arc_free_on_write;             /* free after write list ptr */
 624  635  static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 625  636  static uint64_t l2arc_ndev;                     /* number of devices */
 626  637  
 627  638  typedef struct l2arc_read_callback {
 628  639          arc_buf_t       *l2rcb_buf;             /* read buffer */
 629  640          spa_t           *l2rcb_spa;             /* spa */
 630  641          blkptr_t        l2rcb_bp;               /* original blkptr */
 631  642          zbookmark_t     l2rcb_zb;               /* original bookmark */
 632  643          int             l2rcb_flags;            /* original flags */
 633  644  } l2arc_read_callback_t;
 634  645  
 635  646  typedef struct l2arc_write_callback {
 636  647          l2arc_dev_t     *l2wcb_dev;             /* device info */
 637  648          arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 638  649  } l2arc_write_callback_t;
 639  650  
 640  651  struct l2arc_buf_hdr {
 641  652          /* protected by arc_buf_hdr  mutex */
 642  653          l2arc_dev_t     *b_dev;                 /* L2ARC device */
 643  654          uint64_t        b_daddr;                /* disk address, offset byte */
 644  655  };
 645  656  
 646  657  typedef struct l2arc_data_free {
 647  658          /* protected by l2arc_free_on_write_mtx */
 648  659          void            *l2df_data;
 649  660          size_t          l2df_size;
 650  661          void            (*l2df_func)(void *, size_t);
 651  662          list_node_t     l2df_list_node;
 652  663  } l2arc_data_free_t;
 653  664  
 654  665  static kmutex_t l2arc_feed_thr_lock;
 655  666  static kcondvar_t l2arc_feed_thr_cv;
 656  667  static uint8_t l2arc_thread_exit;
 657  668  
 658  669  static void l2arc_read_done(zio_t *zio);
 659  670  static void l2arc_hdr_stat_add(void);
 660  671  static void l2arc_hdr_stat_remove(void);
 661  672  
 662  673  static uint64_t
 663  674  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 664  675  {
 665  676          uint8_t *vdva = (uint8_t *)dva;
 666  677          uint64_t crc = -1ULL;
 667  678          int i;
 668  679  
 669  680          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 670  681  
 671  682          for (i = 0; i < sizeof (dva_t); i++)
 672  683                  crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 673  684  
 674  685          crc ^= (spa>>8) ^ birth;
 675  686  
 676  687          return (crc);
 677  688  }
 678  689  
 679  690  #define BUF_EMPTY(buf)                                          \
 680  691          ((buf)->b_dva.dva_word[0] == 0 &&                       \
 681  692          (buf)->b_dva.dva_word[1] == 0 &&                        \
 682  693          (buf)->b_birth == 0)
 683  694  
 684  695  #define BUF_EQUAL(spa, dva, birth, buf)                         \
 685  696          ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 686  697          ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 687  698          ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 688  699  
 689  700  static void
 690  701  buf_discard_identity(arc_buf_hdr_t *hdr)
 691  702  {
 692  703          hdr->b_dva.dva_word[0] = 0;
 693  704          hdr->b_dva.dva_word[1] = 0;
 694  705          hdr->b_birth = 0;
 695  706          hdr->b_cksum0 = 0;
 696  707  }
 697  708  
 698  709  static arc_buf_hdr_t *
 699  710  buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 700  711  {
 701  712          uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 702  713          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 703  714          arc_buf_hdr_t *buf;
 704  715  
 705  716          mutex_enter(hash_lock);
 706  717          for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 707  718              buf = buf->b_hash_next) {
 708  719                  if (BUF_EQUAL(spa, dva, birth, buf)) {
 709  720                          *lockp = hash_lock;
 710  721                          return (buf);
 711  722                  }
 712  723          }
 713  724          mutex_exit(hash_lock);
 714  725          *lockp = NULL;
 715  726          return (NULL);
 716  727  }
 717  728  
 718  729  /*
 719  730   * Insert an entry into the hash table.  If there is already an element
 720  731   * equal to elem in the hash table, then the already existing element
 721  732   * will be returned and the new element will not be inserted.
 722  733   * Otherwise returns NULL.
 723  734   */
 724  735  static arc_buf_hdr_t *
 725  736  buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 726  737  {
 727  738          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 728  739          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 729  740          arc_buf_hdr_t *fbuf;
 730  741          uint32_t i;
 731  742  
 732  743          ASSERT(!HDR_IN_HASH_TABLE(buf));
 733  744          *lockp = hash_lock;
 734  745          mutex_enter(hash_lock);
 735  746          for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 736  747              fbuf = fbuf->b_hash_next, i++) {
 737  748                  if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 738  749                          return (fbuf);
 739  750          }
 740  751  
 741  752          buf->b_hash_next = buf_hash_table.ht_table[idx];
 742  753          buf_hash_table.ht_table[idx] = buf;
 743  754          buf->b_flags |= ARC_IN_HASH_TABLE;
 744  755  
 745  756          /* collect some hash table performance data */
 746  757          if (i > 0) {
 747  758                  ARCSTAT_BUMP(arcstat_hash_collisions);
 748  759                  if (i == 1)
 749  760                          ARCSTAT_BUMP(arcstat_hash_chains);
 750  761  
 751  762                  ARCSTAT_MAX(arcstat_hash_chain_max, i);
 752  763          }
 753  764  
 754  765          ARCSTAT_BUMP(arcstat_hash_elements);
 755  766          ARCSTAT_MAXSTAT(arcstat_hash_elements);
 756  767  
 757  768          return (NULL);
 758  769  }
 759  770  
 760  771  static void
 761  772  buf_hash_remove(arc_buf_hdr_t *buf)
 762  773  {
 763  774          arc_buf_hdr_t *fbuf, **bufp;
 764  775          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 765  776  
 766  777          ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 767  778          ASSERT(HDR_IN_HASH_TABLE(buf));
 768  779  
 769  780          bufp = &buf_hash_table.ht_table[idx];
 770  781          while ((fbuf = *bufp) != buf) {
 771  782                  ASSERT(fbuf != NULL);
 772  783                  bufp = &fbuf->b_hash_next;
 773  784          }
 774  785          *bufp = buf->b_hash_next;
 775  786          buf->b_hash_next = NULL;
 776  787          buf->b_flags &= ~ARC_IN_HASH_TABLE;
 777  788  
 778  789          /* collect some hash table performance data */
 779  790          ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 780  791  
 781  792          if (buf_hash_table.ht_table[idx] &&
 782  793              buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 783  794                  ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 784  795  }
 785  796  
 786  797  /*
 787  798   * Global data structures and functions for the buf kmem cache.
 788  799   */
 789  800  static kmem_cache_t *hdr_cache;
 790  801  static kmem_cache_t *buf_cache;
 791  802  
 792  803  static void
 793  804  buf_fini(void)
 794  805  {
 795  806          int i;
 796  807  
 797  808          kmem_free(buf_hash_table.ht_table,
 798  809              (buf_hash_table.ht_mask + 1) * sizeof (void *));
 799  810          for (i = 0; i < BUF_LOCKS; i++)
 800  811                  mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 801  812          kmem_cache_destroy(hdr_cache);
 802  813          kmem_cache_destroy(buf_cache);
 803  814  }
 804  815  
 805  816  /*
 806  817   * Constructor callback - called when the cache is empty
 807  818   * and a new buf is requested.
 808  819   */
 809  820  /* ARGSUSED */
 810  821  static int
 811  822  hdr_cons(void *vbuf, void *unused, int kmflag)
 812  823  {
 813  824          arc_buf_hdr_t *buf = vbuf;
 814  825  
 815  826          bzero(buf, sizeof (arc_buf_hdr_t));
 816  827          refcount_create(&buf->b_refcnt);
 817  828          cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 818  829          mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 819  830          arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 820  831  
 821  832          return (0);
 822  833  }
 823  834  
 824  835  /* ARGSUSED */
 825  836  static int
 826  837  buf_cons(void *vbuf, void *unused, int kmflag)
 827  838  {
 828  839          arc_buf_t *buf = vbuf;
 829  840  
 830  841          bzero(buf, sizeof (arc_buf_t));
 831  842          mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 832  843          arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 833  844  
 834  845          return (0);
 835  846  }
 836  847  
 837  848  /*
 838  849   * Destructor callback - called when a cached buf is
 839  850   * no longer required.
 840  851   */
 841  852  /* ARGSUSED */
 842  853  static void
 843  854  hdr_dest(void *vbuf, void *unused)
 844  855  {
 845  856          arc_buf_hdr_t *buf = vbuf;
 846  857  
 847  858          ASSERT(BUF_EMPTY(buf));
 848  859          refcount_destroy(&buf->b_refcnt);
 849  860          cv_destroy(&buf->b_cv);
 850  861          mutex_destroy(&buf->b_freeze_lock);
 851  862          arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 852  863  }
 853  864  
 854  865  /* ARGSUSED */
 855  866  static void
 856  867  buf_dest(void *vbuf, void *unused)
 857  868  {
 858  869          arc_buf_t *buf = vbuf;
 859  870  
 860  871          mutex_destroy(&buf->b_evict_lock);
 861  872          arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 862  873  }
 863  874  
 864  875  /*
 865  876   * Reclaim callback -- invoked when memory is low.
 866  877   */
 867  878  /* ARGSUSED */
 868  879  static void
 869  880  hdr_recl(void *unused)
 870  881  {
 871  882          dprintf("hdr_recl called\n");
 872  883          /*
 873  884           * umem calls the reclaim func when we destroy the buf cache,
 874  885           * which is after we do arc_fini().
 875  886           */
 876  887          if (!arc_dead)
 877  888                  cv_signal(&arc_reclaim_thr_cv);
 878  889  }
 879  890  
 880  891  static void
 881  892  buf_init(void)
 882  893  {
 883  894          uint64_t *ct;
 884  895          uint64_t hsize = 1ULL << 12;
 885  896          int i, j;
 886  897  
 887  898          /*
 888  899           * The hash table is big enough to fill all of physical memory
 889  900           * with an average 64K block size.  The table will take up
 890  901           * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 891  902           */
 892  903          while (hsize * 65536 < physmem * PAGESIZE)
 893  904                  hsize <<= 1;
 894  905  retry:
 895  906          buf_hash_table.ht_mask = hsize - 1;
 896  907          buf_hash_table.ht_table =
 897  908              kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 898  909          if (buf_hash_table.ht_table == NULL) {
 899  910                  ASSERT(hsize > (1ULL << 8));
 900  911                  hsize >>= 1;
 901  912                  goto retry;
 902  913          }
 903  914  
 904  915          hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 905  916              0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 906  917          buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 907  918              0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 908  919  
 909  920          for (i = 0; i < 256; i++)
 910  921                  for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 911  922                          *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 912  923  
 913  924          for (i = 0; i < BUF_LOCKS; i++) {
 914  925                  mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 915  926                      NULL, MUTEX_DEFAULT, NULL);
 916  927          }
 917  928  }
 918  929  
 919  930  #define ARC_MINTIME     (hz>>4) /* 62 ms */
 920  931  
 921  932  static void
 922  933  arc_cksum_verify(arc_buf_t *buf)
 923  934  {
 924  935          zio_cksum_t zc;
 925  936  
 926  937          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 927  938                  return;
 928  939  
 929  940          mutex_enter(&buf->b_hdr->b_freeze_lock);
 930  941          if (buf->b_hdr->b_freeze_cksum == NULL ||
 931  942              (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 932  943                  mutex_exit(&buf->b_hdr->b_freeze_lock);
 933  944                  return;
 934  945          }
 935  946          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 936  947          if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 937  948                  panic("buffer modified while frozen!");
 938  949          mutex_exit(&buf->b_hdr->b_freeze_lock);
 939  950  }
 940  951  
 941  952  static int
 942  953  arc_cksum_equal(arc_buf_t *buf)
 943  954  {
 944  955          zio_cksum_t zc;
 945  956          int equal;
 946  957  
 947  958          mutex_enter(&buf->b_hdr->b_freeze_lock);
 948  959          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 949  960          equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 950  961          mutex_exit(&buf->b_hdr->b_freeze_lock);
 951  962  
 952  963          return (equal);
 953  964  }
 954  965  
 955  966  static void
 956  967  arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 957  968  {
 958  969          if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 959  970                  return;
 960  971  
 961  972          mutex_enter(&buf->b_hdr->b_freeze_lock);
 962  973          if (buf->b_hdr->b_freeze_cksum != NULL) {
 963  974                  mutex_exit(&buf->b_hdr->b_freeze_lock);
 964  975                  return;
 965  976          }
 966  977          buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
 967  978          fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
 968  979              buf->b_hdr->b_freeze_cksum);
 969  980          mutex_exit(&buf->b_hdr->b_freeze_lock);
 970  981          arc_buf_watch(buf);
 971  982  }
 972  983  
 973  984  #ifndef _KERNEL
 974  985  typedef struct procctl {
 975  986          long cmd;
 976  987          prwatch_t prwatch;
 977  988  } procctl_t;
 978  989  #endif
 979  990  
 980  991  /* ARGSUSED */
 981  992  static void
 982  993  arc_buf_unwatch(arc_buf_t *buf)
 983  994  {
 984  995  #ifndef _KERNEL
 985  996          if (arc_watch) {
 986  997                  int result;
 987  998                  procctl_t ctl;
 988  999                  ctl.cmd = PCWATCH;
 989 1000                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
 990 1001                  ctl.prwatch.pr_size = 0;
 991 1002                  ctl.prwatch.pr_wflags = 0;
 992 1003                  result = write(arc_procfd, &ctl, sizeof (ctl));
 993 1004                  ASSERT3U(result, ==, sizeof (ctl));
 994 1005          }
 995 1006  #endif
 996 1007  }
 997 1008  
 998 1009  /* ARGSUSED */
 999 1010  static void
1000 1011  arc_buf_watch(arc_buf_t *buf)
1001 1012  {
1002 1013  #ifndef _KERNEL
1003 1014          if (arc_watch) {
1004 1015                  int result;
1005 1016                  procctl_t ctl;
1006 1017                  ctl.cmd = PCWATCH;
1007 1018                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1008 1019                  ctl.prwatch.pr_size = buf->b_hdr->b_size;
1009 1020                  ctl.prwatch.pr_wflags = WA_WRITE;
1010 1021                  result = write(arc_procfd, &ctl, sizeof (ctl));
1011 1022                  ASSERT3U(result, ==, sizeof (ctl));
1012 1023          }
1013 1024  #endif
1014 1025  }
1015 1026  
1016 1027  void
1017 1028  arc_buf_thaw(arc_buf_t *buf)
1018 1029  {
1019 1030          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1020 1031                  if (buf->b_hdr->b_state != arc_anon)
1021 1032                          panic("modifying non-anon buffer!");
1022 1033                  if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1023 1034                          panic("modifying buffer while i/o in progress!");
1024 1035                  arc_cksum_verify(buf);
1025 1036          }
1026 1037  
1027 1038          mutex_enter(&buf->b_hdr->b_freeze_lock);
1028 1039          if (buf->b_hdr->b_freeze_cksum != NULL) {
1029 1040                  kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1030 1041                  buf->b_hdr->b_freeze_cksum = NULL;
1031 1042          }
1032 1043  
1033 1044          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1034 1045                  if (buf->b_hdr->b_thawed)
1035 1046                          kmem_free(buf->b_hdr->b_thawed, 1);
1036 1047                  buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1037 1048          }
1038 1049  
1039 1050          mutex_exit(&buf->b_hdr->b_freeze_lock);
1040 1051  
1041 1052          arc_buf_unwatch(buf);
1042 1053  }
1043 1054  
1044 1055  void
1045 1056  arc_buf_freeze(arc_buf_t *buf)
1046 1057  {
1047 1058          kmutex_t *hash_lock;
1048 1059  
1049 1060          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1050 1061                  return;
1051 1062  
1052 1063          hash_lock = HDR_LOCK(buf->b_hdr);
1053 1064          mutex_enter(hash_lock);
1054 1065  
1055 1066          ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1056 1067              buf->b_hdr->b_state == arc_anon);
1057 1068          arc_cksum_compute(buf, B_FALSE);
1058 1069          mutex_exit(hash_lock);
1059 1070  
1060 1071  }
1061 1072  
1062 1073  static void
1063 1074  add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1064 1075  {
1065 1076          ASSERT(MUTEX_HELD(hash_lock));
1066 1077  
1067 1078          if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1068 1079              (ab->b_state != arc_anon)) {
1069 1080                  uint64_t delta = ab->b_size * ab->b_datacnt;
1070 1081                  list_t *list = &ab->b_state->arcs_list[ab->b_type];
1071 1082                  uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1072 1083  
1073 1084                  ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1074 1085                  mutex_enter(&ab->b_state->arcs_mtx);
1075 1086                  ASSERT(list_link_active(&ab->b_arc_node));
1076 1087                  list_remove(list, ab);
1077 1088                  if (GHOST_STATE(ab->b_state)) {
1078 1089                          ASSERT0(ab->b_datacnt);
1079 1090                          ASSERT3P(ab->b_buf, ==, NULL);
1080 1091                          delta = ab->b_size;
1081 1092                  }
1082 1093                  ASSERT(delta > 0);
1083 1094                  ASSERT3U(*size, >=, delta);
1084 1095                  atomic_add_64(size, -delta);
1085 1096                  mutex_exit(&ab->b_state->arcs_mtx);
1086 1097                  /* remove the prefetch flag if we get a reference */
1087 1098                  if (ab->b_flags & ARC_PREFETCH)
1088 1099                          ab->b_flags &= ~ARC_PREFETCH;
1089 1100          }
1090 1101  }
1091 1102  
1092 1103  static int
1093 1104  remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1094 1105  {
1095 1106          int cnt;
1096 1107          arc_state_t *state = ab->b_state;
1097 1108  
1098 1109          ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1099 1110          ASSERT(!GHOST_STATE(state));
1100 1111  
1101 1112          if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1102 1113              (state != arc_anon)) {
1103 1114                  uint64_t *size = &state->arcs_lsize[ab->b_type];
1104 1115  
1105 1116                  ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1106 1117                  mutex_enter(&state->arcs_mtx);
1107 1118                  ASSERT(!list_link_active(&ab->b_arc_node));
1108 1119                  list_insert_head(&state->arcs_list[ab->b_type], ab);
1109 1120                  ASSERT(ab->b_datacnt > 0);
1110 1121                  atomic_add_64(size, ab->b_size * ab->b_datacnt);
1111 1122                  mutex_exit(&state->arcs_mtx);
1112 1123          }
1113 1124          return (cnt);
1114 1125  }
1115 1126  
1116 1127  /*
1117 1128   * Move the supplied buffer to the indicated state.  The mutex
1118 1129   * for the buffer must be held by the caller.
1119 1130   */
1120 1131  static void
1121 1132  arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1122 1133  {
1123 1134          arc_state_t *old_state = ab->b_state;
1124 1135          int64_t refcnt = refcount_count(&ab->b_refcnt);
1125 1136          uint64_t from_delta, to_delta;
1126 1137  
1127 1138          ASSERT(MUTEX_HELD(hash_lock));
1128 1139          ASSERT(new_state != old_state);
1129 1140          ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1130 1141          ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1131 1142          ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1132 1143  
1133 1144          from_delta = to_delta = ab->b_datacnt * ab->b_size;
1134 1145  
1135 1146          /*
1136 1147           * If this buffer is evictable, transfer it from the
1137 1148           * old state list to the new state list.
1138 1149           */
1139 1150          if (refcnt == 0) {
1140 1151                  if (old_state != arc_anon) {
1141 1152                          int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1142 1153                          uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1143 1154  
1144 1155                          if (use_mutex)
1145 1156                                  mutex_enter(&old_state->arcs_mtx);
1146 1157  
1147 1158                          ASSERT(list_link_active(&ab->b_arc_node));
1148 1159                          list_remove(&old_state->arcs_list[ab->b_type], ab);
1149 1160  
1150 1161                          /*
1151 1162                           * If prefetching out of the ghost cache,
1152 1163                           * we will have a non-zero datacnt.
1153 1164                           */
1154 1165                          if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1155 1166                                  /* ghost elements have a ghost size */
1156 1167                                  ASSERT(ab->b_buf == NULL);
1157 1168                                  from_delta = ab->b_size;
1158 1169                          }
1159 1170                          ASSERT3U(*size, >=, from_delta);
1160 1171                          atomic_add_64(size, -from_delta);
1161 1172  
1162 1173                          if (use_mutex)
1163 1174                                  mutex_exit(&old_state->arcs_mtx);
1164 1175                  }
1165 1176                  if (new_state != arc_anon) {
1166 1177                          int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1167 1178                          uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1168 1179  
1169 1180                          if (use_mutex)
1170 1181                                  mutex_enter(&new_state->arcs_mtx);
1171 1182  
1172 1183                          list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1173 1184  
1174 1185                          /* ghost elements have a ghost size */
1175 1186                          if (GHOST_STATE(new_state)) {
1176 1187                                  ASSERT(ab->b_datacnt == 0);
1177 1188                                  ASSERT(ab->b_buf == NULL);
1178 1189                                  to_delta = ab->b_size;
1179 1190                          }
1180 1191                          atomic_add_64(size, to_delta);
1181 1192  
1182 1193                          if (use_mutex)
1183 1194                                  mutex_exit(&new_state->arcs_mtx);
1184 1195                  }
1185 1196          }
1186 1197  
1187 1198          ASSERT(!BUF_EMPTY(ab));
1188 1199          if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1189 1200                  buf_hash_remove(ab);
1190 1201  
1191 1202          /* adjust state sizes */
1192 1203          if (to_delta)
1193 1204                  atomic_add_64(&new_state->arcs_size, to_delta);
1194 1205          if (from_delta) {
1195 1206                  ASSERT3U(old_state->arcs_size, >=, from_delta);
1196 1207                  atomic_add_64(&old_state->arcs_size, -from_delta);
1197 1208          }
1198 1209          ab->b_state = new_state;
1199 1210  
1200 1211          /* adjust l2arc hdr stats */
1201 1212          if (new_state == arc_l2c_only)
1202 1213                  l2arc_hdr_stat_add();
1203 1214          else if (old_state == arc_l2c_only)
1204 1215                  l2arc_hdr_stat_remove();
1205 1216  }
1206 1217  
1207 1218  void
1208 1219  arc_space_consume(uint64_t space, arc_space_type_t type)
1209 1220  {
1210 1221          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1211 1222  
1212 1223          switch (type) {
1213 1224          case ARC_SPACE_DATA:
1214 1225                  ARCSTAT_INCR(arcstat_data_size, space);
1215 1226                  break;
1216 1227          case ARC_SPACE_OTHER:
1217 1228                  ARCSTAT_INCR(arcstat_other_size, space);
1218 1229                  break;
1219 1230          case ARC_SPACE_HDRS:
1220 1231                  ARCSTAT_INCR(arcstat_hdr_size, space);
1221 1232                  break;
1222 1233          case ARC_SPACE_L2HDRS:
1223 1234                  ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1224 1235                  break;
1225 1236          }
1226 1237  
1227 1238          ARCSTAT_INCR(arcstat_meta_used, space);
1228 1239          atomic_add_64(&arc_size, space);
1229 1240  }
1230 1241  
1231 1242  void
1232 1243  arc_space_return(uint64_t space, arc_space_type_t type)
1233 1244  {
1234 1245          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1235 1246  
1236 1247          switch (type) {
1237 1248          case ARC_SPACE_DATA:
1238 1249                  ARCSTAT_INCR(arcstat_data_size, -space);
1239 1250                  break;
1240 1251          case ARC_SPACE_OTHER:
1241 1252                  ARCSTAT_INCR(arcstat_other_size, -space);
1242 1253                  break;
1243 1254          case ARC_SPACE_HDRS:
1244 1255                  ARCSTAT_INCR(arcstat_hdr_size, -space);
1245 1256                  break;
1246 1257          case ARC_SPACE_L2HDRS:
1247 1258                  ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1248 1259                  break;
1249 1260          }
1250 1261  
1251 1262          ASSERT(arc_meta_used >= space);
1252 1263          if (arc_meta_max < arc_meta_used)
1253 1264                  arc_meta_max = arc_meta_used;
1254 1265          ARCSTAT_INCR(arcstat_meta_used, -space);
1255 1266          ASSERT(arc_size >= space);
1256 1267          atomic_add_64(&arc_size, -space);
1257 1268  }
1258 1269  
1259 1270  void *
1260 1271  arc_data_buf_alloc(uint64_t size)
1261 1272  {
1262 1273          if (arc_evict_needed(ARC_BUFC_DATA))
1263 1274                  cv_signal(&arc_reclaim_thr_cv);
1264 1275          atomic_add_64(&arc_size, size);
1265 1276          return (zio_data_buf_alloc(size));
1266 1277  }
1267 1278  
1268 1279  void
1269 1280  arc_data_buf_free(void *buf, uint64_t size)
1270 1281  {
1271 1282          zio_data_buf_free(buf, size);
1272 1283          ASSERT(arc_size >= size);
1273 1284          atomic_add_64(&arc_size, -size);
1274 1285  }
1275 1286  
1276 1287  arc_buf_t *
1277 1288  arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1278 1289  {
1279 1290          arc_buf_hdr_t *hdr;
1280 1291          arc_buf_t *buf;
1281 1292  
1282 1293          ASSERT3U(size, >, 0);
1283 1294          hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1284 1295          ASSERT(BUF_EMPTY(hdr));
1285 1296          hdr->b_size = size;
1286 1297          hdr->b_type = type;
1287 1298          hdr->b_spa = spa_load_guid(spa);
1288 1299          hdr->b_state = arc_anon;
1289 1300          hdr->b_arc_access = 0;
1290 1301          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1291 1302          buf->b_hdr = hdr;
1292 1303          buf->b_data = NULL;
1293 1304          buf->b_efunc = NULL;
1294 1305          buf->b_private = NULL;
1295 1306          buf->b_next = NULL;
1296 1307          hdr->b_buf = buf;
1297 1308          arc_get_data_buf(buf);
1298 1309          hdr->b_datacnt = 1;
1299 1310          hdr->b_flags = 0;
1300 1311          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1301 1312          (void) refcount_add(&hdr->b_refcnt, tag);
1302 1313  
1303 1314          return (buf);
1304 1315  }
1305 1316  
1306 1317  static char *arc_onloan_tag = "onloan";
1307 1318  
1308 1319  /*
1309 1320   * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1310 1321   * flight data by arc_tempreserve_space() until they are "returned". Loaned
1311 1322   * buffers must be returned to the arc before they can be used by the DMU or
1312 1323   * freed.
1313 1324   */
1314 1325  arc_buf_t *
1315 1326  arc_loan_buf(spa_t *spa, int size)
1316 1327  {
1317 1328          arc_buf_t *buf;
1318 1329  
1319 1330          buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1320 1331  
1321 1332          atomic_add_64(&arc_loaned_bytes, size);
1322 1333          return (buf);
1323 1334  }
1324 1335  
1325 1336  /*
1326 1337   * Return a loaned arc buffer to the arc.
1327 1338   */
1328 1339  void
1329 1340  arc_return_buf(arc_buf_t *buf, void *tag)
1330 1341  {
1331 1342          arc_buf_hdr_t *hdr = buf->b_hdr;
1332 1343  
1333 1344          ASSERT(buf->b_data != NULL);
1334 1345          (void) refcount_add(&hdr->b_refcnt, tag);
1335 1346          (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1336 1347  
1337 1348          atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1338 1349  }
1339 1350  
1340 1351  /* Detach an arc_buf from a dbuf (tag) */
1341 1352  void
1342 1353  arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1343 1354  {
1344 1355          arc_buf_hdr_t *hdr;
1345 1356  
1346 1357          ASSERT(buf->b_data != NULL);
1347 1358          hdr = buf->b_hdr;
1348 1359          (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1349 1360          (void) refcount_remove(&hdr->b_refcnt, tag);
1350 1361          buf->b_efunc = NULL;
1351 1362          buf->b_private = NULL;
1352 1363  
1353 1364          atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1354 1365  }
1355 1366  
1356 1367  static arc_buf_t *
1357 1368  arc_buf_clone(arc_buf_t *from)
1358 1369  {
1359 1370          arc_buf_t *buf;
1360 1371          arc_buf_hdr_t *hdr = from->b_hdr;
1361 1372          uint64_t size = hdr->b_size;
1362 1373  
1363 1374          ASSERT(hdr->b_state != arc_anon);
1364 1375  
1365 1376          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1366 1377          buf->b_hdr = hdr;
1367 1378          buf->b_data = NULL;
1368 1379          buf->b_efunc = NULL;
1369 1380          buf->b_private = NULL;
1370 1381          buf->b_next = hdr->b_buf;
1371 1382          hdr->b_buf = buf;
1372 1383          arc_get_data_buf(buf);
1373 1384          bcopy(from->b_data, buf->b_data, size);
1374 1385  
1375 1386          /*
1376 1387           * This buffer already exists in the arc so create a duplicate
1377 1388           * copy for the caller.  If the buffer is associated with user data
1378 1389           * then track the size and number of duplicates.  These stats will be
1379 1390           * updated as duplicate buffers are created and destroyed.
1380 1391           */
1381 1392          if (hdr->b_type == ARC_BUFC_DATA) {
1382 1393                  ARCSTAT_BUMP(arcstat_duplicate_buffers);
1383 1394                  ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1384 1395          }
1385 1396          hdr->b_datacnt += 1;
1386 1397          return (buf);
1387 1398  }
1388 1399  
1389 1400  void
1390 1401  arc_buf_add_ref(arc_buf_t *buf, void* tag)
1391 1402  {
1392 1403          arc_buf_hdr_t *hdr;
1393 1404          kmutex_t *hash_lock;
1394 1405  
1395 1406          /*
1396 1407           * Check to see if this buffer is evicted.  Callers
1397 1408           * must verify b_data != NULL to know if the add_ref
1398 1409           * was successful.
1399 1410           */
1400 1411          mutex_enter(&buf->b_evict_lock);
1401 1412          if (buf->b_data == NULL) {
1402 1413                  mutex_exit(&buf->b_evict_lock);
1403 1414                  return;
1404 1415          }
1405 1416          hash_lock = HDR_LOCK(buf->b_hdr);
1406 1417          mutex_enter(hash_lock);
1407 1418          hdr = buf->b_hdr;
1408 1419          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1409 1420          mutex_exit(&buf->b_evict_lock);
1410 1421  
1411 1422          ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1412 1423          add_reference(hdr, hash_lock, tag);
1413 1424          DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1414 1425          arc_access(hdr, hash_lock);
1415 1426          mutex_exit(hash_lock);
1416 1427          ARCSTAT_BUMP(arcstat_hits);
1417 1428          ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1418 1429              demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1419 1430              data, metadata, hits);
1420 1431  }
1421 1432  
1422 1433  /*
1423 1434   * Free the arc data buffer.  If it is an l2arc write in progress,
1424 1435   * the buffer is placed on l2arc_free_on_write to be freed later.
1425 1436   */
1426 1437  static void
1427 1438  arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1428 1439  {
1429 1440          arc_buf_hdr_t *hdr = buf->b_hdr;
1430 1441  
1431 1442          if (HDR_L2_WRITING(hdr)) {
1432 1443                  l2arc_data_free_t *df;
1433 1444                  df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1434 1445                  df->l2df_data = buf->b_data;
1435 1446                  df->l2df_size = hdr->b_size;
1436 1447                  df->l2df_func = free_func;
1437 1448                  mutex_enter(&l2arc_free_on_write_mtx);
1438 1449                  list_insert_head(l2arc_free_on_write, df);
1439 1450                  mutex_exit(&l2arc_free_on_write_mtx);
1440 1451                  ARCSTAT_BUMP(arcstat_l2_free_on_write);
1441 1452          } else {
1442 1453                  free_func(buf->b_data, hdr->b_size);
1443 1454          }
1444 1455  }
1445 1456  
1446 1457  static void
1447 1458  arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1448 1459  {
1449 1460          arc_buf_t **bufp;
1450 1461  
1451 1462          /* free up data associated with the buf */
1452 1463          if (buf->b_data) {
1453 1464                  arc_state_t *state = buf->b_hdr->b_state;
1454 1465                  uint64_t size = buf->b_hdr->b_size;
1455 1466                  arc_buf_contents_t type = buf->b_hdr->b_type;
1456 1467  
1457 1468                  arc_cksum_verify(buf);
1458 1469                  arc_buf_unwatch(buf);
1459 1470  
1460 1471                  if (!recycle) {
1461 1472                          if (type == ARC_BUFC_METADATA) {
1462 1473                                  arc_buf_data_free(buf, zio_buf_free);
1463 1474                                  arc_space_return(size, ARC_SPACE_DATA);
1464 1475                          } else {
1465 1476                                  ASSERT(type == ARC_BUFC_DATA);
1466 1477                                  arc_buf_data_free(buf, zio_data_buf_free);
1467 1478                                  ARCSTAT_INCR(arcstat_data_size, -size);
1468 1479                                  atomic_add_64(&arc_size, -size);
1469 1480                          }
1470 1481                  }
1471 1482                  if (list_link_active(&buf->b_hdr->b_arc_node)) {
1472 1483                          uint64_t *cnt = &state->arcs_lsize[type];
1473 1484  
1474 1485                          ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1475 1486                          ASSERT(state != arc_anon);
1476 1487  
1477 1488                          ASSERT3U(*cnt, >=, size);
1478 1489                          atomic_add_64(cnt, -size);
1479 1490                  }
1480 1491                  ASSERT3U(state->arcs_size, >=, size);
1481 1492                  atomic_add_64(&state->arcs_size, -size);
1482 1493                  buf->b_data = NULL;
1483 1494  
1484 1495                  /*
1485 1496                   * If we're destroying a duplicate buffer make sure
1486 1497                   * that the appropriate statistics are updated.
1487 1498                   */
1488 1499                  if (buf->b_hdr->b_datacnt > 1 &&
1489 1500                      buf->b_hdr->b_type == ARC_BUFC_DATA) {
1490 1501                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1491 1502                          ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1492 1503                  }
1493 1504                  ASSERT(buf->b_hdr->b_datacnt > 0);
1494 1505                  buf->b_hdr->b_datacnt -= 1;
1495 1506          }
1496 1507  
1497 1508          /* only remove the buf if requested */
1498 1509          if (!all)
1499 1510                  return;
1500 1511  
1501 1512          /* remove the buf from the hdr list */
1502 1513          for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1503 1514                  continue;
1504 1515          *bufp = buf->b_next;
1505 1516          buf->b_next = NULL;
1506 1517  
1507 1518          ASSERT(buf->b_efunc == NULL);
1508 1519  
1509 1520          /* clean up the buf */
1510 1521          buf->b_hdr = NULL;
1511 1522          kmem_cache_free(buf_cache, buf);
1512 1523  }
1513 1524  
1514 1525  static void
1515 1526  arc_hdr_destroy(arc_buf_hdr_t *hdr)
1516 1527  {
1517 1528          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1518 1529          ASSERT3P(hdr->b_state, ==, arc_anon);
1519 1530          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1520 1531          l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1521 1532  
1522 1533          if (l2hdr != NULL) {
1523 1534                  boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1524 1535                  /*
1525 1536                   * To prevent arc_free() and l2arc_evict() from
1526 1537                   * attempting to free the same buffer at the same time,
1527 1538                   * a FREE_IN_PROGRESS flag is given to arc_free() to
1528 1539                   * give it priority.  l2arc_evict() can't destroy this
1529 1540                   * header while we are waiting on l2arc_buflist_mtx.
1530 1541                   *
1531 1542                   * The hdr may be removed from l2ad_buflist before we
1532 1543                   * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1533 1544                   */
1534 1545                  if (!buflist_held) {
1535 1546                          mutex_enter(&l2arc_buflist_mtx);
1536 1547                          l2hdr = hdr->b_l2hdr;
1537 1548                  }
1538 1549  
1539 1550                  if (l2hdr != NULL) {
1540 1551                          list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1541 1552                          ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1542 1553                          kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1543 1554                          if (hdr->b_state == arc_l2c_only)
1544 1555                                  l2arc_hdr_stat_remove();
1545 1556                          hdr->b_l2hdr = NULL;
1546 1557                  }
1547 1558  
1548 1559                  if (!buflist_held)
1549 1560                          mutex_exit(&l2arc_buflist_mtx);
1550 1561          }
1551 1562  
1552 1563          if (!BUF_EMPTY(hdr)) {
1553 1564                  ASSERT(!HDR_IN_HASH_TABLE(hdr));
1554 1565                  buf_discard_identity(hdr);
1555 1566          }
1556 1567          while (hdr->b_buf) {
1557 1568                  arc_buf_t *buf = hdr->b_buf;
1558 1569  
1559 1570                  if (buf->b_efunc) {
1560 1571                          mutex_enter(&arc_eviction_mtx);
1561 1572                          mutex_enter(&buf->b_evict_lock);
1562 1573                          ASSERT(buf->b_hdr != NULL);
1563 1574                          arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1564 1575                          hdr->b_buf = buf->b_next;
1565 1576                          buf->b_hdr = &arc_eviction_hdr;
1566 1577                          buf->b_next = arc_eviction_list;
1567 1578                          arc_eviction_list = buf;
1568 1579                          mutex_exit(&buf->b_evict_lock);
1569 1580                          mutex_exit(&arc_eviction_mtx);
1570 1581                  } else {
1571 1582                          arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1572 1583                  }
1573 1584          }
1574 1585          if (hdr->b_freeze_cksum != NULL) {
1575 1586                  kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1576 1587                  hdr->b_freeze_cksum = NULL;
1577 1588          }
1578 1589          if (hdr->b_thawed) {
1579 1590                  kmem_free(hdr->b_thawed, 1);
1580 1591                  hdr->b_thawed = NULL;
1581 1592          }
1582 1593  
1583 1594          ASSERT(!list_link_active(&hdr->b_arc_node));
1584 1595          ASSERT3P(hdr->b_hash_next, ==, NULL);
1585 1596          ASSERT3P(hdr->b_acb, ==, NULL);
1586 1597          kmem_cache_free(hdr_cache, hdr);
1587 1598  }
1588 1599  
1589 1600  void
1590 1601  arc_buf_free(arc_buf_t *buf, void *tag)
1591 1602  {
1592 1603          arc_buf_hdr_t *hdr = buf->b_hdr;
1593 1604          int hashed = hdr->b_state != arc_anon;
1594 1605  
1595 1606          ASSERT(buf->b_efunc == NULL);
1596 1607          ASSERT(buf->b_data != NULL);
1597 1608  
1598 1609          if (hashed) {
1599 1610                  kmutex_t *hash_lock = HDR_LOCK(hdr);
1600 1611  
1601 1612                  mutex_enter(hash_lock);
1602 1613                  hdr = buf->b_hdr;
1603 1614                  ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1604 1615  
1605 1616                  (void) remove_reference(hdr, hash_lock, tag);
1606 1617                  if (hdr->b_datacnt > 1) {
1607 1618                          arc_buf_destroy(buf, FALSE, TRUE);
1608 1619                  } else {
1609 1620                          ASSERT(buf == hdr->b_buf);
1610 1621                          ASSERT(buf->b_efunc == NULL);
1611 1622                          hdr->b_flags |= ARC_BUF_AVAILABLE;
1612 1623                  }
1613 1624                  mutex_exit(hash_lock);
1614 1625          } else if (HDR_IO_IN_PROGRESS(hdr)) {
1615 1626                  int destroy_hdr;
1616 1627                  /*
1617 1628                   * We are in the middle of an async write.  Don't destroy
1618 1629                   * this buffer unless the write completes before we finish
1619 1630                   * decrementing the reference count.
1620 1631                   */
1621 1632                  mutex_enter(&arc_eviction_mtx);
1622 1633                  (void) remove_reference(hdr, NULL, tag);
1623 1634                  ASSERT(refcount_is_zero(&hdr->b_refcnt));
1624 1635                  destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1625 1636                  mutex_exit(&arc_eviction_mtx);
1626 1637                  if (destroy_hdr)
1627 1638                          arc_hdr_destroy(hdr);
1628 1639          } else {
1629 1640                  if (remove_reference(hdr, NULL, tag) > 0)
1630 1641                          arc_buf_destroy(buf, FALSE, TRUE);
1631 1642                  else
1632 1643                          arc_hdr_destroy(hdr);
1633 1644          }
1634 1645  }
1635 1646  
1636 1647  boolean_t
1637 1648  arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1638 1649  {
1639 1650          arc_buf_hdr_t *hdr = buf->b_hdr;
1640 1651          kmutex_t *hash_lock = HDR_LOCK(hdr);
1641 1652          boolean_t no_callback = (buf->b_efunc == NULL);
1642 1653  
1643 1654          if (hdr->b_state == arc_anon) {
1644 1655                  ASSERT(hdr->b_datacnt == 1);
1645 1656                  arc_buf_free(buf, tag);
1646 1657                  return (no_callback);
1647 1658          }
1648 1659  
1649 1660          mutex_enter(hash_lock);
1650 1661          hdr = buf->b_hdr;
1651 1662          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1652 1663          ASSERT(hdr->b_state != arc_anon);
1653 1664          ASSERT(buf->b_data != NULL);
1654 1665  
1655 1666          (void) remove_reference(hdr, hash_lock, tag);
1656 1667          if (hdr->b_datacnt > 1) {
1657 1668                  if (no_callback)
1658 1669                          arc_buf_destroy(buf, FALSE, TRUE);
1659 1670          } else if (no_callback) {
1660 1671                  ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1661 1672                  ASSERT(buf->b_efunc == NULL);
1662 1673                  hdr->b_flags |= ARC_BUF_AVAILABLE;
1663 1674          }
1664 1675          ASSERT(no_callback || hdr->b_datacnt > 1 ||
1665 1676              refcount_is_zero(&hdr->b_refcnt));
1666 1677          mutex_exit(hash_lock);
1667 1678          return (no_callback);
1668 1679  }
1669 1680  
1670 1681  int
1671 1682  arc_buf_size(arc_buf_t *buf)
1672 1683  {
1673 1684          return (buf->b_hdr->b_size);
1674 1685  }
1675 1686  
1676 1687  /*
1677 1688   * Called from the DMU to determine if the current buffer should be
1678 1689   * evicted. In order to ensure proper locking, the eviction must be initiated
1679 1690   * from the DMU. Return true if the buffer is associated with user data and
1680 1691   * duplicate buffers still exist.
1681 1692   */
1682 1693  boolean_t
1683 1694  arc_buf_eviction_needed(arc_buf_t *buf)
1684 1695  {
1685 1696          arc_buf_hdr_t *hdr;
1686 1697          boolean_t evict_needed = B_FALSE;
1687 1698  
1688 1699          if (zfs_disable_dup_eviction)
1689 1700                  return (B_FALSE);
1690 1701  
1691 1702          mutex_enter(&buf->b_evict_lock);
1692 1703          hdr = buf->b_hdr;
1693 1704          if (hdr == NULL) {
1694 1705                  /*
1695 1706                   * We are in arc_do_user_evicts(); let that function
1696 1707                   * perform the eviction.
1697 1708                   */
1698 1709                  ASSERT(buf->b_data == NULL);
1699 1710                  mutex_exit(&buf->b_evict_lock);
1700 1711                  return (B_FALSE);
1701 1712          } else if (buf->b_data == NULL) {
1702 1713                  /*
1703 1714                   * We have already been added to the arc eviction list;
1704 1715                   * recommend eviction.
1705 1716                   */
1706 1717                  ASSERT3P(hdr, ==, &arc_eviction_hdr);
1707 1718                  mutex_exit(&buf->b_evict_lock);
1708 1719                  return (B_TRUE);
1709 1720          }
1710 1721  
1711 1722          if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1712 1723                  evict_needed = B_TRUE;
1713 1724  
1714 1725          mutex_exit(&buf->b_evict_lock);
1715 1726          return (evict_needed);
1716 1727  }
1717 1728  
1718 1729  /*
1719 1730   * Evict buffers from list until we've removed the specified number of
1720 1731   * bytes.  Move the removed buffers to the appropriate evict state.
1721 1732   * If the recycle flag is set, then attempt to "recycle" a buffer:
1722 1733   * - look for a buffer to evict that is `bytes' long.
1723 1734   * - return the data block from this buffer rather than freeing it.
1724 1735   * This flag is used by callers that are trying to make space for a
1725 1736   * new buffer in a full arc cache.
1726 1737   *
1727 1738   * This function makes a "best effort".  It skips over any buffers
1728 1739   * it can't get a hash_lock on, and so may not catch all candidates.
1729 1740   * It may also return without evicting as much space as requested.
1730 1741   */
1731 1742  static void *
1732 1743  arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1733 1744      arc_buf_contents_t type)
1734 1745  {
1735 1746          arc_state_t *evicted_state;
1736 1747          uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1737 1748          arc_buf_hdr_t *ab, *ab_prev = NULL;
1738 1749          list_t *list = &state->arcs_list[type];
1739 1750          kmutex_t *hash_lock;
1740 1751          boolean_t have_lock;
1741 1752          void *stolen = NULL;
1742 1753  
1743 1754          ASSERT(state == arc_mru || state == arc_mfu);
1744 1755  
1745 1756          evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1746 1757  
1747 1758          mutex_enter(&state->arcs_mtx);
1748 1759          mutex_enter(&evicted_state->arcs_mtx);
1749 1760  
1750 1761          for (ab = list_tail(list); ab; ab = ab_prev) {
1751 1762                  ab_prev = list_prev(list, ab);
1752 1763                  /* prefetch buffers have a minimum lifespan */
1753 1764                  if (HDR_IO_IN_PROGRESS(ab) ||
1754 1765                      (spa && ab->b_spa != spa) ||
1755 1766                      (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1756 1767                      ddi_get_lbolt() - ab->b_arc_access <
1757 1768                      arc_min_prefetch_lifespan)) {
1758 1769                          skipped++;
1759 1770                          continue;
1760 1771                  }
1761 1772                  /* "lookahead" for better eviction candidate */
1762 1773                  if (recycle && ab->b_size != bytes &&
1763 1774                      ab_prev && ab_prev->b_size == bytes)
1764 1775                          continue;
1765 1776                  hash_lock = HDR_LOCK(ab);
1766 1777                  have_lock = MUTEX_HELD(hash_lock);
1767 1778                  if (have_lock || mutex_tryenter(hash_lock)) {
1768 1779                          ASSERT0(refcount_count(&ab->b_refcnt));
1769 1780                          ASSERT(ab->b_datacnt > 0);
1770 1781                          while (ab->b_buf) {
1771 1782                                  arc_buf_t *buf = ab->b_buf;
1772 1783                                  if (!mutex_tryenter(&buf->b_evict_lock)) {
1773 1784                                          missed += 1;
1774 1785                                          break;
1775 1786                                  }
1776 1787                                  if (buf->b_data) {
1777 1788                                          bytes_evicted += ab->b_size;
1778 1789                                          if (recycle && ab->b_type == type &&
1779 1790                                              ab->b_size == bytes &&
1780 1791                                              !HDR_L2_WRITING(ab)) {
1781 1792                                                  stolen = buf->b_data;
1782 1793                                                  recycle = FALSE;
1783 1794                                          }
1784 1795                                  }
1785 1796                                  if (buf->b_efunc) {
1786 1797                                          mutex_enter(&arc_eviction_mtx);
1787 1798                                          arc_buf_destroy(buf,
1788 1799                                              buf->b_data == stolen, FALSE);
1789 1800                                          ab->b_buf = buf->b_next;
1790 1801                                          buf->b_hdr = &arc_eviction_hdr;
1791 1802                                          buf->b_next = arc_eviction_list;
1792 1803                                          arc_eviction_list = buf;
1793 1804                                          mutex_exit(&arc_eviction_mtx);
1794 1805                                          mutex_exit(&buf->b_evict_lock);
1795 1806                                  } else {
1796 1807                                          mutex_exit(&buf->b_evict_lock);
1797 1808                                          arc_buf_destroy(buf,
1798 1809                                              buf->b_data == stolen, TRUE);
1799 1810                                  }
1800 1811                          }
1801 1812  
1802 1813                          if (ab->b_l2hdr) {
1803 1814                                  ARCSTAT_INCR(arcstat_evict_l2_cached,
1804 1815                                      ab->b_size);
1805 1816                          } else {
1806 1817                                  if (l2arc_write_eligible(ab->b_spa, ab)) {
1807 1818                                          ARCSTAT_INCR(arcstat_evict_l2_eligible,
1808 1819                                              ab->b_size);
1809 1820                                  } else {
1810 1821                                          ARCSTAT_INCR(
1811 1822                                              arcstat_evict_l2_ineligible,
1812 1823                                              ab->b_size);
1813 1824                                  }
1814 1825                          }
1815 1826  
1816 1827                          if (ab->b_datacnt == 0) {
1817 1828                                  arc_change_state(evicted_state, ab, hash_lock);
1818 1829                                  ASSERT(HDR_IN_HASH_TABLE(ab));
1819 1830                                  ab->b_flags |= ARC_IN_HASH_TABLE;
1820 1831                                  ab->b_flags &= ~ARC_BUF_AVAILABLE;
1821 1832                                  DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1822 1833                          }
1823 1834                          if (!have_lock)
1824 1835                                  mutex_exit(hash_lock);
1825 1836                          if (bytes >= 0 && bytes_evicted >= bytes)
1826 1837                                  break;
1827 1838                  } else {
1828 1839                          missed += 1;
1829 1840                  }
1830 1841          }
1831 1842  
1832 1843          mutex_exit(&evicted_state->arcs_mtx);
1833 1844          mutex_exit(&state->arcs_mtx);
1834 1845  
1835 1846          if (bytes_evicted < bytes)
1836 1847                  dprintf("only evicted %lld bytes from %x",
1837 1848                      (longlong_t)bytes_evicted, state);
1838 1849  
1839 1850          if (skipped)
1840 1851                  ARCSTAT_INCR(arcstat_evict_skip, skipped);
1841 1852  
1842 1853          if (missed)
1843 1854                  ARCSTAT_INCR(arcstat_mutex_miss, missed);
1844 1855  
1845 1856          /*
1846 1857           * We have just evicted some data into the ghost state, make
1847 1858           * sure we also adjust the ghost state size if necessary.
1848 1859           */
1849 1860          if (arc_no_grow &&
1850 1861              arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1851 1862                  int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1852 1863                      arc_mru_ghost->arcs_size - arc_c;
1853 1864  
1854 1865                  if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1855 1866                          int64_t todelete =
1856 1867                              MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1857 1868                          arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1858 1869                  } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1859 1870                          int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1860 1871                              arc_mru_ghost->arcs_size +
1861 1872                              arc_mfu_ghost->arcs_size - arc_c);
1862 1873                          arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1863 1874                  }
1864 1875          }
1865 1876  
1866 1877          return (stolen);
1867 1878  }
1868 1879  
1869 1880  /*
1870 1881   * Remove buffers from list until we've removed the specified number of
1871 1882   * bytes.  Destroy the buffers that are removed.
1872 1883   */
1873 1884  static void
1874 1885  arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1875 1886  {
1876 1887          arc_buf_hdr_t *ab, *ab_prev;
1877 1888          arc_buf_hdr_t marker = { 0 };
1878 1889          list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1879 1890          kmutex_t *hash_lock;
1880 1891          uint64_t bytes_deleted = 0;
1881 1892          uint64_t bufs_skipped = 0;
1882 1893  
1883 1894          ASSERT(GHOST_STATE(state));
1884 1895  top:
1885 1896          mutex_enter(&state->arcs_mtx);
1886 1897          for (ab = list_tail(list); ab; ab = ab_prev) {
1887 1898                  ab_prev = list_prev(list, ab);
1888 1899                  if (spa && ab->b_spa != spa)
1889 1900                          continue;
1890 1901  
1891 1902                  /* ignore markers */
1892 1903                  if (ab->b_spa == 0)
1893 1904                          continue;
1894 1905  
1895 1906                  hash_lock = HDR_LOCK(ab);
1896 1907                  /* caller may be trying to modify this buffer, skip it */
1897 1908                  if (MUTEX_HELD(hash_lock))
1898 1909                          continue;
1899 1910                  if (mutex_tryenter(hash_lock)) {
1900 1911                          ASSERT(!HDR_IO_IN_PROGRESS(ab));
1901 1912                          ASSERT(ab->b_buf == NULL);
1902 1913                          ARCSTAT_BUMP(arcstat_deleted);
1903 1914                          bytes_deleted += ab->b_size;
1904 1915  
1905 1916                          if (ab->b_l2hdr != NULL) {
1906 1917                                  /*
1907 1918                                   * This buffer is cached on the 2nd Level ARC;
1908 1919                                   * don't destroy the header.
1909 1920                                   */
1910 1921                                  arc_change_state(arc_l2c_only, ab, hash_lock);
1911 1922                                  mutex_exit(hash_lock);
1912 1923                          } else {
1913 1924                                  arc_change_state(arc_anon, ab, hash_lock);
1914 1925                                  mutex_exit(hash_lock);
1915 1926                                  arc_hdr_destroy(ab);
1916 1927                          }
1917 1928  
1918 1929                          DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1919 1930                          if (bytes >= 0 && bytes_deleted >= bytes)
1920 1931                                  break;
1921 1932                  } else if (bytes < 0) {
1922 1933                          /*
1923 1934                           * Insert a list marker and then wait for the
1924 1935                           * hash lock to become available. Once its
1925 1936                           * available, restart from where we left off.
1926 1937                           */
1927 1938                          list_insert_after(list, ab, &marker);
1928 1939                          mutex_exit(&state->arcs_mtx);
1929 1940                          mutex_enter(hash_lock);
1930 1941                          mutex_exit(hash_lock);
1931 1942                          mutex_enter(&state->arcs_mtx);
1932 1943                          ab_prev = list_prev(list, &marker);
1933 1944                          list_remove(list, &marker);
1934 1945                  } else
1935 1946                          bufs_skipped += 1;
1936 1947          }
1937 1948          mutex_exit(&state->arcs_mtx);
1938 1949  
1939 1950          if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1940 1951              (bytes < 0 || bytes_deleted < bytes)) {
1941 1952                  list = &state->arcs_list[ARC_BUFC_METADATA];
1942 1953                  goto top;
1943 1954          }
1944 1955  
1945 1956          if (bufs_skipped) {
1946 1957                  ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1947 1958                  ASSERT(bytes >= 0);
1948 1959          }
1949 1960  
1950 1961          if (bytes_deleted < bytes)
1951 1962                  dprintf("only deleted %lld bytes from %p",
1952 1963                      (longlong_t)bytes_deleted, state);
1953 1964  }
1954 1965  
1955 1966  static void
1956 1967  arc_adjust(void)
1957 1968  {
1958 1969          int64_t adjustment, delta;
1959 1970  
1960 1971          /*
1961 1972           * Adjust MRU size
1962 1973           */
1963 1974  
1964 1975          adjustment = MIN((int64_t)(arc_size - arc_c),
1965 1976              (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1966 1977              arc_p));
1967 1978  
1968 1979          if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1969 1980                  delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1970 1981                  (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
1971 1982                  adjustment -= delta;
1972 1983          }
1973 1984  
1974 1985          if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1975 1986                  delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1976 1987                  (void) arc_evict(arc_mru, NULL, delta, FALSE,
1977 1988                      ARC_BUFC_METADATA);
1978 1989          }
1979 1990  
1980 1991          /*
1981 1992           * Adjust MFU size
1982 1993           */
1983 1994  
1984 1995          adjustment = arc_size - arc_c;
1985 1996  
1986 1997          if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1987 1998                  delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1988 1999                  (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
1989 2000                  adjustment -= delta;
1990 2001          }
1991 2002  
1992 2003          if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1993 2004                  int64_t delta = MIN(adjustment,
1994 2005                      arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
1995 2006                  (void) arc_evict(arc_mfu, NULL, delta, FALSE,
1996 2007                      ARC_BUFC_METADATA);
1997 2008          }
1998 2009  
1999 2010          /*
2000 2011           * Adjust ghost lists
2001 2012           */
2002 2013  
2003 2014          adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2004 2015  
2005 2016          if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2006 2017                  delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2007 2018                  arc_evict_ghost(arc_mru_ghost, NULL, delta);
2008 2019          }
2009 2020  
2010 2021          adjustment =
2011 2022              arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2012 2023  
2013 2024          if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2014 2025                  delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2015 2026                  arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2016 2027          }
2017 2028  }
2018 2029  
2019 2030  static void
2020 2031  arc_do_user_evicts(void)
2021 2032  {
2022 2033          mutex_enter(&arc_eviction_mtx);
2023 2034          while (arc_eviction_list != NULL) {
2024 2035                  arc_buf_t *buf = arc_eviction_list;
2025 2036                  arc_eviction_list = buf->b_next;
2026 2037                  mutex_enter(&buf->b_evict_lock);
2027 2038                  buf->b_hdr = NULL;
2028 2039                  mutex_exit(&buf->b_evict_lock);
2029 2040                  mutex_exit(&arc_eviction_mtx);
2030 2041  
2031 2042                  if (buf->b_efunc != NULL)
2032 2043                          VERIFY(buf->b_efunc(buf) == 0);
2033 2044  
2034 2045                  buf->b_efunc = NULL;
2035 2046                  buf->b_private = NULL;
2036 2047                  kmem_cache_free(buf_cache, buf);
2037 2048                  mutex_enter(&arc_eviction_mtx);
2038 2049          }
2039 2050          mutex_exit(&arc_eviction_mtx);
2040 2051  }
2041 2052  
2042 2053  /*
2043 2054   * Flush all *evictable* data from the cache for the given spa.
2044 2055   * NOTE: this will not touch "active" (i.e. referenced) data.
2045 2056   */
2046 2057  void
2047 2058  arc_flush(spa_t *spa)
2048 2059  {
2049 2060          uint64_t guid = 0;
2050 2061  
2051 2062          if (spa)
2052 2063                  guid = spa_load_guid(spa);
2053 2064  
2054 2065          while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2055 2066                  (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2056 2067                  if (spa)
2057 2068                          break;
2058 2069          }
2059 2070          while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2060 2071                  (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2061 2072                  if (spa)
2062 2073                          break;
2063 2074          }
2064 2075          while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2065 2076                  (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2066 2077                  if (spa)
2067 2078                          break;
2068 2079          }
2069 2080          while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2070 2081                  (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2071 2082                  if (spa)
2072 2083                          break;
2073 2084          }
2074 2085  
2075 2086          arc_evict_ghost(arc_mru_ghost, guid, -1);
2076 2087          arc_evict_ghost(arc_mfu_ghost, guid, -1);
2077 2088  
2078 2089          mutex_enter(&arc_reclaim_thr_lock);
2079 2090          arc_do_user_evicts();
2080 2091          mutex_exit(&arc_reclaim_thr_lock);
2081 2092          ASSERT(spa || arc_eviction_list == NULL);
2082 2093  }
2083 2094  
2084 2095  void
2085 2096  arc_shrink(void)
2086 2097  {
2087 2098          if (arc_c > arc_c_min) {
2088 2099                  uint64_t to_free;
2089 2100  
2090 2101  #ifdef _KERNEL
2091 2102                  to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2092 2103  #else
2093 2104                  to_free = arc_c >> arc_shrink_shift;
2094 2105  #endif
2095 2106                  if (arc_c > arc_c_min + to_free)
2096 2107                          atomic_add_64(&arc_c, -to_free);
2097 2108                  else
2098 2109                          arc_c = arc_c_min;
2099 2110  
2100 2111                  atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2101 2112                  if (arc_c > arc_size)
2102 2113                          arc_c = MAX(arc_size, arc_c_min);
2103 2114                  if (arc_p > arc_c)
2104 2115                          arc_p = (arc_c >> 1);
2105 2116                  ASSERT(arc_c >= arc_c_min);
2106 2117                  ASSERT((int64_t)arc_p >= 0);
2107 2118          }
2108 2119  
2109 2120          if (arc_size > arc_c)
2110 2121                  arc_adjust();
2111 2122  }
2112 2123  
2113 2124  /*
2114 2125   * Determine if the system is under memory pressure and is asking
2115 2126   * to reclaim memory. A return value of 1 indicates that the system
2116 2127   * is under memory pressure and that the arc should adjust accordingly.
2117 2128   */
2118 2129  static int
2119 2130  arc_reclaim_needed(void)
2120 2131  {
2121 2132          uint64_t extra;
2122 2133  
2123 2134  #ifdef _KERNEL
2124 2135  
2125 2136          if (needfree)
2126 2137                  return (1);
2127 2138  
2128 2139          /*
2129 2140           * take 'desfree' extra pages, so we reclaim sooner, rather than later
2130 2141           */
2131 2142          extra = desfree;
2132 2143  
2133 2144          /*
2134 2145           * check that we're out of range of the pageout scanner.  It starts to
2135 2146           * schedule paging if freemem is less than lotsfree and needfree.
2136 2147           * lotsfree is the high-water mark for pageout, and needfree is the
2137 2148           * number of needed free pages.  We add extra pages here to make sure
2138 2149           * the scanner doesn't start up while we're freeing memory.
2139 2150           */
2140 2151          if (freemem < lotsfree + needfree + extra)
2141 2152                  return (1);
2142 2153  
2143 2154          /*
2144 2155           * check to make sure that swapfs has enough space so that anon
2145 2156           * reservations can still succeed. anon_resvmem() checks that the
2146 2157           * availrmem is greater than swapfs_minfree, and the number of reserved
2147 2158           * swap pages.  We also add a bit of extra here just to prevent
2148 2159           * circumstances from getting really dire.
2149 2160           */
2150 2161          if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2151 2162                  return (1);
2152 2163  
2153 2164  #if defined(__i386)
2154 2165          /*
2155 2166           * If we're on an i386 platform, it's possible that we'll exhaust the
2156 2167           * kernel heap space before we ever run out of available physical
2157 2168           * memory.  Most checks of the size of the heap_area compare against
2158 2169           * tune.t_minarmem, which is the minimum available real memory that we
2159 2170           * can have in the system.  However, this is generally fixed at 25 pages
2160 2171           * which is so low that it's useless.  In this comparison, we seek to
2161 2172           * calculate the total heap-size, and reclaim if more than 3/4ths of the
2162 2173           * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2163 2174           * free)
2164 2175           */
2165 2176          if (vmem_size(heap_arena, VMEM_FREE) <
2166 2177              (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2167 2178                  return (1);
2168 2179  #endif
2169 2180  
2170 2181          /*
2171 2182           * If zio data pages are being allocated out of a separate heap segment,
2172 2183           * then enforce that the size of available vmem for this arena remains
2173 2184           * above about 1/16th free.
2174 2185           *
2175 2186           * Note: The 1/16th arena free requirement was put in place
2176 2187           * to aggressively evict memory from the arc in order to avoid
2177 2188           * memory fragmentation issues.
2178 2189           */
2179 2190          if (zio_arena != NULL &&
2180 2191              vmem_size(zio_arena, VMEM_FREE) <
2181 2192              (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2182 2193                  return (1);
2183 2194  #else
2184 2195          if (spa_get_random(100) == 0)
2185 2196                  return (1);
2186 2197  #endif
2187 2198          return (0);
2188 2199  }
2189 2200  
2190 2201  static void
2191 2202  arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2192 2203  {
2193 2204          size_t                  i;
2194 2205          kmem_cache_t            *prev_cache = NULL;
2195 2206          kmem_cache_t            *prev_data_cache = NULL;
2196 2207          extern kmem_cache_t     *zio_buf_cache[];
2197 2208          extern kmem_cache_t     *zio_data_buf_cache[];
2198 2209  
2199 2210  #ifdef _KERNEL
2200 2211          if (arc_meta_used >= arc_meta_limit) {
2201 2212                  /*
2202 2213                   * We are exceeding our meta-data cache limit.
2203 2214                   * Purge some DNLC entries to release holds on meta-data.
2204 2215                   */
2205 2216                  dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2206 2217          }
2207 2218  #if defined(__i386)
2208 2219          /*
2209 2220           * Reclaim unused memory from all kmem caches.
2210 2221           */
2211 2222          kmem_reap();
2212 2223  #endif
2213 2224  #endif
2214 2225  
2215 2226          /*
2216 2227           * An aggressive reclamation will shrink the cache size as well as
2217 2228           * reap free buffers from the arc kmem caches.
2218 2229           */
2219 2230          if (strat == ARC_RECLAIM_AGGR)
2220 2231                  arc_shrink();
2221 2232  
2222 2233          for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2223 2234                  if (zio_buf_cache[i] != prev_cache) {
2224 2235                          prev_cache = zio_buf_cache[i];
2225 2236                          kmem_cache_reap_now(zio_buf_cache[i]);
2226 2237                  }
2227 2238                  if (zio_data_buf_cache[i] != prev_data_cache) {
2228 2239                          prev_data_cache = zio_data_buf_cache[i];
2229 2240                          kmem_cache_reap_now(zio_data_buf_cache[i]);
2230 2241                  }
2231 2242          }
2232 2243          kmem_cache_reap_now(buf_cache);
2233 2244          kmem_cache_reap_now(hdr_cache);
2234 2245  
2235 2246          /*
2236 2247           * Ask the vmem areana to reclaim unused memory from its
2237 2248           * quantum caches.
2238 2249           */
2239 2250          if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2240 2251                  vmem_qcache_reap(zio_arena);
2241 2252  }
2242 2253  
2243 2254  static void
2244 2255  arc_reclaim_thread(void)
2245 2256  {
2246 2257          clock_t                 growtime = 0;
2247 2258          arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2248 2259          callb_cpr_t             cpr;
2249 2260  
2250 2261          CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2251 2262  
2252 2263          mutex_enter(&arc_reclaim_thr_lock);
2253 2264          while (arc_thread_exit == 0) {
2254 2265                  if (arc_reclaim_needed()) {
2255 2266  
2256 2267                          if (arc_no_grow) {
2257 2268                                  if (last_reclaim == ARC_RECLAIM_CONS) {
2258 2269                                          last_reclaim = ARC_RECLAIM_AGGR;
2259 2270                                  } else {
2260 2271                                          last_reclaim = ARC_RECLAIM_CONS;
2261 2272                                  }
2262 2273                          } else {
2263 2274                                  arc_no_grow = TRUE;
2264 2275                                  last_reclaim = ARC_RECLAIM_AGGR;
2265 2276                                  membar_producer();
2266 2277                          }
2267 2278  
2268 2279                          /* reset the growth delay for every reclaim */
2269 2280                          growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2270 2281  
2271 2282                          arc_kmem_reap_now(last_reclaim);
2272 2283                          arc_warm = B_TRUE;
2273 2284  
2274 2285                  } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2275 2286                          arc_no_grow = FALSE;
2276 2287                  }
2277 2288  
2278 2289                  arc_adjust();
2279 2290  
2280 2291                  if (arc_eviction_list != NULL)
2281 2292                          arc_do_user_evicts();
2282 2293  
2283 2294                  /* block until needed, or one second, whichever is shorter */
2284 2295                  CALLB_CPR_SAFE_BEGIN(&cpr);
2285 2296                  (void) cv_timedwait(&arc_reclaim_thr_cv,
2286 2297                      &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2287 2298                  CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2288 2299          }
2289 2300  
2290 2301          arc_thread_exit = 0;
2291 2302          cv_broadcast(&arc_reclaim_thr_cv);
2292 2303          CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2293 2304          thread_exit();
2294 2305  }
2295 2306  
2296 2307  /*
2297 2308   * Adapt arc info given the number of bytes we are trying to add and
2298 2309   * the state that we are comming from.  This function is only called
2299 2310   * when we are adding new content to the cache.
2300 2311   */
2301 2312  static void
2302 2313  arc_adapt(int bytes, arc_state_t *state)
2303 2314  {
2304 2315          int mult;
2305 2316          uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2306 2317  
2307 2318          if (state == arc_l2c_only)
2308 2319                  return;
2309 2320  
2310 2321          ASSERT(bytes > 0);
2311 2322          /*
2312 2323           * Adapt the target size of the MRU list:
2313 2324           *      - if we just hit in the MRU ghost list, then increase
2314 2325           *        the target size of the MRU list.
2315 2326           *      - if we just hit in the MFU ghost list, then increase
2316 2327           *        the target size of the MFU list by decreasing the
2317 2328           *        target size of the MRU list.
2318 2329           */
2319 2330          if (state == arc_mru_ghost) {
2320 2331                  mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2321 2332                      1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2322 2333                  mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2323 2334  
2324 2335                  arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2325 2336          } else if (state == arc_mfu_ghost) {
2326 2337                  uint64_t delta;
2327 2338  
2328 2339                  mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2329 2340                      1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2330 2341                  mult = MIN(mult, 10);
2331 2342  
2332 2343                  delta = MIN(bytes * mult, arc_p);
2333 2344                  arc_p = MAX(arc_p_min, arc_p - delta);
2334 2345          }
2335 2346          ASSERT((int64_t)arc_p >= 0);
2336 2347  
2337 2348          if (arc_reclaim_needed()) {
2338 2349                  cv_signal(&arc_reclaim_thr_cv);
2339 2350                  return;
2340 2351          }
2341 2352  
2342 2353          if (arc_no_grow)
2343 2354                  return;
2344 2355  
2345 2356          if (arc_c >= arc_c_max)
2346 2357                  return;
2347 2358  
2348 2359          /*
2349 2360           * If we're within (2 * maxblocksize) bytes of the target
2350 2361           * cache size, increment the target cache size
2351 2362           */
2352 2363          if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2353 2364                  atomic_add_64(&arc_c, (int64_t)bytes);
2354 2365                  if (arc_c > arc_c_max)
2355 2366                          arc_c = arc_c_max;
2356 2367                  else if (state == arc_anon)
2357 2368                          atomic_add_64(&arc_p, (int64_t)bytes);
2358 2369                  if (arc_p > arc_c)
2359 2370                          arc_p = arc_c;
2360 2371          }
2361 2372          ASSERT((int64_t)arc_p >= 0);
2362 2373  }
2363 2374  
2364 2375  /*
2365 2376   * Check if the cache has reached its limits and eviction is required
2366 2377   * prior to insert.
2367 2378   */
2368 2379  static int
2369 2380  arc_evict_needed(arc_buf_contents_t type)
2370 2381  {
2371 2382          if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2372 2383                  return (1);
2373 2384  
2374 2385          if (arc_reclaim_needed())
2375 2386                  return (1);
2376 2387  
2377 2388          return (arc_size > arc_c);
2378 2389  }
2379 2390  
2380 2391  /*
2381 2392   * The buffer, supplied as the first argument, needs a data block.
2382 2393   * So, if we are at cache max, determine which cache should be victimized.
2383 2394   * We have the following cases:
2384 2395   *
2385 2396   * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2386 2397   * In this situation if we're out of space, but the resident size of the MFU is
2387 2398   * under the limit, victimize the MFU cache to satisfy this insertion request.
2388 2399   *
2389 2400   * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2390 2401   * Here, we've used up all of the available space for the MRU, so we need to
2391 2402   * evict from our own cache instead.  Evict from the set of resident MRU
2392 2403   * entries.
2393 2404   *
2394 2405   * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2395 2406   * c minus p represents the MFU space in the cache, since p is the size of the
2396 2407   * cache that is dedicated to the MRU.  In this situation there's still space on
2397 2408   * the MFU side, so the MRU side needs to be victimized.
2398 2409   *
2399 2410   * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2400 2411   * MFU's resident set is consuming more space than it has been allotted.  In
2401 2412   * this situation, we must victimize our own cache, the MFU, for this insertion.
2402 2413   */
2403 2414  static void
2404 2415  arc_get_data_buf(arc_buf_t *buf)
2405 2416  {
2406 2417          arc_state_t             *state = buf->b_hdr->b_state;
2407 2418          uint64_t                size = buf->b_hdr->b_size;
2408 2419          arc_buf_contents_t      type = buf->b_hdr->b_type;
2409 2420  
2410 2421          arc_adapt(size, state);
2411 2422  
2412 2423          /*
2413 2424           * We have not yet reached cache maximum size,
2414 2425           * just allocate a new buffer.
2415 2426           */
2416 2427          if (!arc_evict_needed(type)) {
2417 2428                  if (type == ARC_BUFC_METADATA) {
2418 2429                          buf->b_data = zio_buf_alloc(size);
2419 2430                          arc_space_consume(size, ARC_SPACE_DATA);
2420 2431                  } else {
2421 2432                          ASSERT(type == ARC_BUFC_DATA);
2422 2433                          buf->b_data = zio_data_buf_alloc(size);
2423 2434                          ARCSTAT_INCR(arcstat_data_size, size);
2424 2435                          atomic_add_64(&arc_size, size);
2425 2436                  }
2426 2437                  goto out;
2427 2438          }
2428 2439  
2429 2440          /*
2430 2441           * If we are prefetching from the mfu ghost list, this buffer
2431 2442           * will end up on the mru list; so steal space from there.
2432 2443           */
2433 2444          if (state == arc_mfu_ghost)
2434 2445                  state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2435 2446          else if (state == arc_mru_ghost)
2436 2447                  state = arc_mru;
2437 2448  
2438 2449          if (state == arc_mru || state == arc_anon) {
2439 2450                  uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2440 2451                  state = (arc_mfu->arcs_lsize[type] >= size &&
2441 2452                      arc_p > mru_used) ? arc_mfu : arc_mru;
2442 2453          } else {
2443 2454                  /* MFU cases */
2444 2455                  uint64_t mfu_space = arc_c - arc_p;
2445 2456                  state =  (arc_mru->arcs_lsize[type] >= size &&
2446 2457                      mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2447 2458          }
2448 2459          if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2449 2460                  if (type == ARC_BUFC_METADATA) {
2450 2461                          buf->b_data = zio_buf_alloc(size);
2451 2462                          arc_space_consume(size, ARC_SPACE_DATA);
2452 2463                  } else {
2453 2464                          ASSERT(type == ARC_BUFC_DATA);
2454 2465                          buf->b_data = zio_data_buf_alloc(size);
2455 2466                          ARCSTAT_INCR(arcstat_data_size, size);
2456 2467                          atomic_add_64(&arc_size, size);
2457 2468                  }
2458 2469                  ARCSTAT_BUMP(arcstat_recycle_miss);
2459 2470          }
2460 2471          ASSERT(buf->b_data != NULL);
2461 2472  out:
2462 2473          /*
2463 2474           * Update the state size.  Note that ghost states have a
2464 2475           * "ghost size" and so don't need to be updated.
2465 2476           */
2466 2477          if (!GHOST_STATE(buf->b_hdr->b_state)) {
2467 2478                  arc_buf_hdr_t *hdr = buf->b_hdr;
2468 2479  
2469 2480                  atomic_add_64(&hdr->b_state->arcs_size, size);
2470 2481                  if (list_link_active(&hdr->b_arc_node)) {
2471 2482                          ASSERT(refcount_is_zero(&hdr->b_refcnt));
2472 2483                          atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2473 2484                  }
2474 2485                  /*
2475 2486                   * If we are growing the cache, and we are adding anonymous
2476 2487                   * data, and we have outgrown arc_p, update arc_p
2477 2488                   */
2478 2489                  if (arc_size < arc_c && hdr->b_state == arc_anon &&
2479 2490                      arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2480 2491                          arc_p = MIN(arc_c, arc_p + size);
2481 2492          }
2482 2493  }
2483 2494  
2484 2495  /*
2485 2496   * This routine is called whenever a buffer is accessed.
2486 2497   * NOTE: the hash lock is dropped in this function.
2487 2498   */
2488 2499  static void
2489 2500  arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2490 2501  {
2491 2502          clock_t now;
2492 2503  
2493 2504          ASSERT(MUTEX_HELD(hash_lock));
2494 2505  
2495 2506          if (buf->b_state == arc_anon) {
2496 2507                  /*
2497 2508                   * This buffer is not in the cache, and does not
2498 2509                   * appear in our "ghost" list.  Add the new buffer
2499 2510                   * to the MRU state.
2500 2511                   */
2501 2512  
2502 2513                  ASSERT(buf->b_arc_access == 0);
2503 2514                  buf->b_arc_access = ddi_get_lbolt();
2504 2515                  DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2505 2516                  arc_change_state(arc_mru, buf, hash_lock);
2506 2517  
2507 2518          } else if (buf->b_state == arc_mru) {
2508 2519                  now = ddi_get_lbolt();
2509 2520  
2510 2521                  /*
2511 2522                   * If this buffer is here because of a prefetch, then either:
2512 2523                   * - clear the flag if this is a "referencing" read
2513 2524                   *   (any subsequent access will bump this into the MFU state).
2514 2525                   * or
2515 2526                   * - move the buffer to the head of the list if this is
2516 2527                   *   another prefetch (to make it less likely to be evicted).
2517 2528                   */
2518 2529                  if ((buf->b_flags & ARC_PREFETCH) != 0) {
2519 2530                          if (refcount_count(&buf->b_refcnt) == 0) {
2520 2531                                  ASSERT(list_link_active(&buf->b_arc_node));
2521 2532                          } else {
2522 2533                                  buf->b_flags &= ~ARC_PREFETCH;
2523 2534                                  ARCSTAT_BUMP(arcstat_mru_hits);
2524 2535                          }
2525 2536                          buf->b_arc_access = now;
2526 2537                          return;
2527 2538                  }
2528 2539  
2529 2540                  /*
2530 2541                   * This buffer has been "accessed" only once so far,
2531 2542                   * but it is still in the cache. Move it to the MFU
2532 2543                   * state.
2533 2544                   */
2534 2545                  if (now > buf->b_arc_access + ARC_MINTIME) {
2535 2546                          /*
2536 2547                           * More than 125ms have passed since we
2537 2548                           * instantiated this buffer.  Move it to the
2538 2549                           * most frequently used state.
2539 2550                           */
2540 2551                          buf->b_arc_access = now;
2541 2552                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2542 2553                          arc_change_state(arc_mfu, buf, hash_lock);
2543 2554                  }
2544 2555                  ARCSTAT_BUMP(arcstat_mru_hits);
2545 2556          } else if (buf->b_state == arc_mru_ghost) {
2546 2557                  arc_state_t     *new_state;
2547 2558                  /*
2548 2559                   * This buffer has been "accessed" recently, but
2549 2560                   * was evicted from the cache.  Move it to the
2550 2561                   * MFU state.
2551 2562                   */
2552 2563  
2553 2564                  if (buf->b_flags & ARC_PREFETCH) {
2554 2565                          new_state = arc_mru;
2555 2566                          if (refcount_count(&buf->b_refcnt) > 0)
2556 2567                                  buf->b_flags &= ~ARC_PREFETCH;
2557 2568                          DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2558 2569                  } else {
2559 2570                          new_state = arc_mfu;
2560 2571                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2561 2572                  }
2562 2573  
2563 2574                  buf->b_arc_access = ddi_get_lbolt();
2564 2575                  arc_change_state(new_state, buf, hash_lock);
2565 2576  
2566 2577                  ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2567 2578          } else if (buf->b_state == arc_mfu) {
2568 2579                  /*
2569 2580                   * This buffer has been accessed more than once and is
2570 2581                   * still in the cache.  Keep it in the MFU state.
2571 2582                   *
2572 2583                   * NOTE: an add_reference() that occurred when we did
2573 2584                   * the arc_read() will have kicked this off the list.
2574 2585                   * If it was a prefetch, we will explicitly move it to
2575 2586                   * the head of the list now.
2576 2587                   */
2577 2588                  if ((buf->b_flags & ARC_PREFETCH) != 0) {
2578 2589                          ASSERT(refcount_count(&buf->b_refcnt) == 0);
2579 2590                          ASSERT(list_link_active(&buf->b_arc_node));
2580 2591                  }
2581 2592                  ARCSTAT_BUMP(arcstat_mfu_hits);
2582 2593                  buf->b_arc_access = ddi_get_lbolt();
2583 2594          } else if (buf->b_state == arc_mfu_ghost) {
2584 2595                  arc_state_t     *new_state = arc_mfu;
2585 2596                  /*
2586 2597                   * This buffer has been accessed more than once but has
2587 2598                   * been evicted from the cache.  Move it back to the
2588 2599                   * MFU state.
2589 2600                   */
2590 2601  
2591 2602                  if (buf->b_flags & ARC_PREFETCH) {
2592 2603                          /*
2593 2604                           * This is a prefetch access...
2594 2605                           * move this block back to the MRU state.
2595 2606                           */
2596 2607                          ASSERT0(refcount_count(&buf->b_refcnt));
2597 2608                          new_state = arc_mru;
2598 2609                  }
2599 2610  
2600 2611                  buf->b_arc_access = ddi_get_lbolt();
2601 2612                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2602 2613                  arc_change_state(new_state, buf, hash_lock);
2603 2614  
2604 2615                  ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2605 2616          } else if (buf->b_state == arc_l2c_only) {
2606 2617                  /*
2607 2618                   * This buffer is on the 2nd Level ARC.
2608 2619                   */
2609 2620  
2610 2621                  buf->b_arc_access = ddi_get_lbolt();
2611 2622                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2612 2623                  arc_change_state(arc_mfu, buf, hash_lock);
2613 2624          } else {
2614 2625                  ASSERT(!"invalid arc state");
2615 2626          }
2616 2627  }
2617 2628  
2618 2629  /* a generic arc_done_func_t which you can use */
2619 2630  /* ARGSUSED */
2620 2631  void
2621 2632  arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2622 2633  {
2623 2634          if (zio == NULL || zio->io_error == 0)
2624 2635                  bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2625 2636          VERIFY(arc_buf_remove_ref(buf, arg));
2626 2637  }
2627 2638  
2628 2639  /* a generic arc_done_func_t */
2629 2640  void
2630 2641  arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2631 2642  {
2632 2643          arc_buf_t **bufp = arg;
2633 2644          if (zio && zio->io_error) {
2634 2645                  VERIFY(arc_buf_remove_ref(buf, arg));
2635 2646                  *bufp = NULL;
2636 2647          } else {
2637 2648                  *bufp = buf;
2638 2649                  ASSERT(buf->b_data);
2639 2650          }
2640 2651  }
2641 2652  
2642 2653  static void
2643 2654  arc_read_done(zio_t *zio)
2644 2655  {
2645 2656          arc_buf_hdr_t   *hdr, *found;
2646 2657          arc_buf_t       *buf;
2647 2658          arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2648 2659          kmutex_t        *hash_lock;
2649 2660          arc_callback_t  *callback_list, *acb;
2650 2661          int             freeable = FALSE;
2651 2662  
2652 2663          buf = zio->io_private;
2653 2664          hdr = buf->b_hdr;
2654 2665  
2655 2666          /*
2656 2667           * The hdr was inserted into hash-table and removed from lists
2657 2668           * prior to starting I/O.  We should find this header, since
2658 2669           * it's in the hash table, and it should be legit since it's
2659 2670           * not possible to evict it during the I/O.  The only possible
2660 2671           * reason for it not to be found is if we were freed during the
2661 2672           * read.
2662 2673           */
2663 2674          found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2664 2675              &hash_lock);
2665 2676  
2666 2677          ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2667 2678              (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2668 2679              (found == hdr && HDR_L2_READING(hdr)));
2669 2680  
2670 2681          hdr->b_flags &= ~ARC_L2_EVICTED;
2671 2682          if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2672 2683                  hdr->b_flags &= ~ARC_L2CACHE;
2673 2684  
2674 2685          /* byteswap if necessary */
2675 2686          callback_list = hdr->b_acb;
2676 2687          ASSERT(callback_list != NULL);
2677 2688          if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2678 2689                  dmu_object_byteswap_t bswap =
2679 2690                      DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2680 2691                  arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2681 2692                      byteswap_uint64_array :
2682 2693                      dmu_ot_byteswap[bswap].ob_func;
2683 2694                  func(buf->b_data, hdr->b_size);
2684 2695          }
2685 2696  
2686 2697          arc_cksum_compute(buf, B_FALSE);
2687 2698          arc_buf_watch(buf);
2688 2699  
2689 2700          if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2690 2701                  /*
2691 2702                   * Only call arc_access on anonymous buffers.  This is because
2692 2703                   * if we've issued an I/O for an evicted buffer, we've already
2693 2704                   * called arc_access (to prevent any simultaneous readers from
2694 2705                   * getting confused).
2695 2706                   */
2696 2707                  arc_access(hdr, hash_lock);
2697 2708          }
2698 2709  
2699 2710          /* create copies of the data buffer for the callers */
2700 2711          abuf = buf;
2701 2712          for (acb = callback_list; acb; acb = acb->acb_next) {
2702 2713                  if (acb->acb_done) {
2703 2714                          if (abuf == NULL) {
2704 2715                                  ARCSTAT_BUMP(arcstat_duplicate_reads);
2705 2716                                  abuf = arc_buf_clone(buf);
2706 2717                          }
2707 2718                          acb->acb_buf = abuf;
2708 2719                          abuf = NULL;
2709 2720                  }
2710 2721          }
2711 2722          hdr->b_acb = NULL;
2712 2723          hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2713 2724          ASSERT(!HDR_BUF_AVAILABLE(hdr));
2714 2725          if (abuf == buf) {
2715 2726                  ASSERT(buf->b_efunc == NULL);
2716 2727                  ASSERT(hdr->b_datacnt == 1);
2717 2728                  hdr->b_flags |= ARC_BUF_AVAILABLE;
2718 2729          }
2719 2730  
2720 2731          ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2721 2732  
2722 2733          if (zio->io_error != 0) {
2723 2734                  hdr->b_flags |= ARC_IO_ERROR;
2724 2735                  if (hdr->b_state != arc_anon)
2725 2736                          arc_change_state(arc_anon, hdr, hash_lock);
2726 2737                  if (HDR_IN_HASH_TABLE(hdr))
2727 2738                          buf_hash_remove(hdr);
2728 2739                  freeable = refcount_is_zero(&hdr->b_refcnt);
2729 2740          }
2730 2741  
2731 2742          /*
2732 2743           * Broadcast before we drop the hash_lock to avoid the possibility
2733 2744           * that the hdr (and hence the cv) might be freed before we get to
2734 2745           * the cv_broadcast().
2735 2746           */
2736 2747          cv_broadcast(&hdr->b_cv);
2737 2748  
2738 2749          if (hash_lock) {
2739 2750                  mutex_exit(hash_lock);
2740 2751          } else {
2741 2752                  /*
2742 2753                   * This block was freed while we waited for the read to
2743 2754                   * complete.  It has been removed from the hash table and
2744 2755                   * moved to the anonymous state (so that it won't show up
2745 2756                   * in the cache).
2746 2757                   */
2747 2758                  ASSERT3P(hdr->b_state, ==, arc_anon);
2748 2759                  freeable = refcount_is_zero(&hdr->b_refcnt);
2749 2760          }
2750 2761  
2751 2762          /* execute each callback and free its structure */
2752 2763          while ((acb = callback_list) != NULL) {
2753 2764                  if (acb->acb_done)
2754 2765                          acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2755 2766  
2756 2767                  if (acb->acb_zio_dummy != NULL) {
2757 2768                          acb->acb_zio_dummy->io_error = zio->io_error;
2758 2769                          zio_nowait(acb->acb_zio_dummy);
2759 2770                  }
2760 2771  
2761 2772                  callback_list = acb->acb_next;
2762 2773                  kmem_free(acb, sizeof (arc_callback_t));
2763 2774          }
2764 2775  
2765 2776          if (freeable)
2766 2777                  arc_hdr_destroy(hdr);
2767 2778  }
2768 2779  
2769 2780  /*
2770 2781   * "Read" the block at the specified DVA (in bp) via the
2771 2782   * cache.  If the block is found in the cache, invoke the provided
2772 2783   * callback immediately and return.  Note that the `zio' parameter
2773 2784   * in the callback will be NULL in this case, since no IO was
2774 2785   * required.  If the block is not in the cache pass the read request
2775 2786   * on to the spa with a substitute callback function, so that the
2776 2787   * requested block will be added to the cache.
2777 2788   *
2778 2789   * If a read request arrives for a block that has a read in-progress,
2779 2790   * either wait for the in-progress read to complete (and return the
2780 2791   * results); or, if this is a read with a "done" func, add a record
2781 2792   * to the read to invoke the "done" func when the read completes,
2782 2793   * and return; or just return.
2783 2794   *
2784 2795   * arc_read_done() will invoke all the requested "done" functions
2785 2796   * for readers of this block.
2786 2797   */
2787 2798  int
2788 2799  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2789 2800      void *private, int priority, int zio_flags, uint32_t *arc_flags,
2790 2801      const zbookmark_t *zb)
2791 2802  {
2792 2803          arc_buf_hdr_t *hdr;
2793 2804          arc_buf_t *buf = NULL;
2794 2805          kmutex_t *hash_lock;
2795 2806          zio_t *rzio;
2796 2807          uint64_t guid = spa_load_guid(spa);
2797 2808  
2798 2809  top:
2799 2810          hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2800 2811              &hash_lock);
2801 2812          if (hdr && hdr->b_datacnt > 0) {
2802 2813  
2803 2814                  *arc_flags |= ARC_CACHED;
2804 2815  
2805 2816                  if (HDR_IO_IN_PROGRESS(hdr)) {
2806 2817  
2807 2818                          if (*arc_flags & ARC_WAIT) {
2808 2819                                  cv_wait(&hdr->b_cv, hash_lock);
2809 2820                                  mutex_exit(hash_lock);
2810 2821                                  goto top;
2811 2822                          }
2812 2823                          ASSERT(*arc_flags & ARC_NOWAIT);
2813 2824  
2814 2825                          if (done) {
2815 2826                                  arc_callback_t  *acb = NULL;
2816 2827  
2817 2828                                  acb = kmem_zalloc(sizeof (arc_callback_t),
2818 2829                                      KM_SLEEP);
2819 2830                                  acb->acb_done = done;
2820 2831                                  acb->acb_private = private;
2821 2832                                  if (pio != NULL)
2822 2833                                          acb->acb_zio_dummy = zio_null(pio,
2823 2834                                              spa, NULL, NULL, NULL, zio_flags);
2824 2835  
2825 2836                                  ASSERT(acb->acb_done != NULL);
2826 2837                                  acb->acb_next = hdr->b_acb;
2827 2838                                  hdr->b_acb = acb;
2828 2839                                  add_reference(hdr, hash_lock, private);
2829 2840                                  mutex_exit(hash_lock);
2830 2841                                  return (0);
2831 2842                          }
2832 2843                          mutex_exit(hash_lock);
2833 2844                          return (0);
2834 2845                  }
2835 2846  
2836 2847                  ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2837 2848  
2838 2849                  if (done) {
2839 2850                          add_reference(hdr, hash_lock, private);
2840 2851                          /*
2841 2852                           * If this block is already in use, create a new
2842 2853                           * copy of the data so that we will be guaranteed
2843 2854                           * that arc_release() will always succeed.
2844 2855                           */
2845 2856                          buf = hdr->b_buf;
2846 2857                          ASSERT(buf);
2847 2858                          ASSERT(buf->b_data);
2848 2859                          if (HDR_BUF_AVAILABLE(hdr)) {
2849 2860                                  ASSERT(buf->b_efunc == NULL);
2850 2861                                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2851 2862                          } else {
2852 2863                                  buf = arc_buf_clone(buf);
2853 2864                          }
2854 2865  
2855 2866                  } else if (*arc_flags & ARC_PREFETCH &&
2856 2867                      refcount_count(&hdr->b_refcnt) == 0) {
2857 2868                          hdr->b_flags |= ARC_PREFETCH;
2858 2869                  }
2859 2870                  DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2860 2871                  arc_access(hdr, hash_lock);
2861 2872                  if (*arc_flags & ARC_L2CACHE)
2862 2873                          hdr->b_flags |= ARC_L2CACHE;
2863 2874                  mutex_exit(hash_lock);
2864 2875                  ARCSTAT_BUMP(arcstat_hits);
2865 2876                  ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2866 2877                      demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2867 2878                      data, metadata, hits);
2868 2879  
2869 2880                  if (done)
2870 2881                          done(NULL, buf, private);
2871 2882          } else {
2872 2883                  uint64_t size = BP_GET_LSIZE(bp);
2873 2884                  arc_callback_t  *acb;
2874 2885                  vdev_t *vd = NULL;
2875 2886                  uint64_t addr = 0;
2876 2887                  boolean_t devw = B_FALSE;
2877 2888  
2878 2889                  if (hdr == NULL) {
2879 2890                          /* this block is not in the cache */
2880 2891                          arc_buf_hdr_t   *exists;
2881 2892                          arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2882 2893                          buf = arc_buf_alloc(spa, size, private, type);
2883 2894                          hdr = buf->b_hdr;
2884 2895                          hdr->b_dva = *BP_IDENTITY(bp);
2885 2896                          hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2886 2897                          hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2887 2898                          exists = buf_hash_insert(hdr, &hash_lock);
2888 2899                          if (exists) {
2889 2900                                  /* somebody beat us to the hash insert */
2890 2901                                  mutex_exit(hash_lock);
2891 2902                                  buf_discard_identity(hdr);
2892 2903                                  (void) arc_buf_remove_ref(buf, private);
2893 2904                                  goto top; /* restart the IO request */
2894 2905                          }
2895 2906                          /* if this is a prefetch, we don't have a reference */
2896 2907                          if (*arc_flags & ARC_PREFETCH) {
2897 2908                                  (void) remove_reference(hdr, hash_lock,
2898 2909                                      private);
2899 2910                                  hdr->b_flags |= ARC_PREFETCH;
2900 2911                          }
2901 2912                          if (*arc_flags & ARC_L2CACHE)
2902 2913                                  hdr->b_flags |= ARC_L2CACHE;
2903 2914                          if (BP_GET_LEVEL(bp) > 0)
2904 2915                                  hdr->b_flags |= ARC_INDIRECT;
2905 2916                  } else {
2906 2917                          /* this block is in the ghost cache */
2907 2918                          ASSERT(GHOST_STATE(hdr->b_state));
2908 2919                          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2909 2920                          ASSERT0(refcount_count(&hdr->b_refcnt));
2910 2921                          ASSERT(hdr->b_buf == NULL);
2911 2922  
2912 2923                          /* if this is a prefetch, we don't have a reference */
2913 2924                          if (*arc_flags & ARC_PREFETCH)
2914 2925                                  hdr->b_flags |= ARC_PREFETCH;
2915 2926                          else
2916 2927                                  add_reference(hdr, hash_lock, private);
2917 2928                          if (*arc_flags & ARC_L2CACHE)
2918 2929                                  hdr->b_flags |= ARC_L2CACHE;
2919 2930                          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2920 2931                          buf->b_hdr = hdr;
2921 2932                          buf->b_data = NULL;
2922 2933                          buf->b_efunc = NULL;
2923 2934                          buf->b_private = NULL;
2924 2935                          buf->b_next = NULL;
2925 2936                          hdr->b_buf = buf;
2926 2937                          ASSERT(hdr->b_datacnt == 0);
2927 2938                          hdr->b_datacnt = 1;
2928 2939                          arc_get_data_buf(buf);
2929 2940                          arc_access(hdr, hash_lock);
2930 2941                  }
2931 2942  
2932 2943                  ASSERT(!GHOST_STATE(hdr->b_state));
2933 2944  
2934 2945                  acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2935 2946                  acb->acb_done = done;
2936 2947                  acb->acb_private = private;
2937 2948  
2938 2949                  ASSERT(hdr->b_acb == NULL);
2939 2950                  hdr->b_acb = acb;
2940 2951                  hdr->b_flags |= ARC_IO_IN_PROGRESS;
2941 2952  
2942 2953                  if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2943 2954                      (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2944 2955                          devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2945 2956                          addr = hdr->b_l2hdr->b_daddr;

↓ open down ↓

2678 lines elided

↑ open up ↑

2946 2957                          /*
2947 2958                           * Lock out device removal.
2948 2959                           */
2949 2960                          if (vdev_is_dead(vd) ||
2950 2961                              !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2951 2962                                  vd = NULL;
2952 2963                  }
2953 2964  
2954 2965                  mutex_exit(hash_lock);
2955 2966  
     2967 +                /*
     2968 +                 * At this point, we have a level 1 cache miss.  Try again in
     2969 +                 * L2ARC if possible.
     2970 +                 */
2956 2971                  ASSERT3U(hdr->b_size, ==, size);
2957 2972                  DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
2958 2973                      uint64_t, size, zbookmark_t *, zb);
2959 2974                  ARCSTAT_BUMP(arcstat_misses);
2960 2975                  ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2961 2976                      demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2962 2977                      data, metadata, misses);
2963 2978  
2964 2979                  if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
2965 2980                          /*

2966 2981                           * Read from the L2ARC if the following are true:
2967 2982                           * 1. The L2ARC vdev was previously cached.
2968 2983                           * 2. This buffer still has L2ARC metadata.
2969 2984                           * 3. This buffer isn't currently writing to the L2ARC.
2970 2985                           * 4. The L2ARC entry wasn't evicted, which may
2971 2986                           *    also have invalidated the vdev.
2972 2987                           * 5. This isn't prefetch and l2arc_noprefetch is set.
2973 2988                           */
2974 2989                          if (hdr->b_l2hdr != NULL &&
2975 2990                              !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
2976 2991                              !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
2977 2992                                  l2arc_read_callback_t *cb;
2978 2993  
2979 2994                                  DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2980 2995                                  ARCSTAT_BUMP(arcstat_l2_hits);
2981 2996  
2982 2997                                  cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2983 2998                                      KM_SLEEP);
2984 2999                                  cb->l2rcb_buf = buf;
2985 3000                                  cb->l2rcb_spa = spa;
2986 3001                                  cb->l2rcb_bp = *bp;
2987 3002                                  cb->l2rcb_zb = *zb;
2988 3003                                  cb->l2rcb_flags = zio_flags;
2989 3004  
2990 3005                                  ASSERT(addr >= VDEV_LABEL_START_SIZE &&
2991 3006                                      addr + size < vd->vdev_psize -
2992 3007                                      VDEV_LABEL_END_SIZE);
2993 3008  
2994 3009                                  /*
2995 3010                                   * l2arc read.  The SCL_L2ARC lock will be
2996 3011                                   * released by l2arc_read_done().
2997 3012                                   */
2998 3013                                  rzio = zio_read_phys(pio, vd, addr, size,
2999 3014                                      buf->b_data, ZIO_CHECKSUM_OFF,
3000 3015                                      l2arc_read_done, cb, priority, zio_flags |
3001 3016                                      ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3002 3017                                      ZIO_FLAG_DONT_PROPAGATE |
3003 3018                                      ZIO_FLAG_DONT_RETRY, B_FALSE);
3004 3019                                  DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3005 3020                                      zio_t *, rzio);
3006 3021                                  ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3007 3022  
3008 3023                                  if (*arc_flags & ARC_NOWAIT) {
3009 3024                                          zio_nowait(rzio);
3010 3025                                          return (0);
3011 3026                                  }
3012 3027  
3013 3028                                  ASSERT(*arc_flags & ARC_WAIT);
3014 3029                                  if (zio_wait(rzio) == 0)
3015 3030                                          return (0);
3016 3031  
3017 3032                                  /* l2arc read error; goto zio_read() */
3018 3033                          } else {
3019 3034                                  DTRACE_PROBE1(l2arc__miss,
3020 3035                                      arc_buf_hdr_t *, hdr);
3021 3036                                  ARCSTAT_BUMP(arcstat_l2_misses);
3022 3037                                  if (HDR_L2_WRITING(hdr))
3023 3038                                          ARCSTAT_BUMP(arcstat_l2_rw_clash);
3024 3039                                  spa_config_exit(spa, SCL_L2ARC, vd);
3025 3040                          }
3026 3041                  } else {
3027 3042                          if (vd != NULL)
3028 3043                                  spa_config_exit(spa, SCL_L2ARC, vd);
3029 3044                          if (l2arc_ndev != 0) {
3030 3045                                  DTRACE_PROBE1(l2arc__miss,
3031 3046                                      arc_buf_hdr_t *, hdr);
3032 3047                                  ARCSTAT_BUMP(arcstat_l2_misses);
3033 3048                          }
3034 3049                  }
3035 3050  
3036 3051                  rzio = zio_read(pio, spa, bp, buf->b_data, size,
3037 3052                      arc_read_done, buf, priority, zio_flags, zb);
3038 3053  
3039 3054                  if (*arc_flags & ARC_WAIT)
3040 3055                          return (zio_wait(rzio));
3041 3056  
3042 3057                  ASSERT(*arc_flags & ARC_NOWAIT);
3043 3058                  zio_nowait(rzio);
3044 3059          }
3045 3060          return (0);
3046 3061  }
3047 3062  
3048 3063  void
3049 3064  arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3050 3065  {
3051 3066          ASSERT(buf->b_hdr != NULL);
3052 3067          ASSERT(buf->b_hdr->b_state != arc_anon);
3053 3068          ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3054 3069          ASSERT(buf->b_efunc == NULL);
3055 3070          ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3056 3071  
3057 3072          buf->b_efunc = func;
3058 3073          buf->b_private = private;
3059 3074  }
3060 3075  
3061 3076  /*
3062 3077   * This is used by the DMU to let the ARC know that a buffer is
3063 3078   * being evicted, so the ARC should clean up.  If this arc buf
3064 3079   * is not yet in the evicted state, it will be put there.
3065 3080   */
3066 3081  int
3067 3082  arc_buf_evict(arc_buf_t *buf)
3068 3083  {
3069 3084          arc_buf_hdr_t *hdr;
3070 3085          kmutex_t *hash_lock;
3071 3086          arc_buf_t **bufp;
3072 3087  
3073 3088          mutex_enter(&buf->b_evict_lock);
3074 3089          hdr = buf->b_hdr;
3075 3090          if (hdr == NULL) {
3076 3091                  /*
3077 3092                   * We are in arc_do_user_evicts().
3078 3093                   */
3079 3094                  ASSERT(buf->b_data == NULL);
3080 3095                  mutex_exit(&buf->b_evict_lock);
3081 3096                  return (0);
3082 3097          } else if (buf->b_data == NULL) {
3083 3098                  arc_buf_t copy = *buf; /* structure assignment */
3084 3099                  /*
3085 3100                   * We are on the eviction list; process this buffer now
3086 3101                   * but let arc_do_user_evicts() do the reaping.
3087 3102                   */
3088 3103                  buf->b_efunc = NULL;
3089 3104                  mutex_exit(&buf->b_evict_lock);
3090 3105                  VERIFY(copy.b_efunc(&copy) == 0);
3091 3106                  return (1);
3092 3107          }
3093 3108          hash_lock = HDR_LOCK(hdr);
3094 3109          mutex_enter(hash_lock);
3095 3110          hdr = buf->b_hdr;
3096 3111          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3097 3112  
3098 3113          ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3099 3114          ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3100 3115  
3101 3116          /*
3102 3117           * Pull this buffer off of the hdr
3103 3118           */
3104 3119          bufp = &hdr->b_buf;
3105 3120          while (*bufp != buf)
3106 3121                  bufp = &(*bufp)->b_next;
3107 3122          *bufp = buf->b_next;
3108 3123  
3109 3124          ASSERT(buf->b_data != NULL);
3110 3125          arc_buf_destroy(buf, FALSE, FALSE);
3111 3126  
3112 3127          if (hdr->b_datacnt == 0) {
3113 3128                  arc_state_t *old_state = hdr->b_state;
3114 3129                  arc_state_t *evicted_state;
3115 3130  
3116 3131                  ASSERT(hdr->b_buf == NULL);
3117 3132                  ASSERT(refcount_is_zero(&hdr->b_refcnt));
3118 3133  
3119 3134                  evicted_state =
3120 3135                      (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3121 3136  
3122 3137                  mutex_enter(&old_state->arcs_mtx);
3123 3138                  mutex_enter(&evicted_state->arcs_mtx);
3124 3139  
3125 3140                  arc_change_state(evicted_state, hdr, hash_lock);
3126 3141                  ASSERT(HDR_IN_HASH_TABLE(hdr));
3127 3142                  hdr->b_flags |= ARC_IN_HASH_TABLE;
3128 3143                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3129 3144  
3130 3145                  mutex_exit(&evicted_state->arcs_mtx);
3131 3146                  mutex_exit(&old_state->arcs_mtx);
3132 3147          }
3133 3148          mutex_exit(hash_lock);
3134 3149          mutex_exit(&buf->b_evict_lock);
3135 3150

↓ open down ↓

170 lines elided

↑ open up ↑

3136 3151          VERIFY(buf->b_efunc(buf) == 0);
3137 3152          buf->b_efunc = NULL;
3138 3153          buf->b_private = NULL;
3139 3154          buf->b_hdr = NULL;
3140 3155          buf->b_next = NULL;
3141 3156          kmem_cache_free(buf_cache, buf);
3142 3157          return (1);
3143 3158  }
3144 3159  
3145 3160  /*
3146      - * Release this buffer from the cache.  This must be done
3147      - * after a read and prior to modifying the buffer contents.
     3161 + * Release this buffer from the cache, making it an anonymous buffer.  This
     3162 + * must be done after a read and prior to modifying the buffer contents.
3148 3163   * If the buffer has more than one reference, we must make
3149 3164   * a new hdr for the buffer.
3150 3165   */
3151 3166  void
3152 3167  arc_release(arc_buf_t *buf, void *tag)
3153 3168  {
3154 3169          arc_buf_hdr_t *hdr;
3155 3170          kmutex_t *hash_lock = NULL;
3156 3171          l2arc_buf_hdr_t *l2hdr;
3157 3172          uint64_t buf_size;

3158 3173  
3159 3174          /*
3160 3175           * It would be nice to assert that if it's DMU metadata (level >
3161 3176           * 0 || it's the dnode file), then it must be syncing context.
3162 3177           * But we don't know that information at this level.
3163 3178           */
3164 3179  
3165 3180          mutex_enter(&buf->b_evict_lock);
3166 3181          hdr = buf->b_hdr;
3167 3182  
3168 3183          /* this buffer is not on any list */
3169 3184          ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3170 3185  
3171 3186          if (hdr->b_state == arc_anon) {
3172 3187                  /* this buffer is already released */
3173 3188                  ASSERT(buf->b_efunc == NULL);
3174 3189          } else {
3175 3190                  hash_lock = HDR_LOCK(hdr);
3176 3191                  mutex_enter(hash_lock);
3177 3192                  hdr = buf->b_hdr;
3178 3193                  ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3179 3194          }
3180 3195  
3181 3196          l2hdr = hdr->b_l2hdr;
3182 3197          if (l2hdr) {
3183 3198                  mutex_enter(&l2arc_buflist_mtx);
3184 3199                  hdr->b_l2hdr = NULL;
3185 3200          }
3186 3201          buf_size = hdr->b_size;
3187 3202  
3188 3203          /*
3189 3204           * Do we have more than one buf?
3190 3205           */
3191 3206          if (hdr->b_datacnt > 1) {
3192 3207                  arc_buf_hdr_t *nhdr;
3193 3208                  arc_buf_t **bufp;
3194 3209                  uint64_t blksz = hdr->b_size;
3195 3210                  uint64_t spa = hdr->b_spa;
3196 3211                  arc_buf_contents_t type = hdr->b_type;
3197 3212                  uint32_t flags = hdr->b_flags;
3198 3213  
3199 3214                  ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3200 3215                  /*
3201 3216                   * Pull the data off of this hdr and attach it to
3202 3217                   * a new anonymous hdr.
3203 3218                   */
3204 3219                  (void) remove_reference(hdr, hash_lock, tag);
3205 3220                  bufp = &hdr->b_buf;
3206 3221                  while (*bufp != buf)
3207 3222                          bufp = &(*bufp)->b_next;
3208 3223                  *bufp = buf->b_next;
3209 3224                  buf->b_next = NULL;
3210 3225  
3211 3226                  ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3212 3227                  atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3213 3228                  if (refcount_is_zero(&hdr->b_refcnt)) {
3214 3229                          uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3215 3230                          ASSERT3U(*size, >=, hdr->b_size);
3216 3231                          atomic_add_64(size, -hdr->b_size);
3217 3232                  }
3218 3233  
3219 3234                  /*
3220 3235                   * We're releasing a duplicate user data buffer, update
3221 3236                   * our statistics accordingly.
3222 3237                   */
3223 3238                  if (hdr->b_type == ARC_BUFC_DATA) {
3224 3239                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3225 3240                          ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3226 3241                              -hdr->b_size);
3227 3242                  }
3228 3243                  hdr->b_datacnt -= 1;
3229 3244                  arc_cksum_verify(buf);
3230 3245                  arc_buf_unwatch(buf);
3231 3246  
3232 3247                  mutex_exit(hash_lock);
3233 3248  
3234 3249                  nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3235 3250                  nhdr->b_size = blksz;
3236 3251                  nhdr->b_spa = spa;
3237 3252                  nhdr->b_type = type;
3238 3253                  nhdr->b_buf = buf;
3239 3254                  nhdr->b_state = arc_anon;
3240 3255                  nhdr->b_arc_access = 0;
3241 3256                  nhdr->b_flags = flags & ARC_L2_WRITING;
3242 3257                  nhdr->b_l2hdr = NULL;
3243 3258                  nhdr->b_datacnt = 1;
3244 3259                  nhdr->b_freeze_cksum = NULL;
3245 3260                  (void) refcount_add(&nhdr->b_refcnt, tag);
3246 3261                  buf->b_hdr = nhdr;
3247 3262                  mutex_exit(&buf->b_evict_lock);
3248 3263                  atomic_add_64(&arc_anon->arcs_size, blksz);
3249 3264          } else {
3250 3265                  mutex_exit(&buf->b_evict_lock);
3251 3266                  ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3252 3267                  ASSERT(!list_link_active(&hdr->b_arc_node));
3253 3268                  ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3254 3269                  if (hdr->b_state != arc_anon)
3255 3270                          arc_change_state(arc_anon, hdr, hash_lock);
3256 3271                  hdr->b_arc_access = 0;
3257 3272                  if (hash_lock)
3258 3273                          mutex_exit(hash_lock);
3259 3274  
3260 3275                  buf_discard_identity(hdr);
3261 3276                  arc_buf_thaw(buf);
3262 3277          }
3263 3278          buf->b_efunc = NULL;
3264 3279          buf->b_private = NULL;
3265 3280  
3266 3281          if (l2hdr) {
3267 3282                  list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3268 3283                  kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3269 3284                  ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3270 3285                  mutex_exit(&l2arc_buflist_mtx);
3271 3286          }
3272 3287  }
3273 3288  
3274 3289  int
3275 3290  arc_released(arc_buf_t *buf)
3276 3291  {
3277 3292          int released;
3278 3293  
3279 3294          mutex_enter(&buf->b_evict_lock);
3280 3295          released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3281 3296          mutex_exit(&buf->b_evict_lock);
3282 3297          return (released);
3283 3298  }
3284 3299  
3285 3300  int
3286 3301  arc_has_callback(arc_buf_t *buf)
3287 3302  {
3288 3303          int callback;
3289 3304  
3290 3305          mutex_enter(&buf->b_evict_lock);
3291 3306          callback = (buf->b_efunc != NULL);
3292 3307          mutex_exit(&buf->b_evict_lock);
3293 3308          return (callback);
3294 3309  }
3295 3310  
3296 3311  #ifdef ZFS_DEBUG
3297 3312  int
3298 3313  arc_referenced(arc_buf_t *buf)
3299 3314  {
3300 3315          int referenced;
3301 3316  
3302 3317          mutex_enter(&buf->b_evict_lock);
3303 3318          referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3304 3319          mutex_exit(&buf->b_evict_lock);
3305 3320          return (referenced);
3306 3321  }
3307 3322  #endif
3308 3323  
3309 3324  static void
3310 3325  arc_write_ready(zio_t *zio)
3311 3326  {
3312 3327          arc_write_callback_t *callback = zio->io_private;
3313 3328          arc_buf_t *buf = callback->awcb_buf;
3314 3329          arc_buf_hdr_t *hdr = buf->b_hdr;
3315 3330  
3316 3331          ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3317 3332          callback->awcb_ready(zio, buf, callback->awcb_private);
3318 3333  
3319 3334          /*
3320 3335           * If the IO is already in progress, then this is a re-write
3321 3336           * attempt, so we need to thaw and re-compute the cksum.
3322 3337           * It is the responsibility of the callback to handle the
3323 3338           * accounting for any re-write attempt.
3324 3339           */
3325 3340          if (HDR_IO_IN_PROGRESS(hdr)) {
3326 3341                  mutex_enter(&hdr->b_freeze_lock);
3327 3342                  if (hdr->b_freeze_cksum != NULL) {
3328 3343                          kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3329 3344                          hdr->b_freeze_cksum = NULL;
3330 3345                  }
3331 3346                  mutex_exit(&hdr->b_freeze_lock);
3332 3347          }
3333 3348          arc_cksum_compute(buf, B_FALSE);
3334 3349          hdr->b_flags |= ARC_IO_IN_PROGRESS;
3335 3350  }
3336 3351  
3337 3352  static void
3338 3353  arc_write_done(zio_t *zio)
3339 3354  {
3340 3355          arc_write_callback_t *callback = zio->io_private;
3341 3356          arc_buf_t *buf = callback->awcb_buf;
3342 3357          arc_buf_hdr_t *hdr = buf->b_hdr;
3343 3358  
3344 3359          ASSERT(hdr->b_acb == NULL);
3345 3360  
3346 3361          if (zio->io_error == 0) {
3347 3362                  hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3348 3363                  hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3349 3364                  hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3350 3365          } else {
3351 3366                  ASSERT(BUF_EMPTY(hdr));
3352 3367          }
3353 3368  
3354 3369          /*
3355 3370           * If the block to be written was all-zero, we may have
3356 3371           * compressed it away.  In this case no write was performed
3357 3372           * so there will be no dva/birth/checksum.  The buffer must
3358 3373           * therefore remain anonymous (and uncached).
3359 3374           */
3360 3375          if (!BUF_EMPTY(hdr)) {
3361 3376                  arc_buf_hdr_t *exists;
3362 3377                  kmutex_t *hash_lock;
3363 3378  
3364 3379                  ASSERT(zio->io_error == 0);
3365 3380  
3366 3381                  arc_cksum_verify(buf);
3367 3382  
3368 3383                  exists = buf_hash_insert(hdr, &hash_lock);
3369 3384                  if (exists) {
3370 3385                          /*
3371 3386                           * This can only happen if we overwrite for
3372 3387                           * sync-to-convergence, because we remove
3373 3388                           * buffers from the hash table when we arc_free().
3374 3389                           */
3375 3390                          if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3376 3391                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3377 3392                                          panic("bad overwrite, hdr=%p exists=%p",
3378 3393                                              (void *)hdr, (void *)exists);
3379 3394                                  ASSERT(refcount_is_zero(&exists->b_refcnt));
3380 3395                                  arc_change_state(arc_anon, exists, hash_lock);
3381 3396                                  mutex_exit(hash_lock);
3382 3397                                  arc_hdr_destroy(exists);
3383 3398                                  exists = buf_hash_insert(hdr, &hash_lock);
3384 3399                                  ASSERT3P(exists, ==, NULL);
3385 3400                          } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3386 3401                                  /* nopwrite */
3387 3402                                  ASSERT(zio->io_prop.zp_nopwrite);
3388 3403                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3389 3404                                          panic("bad nopwrite, hdr=%p exists=%p",
3390 3405                                              (void *)hdr, (void *)exists);
3391 3406                          } else {
3392 3407                                  /* Dedup */
3393 3408                                  ASSERT(hdr->b_datacnt == 1);
3394 3409                                  ASSERT(hdr->b_state == arc_anon);
3395 3410                                  ASSERT(BP_GET_DEDUP(zio->io_bp));
3396 3411                                  ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3397 3412                          }
3398 3413                  }
3399 3414                  hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3400 3415                  /* if it's not anon, we are doing a scrub */
3401 3416                  if (!exists && hdr->b_state == arc_anon)
3402 3417                          arc_access(hdr, hash_lock);
3403 3418                  mutex_exit(hash_lock);
3404 3419          } else {
3405 3420                  hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3406 3421          }
3407 3422  
3408 3423          ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3409 3424          callback->awcb_done(zio, buf, callback->awcb_private);
3410 3425  
3411 3426          kmem_free(callback, sizeof (arc_write_callback_t));
3412 3427  }
3413 3428  
3414 3429  zio_t *
3415 3430  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3416 3431      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3417 3432      arc_done_func_t *ready, arc_done_func_t *done, void *private,
3418 3433      int priority, int zio_flags, const zbookmark_t *zb)
3419 3434  {
3420 3435          arc_buf_hdr_t *hdr = buf->b_hdr;
3421 3436          arc_write_callback_t *callback;
3422 3437          zio_t *zio;
3423 3438  
3424 3439          ASSERT(ready != NULL);
3425 3440          ASSERT(done != NULL);
3426 3441          ASSERT(!HDR_IO_ERROR(hdr));
3427 3442          ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3428 3443          ASSERT(hdr->b_acb == NULL);
3429 3444          if (l2arc)
3430 3445                  hdr->b_flags |= ARC_L2CACHE;
3431 3446          callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3432 3447          callback->awcb_ready = ready;
3433 3448          callback->awcb_done = done;
3434 3449          callback->awcb_private = private;
3435 3450          callback->awcb_buf = buf;
3436 3451  
3437 3452          zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3438 3453              arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3439 3454  
3440 3455          return (zio);
3441 3456  }
3442 3457  
3443 3458  static int
3444 3459  arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3445 3460  {
3446 3461  #ifdef _KERNEL
3447 3462          uint64_t available_memory = ptob(freemem);
3448 3463          static uint64_t page_load = 0;
3449 3464          static uint64_t last_txg = 0;
3450 3465  
3451 3466  #if defined(__i386)
3452 3467          available_memory =
3453 3468              MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3454 3469  #endif
3455 3470          if (available_memory >= zfs_write_limit_max)
3456 3471                  return (0);
3457 3472  
3458 3473          if (txg > last_txg) {
3459 3474                  last_txg = txg;
3460 3475                  page_load = 0;
3461 3476          }
3462 3477          /*
3463 3478           * If we are in pageout, we know that memory is already tight,
3464 3479           * the arc is already going to be evicting, so we just want to
3465 3480           * continue to let page writes occur as quickly as possible.
3466 3481           */
3467 3482          if (curproc == proc_pageout) {
3468 3483                  if (page_load > MAX(ptob(minfree), available_memory) / 4)
3469 3484                          return (SET_ERROR(ERESTART));
3470 3485                  /* Note: reserve is inflated, so we deflate */
3471 3486                  page_load += reserve / 8;
3472 3487                  return (0);
3473 3488          } else if (page_load > 0 && arc_reclaim_needed()) {
3474 3489                  /* memory is low, delay before restarting */
3475 3490                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3476 3491                  return (SET_ERROR(EAGAIN));
3477 3492          }
3478 3493          page_load = 0;
3479 3494  
3480 3495          if (arc_size > arc_c_min) {
3481 3496                  uint64_t evictable_memory =
3482 3497                      arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3483 3498                      arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3484 3499                      arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3485 3500                      arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3486 3501                  available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3487 3502          }
3488 3503  
3489 3504          if (inflight_data > available_memory / 4) {
3490 3505                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3491 3506                  return (SET_ERROR(ERESTART));
3492 3507          }
3493 3508  #endif
3494 3509          return (0);
3495 3510  }
3496 3511  
3497 3512  void
3498 3513  arc_tempreserve_clear(uint64_t reserve)
3499 3514  {
3500 3515          atomic_add_64(&arc_tempreserve, -reserve);
3501 3516          ASSERT((int64_t)arc_tempreserve >= 0);
3502 3517  }
3503 3518  
3504 3519  int
3505 3520  arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3506 3521  {
3507 3522          int error;
3508 3523          uint64_t anon_size;
3509 3524  
3510 3525  #ifdef ZFS_DEBUG
3511 3526          /*
3512 3527           * Once in a while, fail for no reason.  Everything should cope.
3513 3528           */
3514 3529          if (spa_get_random(10000) == 0) {
3515 3530                  dprintf("forcing random failure\n");
3516 3531                  return (SET_ERROR(ERESTART));
3517 3532          }
3518 3533  #endif
3519 3534          if (reserve > arc_c/4 && !arc_no_grow)
3520 3535                  arc_c = MIN(arc_c_max, reserve * 4);
3521 3536          if (reserve > arc_c)
3522 3537                  return (SET_ERROR(ENOMEM));
3523 3538  
3524 3539          /*
3525 3540           * Don't count loaned bufs as in flight dirty data to prevent long
3526 3541           * network delays from blocking transactions that are ready to be
3527 3542           * assigned to a txg.
3528 3543           */
3529 3544          anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3530 3545  
3531 3546          /*
3532 3547           * Writes will, almost always, require additional memory allocations
3533 3548           * in order to compress/encrypt/etc the data.  We therefor need to
3534 3549           * make sure that there is sufficient available memory for this.
3535 3550           */
3536 3551          if (error = arc_memory_throttle(reserve, anon_size, txg))
3537 3552                  return (error);
3538 3553  
3539 3554          /*
3540 3555           * Throttle writes when the amount of dirty data in the cache
3541 3556           * gets too large.  We try to keep the cache less than half full
3542 3557           * of dirty blocks so that our sync times don't grow too large.
3543 3558           * Note: if two requests come in concurrently, we might let them
3544 3559           * both succeed, when one of them should fail.  Not a huge deal.
3545 3560           */
3546 3561  
3547 3562          if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3548 3563              anon_size > arc_c / 4) {
3549 3564                  dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3550 3565                      "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3551 3566                      arc_tempreserve>>10,
3552 3567                      arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3553 3568                      arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3554 3569                      reserve>>10, arc_c>>10);
3555 3570                  return (SET_ERROR(ERESTART));
3556 3571          }
3557 3572          atomic_add_64(&arc_tempreserve, reserve);
3558 3573          return (0);
3559 3574  }
3560 3575  
3561 3576  void
3562 3577  arc_init(void)
3563 3578  {
3564 3579          mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3565 3580          cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3566 3581  
3567 3582          /* Convert seconds to clock ticks */
3568 3583          arc_min_prefetch_lifespan = 1 * hz;
3569 3584  
3570 3585          /* Start out with 1/8 of all memory */
3571 3586          arc_c = physmem * PAGESIZE / 8;
3572 3587  
3573 3588  #ifdef _KERNEL
3574 3589          /*
3575 3590           * On architectures where the physical memory can be larger
3576 3591           * than the addressable space (intel in 32-bit mode), we may
3577 3592           * need to limit the cache to 1/8 of VM size.
3578 3593           */
3579 3594          arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3580 3595  #endif
3581 3596  
3582 3597          /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3583 3598          arc_c_min = MAX(arc_c / 4, 64<<20);
3584 3599          /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3585 3600          if (arc_c * 8 >= 1<<30)
3586 3601                  arc_c_max = (arc_c * 8) - (1<<30);
3587 3602          else
3588 3603                  arc_c_max = arc_c_min;
3589 3604          arc_c_max = MAX(arc_c * 6, arc_c_max);
3590 3605  
3591 3606          /*
3592 3607           * Allow the tunables to override our calculations if they are
3593 3608           * reasonable (ie. over 64MB)
3594 3609           */
3595 3610          if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3596 3611                  arc_c_max = zfs_arc_max;
3597 3612          if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3598 3613                  arc_c_min = zfs_arc_min;
3599 3614  
3600 3615          arc_c = arc_c_max;
3601 3616          arc_p = (arc_c >> 1);
3602 3617  
3603 3618          /* limit meta-data to 1/4 of the arc capacity */
3604 3619          arc_meta_limit = arc_c_max / 4;
3605 3620  
3606 3621          /* Allow the tunable to override if it is reasonable */
3607 3622          if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3608 3623                  arc_meta_limit = zfs_arc_meta_limit;
3609 3624  
3610 3625          if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3611 3626                  arc_c_min = arc_meta_limit / 2;
3612 3627  
3613 3628          if (zfs_arc_grow_retry > 0)
3614 3629                  arc_grow_retry = zfs_arc_grow_retry;
3615 3630  
3616 3631          if (zfs_arc_shrink_shift > 0)
3617 3632                  arc_shrink_shift = zfs_arc_shrink_shift;
3618 3633  
3619 3634          if (zfs_arc_p_min_shift > 0)
3620 3635                  arc_p_min_shift = zfs_arc_p_min_shift;
3621 3636  
3622 3637          /* if kmem_flags are set, lets try to use less memory */
3623 3638          if (kmem_debugging())
3624 3639                  arc_c = arc_c / 2;
3625 3640          if (arc_c < arc_c_min)
3626 3641                  arc_c = arc_c_min;
3627 3642  
3628 3643          arc_anon = &ARC_anon;
3629 3644          arc_mru = &ARC_mru;
3630 3645          arc_mru_ghost = &ARC_mru_ghost;
3631 3646          arc_mfu = &ARC_mfu;
3632 3647          arc_mfu_ghost = &ARC_mfu_ghost;
3633 3648          arc_l2c_only = &ARC_l2c_only;
3634 3649          arc_size = 0;
3635 3650  
3636 3651          mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3637 3652          mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3638 3653          mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3639 3654          mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3640 3655          mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3641 3656          mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3642 3657  
3643 3658          list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3644 3659              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3645 3660          list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3646 3661              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3647 3662          list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3648 3663              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3649 3664          list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3650 3665              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3651 3666          list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3652 3667              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3653 3668          list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3654 3669              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3655 3670          list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3656 3671              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3657 3672          list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3658 3673              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3659 3674          list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3660 3675              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3661 3676          list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3662 3677              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3663 3678  
3664 3679          buf_init();
3665 3680  
3666 3681          arc_thread_exit = 0;
3667 3682          arc_eviction_list = NULL;
3668 3683          mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3669 3684          bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3670 3685  
3671 3686          arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3672 3687              sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3673 3688  
3674 3689          if (arc_ksp != NULL) {
3675 3690                  arc_ksp->ks_data = &arc_stats;
3676 3691                  kstat_install(arc_ksp);
3677 3692          }
3678 3693  
3679 3694          (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3680 3695              TS_RUN, minclsyspri);
3681 3696  
3682 3697          arc_dead = FALSE;
3683 3698          arc_warm = B_FALSE;
3684 3699  
3685 3700          if (zfs_write_limit_max == 0)
3686 3701                  zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3687 3702          else
3688 3703                  zfs_write_limit_shift = 0;
3689 3704          mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3690 3705  }
3691 3706  
3692 3707  void
3693 3708  arc_fini(void)
3694 3709  {
3695 3710          mutex_enter(&arc_reclaim_thr_lock);
3696 3711          arc_thread_exit = 1;
3697 3712          while (arc_thread_exit != 0)
3698 3713                  cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3699 3714          mutex_exit(&arc_reclaim_thr_lock);
3700 3715  
3701 3716          arc_flush(NULL);
3702 3717  
3703 3718          arc_dead = TRUE;
3704 3719  
3705 3720          if (arc_ksp != NULL) {
3706 3721                  kstat_delete(arc_ksp);
3707 3722                  arc_ksp = NULL;
3708 3723          }
3709 3724  
3710 3725          mutex_destroy(&arc_eviction_mtx);
3711 3726          mutex_destroy(&arc_reclaim_thr_lock);
3712 3727          cv_destroy(&arc_reclaim_thr_cv);
3713 3728  
3714 3729          list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3715 3730          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3716 3731          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3717 3732          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3718 3733          list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3719 3734          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3720 3735          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3721 3736          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3722 3737  
3723 3738          mutex_destroy(&arc_anon->arcs_mtx);
3724 3739          mutex_destroy(&arc_mru->arcs_mtx);
3725 3740          mutex_destroy(&arc_mru_ghost->arcs_mtx);
3726 3741          mutex_destroy(&arc_mfu->arcs_mtx);
3727 3742          mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3728 3743          mutex_destroy(&arc_l2c_only->arcs_mtx);
3729 3744  
3730 3745          mutex_destroy(&zfs_write_limit_lock);
3731 3746  
3732 3747          buf_fini();
3733 3748  
3734 3749          ASSERT(arc_loaned_bytes == 0);
3735 3750  }
3736 3751  
3737 3752  /*
3738 3753   * Level 2 ARC
3739 3754   *
3740 3755   * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3741 3756   * It uses dedicated storage devices to hold cached data, which are populated
3742 3757   * using large infrequent writes.  The main role of this cache is to boost
3743 3758   * the performance of random read workloads.  The intended L2ARC devices
3744 3759   * include short-stroked disks, solid state disks, and other media with
3745 3760   * substantially faster read latency than disk.
3746 3761   *
3747 3762   *                 +-----------------------+
3748 3763   *                 |         ARC           |
3749 3764   *                 +-----------------------+
3750 3765   *                    |         ^     ^
3751 3766   *                    |         |     |
3752 3767   *      l2arc_feed_thread()    arc_read()
3753 3768   *                    |         |     |
3754 3769   *                    |  l2arc read   |
3755 3770   *                    V         |     |
3756 3771   *               +---------------+    |
3757 3772   *               |     L2ARC     |    |
3758 3773   *               +---------------+    |
3759 3774   *                   |    ^           |
3760 3775   *          l2arc_write() |           |
3761 3776   *                   |    |           |
3762 3777   *                   V    |           |
3763 3778   *                 +-------+      +-------+
3764 3779   *                 | vdev  |      | vdev  |
3765 3780   *                 | cache |      | cache |
3766 3781   *                 +-------+      +-------+
3767 3782   *                 +=========+     .-----.
3768 3783   *                 :  L2ARC  :    |-_____-|
3769 3784   *                 : devices :    | Disks |
3770 3785   *                 +=========+    `-_____-'
3771 3786   *
3772 3787   * Read requests are satisfied from the following sources, in order:
3773 3788   *
3774 3789   *      1) ARC
3775 3790   *      2) vdev cache of L2ARC devices
3776 3791   *      3) L2ARC devices
3777 3792   *      4) vdev cache of disks
3778 3793   *      5) disks
3779 3794   *
3780 3795   * Some L2ARC device types exhibit extremely slow write performance.
3781 3796   * To accommodate for this there are some significant differences between
3782 3797   * the L2ARC and traditional cache design:
3783 3798   *
3784 3799   * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3785 3800   * the ARC behave as usual, freeing buffers and placing headers on ghost
3786 3801   * lists.  The ARC does not send buffers to the L2ARC during eviction as
3787 3802   * this would add inflated write latencies for all ARC memory pressure.
3788 3803   *
3789 3804   * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3790 3805   * It does this by periodically scanning buffers from the eviction-end of
3791 3806   * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3792 3807   * not already there.  It scans until a headroom of buffers is satisfied,
3793 3808   * which itself is a buffer for ARC eviction.  The thread that does this is
3794 3809   * l2arc_feed_thread(), illustrated below; example sizes are included to
3795 3810   * provide a better sense of ratio than this diagram:
3796 3811   *
3797 3812   *             head -->                        tail
3798 3813   *              +---------------------+----------+
3799 3814   *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3800 3815   *              +---------------------+----------+   |   o L2ARC eligible
3801 3816   *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3802 3817   *              +---------------------+----------+   |
3803 3818   *                   15.9 Gbytes      ^ 32 Mbytes    |
3804 3819   *                                 headroom          |
3805 3820   *                                            l2arc_feed_thread()
3806 3821   *                                                   |
3807 3822   *                       l2arc write hand <--[oooo]--'
3808 3823   *                               |           8 Mbyte
3809 3824   *                               |          write max
3810 3825   *                               V
3811 3826   *                +==============================+
3812 3827   *      L2ARC dev |####|#|###|###|    |####| ... |
3813 3828   *                +==============================+
3814 3829   *                           32 Gbytes
3815 3830   *
3816 3831   * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3817 3832   * evicted, then the L2ARC has cached a buffer much sooner than it probably
3818 3833   * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3819 3834   * safe to say that this is an uncommon case, since buffers at the end of
3820 3835   * the ARC lists have moved there due to inactivity.
3821 3836   *
3822 3837   * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3823 3838   * then the L2ARC simply misses copying some buffers.  This serves as a
3824 3839   * pressure valve to prevent heavy read workloads from both stalling the ARC
3825 3840   * with waits and clogging the L2ARC with writes.  This also helps prevent
3826 3841   * the potential for the L2ARC to churn if it attempts to cache content too
3827 3842   * quickly, such as during backups of the entire pool.
3828 3843   *
3829 3844   * 5. After system boot and before the ARC has filled main memory, there are
3830 3845   * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3831 3846   * lists can remain mostly static.  Instead of searching from tail of these
3832 3847   * lists as pictured, the l2arc_feed_thread() will search from the list heads
3833 3848   * for eligible buffers, greatly increasing its chance of finding them.
3834 3849   *
3835 3850   * The L2ARC device write speed is also boosted during this time so that
3836 3851   * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3837 3852   * there are no L2ARC reads, and no fear of degrading read performance
3838 3853   * through increased writes.
3839 3854   *
3840 3855   * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3841 3856   * the vdev queue can aggregate them into larger and fewer writes.  Each
3842 3857   * device is written to in a rotor fashion, sweeping writes through
3843 3858   * available space then repeating.
3844 3859   *
3845 3860   * 7. The L2ARC does not store dirty content.  It never needs to flush
3846 3861   * write buffers back to disk based storage.
3847 3862   *
3848 3863   * 8. If an ARC buffer is written (and dirtied) which also exists in the
3849 3864   * L2ARC, the now stale L2ARC buffer is immediately dropped.
3850 3865   *
3851 3866   * The performance of the L2ARC can be tweaked by a number of tunables, which
3852 3867   * may be necessary for different workloads:
3853 3868   *
3854 3869   *      l2arc_write_max         max write bytes per interval
3855 3870   *      l2arc_write_boost       extra write bytes during device warmup
3856 3871   *      l2arc_noprefetch        skip caching prefetched buffers
3857 3872   *      l2arc_headroom          number of max device writes to precache
3858 3873   *      l2arc_feed_secs         seconds between L2ARC writing
3859 3874   *
3860 3875   * Tunables may be removed or added as future performance improvements are
3861 3876   * integrated, and also may become zpool properties.
3862 3877   *
3863 3878   * There are three key functions that control how the L2ARC warms up:
3864 3879   *
3865 3880   *      l2arc_write_eligible()  check if a buffer is eligible to cache
3866 3881   *      l2arc_write_size()      calculate how much to write
3867 3882   *      l2arc_write_interval()  calculate sleep delay between writes
3868 3883   *
3869 3884   * These three functions determine what to write, how much, and how quickly
3870 3885   * to send writes.
3871 3886   */
3872 3887  
3873 3888  static boolean_t
3874 3889  l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3875 3890  {
3876 3891          /*
3877 3892           * A buffer is *not* eligible for the L2ARC if it:
3878 3893           * 1. belongs to a different spa.
3879 3894           * 2. is already cached on the L2ARC.
3880 3895           * 3. has an I/O in progress (it may be an incomplete read).
3881 3896           * 4. is flagged not eligible (zfs property).
3882 3897           */
3883 3898          if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3884 3899              HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3885 3900                  return (B_FALSE);
3886 3901  
3887 3902          return (B_TRUE);
3888 3903  }
3889 3904  
3890 3905  static uint64_t
3891 3906  l2arc_write_size(l2arc_dev_t *dev)
3892 3907  {
3893 3908          uint64_t size;
3894 3909  
3895 3910          size = dev->l2ad_write;
3896 3911  
3897 3912          if (arc_warm == B_FALSE)
3898 3913                  size += dev->l2ad_boost;
3899 3914  
3900 3915          return (size);
3901 3916  
3902 3917  }
3903 3918  
3904 3919  static clock_t
3905 3920  l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
3906 3921  {
3907 3922          clock_t interval, next, now;
3908 3923  
3909 3924          /*
3910 3925           * If the ARC lists are busy, increase our write rate; if the
3911 3926           * lists are stale, idle back.  This is achieved by checking
3912 3927           * how much we previously wrote - if it was more than half of
3913 3928           * what we wanted, schedule the next write much sooner.
3914 3929           */
3915 3930          if (l2arc_feed_again && wrote > (wanted / 2))
3916 3931                  interval = (hz * l2arc_feed_min_ms) / 1000;
3917 3932          else
3918 3933                  interval = hz * l2arc_feed_secs;
3919 3934  
3920 3935          now = ddi_get_lbolt();
3921 3936          next = MAX(now, MIN(now + interval, began + interval));
3922 3937  
3923 3938          return (next);
3924 3939  }
3925 3940  
3926 3941  static void
3927 3942  l2arc_hdr_stat_add(void)
3928 3943  {
3929 3944          ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3930 3945          ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3931 3946  }
3932 3947  
3933 3948  static void
3934 3949  l2arc_hdr_stat_remove(void)
3935 3950  {
3936 3951          ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3937 3952          ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3938 3953  }
3939 3954  
3940 3955  /*
3941 3956   * Cycle through L2ARC devices.  This is how L2ARC load balances.
3942 3957   * If a device is returned, this also returns holding the spa config lock.
3943 3958   */
3944 3959  static l2arc_dev_t *
3945 3960  l2arc_dev_get_next(void)
3946 3961  {
3947 3962          l2arc_dev_t *first, *next = NULL;
3948 3963  
3949 3964          /*
3950 3965           * Lock out the removal of spas (spa_namespace_lock), then removal
3951 3966           * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
3952 3967           * both locks will be dropped and a spa config lock held instead.
3953 3968           */
3954 3969          mutex_enter(&spa_namespace_lock);
3955 3970          mutex_enter(&l2arc_dev_mtx);
3956 3971  
3957 3972          /* if there are no vdevs, there is nothing to do */
3958 3973          if (l2arc_ndev == 0)
3959 3974                  goto out;
3960 3975  
3961 3976          first = NULL;
3962 3977          next = l2arc_dev_last;
3963 3978          do {
3964 3979                  /* loop around the list looking for a non-faulted vdev */
3965 3980                  if (next == NULL) {
3966 3981                          next = list_head(l2arc_dev_list);
3967 3982                  } else {
3968 3983                          next = list_next(l2arc_dev_list, next);
3969 3984                          if (next == NULL)
3970 3985                                  next = list_head(l2arc_dev_list);
3971 3986                  }
3972 3987  
3973 3988                  /* if we have come back to the start, bail out */
3974 3989                  if (first == NULL)
3975 3990                          first = next;
3976 3991                  else if (next == first)
3977 3992                          break;
3978 3993  
3979 3994          } while (vdev_is_dead(next->l2ad_vdev));
3980 3995  
3981 3996          /* if we were unable to find any usable vdevs, return NULL */
3982 3997          if (vdev_is_dead(next->l2ad_vdev))
3983 3998                  next = NULL;
3984 3999  
3985 4000          l2arc_dev_last = next;
3986 4001  
3987 4002  out:
3988 4003          mutex_exit(&l2arc_dev_mtx);
3989 4004  
3990 4005          /*
3991 4006           * Grab the config lock to prevent the 'next' device from being
3992 4007           * removed while we are writing to it.
3993 4008           */
3994 4009          if (next != NULL)
3995 4010                  spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
3996 4011          mutex_exit(&spa_namespace_lock);
3997 4012  
3998 4013          return (next);
3999 4014  }
4000 4015  
4001 4016  /*
4002 4017   * Free buffers that were tagged for destruction.
4003 4018   */
4004 4019  static void
4005 4020  l2arc_do_free_on_write()
4006 4021  {
4007 4022          list_t *buflist;
4008 4023          l2arc_data_free_t *df, *df_prev;
4009 4024  
4010 4025          mutex_enter(&l2arc_free_on_write_mtx);
4011 4026          buflist = l2arc_free_on_write;
4012 4027  
4013 4028          for (df = list_tail(buflist); df; df = df_prev) {
4014 4029                  df_prev = list_prev(buflist, df);
4015 4030                  ASSERT(df->l2df_data != NULL);
4016 4031                  ASSERT(df->l2df_func != NULL);
4017 4032                  df->l2df_func(df->l2df_data, df->l2df_size);
4018 4033                  list_remove(buflist, df);
4019 4034                  kmem_free(df, sizeof (l2arc_data_free_t));
4020 4035          }
4021 4036  
4022 4037          mutex_exit(&l2arc_free_on_write_mtx);
4023 4038  }
4024 4039  
4025 4040  /*
4026 4041   * A write to a cache device has completed.  Update all headers to allow
4027 4042   * reads from these buffers to begin.
4028 4043   */
4029 4044  static void
4030 4045  l2arc_write_done(zio_t *zio)
4031 4046  {
4032 4047          l2arc_write_callback_t *cb;
4033 4048          l2arc_dev_t *dev;
4034 4049          list_t *buflist;
4035 4050          arc_buf_hdr_t *head, *ab, *ab_prev;
4036 4051          l2arc_buf_hdr_t *abl2;
4037 4052          kmutex_t *hash_lock;
4038 4053  
4039 4054          cb = zio->io_private;
4040 4055          ASSERT(cb != NULL);
4041 4056          dev = cb->l2wcb_dev;
4042 4057          ASSERT(dev != NULL);
4043 4058          head = cb->l2wcb_head;
4044 4059          ASSERT(head != NULL);
4045 4060          buflist = dev->l2ad_buflist;
4046 4061          ASSERT(buflist != NULL);
4047 4062          DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4048 4063              l2arc_write_callback_t *, cb);
4049 4064  
4050 4065          if (zio->io_error != 0)
4051 4066                  ARCSTAT_BUMP(arcstat_l2_writes_error);
4052 4067  
4053 4068          mutex_enter(&l2arc_buflist_mtx);
4054 4069  
4055 4070          /*
4056 4071           * All writes completed, or an error was hit.
4057 4072           */
4058 4073          for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4059 4074                  ab_prev = list_prev(buflist, ab);
4060 4075  
4061 4076                  hash_lock = HDR_LOCK(ab);
4062 4077                  if (!mutex_tryenter(hash_lock)) {
4063 4078                          /*
4064 4079                           * This buffer misses out.  It may be in a stage
4065 4080                           * of eviction.  Its ARC_L2_WRITING flag will be
4066 4081                           * left set, denying reads to this buffer.
4067 4082                           */
4068 4083                          ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4069 4084                          continue;
4070 4085                  }
4071 4086  
4072 4087                  if (zio->io_error != 0) {
4073 4088                          /*
4074 4089                           * Error - drop L2ARC entry.
4075 4090                           */
4076 4091                          list_remove(buflist, ab);
4077 4092                          abl2 = ab->b_l2hdr;
4078 4093                          ab->b_l2hdr = NULL;
4079 4094                          kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4080 4095                          ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4081 4096                  }
4082 4097  
4083 4098                  /*
4084 4099                   * Allow ARC to begin reads to this L2ARC entry.
4085 4100                   */
4086 4101                  ab->b_flags &= ~ARC_L2_WRITING;
4087 4102  
4088 4103                  mutex_exit(hash_lock);
4089 4104          }
4090 4105  
4091 4106          atomic_inc_64(&l2arc_writes_done);
4092 4107          list_remove(buflist, head);
4093 4108          kmem_cache_free(hdr_cache, head);
4094 4109          mutex_exit(&l2arc_buflist_mtx);
4095 4110  
4096 4111          l2arc_do_free_on_write();
4097 4112  
4098 4113          kmem_free(cb, sizeof (l2arc_write_callback_t));
4099 4114  }
4100 4115  
4101 4116  /*
4102 4117   * A read to a cache device completed.  Validate buffer contents before
4103 4118   * handing over to the regular ARC routines.
4104 4119   */
4105 4120  static void
4106 4121  l2arc_read_done(zio_t *zio)
4107 4122  {
4108 4123          l2arc_read_callback_t *cb;
4109 4124          arc_buf_hdr_t *hdr;
4110 4125          arc_buf_t *buf;
4111 4126          kmutex_t *hash_lock;
4112 4127          int equal;
4113 4128  
4114 4129          ASSERT(zio->io_vd != NULL);
4115 4130          ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4116 4131  
4117 4132          spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4118 4133  
4119 4134          cb = zio->io_private;
4120 4135          ASSERT(cb != NULL);
4121 4136          buf = cb->l2rcb_buf;
4122 4137          ASSERT(buf != NULL);
4123 4138  
4124 4139          hash_lock = HDR_LOCK(buf->b_hdr);
4125 4140          mutex_enter(hash_lock);
4126 4141          hdr = buf->b_hdr;
4127 4142          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4128 4143  
4129 4144          /*
4130 4145           * Check this survived the L2ARC journey.
4131 4146           */
4132 4147          equal = arc_cksum_equal(buf);
4133 4148          if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4134 4149                  mutex_exit(hash_lock);
4135 4150                  zio->io_private = buf;
4136 4151                  zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4137 4152                  zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
4138 4153                  arc_read_done(zio);
4139 4154          } else {
4140 4155                  mutex_exit(hash_lock);
4141 4156                  /*
4142 4157                   * Buffer didn't survive caching.  Increment stats and
4143 4158                   * reissue to the original storage device.
4144 4159                   */
4145 4160                  if (zio->io_error != 0) {
4146 4161                          ARCSTAT_BUMP(arcstat_l2_io_error);
4147 4162                  } else {
4148 4163                          zio->io_error = SET_ERROR(EIO);
4149 4164                  }
4150 4165                  if (!equal)
4151 4166                          ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4152 4167  
4153 4168                  /*
4154 4169                   * If there's no waiter, issue an async i/o to the primary
4155 4170                   * storage now.  If there *is* a waiter, the caller must
4156 4171                   * issue the i/o in a context where it's OK to block.
4157 4172                   */
4158 4173                  if (zio->io_waiter == NULL) {
4159 4174                          zio_t *pio = zio_unique_parent(zio);
4160 4175  
4161 4176                          ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4162 4177  
4163 4178                          zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4164 4179                              buf->b_data, zio->io_size, arc_read_done, buf,
4165 4180                              zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4166 4181                  }
4167 4182          }
4168 4183  
4169 4184          kmem_free(cb, sizeof (l2arc_read_callback_t));
4170 4185  }
4171 4186  
4172 4187  /*
4173 4188   * This is the list priority from which the L2ARC will search for pages to
4174 4189   * cache.  This is used within loops (0..3) to cycle through lists in the
4175 4190   * desired order.  This order can have a significant effect on cache
4176 4191   * performance.
4177 4192   *
4178 4193   * Currently the metadata lists are hit first, MFU then MRU, followed by
4179 4194   * the data lists.  This function returns a locked list, and also returns
4180 4195   * the lock pointer.
4181 4196   */
4182 4197  static list_t *
4183 4198  l2arc_list_locked(int list_num, kmutex_t **lock)
4184 4199  {
4185 4200          list_t *list = NULL;
4186 4201  
4187 4202          ASSERT(list_num >= 0 && list_num <= 3);
4188 4203  
4189 4204          switch (list_num) {
4190 4205          case 0:
4191 4206                  list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4192 4207                  *lock = &arc_mfu->arcs_mtx;
4193 4208                  break;
4194 4209          case 1:
4195 4210                  list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4196 4211                  *lock = &arc_mru->arcs_mtx;
4197 4212                  break;
4198 4213          case 2:
4199 4214                  list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4200 4215                  *lock = &arc_mfu->arcs_mtx;
4201 4216                  break;
4202 4217          case 3:
4203 4218                  list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4204 4219                  *lock = &arc_mru->arcs_mtx;
4205 4220                  break;
4206 4221          }
4207 4222  
4208 4223          ASSERT(!(MUTEX_HELD(*lock)));
4209 4224          mutex_enter(*lock);
4210 4225          return (list);
4211 4226  }
4212 4227  
4213 4228  /*
4214 4229   * Evict buffers from the device write hand to the distance specified in
4215 4230   * bytes.  This distance may span populated buffers, it may span nothing.
4216 4231   * This is clearing a region on the L2ARC device ready for writing.
4217 4232   * If the 'all' boolean is set, every buffer is evicted.
4218 4233   */
4219 4234  static void
4220 4235  l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4221 4236  {
4222 4237          list_t *buflist;
4223 4238          l2arc_buf_hdr_t *abl2;
4224 4239          arc_buf_hdr_t *ab, *ab_prev;
4225 4240          kmutex_t *hash_lock;
4226 4241          uint64_t taddr;
4227 4242  
4228 4243          buflist = dev->l2ad_buflist;
4229 4244  
4230 4245          if (buflist == NULL)
4231 4246                  return;
4232 4247  
4233 4248          if (!all && dev->l2ad_first) {
4234 4249                  /*
4235 4250                   * This is the first sweep through the device.  There is
4236 4251                   * nothing to evict.
4237 4252                   */
4238 4253                  return;
4239 4254          }
4240 4255  
4241 4256          if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4242 4257                  /*
4243 4258                   * When nearing the end of the device, evict to the end
4244 4259                   * before the device write hand jumps to the start.
4245 4260                   */
4246 4261                  taddr = dev->l2ad_end;
4247 4262          } else {
4248 4263                  taddr = dev->l2ad_hand + distance;
4249 4264          }
4250 4265          DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4251 4266              uint64_t, taddr, boolean_t, all);
4252 4267  
4253 4268  top:
4254 4269          mutex_enter(&l2arc_buflist_mtx);
4255 4270          for (ab = list_tail(buflist); ab; ab = ab_prev) {
4256 4271                  ab_prev = list_prev(buflist, ab);
4257 4272  
4258 4273                  hash_lock = HDR_LOCK(ab);
4259 4274                  if (!mutex_tryenter(hash_lock)) {
4260 4275                          /*
4261 4276                           * Missed the hash lock.  Retry.
4262 4277                           */
4263 4278                          ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4264 4279                          mutex_exit(&l2arc_buflist_mtx);
4265 4280                          mutex_enter(hash_lock);
4266 4281                          mutex_exit(hash_lock);
4267 4282                          goto top;
4268 4283                  }
4269 4284  
4270 4285                  if (HDR_L2_WRITE_HEAD(ab)) {
4271 4286                          /*
4272 4287                           * We hit a write head node.  Leave it for
4273 4288                           * l2arc_write_done().
4274 4289                           */
4275 4290                          list_remove(buflist, ab);
4276 4291                          mutex_exit(hash_lock);
4277 4292                          continue;
4278 4293                  }
4279 4294  
4280 4295                  if (!all && ab->b_l2hdr != NULL &&
4281 4296                      (ab->b_l2hdr->b_daddr > taddr ||
4282 4297                      ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4283 4298                          /*
4284 4299                           * We've evicted to the target address,
4285 4300                           * or the end of the device.
4286 4301                           */
4287 4302                          mutex_exit(hash_lock);
4288 4303                          break;
4289 4304                  }
4290 4305  
4291 4306                  if (HDR_FREE_IN_PROGRESS(ab)) {
4292 4307                          /*
4293 4308                           * Already on the path to destruction.
4294 4309                           */
4295 4310                          mutex_exit(hash_lock);
4296 4311                          continue;
4297 4312                  }
4298 4313  
4299 4314                  if (ab->b_state == arc_l2c_only) {
4300 4315                          ASSERT(!HDR_L2_READING(ab));
4301 4316                          /*
4302 4317                           * This doesn't exist in the ARC.  Destroy.
4303 4318                           * arc_hdr_destroy() will call list_remove()
4304 4319                           * and decrement arcstat_l2_size.
4305 4320                           */
4306 4321                          arc_change_state(arc_anon, ab, hash_lock);
4307 4322                          arc_hdr_destroy(ab);
4308 4323                  } else {
4309 4324                          /*
4310 4325                           * Invalidate issued or about to be issued
4311 4326                           * reads, since we may be about to write
4312 4327                           * over this location.
4313 4328                           */
4314 4329                          if (HDR_L2_READING(ab)) {
4315 4330                                  ARCSTAT_BUMP(arcstat_l2_evict_reading);
4316 4331                                  ab->b_flags |= ARC_L2_EVICTED;
4317 4332                          }
4318 4333  
4319 4334                          /*
4320 4335                           * Tell ARC this no longer exists in L2ARC.
4321 4336                           */
4322 4337                          if (ab->b_l2hdr != NULL) {
4323 4338                                  abl2 = ab->b_l2hdr;
4324 4339                                  ab->b_l2hdr = NULL;
4325 4340                                  kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4326 4341                                  ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4327 4342                          }
4328 4343                          list_remove(buflist, ab);
4329 4344  
4330 4345                          /*
4331 4346                           * This may have been leftover after a
4332 4347                           * failed write.
4333 4348                           */
4334 4349                          ab->b_flags &= ~ARC_L2_WRITING;
4335 4350                  }
4336 4351                  mutex_exit(hash_lock);
4337 4352          }
4338 4353          mutex_exit(&l2arc_buflist_mtx);
4339 4354  
4340 4355          vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4341 4356          dev->l2ad_evict = taddr;
4342 4357  }
4343 4358  
4344 4359  /*
4345 4360   * Find and write ARC buffers to the L2ARC device.
4346 4361   *
4347 4362   * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4348 4363   * for reading until they have completed writing.
4349 4364   */
4350 4365  static uint64_t
4351 4366  l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4352 4367  {
4353 4368          arc_buf_hdr_t *ab, *ab_prev, *head;
4354 4369          l2arc_buf_hdr_t *hdrl2;
4355 4370          list_t *list;
4356 4371          uint64_t passed_sz, write_sz, buf_sz, headroom;
4357 4372          void *buf_data;
4358 4373          kmutex_t *hash_lock, *list_lock;
4359 4374          boolean_t have_lock, full;
4360 4375          l2arc_write_callback_t *cb;
4361 4376          zio_t *pio, *wzio;
4362 4377          uint64_t guid = spa_load_guid(spa);
4363 4378  
4364 4379          ASSERT(dev->l2ad_vdev != NULL);
4365 4380  
4366 4381          pio = NULL;
4367 4382          write_sz = 0;
4368 4383          full = B_FALSE;
4369 4384          head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4370 4385          head->b_flags |= ARC_L2_WRITE_HEAD;
4371 4386  
4372 4387          /*
4373 4388           * Copy buffers for L2ARC writing.
4374 4389           */
4375 4390          mutex_enter(&l2arc_buflist_mtx);
4376 4391          for (int try = 0; try <= 3; try++) {
4377 4392                  list = l2arc_list_locked(try, &list_lock);
4378 4393                  passed_sz = 0;
4379 4394  
4380 4395                  /*
4381 4396                   * L2ARC fast warmup.
4382 4397                   *
4383 4398                   * Until the ARC is warm and starts to evict, read from the
4384 4399                   * head of the ARC lists rather than the tail.
4385 4400                   */
4386 4401                  headroom = target_sz * l2arc_headroom;
4387 4402                  if (arc_warm == B_FALSE)
4388 4403                          ab = list_head(list);
4389 4404                  else
4390 4405                          ab = list_tail(list);
4391 4406  
4392 4407                  for (; ab; ab = ab_prev) {
4393 4408                          if (arc_warm == B_FALSE)
4394 4409                                  ab_prev = list_next(list, ab);
4395 4410                          else
4396 4411                                  ab_prev = list_prev(list, ab);
4397 4412  
4398 4413                          hash_lock = HDR_LOCK(ab);
4399 4414                          have_lock = MUTEX_HELD(hash_lock);
4400 4415                          if (!have_lock && !mutex_tryenter(hash_lock)) {
4401 4416                                  /*
4402 4417                                   * Skip this buffer rather than waiting.
4403 4418                                   */
4404 4419                                  continue;
4405 4420                          }
4406 4421  
4407 4422                          passed_sz += ab->b_size;
4408 4423                          if (passed_sz > headroom) {
4409 4424                                  /*
4410 4425                                   * Searched too far.
4411 4426                                   */
4412 4427                                  mutex_exit(hash_lock);
4413 4428                                  break;
4414 4429                          }
4415 4430  
4416 4431                          if (!l2arc_write_eligible(guid, ab)) {
4417 4432                                  mutex_exit(hash_lock);
4418 4433                                  continue;
4419 4434                          }
4420 4435  
4421 4436                          if ((write_sz + ab->b_size) > target_sz) {
4422 4437                                  full = B_TRUE;
4423 4438                                  mutex_exit(hash_lock);
4424 4439                                  break;
4425 4440                          }
4426 4441  
4427 4442                          if (pio == NULL) {
4428 4443                                  /*
4429 4444                                   * Insert a dummy header on the buflist so
4430 4445                                   * l2arc_write_done() can find where the
4431 4446                                   * write buffers begin without searching.
4432 4447                                   */
4433 4448                                  list_insert_head(dev->l2ad_buflist, head);
4434 4449  
4435 4450                                  cb = kmem_alloc(
4436 4451                                      sizeof (l2arc_write_callback_t), KM_SLEEP);
4437 4452                                  cb->l2wcb_dev = dev;
4438 4453                                  cb->l2wcb_head = head;
4439 4454                                  pio = zio_root(spa, l2arc_write_done, cb,
4440 4455                                      ZIO_FLAG_CANFAIL);
4441 4456                          }
4442 4457  
4443 4458                          /*
4444 4459                           * Create and add a new L2ARC header.
4445 4460                           */
4446 4461                          hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4447 4462                          hdrl2->b_dev = dev;
4448 4463                          hdrl2->b_daddr = dev->l2ad_hand;
4449 4464  
4450 4465                          ab->b_flags |= ARC_L2_WRITING;
4451 4466                          ab->b_l2hdr = hdrl2;
4452 4467                          list_insert_head(dev->l2ad_buflist, ab);
4453 4468                          buf_data = ab->b_buf->b_data;
4454 4469                          buf_sz = ab->b_size;
4455 4470  
4456 4471                          /*
4457 4472                           * Compute and store the buffer cksum before
4458 4473                           * writing.  On debug the cksum is verified first.
4459 4474                           */
4460 4475                          arc_cksum_verify(ab->b_buf);
4461 4476                          arc_cksum_compute(ab->b_buf, B_TRUE);
4462 4477  
4463 4478                          mutex_exit(hash_lock);
4464 4479  
4465 4480                          wzio = zio_write_phys(pio, dev->l2ad_vdev,
4466 4481                              dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4467 4482                              NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4468 4483                              ZIO_FLAG_CANFAIL, B_FALSE);
4469 4484  
4470 4485                          DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4471 4486                              zio_t *, wzio);
4472 4487                          (void) zio_nowait(wzio);
4473 4488  
4474 4489                          /*
4475 4490                           * Keep the clock hand suitably device-aligned.
4476 4491                           */
4477 4492                          buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4478 4493  
4479 4494                          write_sz += buf_sz;
4480 4495                          dev->l2ad_hand += buf_sz;
4481 4496                  }
4482 4497  
4483 4498                  mutex_exit(list_lock);
4484 4499  
4485 4500                  if (full == B_TRUE)
4486 4501                          break;
4487 4502          }
4488 4503          mutex_exit(&l2arc_buflist_mtx);
4489 4504  
4490 4505          if (pio == NULL) {
4491 4506                  ASSERT0(write_sz);
4492 4507                  kmem_cache_free(hdr_cache, head);
4493 4508                  return (0);
4494 4509          }
4495 4510  
4496 4511          ASSERT3U(write_sz, <=, target_sz);
4497 4512          ARCSTAT_BUMP(arcstat_l2_writes_sent);
4498 4513          ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4499 4514          ARCSTAT_INCR(arcstat_l2_size, write_sz);
4500 4515          vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4501 4516  
4502 4517          /*
4503 4518           * Bump device hand to the device start if it is approaching the end.
4504 4519           * l2arc_evict() will already have evicted ahead for this case.
4505 4520           */
4506 4521          if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4507 4522                  vdev_space_update(dev->l2ad_vdev,
4508 4523                      dev->l2ad_end - dev->l2ad_hand, 0, 0);
4509 4524                  dev->l2ad_hand = dev->l2ad_start;
4510 4525                  dev->l2ad_evict = dev->l2ad_start;
4511 4526                  dev->l2ad_first = B_FALSE;
4512 4527          }
4513 4528  
4514 4529          dev->l2ad_writing = B_TRUE;
4515 4530          (void) zio_wait(pio);
4516 4531          dev->l2ad_writing = B_FALSE;
4517 4532  
4518 4533          return (write_sz);
4519 4534  }
4520 4535  
4521 4536  /*
4522 4537   * This thread feeds the L2ARC at regular intervals.  This is the beating
4523 4538   * heart of the L2ARC.
4524 4539   */
4525 4540  static void
4526 4541  l2arc_feed_thread(void)
4527 4542  {
4528 4543          callb_cpr_t cpr;
4529 4544          l2arc_dev_t *dev;
4530 4545          spa_t *spa;
4531 4546          uint64_t size, wrote;
4532 4547          clock_t begin, next = ddi_get_lbolt();
4533 4548  
4534 4549          CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4535 4550  
4536 4551          mutex_enter(&l2arc_feed_thr_lock);
4537 4552  
4538 4553          while (l2arc_thread_exit == 0) {
4539 4554                  CALLB_CPR_SAFE_BEGIN(&cpr);
4540 4555                  (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4541 4556                      next);
4542 4557                  CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4543 4558                  next = ddi_get_lbolt() + hz;
4544 4559  
4545 4560                  /*
4546 4561                   * Quick check for L2ARC devices.
4547 4562                   */
4548 4563                  mutex_enter(&l2arc_dev_mtx);
4549 4564                  if (l2arc_ndev == 0) {
4550 4565                          mutex_exit(&l2arc_dev_mtx);
4551 4566                          continue;
4552 4567                  }
4553 4568                  mutex_exit(&l2arc_dev_mtx);
4554 4569                  begin = ddi_get_lbolt();
4555 4570  
4556 4571                  /*
4557 4572                   * This selects the next l2arc device to write to, and in
4558 4573                   * doing so the next spa to feed from: dev->l2ad_spa.   This
4559 4574                   * will return NULL if there are now no l2arc devices or if
4560 4575                   * they are all faulted.
4561 4576                   *
4562 4577                   * If a device is returned, its spa's config lock is also
4563 4578                   * held to prevent device removal.  l2arc_dev_get_next()
4564 4579                   * will grab and release l2arc_dev_mtx.
4565 4580                   */
4566 4581                  if ((dev = l2arc_dev_get_next()) == NULL)
4567 4582                          continue;
4568 4583  
4569 4584                  spa = dev->l2ad_spa;
4570 4585                  ASSERT(spa != NULL);
4571 4586  
4572 4587                  /*
4573 4588                   * If the pool is read-only then force the feed thread to
4574 4589                   * sleep a little longer.
4575 4590                   */
4576 4591                  if (!spa_writeable(spa)) {
4577 4592                          next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4578 4593                          spa_config_exit(spa, SCL_L2ARC, dev);
4579 4594                          continue;
4580 4595                  }
4581 4596  
4582 4597                  /*
4583 4598                   * Avoid contributing to memory pressure.
4584 4599                   */
4585 4600                  if (arc_reclaim_needed()) {
4586 4601                          ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4587 4602                          spa_config_exit(spa, SCL_L2ARC, dev);
4588 4603                          continue;
4589 4604                  }
4590 4605  
4591 4606                  ARCSTAT_BUMP(arcstat_l2_feeds);
4592 4607  
4593 4608                  size = l2arc_write_size(dev);
4594 4609  
4595 4610                  /*
4596 4611                   * Evict L2ARC buffers that will be overwritten.
4597 4612                   */
4598 4613                  l2arc_evict(dev, size, B_FALSE);
4599 4614  
4600 4615                  /*
4601 4616                   * Write ARC buffers.
4602 4617                   */
4603 4618                  wrote = l2arc_write_buffers(spa, dev, size);
4604 4619  
4605 4620                  /*
4606 4621                   * Calculate interval between writes.
4607 4622                   */
4608 4623                  next = l2arc_write_interval(begin, size, wrote);
4609 4624                  spa_config_exit(spa, SCL_L2ARC, dev);
4610 4625          }
4611 4626  
4612 4627          l2arc_thread_exit = 0;
4613 4628          cv_broadcast(&l2arc_feed_thr_cv);
4614 4629          CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
4615 4630          thread_exit();
4616 4631  }
4617 4632  
4618 4633  boolean_t
4619 4634  l2arc_vdev_present(vdev_t *vd)
4620 4635  {
4621 4636          l2arc_dev_t *dev;
4622 4637  
4623 4638          mutex_enter(&l2arc_dev_mtx);
4624 4639          for (dev = list_head(l2arc_dev_list); dev != NULL;
4625 4640              dev = list_next(l2arc_dev_list, dev)) {
4626 4641                  if (dev->l2ad_vdev == vd)
4627 4642                          break;
4628 4643          }
4629 4644          mutex_exit(&l2arc_dev_mtx);
4630 4645  
4631 4646          return (dev != NULL);
4632 4647  }
4633 4648  
4634 4649  /*
4635 4650   * Add a vdev for use by the L2ARC.  By this point the spa has already
4636 4651   * validated the vdev and opened it.
4637 4652   */
4638 4653  void
4639 4654  l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4640 4655  {
4641 4656          l2arc_dev_t *adddev;
4642 4657  
4643 4658          ASSERT(!l2arc_vdev_present(vd));
4644 4659  
4645 4660          /*
4646 4661           * Create a new l2arc device entry.
4647 4662           */
4648 4663          adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4649 4664          adddev->l2ad_spa = spa;
4650 4665          adddev->l2ad_vdev = vd;
4651 4666          adddev->l2ad_write = l2arc_write_max;
4652 4667          adddev->l2ad_boost = l2arc_write_boost;
4653 4668          adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4654 4669          adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4655 4670          adddev->l2ad_hand = adddev->l2ad_start;
4656 4671          adddev->l2ad_evict = adddev->l2ad_start;
4657 4672          adddev->l2ad_first = B_TRUE;
4658 4673          adddev->l2ad_writing = B_FALSE;
4659 4674          ASSERT3U(adddev->l2ad_write, >, 0);
4660 4675  
4661 4676          /*
4662 4677           * This is a list of all ARC buffers that are still valid on the
4663 4678           * device.
4664 4679           */
4665 4680          adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4666 4681          list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4667 4682              offsetof(arc_buf_hdr_t, b_l2node));
4668 4683  
4669 4684          vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4670 4685  
4671 4686          /*
4672 4687           * Add device to global list
4673 4688           */
4674 4689          mutex_enter(&l2arc_dev_mtx);
4675 4690          list_insert_head(l2arc_dev_list, adddev);
4676 4691          atomic_inc_64(&l2arc_ndev);
4677 4692          mutex_exit(&l2arc_dev_mtx);
4678 4693  }
4679 4694  
4680 4695  /*
4681 4696   * Remove a vdev from the L2ARC.
4682 4697   */
4683 4698  void
4684 4699  l2arc_remove_vdev(vdev_t *vd)
4685 4700  {
4686 4701          l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4687 4702  
4688 4703          /*
4689 4704           * Find the device by vdev
4690 4705           */
4691 4706          mutex_enter(&l2arc_dev_mtx);
4692 4707          for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4693 4708                  nextdev = list_next(l2arc_dev_list, dev);
4694 4709                  if (vd == dev->l2ad_vdev) {
4695 4710                          remdev = dev;
4696 4711                          break;
4697 4712                  }
4698 4713          }
4699 4714          ASSERT(remdev != NULL);
4700 4715  
4701 4716          /*
4702 4717           * Remove device from global list
4703 4718           */
4704 4719          list_remove(l2arc_dev_list, remdev);
4705 4720          l2arc_dev_last = NULL;          /* may have been invalidated */
4706 4721          atomic_dec_64(&l2arc_ndev);
4707 4722          mutex_exit(&l2arc_dev_mtx);
4708 4723  
4709 4724          /*
4710 4725           * Clear all buflists and ARC references.  L2ARC device flush.
4711 4726           */
4712 4727          l2arc_evict(remdev, 0, B_TRUE);
4713 4728          list_destroy(remdev->l2ad_buflist);
4714 4729          kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4715 4730          kmem_free(remdev, sizeof (l2arc_dev_t));
4716 4731  }
4717 4732  
4718 4733  void
4719 4734  l2arc_init(void)
4720 4735  {
4721 4736          l2arc_thread_exit = 0;
4722 4737          l2arc_ndev = 0;
4723 4738          l2arc_writes_sent = 0;
4724 4739          l2arc_writes_done = 0;
4725 4740  
4726 4741          mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4727 4742          cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4728 4743          mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4729 4744          mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4730 4745          mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4731 4746  
4732 4747          l2arc_dev_list = &L2ARC_dev_list;
4733 4748          l2arc_free_on_write = &L2ARC_free_on_write;
4734 4749          list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4735 4750              offsetof(l2arc_dev_t, l2ad_node));
4736 4751          list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4737 4752              offsetof(l2arc_data_free_t, l2df_list_node));
4738 4753  }
4739 4754  
4740 4755  void
4741 4756  l2arc_fini(void)
4742 4757  {
4743 4758          /*
4744 4759           * This is called from dmu_fini(), which is called from spa_fini();
4745 4760           * Because of this, we can assume that all l2arc devices have
4746 4761           * already been removed when the pools themselves were removed.
4747 4762           */
4748 4763  
4749 4764          l2arc_do_free_on_write();
4750 4765  
4751 4766          mutex_destroy(&l2arc_feed_thr_lock);
4752 4767          cv_destroy(&l2arc_feed_thr_cv);
4753 4768          mutex_destroy(&l2arc_dev_mtx);
4754 4769          mutex_destroy(&l2arc_buflist_mtx);
4755 4770          mutex_destroy(&l2arc_free_on_write_mtx);
4756 4771  
4757 4772          list_destroy(l2arc_dev_list);
4758 4773          list_destroy(l2arc_free_on_write);
4759 4774  }
4760 4775  
4761 4776  void
4762 4777  l2arc_start(void)
4763 4778  {
4764 4779          if (!(spa_mode_global & FWRITE))
4765 4780                  return;
4766 4781  
4767 4782          (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4768 4783              TS_RUN, minclsyspri);
4769 4784  }
4770 4785  
4771 4786  void
4772 4787  l2arc_stop(void)
4773 4788  {
4774 4789          if (!(spa_mode_global & FWRITE))
4775 4790                  return;
4776 4791  
4777 4792          mutex_enter(&l2arc_feed_thr_lock);
4778 4793          cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
4779 4794          l2arc_thread_exit = 1;
4780 4795          while (l2arc_thread_exit != 0)
4781 4796                  cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4782 4797          mutex_exit(&l2arc_feed_thr_lock);
4783 4798  }

↓ open down ↓

1626 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX