illumos-3748 Wdiff usr/src/uts/common/fs/zfs/arc.c

Print this page

3748 zfs headers should be C++ compatible
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Will Andrews <willa@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * DVA-based Adjustable Replacement Cache
  29   29   *
  30   30   * While much of the theory of operation used here is
  31   31   * based on the self-tuning, low overhead replacement cache
  32   32   * presented by Megiddo and Modha at FAST 2003, there are some
  33   33   * significant differences:
  34   34   *
  35   35   * 1. The Megiddo and Modha model assumes any page is evictable.
  36   36   * Pages in its cache cannot be "locked" into memory.  This makes
  37   37   * the eviction algorithm simple: evict the last page in the list.
  38   38   * This also make the performance characteristics easy to reason
  39   39   * about.  Our cache is not so simple.  At any given moment, some
  40   40   * subset of the blocks in the cache are un-evictable because we
  41   41   * have handed out a reference to them.  Blocks are only evictable
  42   42   * when there are no external references active.  This makes
  43   43   * eviction far more problematic:  we choose to evict the evictable
  44   44   * blocks that are the "lowest" in the list.
  45   45   *
  46   46   * There are times when it is not possible to evict the requested
  47   47   * space.  In these circumstances we are unable to adjust the cache
  48   48   * size.  To prevent the cache growing unbounded at these times we
  49   49   * implement a "cache throttle" that slows the flow of new data
  50   50   * into the cache until we can make space available.
  51   51   *
  52   52   * 2. The Megiddo and Modha model assumes a fixed cache size.
  53   53   * Pages are evicted when the cache is full and there is a cache
  54   54   * miss.  Our model has a variable sized cache.  It grows with
  55   55   * high use, but also tries to react to memory pressure from the
  56   56   * operating system: decreasing its size when system memory is
  57   57   * tight.
  58   58   *
  59   59   * 3. The Megiddo and Modha model assumes a fixed page size. All
  60   60   * elements of the cache are therefor exactly the same size.  So
  61   61   * when adjusting the cache size following a cache miss, its simply
  62   62   * a matter of choosing a single page to evict.  In our model, we
  63   63   * have variable sized cache blocks (rangeing from 512 bytes to
  64   64   * 128K bytes).  We therefor choose a set of blocks to evict to make
  65   65   * space for a cache miss that approximates as closely as possible
  66   66   * the space used by the new block.
  67   67   *
  68   68   * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  69   69   * by N. Megiddo & D. Modha, FAST 2003
  70   70   */
  71   71  
  72   72  /*
  73   73   * The locking model:
  74   74   *
  75   75   * A new reference to a cache buffer can be obtained in two
  76   76   * ways: 1) via a hash table lookup using the DVA as a key,
  77   77   * or 2) via one of the ARC lists.  The arc_read() interface
  78   78   * uses method 1, while the internal arc algorithms for
  79   79   * adjusting the cache use method 2.  We therefor provide two
  80   80   * types of locks: 1) the hash table lock array, and 2) the
  81   81   * arc list locks.
  82   82   *
  83   83   * Buffers do not have their own mutexes, rather they rely on the
  84   84   * hash table mutexes for the bulk of their protection (i.e. most
  85   85   * fields in the arc_buf_hdr_t are protected by these mutexes).
  86   86   *
  87   87   * buf_hash_find() returns the appropriate mutex (held) when it
  88   88   * locates the requested buffer in the hash table.  It returns
  89   89   * NULL for the mutex if the buffer was not in the table.
  90   90   *
  91   91   * buf_hash_remove() expects the appropriate hash mutex to be
  92   92   * already held before it is invoked.
  93   93   *
  94   94   * Each arc state also has a mutex which is used to protect the
  95   95   * buffer list associated with the state.  When attempting to
  96   96   * obtain a hash table lock while holding an arc list lock you
  97   97   * must use: mutex_tryenter() to avoid deadlock.  Also note that
  98   98   * the active state mutex must be held before the ghost state mutex.
  99   99   *
 100  100   * Arc buffers may have an associated eviction callback function.
 101  101   * This function will be invoked prior to removing the buffer (e.g.
 102  102   * in arc_do_user_evicts()).  Note however that the data associated
 103  103   * with the buffer may be evicted prior to the callback.  The callback
 104  104   * must be made with *no locks held* (to prevent deadlock).  Additionally,
 105  105   * the users of callbacks must ensure that their private data is
 106  106   * protected from simultaneous callbacks from arc_buf_evict()
 107  107   * and arc_do_user_evicts().
 108  108   *
 109  109   * Note that the majority of the performance stats are manipulated
 110  110   * with atomic operations.
 111  111   *
 112  112   * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 113  113   *
 114  114   *      - L2ARC buflist creation
 115  115   *      - L2ARC buflist eviction
 116  116   *      - L2ARC write completion, which walks L2ARC buflists
 117  117   *      - ARC header destruction, as it removes from L2ARC buflists
 118  118   *      - ARC header release, as it removes from L2ARC buflists
 119  119   */
 120  120  
 121  121  #include <sys/spa.h>
 122  122  #include <sys/zio.h>
 123  123  #include <sys/zfs_context.h>
 124  124  #include <sys/arc.h>
 125  125  #include <sys/refcount.h>
 126  126  #include <sys/vdev.h>
 127  127  #include <sys/vdev_impl.h>
 128  128  #ifdef _KERNEL
 129  129  #include <sys/vmsystm.h>
 130  130  #include <vm/anon.h>
 131  131  #include <sys/fs/swapnode.h>
 132  132  #include <sys/dnlc.h>
 133  133  #endif
 134  134  #include <sys/callb.h>
 135  135  #include <sys/kstat.h>
 136  136  #include <zfs_fletcher.h>
 137  137  
 138  138  #ifndef _KERNEL
 139  139  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 140  140  boolean_t arc_watch = B_FALSE;
 141  141  int arc_procfd;
 142  142  #endif
 143  143  
 144  144  static kmutex_t         arc_reclaim_thr_lock;
 145  145  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 146  146  static uint8_t          arc_thread_exit;
 147  147  
 148  148  extern int zfs_write_limit_shift;
 149  149  extern uint64_t zfs_write_limit_max;
 150  150  extern kmutex_t zfs_write_limit_lock;
 151  151  
 152  152  #define ARC_REDUCE_DNLC_PERCENT 3
 153  153  uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154  154  
 155  155  typedef enum arc_reclaim_strategy {
 156  156          ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157  157          ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158  158  } arc_reclaim_strategy_t;
 159  159  
 160  160  /* number of seconds before growing cache again */
 161  161  static int              arc_grow_retry = 60;
 162  162  
 163  163  /* shift of arc_c for calculating both min and max arc_p */
 164  164  static int              arc_p_min_shift = 4;
 165  165  
 166  166  /* log2(fraction of arc to reclaim) */
 167  167  static int              arc_shrink_shift = 5;
 168  168  
 169  169  /*
 170  170   * minimum lifespan of a prefetch block in clock ticks
 171  171   * (initialized in arc_init())
 172  172   */
 173  173  static int              arc_min_prefetch_lifespan;
 174  174  
 175  175  static int arc_dead;
 176  176  
 177  177  /*
 178  178   * The arc has filled available memory and has now warmed up.
 179  179   */
 180  180  static boolean_t arc_warm;
 181  181  
 182  182  /*
 183  183   * These tunables are for performance analysis.
 184  184   */
 185  185  uint64_t zfs_arc_max;
 186  186  uint64_t zfs_arc_min;
 187  187  uint64_t zfs_arc_meta_limit = 0;
 188  188  int zfs_arc_grow_retry = 0;
 189  189  int zfs_arc_shrink_shift = 0;
 190  190  int zfs_arc_p_min_shift = 0;
 191  191  int zfs_disable_dup_eviction = 0;
 192  192  
 193  193  /*
 194  194   * Note that buffers can be in one of 6 states:
 195  195   *      ARC_anon        - anonymous (discussed below)
 196  196   *      ARC_mru         - recently used, currently cached
 197  197   *      ARC_mru_ghost   - recentely used, no longer in cache
 198  198   *      ARC_mfu         - frequently used, currently cached
 199  199   *      ARC_mfu_ghost   - frequently used, no longer in cache
 200  200   *      ARC_l2c_only    - exists in L2ARC but not other states
 201  201   * When there are no active references to the buffer, they are
 202  202   * are linked onto a list in one of these arc states.  These are
 203  203   * the only buffers that can be evicted or deleted.  Within each
 204  204   * state there are multiple lists, one for meta-data and one for
 205  205   * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 206  206   * etc.) is tracked separately so that it can be managed more
 207  207   * explicitly: favored over data, limited explicitly.
 208  208   *
 209  209   * Anonymous buffers are buffers that are not associated with
 210  210   * a DVA.  These are buffers that hold dirty block copies
 211  211   * before they are written to stable storage.  By definition,
 212  212   * they are "ref'd" and are considered part of arc_mru
 213  213   * that cannot be freed.  Generally, they will aquire a DVA
 214  214   * as they are written and migrate onto the arc_mru list.
 215  215   *
 216  216   * The ARC_l2c_only state is for buffers that are in the second
 217  217   * level ARC but no longer in any of the ARC_m* lists.  The second
 218  218   * level ARC itself may also contain buffers that are in any of
 219  219   * the ARC_m* states - meaning that a buffer can exist in two
 220  220   * places.  The reason for the ARC_l2c_only state is to keep the
 221  221   * buffer header in the hash table, so that reads that hit the
 222  222   * second level ARC benefit from these fast lookups.
 223  223   */
 224  224  
 225  225  typedef struct arc_state {
 226  226          list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 227  227          uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 228  228          uint64_t arcs_size;     /* total amount of data in this state */
 229  229          kmutex_t arcs_mtx;
 230  230  } arc_state_t;
 231  231  
 232  232  /* The 6 states: */
 233  233  static arc_state_t ARC_anon;
 234  234  static arc_state_t ARC_mru;
 235  235  static arc_state_t ARC_mru_ghost;
 236  236  static arc_state_t ARC_mfu;
 237  237  static arc_state_t ARC_mfu_ghost;
 238  238  static arc_state_t ARC_l2c_only;
 239  239  
 240  240  typedef struct arc_stats {
 241  241          kstat_named_t arcstat_hits;
 242  242          kstat_named_t arcstat_misses;
 243  243          kstat_named_t arcstat_demand_data_hits;
 244  244          kstat_named_t arcstat_demand_data_misses;
 245  245          kstat_named_t arcstat_demand_metadata_hits;
 246  246          kstat_named_t arcstat_demand_metadata_misses;
 247  247          kstat_named_t arcstat_prefetch_data_hits;
 248  248          kstat_named_t arcstat_prefetch_data_misses;
 249  249          kstat_named_t arcstat_prefetch_metadata_hits;
 250  250          kstat_named_t arcstat_prefetch_metadata_misses;
 251  251          kstat_named_t arcstat_mru_hits;
 252  252          kstat_named_t arcstat_mru_ghost_hits;
 253  253          kstat_named_t arcstat_mfu_hits;
 254  254          kstat_named_t arcstat_mfu_ghost_hits;
 255  255          kstat_named_t arcstat_deleted;
 256  256          kstat_named_t arcstat_recycle_miss;
 257  257          kstat_named_t arcstat_mutex_miss;
 258  258          kstat_named_t arcstat_evict_skip;
 259  259          kstat_named_t arcstat_evict_l2_cached;
 260  260          kstat_named_t arcstat_evict_l2_eligible;
 261  261          kstat_named_t arcstat_evict_l2_ineligible;
 262  262          kstat_named_t arcstat_hash_elements;
 263  263          kstat_named_t arcstat_hash_elements_max;
 264  264          kstat_named_t arcstat_hash_collisions;
 265  265          kstat_named_t arcstat_hash_chains;
 266  266          kstat_named_t arcstat_hash_chain_max;
 267  267          kstat_named_t arcstat_p;
 268  268          kstat_named_t arcstat_c;
 269  269          kstat_named_t arcstat_c_min;
 270  270          kstat_named_t arcstat_c_max;
 271  271          kstat_named_t arcstat_size;
 272  272          kstat_named_t arcstat_hdr_size;
 273  273          kstat_named_t arcstat_data_size;
 274  274          kstat_named_t arcstat_other_size;
 275  275          kstat_named_t arcstat_l2_hits;
 276  276          kstat_named_t arcstat_l2_misses;
 277  277          kstat_named_t arcstat_l2_feeds;
 278  278          kstat_named_t arcstat_l2_rw_clash;
 279  279          kstat_named_t arcstat_l2_read_bytes;
 280  280          kstat_named_t arcstat_l2_write_bytes;
 281  281          kstat_named_t arcstat_l2_writes_sent;
 282  282          kstat_named_t arcstat_l2_writes_done;
 283  283          kstat_named_t arcstat_l2_writes_error;
 284  284          kstat_named_t arcstat_l2_writes_hdr_miss;
 285  285          kstat_named_t arcstat_l2_evict_lock_retry;
 286  286          kstat_named_t arcstat_l2_evict_reading;
 287  287          kstat_named_t arcstat_l2_free_on_write;
 288  288          kstat_named_t arcstat_l2_abort_lowmem;
 289  289          kstat_named_t arcstat_l2_cksum_bad;
 290  290          kstat_named_t arcstat_l2_io_error;
 291  291          kstat_named_t arcstat_l2_size;
 292  292          kstat_named_t arcstat_l2_hdr_size;
 293  293          kstat_named_t arcstat_memory_throttle_count;
 294  294          kstat_named_t arcstat_duplicate_buffers;
 295  295          kstat_named_t arcstat_duplicate_buffers_size;
 296  296          kstat_named_t arcstat_duplicate_reads;
 297  297          kstat_named_t arcstat_meta_used;
 298  298          kstat_named_t arcstat_meta_limit;
 299  299          kstat_named_t arcstat_meta_max;
 300  300  } arc_stats_t;
 301  301  
 302  302  static arc_stats_t arc_stats = {
 303  303          { "hits",                       KSTAT_DATA_UINT64 },
 304  304          { "misses",                     KSTAT_DATA_UINT64 },
 305  305          { "demand_data_hits",           KSTAT_DATA_UINT64 },
 306  306          { "demand_data_misses",         KSTAT_DATA_UINT64 },
 307  307          { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 308  308          { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 309  309          { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 310  310          { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 311  311          { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 312  312          { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 313  313          { "mru_hits",                   KSTAT_DATA_UINT64 },
 314  314          { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 315  315          { "mfu_hits",                   KSTAT_DATA_UINT64 },
 316  316          { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 317  317          { "deleted",                    KSTAT_DATA_UINT64 },
 318  318          { "recycle_miss",               KSTAT_DATA_UINT64 },
 319  319          { "mutex_miss",                 KSTAT_DATA_UINT64 },
 320  320          { "evict_skip",                 KSTAT_DATA_UINT64 },
 321  321          { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 322  322          { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 323  323          { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 324  324          { "hash_elements",              KSTAT_DATA_UINT64 },
 325  325          { "hash_elements_max",          KSTAT_DATA_UINT64 },
 326  326          { "hash_collisions",            KSTAT_DATA_UINT64 },
 327  327          { "hash_chains",                KSTAT_DATA_UINT64 },
 328  328          { "hash_chain_max",             KSTAT_DATA_UINT64 },
 329  329          { "p",                          KSTAT_DATA_UINT64 },
 330  330          { "c",                          KSTAT_DATA_UINT64 },
 331  331          { "c_min",                      KSTAT_DATA_UINT64 },
 332  332          { "c_max",                      KSTAT_DATA_UINT64 },
 333  333          { "size",                       KSTAT_DATA_UINT64 },
 334  334          { "hdr_size",                   KSTAT_DATA_UINT64 },
 335  335          { "data_size",                  KSTAT_DATA_UINT64 },
 336  336          { "other_size",                 KSTAT_DATA_UINT64 },
 337  337          { "l2_hits",                    KSTAT_DATA_UINT64 },
 338  338          { "l2_misses",                  KSTAT_DATA_UINT64 },
 339  339          { "l2_feeds",                   KSTAT_DATA_UINT64 },
 340  340          { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 341  341          { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 342  342          { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 343  343          { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 344  344          { "l2_writes_done",             KSTAT_DATA_UINT64 },
 345  345          { "l2_writes_error",            KSTAT_DATA_UINT64 },
 346  346          { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 347  347          { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 348  348          { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 349  349          { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 350  350          { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 351  351          { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 352  352          { "l2_io_error",                KSTAT_DATA_UINT64 },
 353  353          { "l2_size",                    KSTAT_DATA_UINT64 },
 354  354          { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 355  355          { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 356  356          { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 357  357          { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 358  358          { "duplicate_reads",            KSTAT_DATA_UINT64 },
 359  359          { "arc_meta_used",              KSTAT_DATA_UINT64 },
 360  360          { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 361  361          { "arc_meta_max",               KSTAT_DATA_UINT64 }
 362  362  };
 363  363  
 364  364  #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 365  365  
 366  366  #define ARCSTAT_INCR(stat, val) \
 367  367          atomic_add_64(&arc_stats.stat.value.ui64, (val));
 368  368  
 369  369  #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 370  370  #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 371  371  
 372  372  #define ARCSTAT_MAX(stat, val) {                                        \
 373  373          uint64_t m;                                                     \
 374  374          while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 375  375              (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 376  376                  continue;                                               \
 377  377  }
 378  378  
 379  379  #define ARCSTAT_MAXSTAT(stat) \
 380  380          ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 381  381  
 382  382  /*
 383  383   * We define a macro to allow ARC hits/misses to be easily broken down by
 384  384   * two separate conditions, giving a total of four different subtypes for
 385  385   * each of hits and misses (so eight statistics total).
 386  386   */
 387  387  #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 388  388          if (cond1) {                                                    \
 389  389                  if (cond2) {                                            \
 390  390                          ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 391  391                  } else {                                                \
 392  392                          ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 393  393                  }                                                       \
 394  394          } else {                                                        \
 395  395                  if (cond2) {                                            \
 396  396                          ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 397  397                  } else {                                                \
 398  398                          ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 399  399                  }                                                       \
 400  400          }
 401  401  
 402  402  kstat_t                 *arc_ksp;
 403  403  static arc_state_t      *arc_anon;
 404  404  static arc_state_t      *arc_mru;
 405  405  static arc_state_t      *arc_mru_ghost;
 406  406  static arc_state_t      *arc_mfu;
 407  407  static arc_state_t      *arc_mfu_ghost;
 408  408  static arc_state_t      *arc_l2c_only;
 409  409  
 410  410  /*
 411  411   * There are several ARC variables that are critical to export as kstats --
 412  412   * but we don't want to have to grovel around in the kstat whenever we wish to
 413  413   * manipulate them.  For these variables, we therefore define them to be in
 414  414   * terms of the statistic variable.  This assures that we are not introducing
 415  415   * the possibility of inconsistency by having shadow copies of the variables,
 416  416   * while still allowing the code to be readable.
 417  417   */
 418  418  #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 419  419  #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 420  420  #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 421  421  #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 422  422  #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 423  423  #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 424  424  #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 425  425  #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 426  426  
 427  427  static int              arc_no_grow;    /* Don't try to grow cache size */
 428  428  static uint64_t         arc_tempreserve;
 429  429  static uint64_t         arc_loaned_bytes;
 430  430  
 431  431  typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 432  432  
 433  433  typedef struct arc_callback arc_callback_t;
 434  434  
 435  435  struct arc_callback {
 436  436          void                    *acb_private;
 437  437          arc_done_func_t         *acb_done;
 438  438          arc_buf_t               *acb_buf;
 439  439          zio_t                   *acb_zio_dummy;
 440  440          arc_callback_t          *acb_next;
 441  441  };
 442  442  
 443  443  typedef struct arc_write_callback arc_write_callback_t;
 444  444  
 445  445  struct arc_write_callback {
 446  446          void            *awcb_private;
 447  447          arc_done_func_t *awcb_ready;
 448  448          arc_done_func_t *awcb_done;
 449  449          arc_buf_t       *awcb_buf;
 450  450  };
 451  451  
 452  452  struct arc_buf_hdr {
 453  453          /* protected by hash lock */
 454  454          dva_t                   b_dva;
 455  455          uint64_t                b_birth;
 456  456          uint64_t                b_cksum0;
 457  457  
 458  458          kmutex_t                b_freeze_lock;
 459  459          zio_cksum_t             *b_freeze_cksum;
 460  460          void                    *b_thawed;
 461  461  
 462  462          arc_buf_hdr_t           *b_hash_next;
 463  463          arc_buf_t               *b_buf;
 464  464          uint32_t                b_flags;
 465  465          uint32_t                b_datacnt;
 466  466  
 467  467          arc_callback_t          *b_acb;
 468  468          kcondvar_t              b_cv;
 469  469  
 470  470          /* immutable */
 471  471          arc_buf_contents_t      b_type;
 472  472          uint64_t                b_size;
 473  473          uint64_t                b_spa;
 474  474  
 475  475          /* protected by arc state mutex */
 476  476          arc_state_t             *b_state;
 477  477          list_node_t             b_arc_node;
 478  478  
 479  479          /* updated atomically */
 480  480          clock_t                 b_arc_access;
 481  481  
 482  482          /* self protecting */
 483  483          refcount_t              b_refcnt;
 484  484  
 485  485          l2arc_buf_hdr_t         *b_l2hdr;
 486  486          list_node_t             b_l2node;
 487  487  };
 488  488  
 489  489  static arc_buf_t *arc_eviction_list;
 490  490  static kmutex_t arc_eviction_mtx;
 491  491  static arc_buf_hdr_t arc_eviction_hdr;
 492  492  static void arc_get_data_buf(arc_buf_t *buf);
 493  493  static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 494  494  static int arc_evict_needed(arc_buf_contents_t type);
 495  495  static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 496  496  static void arc_buf_watch(arc_buf_t *buf);
 497  497  
 498  498  static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 499  499  
 500  500  #define GHOST_STATE(state)      \
 501  501          ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 502  502          (state) == arc_l2c_only)
 503  503  
 504  504  /*
 505  505   * Private ARC flags.  These flags are private ARC only flags that will show up
 506  506   * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 507  507   * be passed in as arc_flags in things like arc_read.  However, these flags
 508  508   * should never be passed and should only be set by ARC code.  When adding new
 509  509   * public flags, make sure not to smash the private ones.
 510  510   */
 511  511  
 512  512  #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 513  513  #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 514  514  #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 515  515  #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 516  516  #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 517  517  #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 518  518  #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 519  519  #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 520  520  #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 521  521  #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 522  522  
 523  523  #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 524  524  #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 525  525  #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 526  526  #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 527  527  #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 528  528  #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 529  529  #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 530  530  #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 531  531  #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 532  532                                      (hdr)->b_l2hdr != NULL)
 533  533  #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 534  534  #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 535  535  #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 536  536  
 537  537  /*
 538  538   * Other sizes
 539  539   */
 540  540  
 541  541  #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 542  542  #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 543  543  
 544  544  /*
 545  545   * Hash table routines
 546  546   */
 547  547  
 548  548  #define HT_LOCK_PAD     64
 549  549  
 550  550  struct ht_lock {
 551  551          kmutex_t        ht_lock;
 552  552  #ifdef _KERNEL
 553  553          unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 554  554  #endif
 555  555  };
 556  556  
 557  557  #define BUF_LOCKS 256
 558  558  typedef struct buf_hash_table {
 559  559          uint64_t ht_mask;
 560  560          arc_buf_hdr_t **ht_table;
 561  561          struct ht_lock ht_locks[BUF_LOCKS];
 562  562  } buf_hash_table_t;
 563  563  
 564  564  static buf_hash_table_t buf_hash_table;
 565  565  
 566  566  #define BUF_HASH_INDEX(spa, dva, birth) \
 567  567          (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 568  568  #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 569  569  #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 570  570  #define HDR_LOCK(hdr) \
 571  571          (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 572  572  
 573  573  uint64_t zfs_crc64_table[256];
 574  574  
 575  575  /*
 576  576   * Level 2 ARC
 577  577   */
 578  578  
 579  579  #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 580  580  #define L2ARC_HEADROOM          2               /* num of writes */
 581  581  #define L2ARC_FEED_SECS         1               /* caching interval secs */
 582  582  #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 583  583  
 584  584  #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 585  585  #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 586  586  
 587  587  /*
 588  588   * L2ARC Performance Tunables
 589  589   */
 590  590  uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 591  591  uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 592  592  uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 593  593  uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 594  594  uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 595  595  boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 596  596  boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 597  597  boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 598  598  
 599  599  /*
 600  600   * L2ARC Internals
 601  601   */
 602  602  typedef struct l2arc_dev {
 603  603          vdev_t                  *l2ad_vdev;     /* vdev */
 604  604          spa_t                   *l2ad_spa;      /* spa */
 605  605          uint64_t                l2ad_hand;      /* next write location */
 606  606          uint64_t                l2ad_write;     /* desired write size, bytes */
 607  607          uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 608  608          uint64_t                l2ad_start;     /* first addr on device */
 609  609          uint64_t                l2ad_end;       /* last addr on device */
 610  610          uint64_t                l2ad_evict;     /* last addr eviction reached */
 611  611          boolean_t               l2ad_first;     /* first sweep through */
 612  612          boolean_t               l2ad_writing;   /* currently writing */
 613  613          list_t                  *l2ad_buflist;  /* buffer list */
 614  614          list_node_t             l2ad_node;      /* device list node */
 615  615  } l2arc_dev_t;
 616  616  
 617  617  static list_t L2ARC_dev_list;                   /* device list */
 618  618  static list_t *l2arc_dev_list;                  /* device list pointer */
 619  619  static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 620  620  static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 621  621  static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 622  622  static list_t L2ARC_free_on_write;              /* free after write buf list */
 623  623  static list_t *l2arc_free_on_write;             /* free after write list ptr */
 624  624  static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 625  625  static uint64_t l2arc_ndev;                     /* number of devices */
 626  626  
 627  627  typedef struct l2arc_read_callback {
 628  628          arc_buf_t       *l2rcb_buf;             /* read buffer */
 629  629          spa_t           *l2rcb_spa;             /* spa */
 630  630          blkptr_t        l2rcb_bp;               /* original blkptr */
 631  631          zbookmark_t     l2rcb_zb;               /* original bookmark */
 632  632          int             l2rcb_flags;            /* original flags */
 633  633  } l2arc_read_callback_t;
 634  634  
 635  635  typedef struct l2arc_write_callback {
 636  636          l2arc_dev_t     *l2wcb_dev;             /* device info */
 637  637          arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 638  638  } l2arc_write_callback_t;
 639  639  
 640  640  struct l2arc_buf_hdr {
 641  641          /* protected by arc_buf_hdr  mutex */
 642  642          l2arc_dev_t     *b_dev;                 /* L2ARC device */
 643  643          uint64_t        b_daddr;                /* disk address, offset byte */
 644  644  };
 645  645  
 646  646  typedef struct l2arc_data_free {
 647  647          /* protected by l2arc_free_on_write_mtx */
 648  648          void            *l2df_data;
 649  649          size_t          l2df_size;
 650  650          void            (*l2df_func)(void *, size_t);
 651  651          list_node_t     l2df_list_node;
 652  652  } l2arc_data_free_t;
 653  653  
 654  654  static kmutex_t l2arc_feed_thr_lock;
 655  655  static kcondvar_t l2arc_feed_thr_cv;
 656  656  static uint8_t l2arc_thread_exit;
 657  657  
 658  658  static void l2arc_read_done(zio_t *zio);
 659  659  static void l2arc_hdr_stat_add(void);
 660  660  static void l2arc_hdr_stat_remove(void);
 661  661  
 662  662  static uint64_t
 663  663  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 664  664  {
 665  665          uint8_t *vdva = (uint8_t *)dva;
 666  666          uint64_t crc = -1ULL;
 667  667          int i;
 668  668  
 669  669          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 670  670  
 671  671          for (i = 0; i < sizeof (dva_t); i++)
 672  672                  crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 673  673  
 674  674          crc ^= (spa>>8) ^ birth;
 675  675  
 676  676          return (crc);
 677  677  }
 678  678  
 679  679  #define BUF_EMPTY(buf)                                          \
 680  680          ((buf)->b_dva.dva_word[0] == 0 &&                       \
 681  681          (buf)->b_dva.dva_word[1] == 0 &&                        \
 682  682          (buf)->b_birth == 0)
 683  683  
 684  684  #define BUF_EQUAL(spa, dva, birth, buf)                         \
 685  685          ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 686  686          ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 687  687          ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 688  688  
 689  689  static void
 690  690  buf_discard_identity(arc_buf_hdr_t *hdr)
 691  691  {
 692  692          hdr->b_dva.dva_word[0] = 0;
 693  693          hdr->b_dva.dva_word[1] = 0;
 694  694          hdr->b_birth = 0;
 695  695          hdr->b_cksum0 = 0;
 696  696  }
 697  697  
 698  698  static arc_buf_hdr_t *
 699  699  buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 700  700  {
 701  701          uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 702  702          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 703  703          arc_buf_hdr_t *buf;
 704  704  
 705  705          mutex_enter(hash_lock);
 706  706          for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 707  707              buf = buf->b_hash_next) {
 708  708                  if (BUF_EQUAL(spa, dva, birth, buf)) {
 709  709                          *lockp = hash_lock;
 710  710                          return (buf);
 711  711                  }
 712  712          }
 713  713          mutex_exit(hash_lock);
 714  714          *lockp = NULL;
 715  715          return (NULL);
 716  716  }
 717  717  
 718  718  /*
 719  719   * Insert an entry into the hash table.  If there is already an element
 720  720   * equal to elem in the hash table, then the already existing element
 721  721   * will be returned and the new element will not be inserted.
 722  722   * Otherwise returns NULL.
 723  723   */
 724  724  static arc_buf_hdr_t *
 725  725  buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 726  726  {
 727  727          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 728  728          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 729  729          arc_buf_hdr_t *fbuf;
 730  730          uint32_t i;
 731  731  
 732  732          ASSERT(!HDR_IN_HASH_TABLE(buf));
 733  733          *lockp = hash_lock;
 734  734          mutex_enter(hash_lock);
 735  735          for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 736  736              fbuf = fbuf->b_hash_next, i++) {
 737  737                  if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 738  738                          return (fbuf);
 739  739          }
 740  740  
 741  741          buf->b_hash_next = buf_hash_table.ht_table[idx];
 742  742          buf_hash_table.ht_table[idx] = buf;
 743  743          buf->b_flags |= ARC_IN_HASH_TABLE;
 744  744  
 745  745          /* collect some hash table performance data */
 746  746          if (i > 0) {
 747  747                  ARCSTAT_BUMP(arcstat_hash_collisions);
 748  748                  if (i == 1)
 749  749                          ARCSTAT_BUMP(arcstat_hash_chains);
 750  750  
 751  751                  ARCSTAT_MAX(arcstat_hash_chain_max, i);
 752  752          }
 753  753  
 754  754          ARCSTAT_BUMP(arcstat_hash_elements);
 755  755          ARCSTAT_MAXSTAT(arcstat_hash_elements);
 756  756  
 757  757          return (NULL);
 758  758  }
 759  759  
 760  760  static void
 761  761  buf_hash_remove(arc_buf_hdr_t *buf)
 762  762  {
 763  763          arc_buf_hdr_t *fbuf, **bufp;
 764  764          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 765  765  
 766  766          ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 767  767          ASSERT(HDR_IN_HASH_TABLE(buf));
 768  768  
 769  769          bufp = &buf_hash_table.ht_table[idx];
 770  770          while ((fbuf = *bufp) != buf) {
 771  771                  ASSERT(fbuf != NULL);
 772  772                  bufp = &fbuf->b_hash_next;
 773  773          }
 774  774          *bufp = buf->b_hash_next;
 775  775          buf->b_hash_next = NULL;
 776  776          buf->b_flags &= ~ARC_IN_HASH_TABLE;
 777  777  
 778  778          /* collect some hash table performance data */
 779  779          ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 780  780  
 781  781          if (buf_hash_table.ht_table[idx] &&
 782  782              buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 783  783                  ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 784  784  }
 785  785  
 786  786  /*
 787  787   * Global data structures and functions for the buf kmem cache.
 788  788   */
 789  789  static kmem_cache_t *hdr_cache;
 790  790  static kmem_cache_t *buf_cache;
 791  791  
 792  792  static void
 793  793  buf_fini(void)
 794  794  {
 795  795          int i;
 796  796  
 797  797          kmem_free(buf_hash_table.ht_table,
 798  798              (buf_hash_table.ht_mask + 1) * sizeof (void *));
 799  799          for (i = 0; i < BUF_LOCKS; i++)
 800  800                  mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 801  801          kmem_cache_destroy(hdr_cache);
 802  802          kmem_cache_destroy(buf_cache);
 803  803  }
 804  804  
 805  805  /*
 806  806   * Constructor callback - called when the cache is empty
 807  807   * and a new buf is requested.
 808  808   */
 809  809  /* ARGSUSED */
 810  810  static int
 811  811  hdr_cons(void *vbuf, void *unused, int kmflag)
 812  812  {
 813  813          arc_buf_hdr_t *buf = vbuf;
 814  814  
 815  815          bzero(buf, sizeof (arc_buf_hdr_t));
 816  816          refcount_create(&buf->b_refcnt);
 817  817          cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 818  818          mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 819  819          arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 820  820  
 821  821          return (0);
 822  822  }
 823  823  
 824  824  /* ARGSUSED */
 825  825  static int
 826  826  buf_cons(void *vbuf, void *unused, int kmflag)
 827  827  {
 828  828          arc_buf_t *buf = vbuf;
 829  829  
 830  830          bzero(buf, sizeof (arc_buf_t));
 831  831          mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 832  832          arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 833  833  
 834  834          return (0);
 835  835  }
 836  836  
 837  837  /*
 838  838   * Destructor callback - called when a cached buf is
 839  839   * no longer required.
 840  840   */
 841  841  /* ARGSUSED */
 842  842  static void
 843  843  hdr_dest(void *vbuf, void *unused)
 844  844  {
 845  845          arc_buf_hdr_t *buf = vbuf;
 846  846  
 847  847          ASSERT(BUF_EMPTY(buf));
 848  848          refcount_destroy(&buf->b_refcnt);
 849  849          cv_destroy(&buf->b_cv);
 850  850          mutex_destroy(&buf->b_freeze_lock);
 851  851          arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 852  852  }
 853  853  
 854  854  /* ARGSUSED */
 855  855  static void
 856  856  buf_dest(void *vbuf, void *unused)
 857  857  {
 858  858          arc_buf_t *buf = vbuf;
 859  859  
 860  860          mutex_destroy(&buf->b_evict_lock);
 861  861          arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 862  862  }
 863  863  
 864  864  /*
 865  865   * Reclaim callback -- invoked when memory is low.
 866  866   */
 867  867  /* ARGSUSED */
 868  868  static void
 869  869  hdr_recl(void *unused)
 870  870  {
 871  871          dprintf("hdr_recl called\n");
 872  872          /*
 873  873           * umem calls the reclaim func when we destroy the buf cache,
 874  874           * which is after we do arc_fini().
 875  875           */
 876  876          if (!arc_dead)
 877  877                  cv_signal(&arc_reclaim_thr_cv);
 878  878  }
 879  879  
 880  880  static void
 881  881  buf_init(void)
 882  882  {
 883  883          uint64_t *ct;
 884  884          uint64_t hsize = 1ULL << 12;
 885  885          int i, j;
 886  886  
 887  887          /*
 888  888           * The hash table is big enough to fill all of physical memory
 889  889           * with an average 64K block size.  The table will take up
 890  890           * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 891  891           */
 892  892          while (hsize * 65536 < physmem * PAGESIZE)
 893  893                  hsize <<= 1;
 894  894  retry:
 895  895          buf_hash_table.ht_mask = hsize - 1;
 896  896          buf_hash_table.ht_table =
 897  897              kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 898  898          if (buf_hash_table.ht_table == NULL) {
 899  899                  ASSERT(hsize > (1ULL << 8));
 900  900                  hsize >>= 1;
 901  901                  goto retry;
 902  902          }
 903  903  
 904  904          hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 905  905              0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 906  906          buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 907  907              0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 908  908  
 909  909          for (i = 0; i < 256; i++)
 910  910                  for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 911  911                          *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 912  912  
 913  913          for (i = 0; i < BUF_LOCKS; i++) {
 914  914                  mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 915  915                      NULL, MUTEX_DEFAULT, NULL);
 916  916          }
 917  917  }
 918  918  
 919  919  #define ARC_MINTIME     (hz>>4) /* 62 ms */
 920  920  
 921  921  static void
 922  922  arc_cksum_verify(arc_buf_t *buf)
 923  923  {
 924  924          zio_cksum_t zc;
 925  925  
 926  926          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 927  927                  return;
 928  928  
 929  929          mutex_enter(&buf->b_hdr->b_freeze_lock);
 930  930          if (buf->b_hdr->b_freeze_cksum == NULL ||
 931  931              (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 932  932                  mutex_exit(&buf->b_hdr->b_freeze_lock);
 933  933                  return;
 934  934          }
 935  935          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 936  936          if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 937  937                  panic("buffer modified while frozen!");
 938  938          mutex_exit(&buf->b_hdr->b_freeze_lock);
 939  939  }
 940  940  
 941  941  static int
 942  942  arc_cksum_equal(arc_buf_t *buf)
 943  943  {
 944  944          zio_cksum_t zc;
 945  945          int equal;
 946  946  
 947  947          mutex_enter(&buf->b_hdr->b_freeze_lock);
 948  948          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 949  949          equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 950  950          mutex_exit(&buf->b_hdr->b_freeze_lock);
 951  951  
 952  952          return (equal);
 953  953  }
 954  954  
 955  955  static void
 956  956  arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 957  957  {
 958  958          if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 959  959                  return;
 960  960  
 961  961          mutex_enter(&buf->b_hdr->b_freeze_lock);
 962  962          if (buf->b_hdr->b_freeze_cksum != NULL) {
 963  963                  mutex_exit(&buf->b_hdr->b_freeze_lock);
 964  964                  return;
 965  965          }
 966  966          buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
 967  967          fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
 968  968              buf->b_hdr->b_freeze_cksum);
 969  969          mutex_exit(&buf->b_hdr->b_freeze_lock);
 970  970          arc_buf_watch(buf);
 971  971  }
 972  972  
 973  973  #ifndef _KERNEL
 974  974  typedef struct procctl {
 975  975          long cmd;
 976  976          prwatch_t prwatch;
 977  977  } procctl_t;
 978  978  #endif
 979  979  
 980  980  /* ARGSUSED */
 981  981  static void
 982  982  arc_buf_unwatch(arc_buf_t *buf)
 983  983  {
 984  984  #ifndef _KERNEL
 985  985          if (arc_watch) {
 986  986                  int result;
 987  987                  procctl_t ctl;
 988  988                  ctl.cmd = PCWATCH;
 989  989                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
 990  990                  ctl.prwatch.pr_size = 0;
 991  991                  ctl.prwatch.pr_wflags = 0;
 992  992                  result = write(arc_procfd, &ctl, sizeof (ctl));
 993  993                  ASSERT3U(result, ==, sizeof (ctl));
 994  994          }
 995  995  #endif
 996  996  }
 997  997  
 998  998  /* ARGSUSED */
 999  999  static void
1000 1000  arc_buf_watch(arc_buf_t *buf)
1001 1001  {
1002 1002  #ifndef _KERNEL
1003 1003          if (arc_watch) {
1004 1004                  int result;
1005 1005                  procctl_t ctl;
1006 1006                  ctl.cmd = PCWATCH;
1007 1007                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1008 1008                  ctl.prwatch.pr_size = buf->b_hdr->b_size;
1009 1009                  ctl.prwatch.pr_wflags = WA_WRITE;
1010 1010                  result = write(arc_procfd, &ctl, sizeof (ctl));
1011 1011                  ASSERT3U(result, ==, sizeof (ctl));
1012 1012          }
1013 1013  #endif
1014 1014  }
1015 1015  
1016 1016  void
1017 1017  arc_buf_thaw(arc_buf_t *buf)
1018 1018  {
1019 1019          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1020 1020                  if (buf->b_hdr->b_state != arc_anon)
1021 1021                          panic("modifying non-anon buffer!");
1022 1022                  if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1023 1023                          panic("modifying buffer while i/o in progress!");
1024 1024                  arc_cksum_verify(buf);
1025 1025          }
1026 1026  
1027 1027          mutex_enter(&buf->b_hdr->b_freeze_lock);
1028 1028          if (buf->b_hdr->b_freeze_cksum != NULL) {
1029 1029                  kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1030 1030                  buf->b_hdr->b_freeze_cksum = NULL;
1031 1031          }
1032 1032  
1033 1033          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1034 1034                  if (buf->b_hdr->b_thawed)
1035 1035                          kmem_free(buf->b_hdr->b_thawed, 1);
1036 1036                  buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1037 1037          }
1038 1038  
1039 1039          mutex_exit(&buf->b_hdr->b_freeze_lock);
1040 1040  
1041 1041          arc_buf_unwatch(buf);
1042 1042  }
1043 1043  
1044 1044  void
1045 1045  arc_buf_freeze(arc_buf_t *buf)
1046 1046  {
1047 1047          kmutex_t *hash_lock;
1048 1048  
1049 1049          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1050 1050                  return;
1051 1051  
1052 1052          hash_lock = HDR_LOCK(buf->b_hdr);
1053 1053          mutex_enter(hash_lock);
1054 1054  
1055 1055          ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1056 1056              buf->b_hdr->b_state == arc_anon);
1057 1057          arc_cksum_compute(buf, B_FALSE);
1058 1058          mutex_exit(hash_lock);
1059 1059  
1060 1060  }
1061 1061  
1062 1062  static void
1063 1063  add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1064 1064  {
1065 1065          ASSERT(MUTEX_HELD(hash_lock));
1066 1066  
1067 1067          if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1068 1068              (ab->b_state != arc_anon)) {
1069 1069                  uint64_t delta = ab->b_size * ab->b_datacnt;
1070 1070                  list_t *list = &ab->b_state->arcs_list[ab->b_type];
1071 1071                  uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1072 1072  
1073 1073                  ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1074 1074                  mutex_enter(&ab->b_state->arcs_mtx);
1075 1075                  ASSERT(list_link_active(&ab->b_arc_node));
1076 1076                  list_remove(list, ab);
1077 1077                  if (GHOST_STATE(ab->b_state)) {
1078 1078                          ASSERT0(ab->b_datacnt);
1079 1079                          ASSERT3P(ab->b_buf, ==, NULL);
1080 1080                          delta = ab->b_size;
1081 1081                  }
1082 1082                  ASSERT(delta > 0);
1083 1083                  ASSERT3U(*size, >=, delta);
1084 1084                  atomic_add_64(size, -delta);
1085 1085                  mutex_exit(&ab->b_state->arcs_mtx);
1086 1086                  /* remove the prefetch flag if we get a reference */
1087 1087                  if (ab->b_flags & ARC_PREFETCH)
1088 1088                          ab->b_flags &= ~ARC_PREFETCH;
1089 1089          }
1090 1090  }
1091 1091  
1092 1092  static int
1093 1093  remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1094 1094  {
1095 1095          int cnt;
1096 1096          arc_state_t *state = ab->b_state;
1097 1097  
1098 1098          ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1099 1099          ASSERT(!GHOST_STATE(state));
1100 1100  
1101 1101          if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1102 1102              (state != arc_anon)) {
1103 1103                  uint64_t *size = &state->arcs_lsize[ab->b_type];
1104 1104  
1105 1105                  ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1106 1106                  mutex_enter(&state->arcs_mtx);
1107 1107                  ASSERT(!list_link_active(&ab->b_arc_node));
1108 1108                  list_insert_head(&state->arcs_list[ab->b_type], ab);
1109 1109                  ASSERT(ab->b_datacnt > 0);
1110 1110                  atomic_add_64(size, ab->b_size * ab->b_datacnt);
1111 1111                  mutex_exit(&state->arcs_mtx);
1112 1112          }
1113 1113          return (cnt);
1114 1114  }
1115 1115  
1116 1116  /*
1117 1117   * Move the supplied buffer to the indicated state.  The mutex
1118 1118   * for the buffer must be held by the caller.
1119 1119   */
1120 1120  static void
1121 1121  arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1122 1122  {
1123 1123          arc_state_t *old_state = ab->b_state;
1124 1124          int64_t refcnt = refcount_count(&ab->b_refcnt);
1125 1125          uint64_t from_delta, to_delta;
1126 1126  
1127 1127          ASSERT(MUTEX_HELD(hash_lock));
1128 1128          ASSERT(new_state != old_state);
1129 1129          ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1130 1130          ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1131 1131          ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1132 1132  
1133 1133          from_delta = to_delta = ab->b_datacnt * ab->b_size;
1134 1134  
1135 1135          /*
1136 1136           * If this buffer is evictable, transfer it from the
1137 1137           * old state list to the new state list.
1138 1138           */
1139 1139          if (refcnt == 0) {
1140 1140                  if (old_state != arc_anon) {
1141 1141                          int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1142 1142                          uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1143 1143  
1144 1144                          if (use_mutex)
1145 1145                                  mutex_enter(&old_state->arcs_mtx);
1146 1146  
1147 1147                          ASSERT(list_link_active(&ab->b_arc_node));
1148 1148                          list_remove(&old_state->arcs_list[ab->b_type], ab);
1149 1149  
1150 1150                          /*
1151 1151                           * If prefetching out of the ghost cache,
1152 1152                           * we will have a non-zero datacnt.
1153 1153                           */
1154 1154                          if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1155 1155                                  /* ghost elements have a ghost size */
1156 1156                                  ASSERT(ab->b_buf == NULL);
1157 1157                                  from_delta = ab->b_size;
1158 1158                          }
1159 1159                          ASSERT3U(*size, >=, from_delta);
1160 1160                          atomic_add_64(size, -from_delta);
1161 1161  
1162 1162                          if (use_mutex)
1163 1163                                  mutex_exit(&old_state->arcs_mtx);
1164 1164                  }
1165 1165                  if (new_state != arc_anon) {
1166 1166                          int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1167 1167                          uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1168 1168  
1169 1169                          if (use_mutex)
1170 1170                                  mutex_enter(&new_state->arcs_mtx);
1171 1171  
1172 1172                          list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1173 1173  
1174 1174                          /* ghost elements have a ghost size */
1175 1175                          if (GHOST_STATE(new_state)) {
1176 1176                                  ASSERT(ab->b_datacnt == 0);
1177 1177                                  ASSERT(ab->b_buf == NULL);
1178 1178                                  to_delta = ab->b_size;
1179 1179                          }
1180 1180                          atomic_add_64(size, to_delta);
1181 1181  
1182 1182                          if (use_mutex)
1183 1183                                  mutex_exit(&new_state->arcs_mtx);
1184 1184                  }
1185 1185          }
1186 1186  
1187 1187          ASSERT(!BUF_EMPTY(ab));
1188 1188          if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1189 1189                  buf_hash_remove(ab);
1190 1190  
1191 1191          /* adjust state sizes */
1192 1192          if (to_delta)
1193 1193                  atomic_add_64(&new_state->arcs_size, to_delta);
1194 1194          if (from_delta) {
1195 1195                  ASSERT3U(old_state->arcs_size, >=, from_delta);
1196 1196                  atomic_add_64(&old_state->arcs_size, -from_delta);
1197 1197          }
1198 1198          ab->b_state = new_state;
1199 1199  
1200 1200          /* adjust l2arc hdr stats */
1201 1201          if (new_state == arc_l2c_only)
1202 1202                  l2arc_hdr_stat_add();
1203 1203          else if (old_state == arc_l2c_only)
1204 1204                  l2arc_hdr_stat_remove();
1205 1205  }
1206 1206  
1207 1207  void
1208 1208  arc_space_consume(uint64_t space, arc_space_type_t type)
1209 1209  {
1210 1210          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1211 1211  
1212 1212          switch (type) {
1213 1213          case ARC_SPACE_DATA:
1214 1214                  ARCSTAT_INCR(arcstat_data_size, space);
1215 1215                  break;
1216 1216          case ARC_SPACE_OTHER:
1217 1217                  ARCSTAT_INCR(arcstat_other_size, space);
1218 1218                  break;
1219 1219          case ARC_SPACE_HDRS:
1220 1220                  ARCSTAT_INCR(arcstat_hdr_size, space);
1221 1221                  break;
1222 1222          case ARC_SPACE_L2HDRS:
1223 1223                  ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1224 1224                  break;
1225 1225          }
1226 1226  
1227 1227          ARCSTAT_INCR(arcstat_meta_used, space);
1228 1228          atomic_add_64(&arc_size, space);
1229 1229  }
1230 1230  
1231 1231  void
1232 1232  arc_space_return(uint64_t space, arc_space_type_t type)
1233 1233  {
1234 1234          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1235 1235  
1236 1236          switch (type) {
1237 1237          case ARC_SPACE_DATA:
1238 1238                  ARCSTAT_INCR(arcstat_data_size, -space);
1239 1239                  break;
1240 1240          case ARC_SPACE_OTHER:
1241 1241                  ARCSTAT_INCR(arcstat_other_size, -space);
1242 1242                  break;
1243 1243          case ARC_SPACE_HDRS:
1244 1244                  ARCSTAT_INCR(arcstat_hdr_size, -space);
1245 1245                  break;
1246 1246          case ARC_SPACE_L2HDRS:
1247 1247                  ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1248 1248                  break;
1249 1249          }
1250 1250  
1251 1251          ASSERT(arc_meta_used >= space);
1252 1252          if (arc_meta_max < arc_meta_used)
1253 1253                  arc_meta_max = arc_meta_used;
1254 1254          ARCSTAT_INCR(arcstat_meta_used, -space);
1255 1255          ASSERT(arc_size >= space);
1256 1256          atomic_add_64(&arc_size, -space);
1257 1257  }
1258 1258  
1259 1259  void *
1260 1260  arc_data_buf_alloc(uint64_t size)
1261 1261  {
1262 1262          if (arc_evict_needed(ARC_BUFC_DATA))
1263 1263                  cv_signal(&arc_reclaim_thr_cv);
1264 1264          atomic_add_64(&arc_size, size);
1265 1265          return (zio_data_buf_alloc(size));
1266 1266  }
1267 1267  
1268 1268  void
1269 1269  arc_data_buf_free(void *buf, uint64_t size)
1270 1270  {
1271 1271          zio_data_buf_free(buf, size);
1272 1272          ASSERT(arc_size >= size);
1273 1273          atomic_add_64(&arc_size, -size);
1274 1274  }
1275 1275  
1276 1276  arc_buf_t *
1277 1277  arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1278 1278  {
1279 1279          arc_buf_hdr_t *hdr;
1280 1280          arc_buf_t *buf;
1281 1281  
1282 1282          ASSERT3U(size, >, 0);
1283 1283          hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1284 1284          ASSERT(BUF_EMPTY(hdr));
1285 1285          hdr->b_size = size;
1286 1286          hdr->b_type = type;
1287 1287          hdr->b_spa = spa_load_guid(spa);
1288 1288          hdr->b_state = arc_anon;
1289 1289          hdr->b_arc_access = 0;
1290 1290          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1291 1291          buf->b_hdr = hdr;
1292 1292          buf->b_data = NULL;
1293 1293          buf->b_efunc = NULL;
1294 1294          buf->b_private = NULL;
1295 1295          buf->b_next = NULL;
1296 1296          hdr->b_buf = buf;
1297 1297          arc_get_data_buf(buf);
1298 1298          hdr->b_datacnt = 1;
1299 1299          hdr->b_flags = 0;
1300 1300          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1301 1301          (void) refcount_add(&hdr->b_refcnt, tag);
1302 1302  
1303 1303          return (buf);
1304 1304  }
1305 1305  
1306 1306  static char *arc_onloan_tag = "onloan";
1307 1307  
1308 1308  /*
1309 1309   * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1310 1310   * flight data by arc_tempreserve_space() until they are "returned". Loaned
1311 1311   * buffers must be returned to the arc before they can be used by the DMU or
1312 1312   * freed.
1313 1313   */
1314 1314  arc_buf_t *
1315 1315  arc_loan_buf(spa_t *spa, int size)
1316 1316  {
1317 1317          arc_buf_t *buf;
1318 1318  
1319 1319          buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1320 1320  
1321 1321          atomic_add_64(&arc_loaned_bytes, size);
1322 1322          return (buf);
1323 1323  }
1324 1324  
1325 1325  /*
1326 1326   * Return a loaned arc buffer to the arc.
1327 1327   */
1328 1328  void
1329 1329  arc_return_buf(arc_buf_t *buf, void *tag)
1330 1330  {
1331 1331          arc_buf_hdr_t *hdr = buf->b_hdr;
1332 1332  
1333 1333          ASSERT(buf->b_data != NULL);
1334 1334          (void) refcount_add(&hdr->b_refcnt, tag);
1335 1335          (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1336 1336  
1337 1337          atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1338 1338  }
1339 1339  
1340 1340  /* Detach an arc_buf from a dbuf (tag) */
1341 1341  void
1342 1342  arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1343 1343  {
1344 1344          arc_buf_hdr_t *hdr;
1345 1345  
1346 1346          ASSERT(buf->b_data != NULL);
1347 1347          hdr = buf->b_hdr;
1348 1348          (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1349 1349          (void) refcount_remove(&hdr->b_refcnt, tag);
1350 1350          buf->b_efunc = NULL;
1351 1351          buf->b_private = NULL;
1352 1352  
1353 1353          atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1354 1354  }
1355 1355  
1356 1356  static arc_buf_t *
1357 1357  arc_buf_clone(arc_buf_t *from)
1358 1358  {
1359 1359          arc_buf_t *buf;
1360 1360          arc_buf_hdr_t *hdr = from->b_hdr;
1361 1361          uint64_t size = hdr->b_size;
1362 1362  
1363 1363          ASSERT(hdr->b_state != arc_anon);
1364 1364  
1365 1365          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1366 1366          buf->b_hdr = hdr;
1367 1367          buf->b_data = NULL;
1368 1368          buf->b_efunc = NULL;
1369 1369          buf->b_private = NULL;
1370 1370          buf->b_next = hdr->b_buf;
1371 1371          hdr->b_buf = buf;
1372 1372          arc_get_data_buf(buf);
1373 1373          bcopy(from->b_data, buf->b_data, size);
1374 1374  
1375 1375          /*
1376 1376           * This buffer already exists in the arc so create a duplicate
1377 1377           * copy for the caller.  If the buffer is associated with user data
1378 1378           * then track the size and number of duplicates.  These stats will be
1379 1379           * updated as duplicate buffers are created and destroyed.
1380 1380           */
1381 1381          if (hdr->b_type == ARC_BUFC_DATA) {
1382 1382                  ARCSTAT_BUMP(arcstat_duplicate_buffers);
1383 1383                  ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1384 1384          }
1385 1385          hdr->b_datacnt += 1;
1386 1386          return (buf);
1387 1387  }
1388 1388  
1389 1389  void
1390 1390  arc_buf_add_ref(arc_buf_t *buf, void* tag)
1391 1391  {
1392 1392          arc_buf_hdr_t *hdr;
1393 1393          kmutex_t *hash_lock;
1394 1394  
1395 1395          /*
1396 1396           * Check to see if this buffer is evicted.  Callers
1397 1397           * must verify b_data != NULL to know if the add_ref
1398 1398           * was successful.
1399 1399           */
1400 1400          mutex_enter(&buf->b_evict_lock);
1401 1401          if (buf->b_data == NULL) {
1402 1402                  mutex_exit(&buf->b_evict_lock);
1403 1403                  return;
1404 1404          }
1405 1405          hash_lock = HDR_LOCK(buf->b_hdr);
1406 1406          mutex_enter(hash_lock);
1407 1407          hdr = buf->b_hdr;
1408 1408          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1409 1409          mutex_exit(&buf->b_evict_lock);
1410 1410  
1411 1411          ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1412 1412          add_reference(hdr, hash_lock, tag);
1413 1413          DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1414 1414          arc_access(hdr, hash_lock);
1415 1415          mutex_exit(hash_lock);
1416 1416          ARCSTAT_BUMP(arcstat_hits);
1417 1417          ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1418 1418              demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1419 1419              data, metadata, hits);
1420 1420  }
1421 1421  
1422 1422  /*
1423 1423   * Free the arc data buffer.  If it is an l2arc write in progress,
1424 1424   * the buffer is placed on l2arc_free_on_write to be freed later.
1425 1425   */
1426 1426  static void
1427 1427  arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1428 1428  {
1429 1429          arc_buf_hdr_t *hdr = buf->b_hdr;
1430 1430  
1431 1431          if (HDR_L2_WRITING(hdr)) {
1432 1432                  l2arc_data_free_t *df;
1433 1433                  df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1434 1434                  df->l2df_data = buf->b_data;
1435 1435                  df->l2df_size = hdr->b_size;
1436 1436                  df->l2df_func = free_func;
1437 1437                  mutex_enter(&l2arc_free_on_write_mtx);
1438 1438                  list_insert_head(l2arc_free_on_write, df);
1439 1439                  mutex_exit(&l2arc_free_on_write_mtx);
1440 1440                  ARCSTAT_BUMP(arcstat_l2_free_on_write);
1441 1441          } else {
1442 1442                  free_func(buf->b_data, hdr->b_size);
1443 1443          }
1444 1444  }
1445 1445  
1446 1446  static void
1447 1447  arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1448 1448  {
1449 1449          arc_buf_t **bufp;
1450 1450  
1451 1451          /* free up data associated with the buf */
1452 1452          if (buf->b_data) {
1453 1453                  arc_state_t *state = buf->b_hdr->b_state;
1454 1454                  uint64_t size = buf->b_hdr->b_size;
1455 1455                  arc_buf_contents_t type = buf->b_hdr->b_type;
1456 1456  
1457 1457                  arc_cksum_verify(buf);
1458 1458                  arc_buf_unwatch(buf);
1459 1459  
1460 1460                  if (!recycle) {
1461 1461                          if (type == ARC_BUFC_METADATA) {
1462 1462                                  arc_buf_data_free(buf, zio_buf_free);
1463 1463                                  arc_space_return(size, ARC_SPACE_DATA);
1464 1464                          } else {
1465 1465                                  ASSERT(type == ARC_BUFC_DATA);
1466 1466                                  arc_buf_data_free(buf, zio_data_buf_free);
1467 1467                                  ARCSTAT_INCR(arcstat_data_size, -size);
1468 1468                                  atomic_add_64(&arc_size, -size);
1469 1469                          }
1470 1470                  }
1471 1471                  if (list_link_active(&buf->b_hdr->b_arc_node)) {
1472 1472                          uint64_t *cnt = &state->arcs_lsize[type];
1473 1473  
1474 1474                          ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1475 1475                          ASSERT(state != arc_anon);
1476 1476  
1477 1477                          ASSERT3U(*cnt, >=, size);
1478 1478                          atomic_add_64(cnt, -size);
1479 1479                  }
1480 1480                  ASSERT3U(state->arcs_size, >=, size);
1481 1481                  atomic_add_64(&state->arcs_size, -size);
1482 1482                  buf->b_data = NULL;
1483 1483  
1484 1484                  /*
1485 1485                   * If we're destroying a duplicate buffer make sure
1486 1486                   * that the appropriate statistics are updated.
1487 1487                   */
1488 1488                  if (buf->b_hdr->b_datacnt > 1 &&
1489 1489                      buf->b_hdr->b_type == ARC_BUFC_DATA) {
1490 1490                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1491 1491                          ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1492 1492                  }
1493 1493                  ASSERT(buf->b_hdr->b_datacnt > 0);
1494 1494                  buf->b_hdr->b_datacnt -= 1;
1495 1495          }
1496 1496  
1497 1497          /* only remove the buf if requested */
1498 1498          if (!all)
1499 1499                  return;
1500 1500  
1501 1501          /* remove the buf from the hdr list */
1502 1502          for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1503 1503                  continue;
1504 1504          *bufp = buf->b_next;
1505 1505          buf->b_next = NULL;
1506 1506  
1507 1507          ASSERT(buf->b_efunc == NULL);
1508 1508  
1509 1509          /* clean up the buf */
1510 1510          buf->b_hdr = NULL;
1511 1511          kmem_cache_free(buf_cache, buf);
1512 1512  }
1513 1513  
1514 1514  static void
1515 1515  arc_hdr_destroy(arc_buf_hdr_t *hdr)
1516 1516  {
1517 1517          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1518 1518          ASSERT3P(hdr->b_state, ==, arc_anon);
1519 1519          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1520 1520          l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1521 1521  
1522 1522          if (l2hdr != NULL) {
1523 1523                  boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1524 1524                  /*
1525 1525                   * To prevent arc_free() and l2arc_evict() from
1526 1526                   * attempting to free the same buffer at the same time,
1527 1527                   * a FREE_IN_PROGRESS flag is given to arc_free() to
1528 1528                   * give it priority.  l2arc_evict() can't destroy this
1529 1529                   * header while we are waiting on l2arc_buflist_mtx.
1530 1530                   *
1531 1531                   * The hdr may be removed from l2ad_buflist before we
1532 1532                   * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1533 1533                   */
1534 1534                  if (!buflist_held) {
1535 1535                          mutex_enter(&l2arc_buflist_mtx);
1536 1536                          l2hdr = hdr->b_l2hdr;
1537 1537                  }
1538 1538  
1539 1539                  if (l2hdr != NULL) {
1540 1540                          list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1541 1541                          ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1542 1542                          kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1543 1543                          if (hdr->b_state == arc_l2c_only)
1544 1544                                  l2arc_hdr_stat_remove();
1545 1545                          hdr->b_l2hdr = NULL;
1546 1546                  }
1547 1547  
1548 1548                  if (!buflist_held)
1549 1549                          mutex_exit(&l2arc_buflist_mtx);
1550 1550          }
1551 1551  
1552 1552          if (!BUF_EMPTY(hdr)) {
1553 1553                  ASSERT(!HDR_IN_HASH_TABLE(hdr));
1554 1554                  buf_discard_identity(hdr);
1555 1555          }
1556 1556          while (hdr->b_buf) {
1557 1557                  arc_buf_t *buf = hdr->b_buf;
1558 1558  
1559 1559                  if (buf->b_efunc) {
1560 1560                          mutex_enter(&arc_eviction_mtx);
1561 1561                          mutex_enter(&buf->b_evict_lock);
1562 1562                          ASSERT(buf->b_hdr != NULL);
1563 1563                          arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1564 1564                          hdr->b_buf = buf->b_next;
1565 1565                          buf->b_hdr = &arc_eviction_hdr;
1566 1566                          buf->b_next = arc_eviction_list;
1567 1567                          arc_eviction_list = buf;
1568 1568                          mutex_exit(&buf->b_evict_lock);
1569 1569                          mutex_exit(&arc_eviction_mtx);
1570 1570                  } else {
1571 1571                          arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1572 1572                  }
1573 1573          }
1574 1574          if (hdr->b_freeze_cksum != NULL) {
1575 1575                  kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1576 1576                  hdr->b_freeze_cksum = NULL;
1577 1577          }
1578 1578          if (hdr->b_thawed) {
1579 1579                  kmem_free(hdr->b_thawed, 1);
1580 1580                  hdr->b_thawed = NULL;
1581 1581          }
1582 1582  
1583 1583          ASSERT(!list_link_active(&hdr->b_arc_node));
1584 1584          ASSERT3P(hdr->b_hash_next, ==, NULL);
1585 1585          ASSERT3P(hdr->b_acb, ==, NULL);
1586 1586          kmem_cache_free(hdr_cache, hdr);
1587 1587  }
1588 1588  
1589 1589  void
1590 1590  arc_buf_free(arc_buf_t *buf, void *tag)
1591 1591  {
1592 1592          arc_buf_hdr_t *hdr = buf->b_hdr;
1593 1593          int hashed = hdr->b_state != arc_anon;
1594 1594  
1595 1595          ASSERT(buf->b_efunc == NULL);
1596 1596          ASSERT(buf->b_data != NULL);
1597 1597  
1598 1598          if (hashed) {
1599 1599                  kmutex_t *hash_lock = HDR_LOCK(hdr);
1600 1600  
1601 1601                  mutex_enter(hash_lock);
1602 1602                  hdr = buf->b_hdr;
1603 1603                  ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1604 1604  
1605 1605                  (void) remove_reference(hdr, hash_lock, tag);
1606 1606                  if (hdr->b_datacnt > 1) {
1607 1607                          arc_buf_destroy(buf, FALSE, TRUE);
1608 1608                  } else {
1609 1609                          ASSERT(buf == hdr->b_buf);
1610 1610                          ASSERT(buf->b_efunc == NULL);
1611 1611                          hdr->b_flags |= ARC_BUF_AVAILABLE;
1612 1612                  }
1613 1613                  mutex_exit(hash_lock);
1614 1614          } else if (HDR_IO_IN_PROGRESS(hdr)) {
1615 1615                  int destroy_hdr;
1616 1616                  /*
1617 1617                   * We are in the middle of an async write.  Don't destroy
1618 1618                   * this buffer unless the write completes before we finish
1619 1619                   * decrementing the reference count.
1620 1620                   */
1621 1621                  mutex_enter(&arc_eviction_mtx);
1622 1622                  (void) remove_reference(hdr, NULL, tag);
1623 1623                  ASSERT(refcount_is_zero(&hdr->b_refcnt));
1624 1624                  destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1625 1625                  mutex_exit(&arc_eviction_mtx);
1626 1626                  if (destroy_hdr)
1627 1627                          arc_hdr_destroy(hdr);
1628 1628          } else {
1629 1629                  if (remove_reference(hdr, NULL, tag) > 0)
1630 1630                          arc_buf_destroy(buf, FALSE, TRUE);
1631 1631                  else
1632 1632                          arc_hdr_destroy(hdr);
1633 1633          }
1634 1634  }
1635 1635  
1636 1636  boolean_t
1637 1637  arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1638 1638  {
1639 1639          arc_buf_hdr_t *hdr = buf->b_hdr;
1640 1640          kmutex_t *hash_lock = HDR_LOCK(hdr);
1641 1641          boolean_t no_callback = (buf->b_efunc == NULL);
1642 1642  
1643 1643          if (hdr->b_state == arc_anon) {
1644 1644                  ASSERT(hdr->b_datacnt == 1);
1645 1645                  arc_buf_free(buf, tag);
1646 1646                  return (no_callback);
1647 1647          }
1648 1648  
1649 1649          mutex_enter(hash_lock);
1650 1650          hdr = buf->b_hdr;
1651 1651          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1652 1652          ASSERT(hdr->b_state != arc_anon);
1653 1653          ASSERT(buf->b_data != NULL);
1654 1654  
1655 1655          (void) remove_reference(hdr, hash_lock, tag);
1656 1656          if (hdr->b_datacnt > 1) {
1657 1657                  if (no_callback)
1658 1658                          arc_buf_destroy(buf, FALSE, TRUE);
1659 1659          } else if (no_callback) {
1660 1660                  ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1661 1661                  ASSERT(buf->b_efunc == NULL);
1662 1662                  hdr->b_flags |= ARC_BUF_AVAILABLE;
1663 1663          }
1664 1664          ASSERT(no_callback || hdr->b_datacnt > 1 ||
1665 1665              refcount_is_zero(&hdr->b_refcnt));
1666 1666          mutex_exit(hash_lock);
1667 1667          return (no_callback);
1668 1668  }
1669 1669  
1670 1670  int
1671 1671  arc_buf_size(arc_buf_t *buf)
1672 1672  {
1673 1673          return (buf->b_hdr->b_size);
1674 1674  }
1675 1675  
1676 1676  /*
1677 1677   * Called from the DMU to determine if the current buffer should be
1678 1678   * evicted. In order to ensure proper locking, the eviction must be initiated
1679 1679   * from the DMU. Return true if the buffer is associated with user data and
1680 1680   * duplicate buffers still exist.
1681 1681   */
1682 1682  boolean_t
1683 1683  arc_buf_eviction_needed(arc_buf_t *buf)
1684 1684  {
1685 1685          arc_buf_hdr_t *hdr;
1686 1686          boolean_t evict_needed = B_FALSE;
1687 1687  
1688 1688          if (zfs_disable_dup_eviction)
1689 1689                  return (B_FALSE);
1690 1690  
1691 1691          mutex_enter(&buf->b_evict_lock);
1692 1692          hdr = buf->b_hdr;
1693 1693          if (hdr == NULL) {
1694 1694                  /*
1695 1695                   * We are in arc_do_user_evicts(); let that function
1696 1696                   * perform the eviction.
1697 1697                   */
1698 1698                  ASSERT(buf->b_data == NULL);
1699 1699                  mutex_exit(&buf->b_evict_lock);
1700 1700                  return (B_FALSE);
1701 1701          } else if (buf->b_data == NULL) {
1702 1702                  /*
1703 1703                   * We have already been added to the arc eviction list;
1704 1704                   * recommend eviction.
1705 1705                   */
1706 1706                  ASSERT3P(hdr, ==, &arc_eviction_hdr);
1707 1707                  mutex_exit(&buf->b_evict_lock);
1708 1708                  return (B_TRUE);
1709 1709          }
1710 1710  
1711 1711          if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1712 1712                  evict_needed = B_TRUE;
1713 1713  
1714 1714          mutex_exit(&buf->b_evict_lock);
1715 1715          return (evict_needed);
1716 1716  }
1717 1717  
1718 1718  /*
1719 1719   * Evict buffers from list until we've removed the specified number of
1720 1720   * bytes.  Move the removed buffers to the appropriate evict state.
1721 1721   * If the recycle flag is set, then attempt to "recycle" a buffer:
1722 1722   * - look for a buffer to evict that is `bytes' long.
1723 1723   * - return the data block from this buffer rather than freeing it.
1724 1724   * This flag is used by callers that are trying to make space for a
1725 1725   * new buffer in a full arc cache.
1726 1726   *
1727 1727   * This function makes a "best effort".  It skips over any buffers
1728 1728   * it can't get a hash_lock on, and so may not catch all candidates.
1729 1729   * It may also return without evicting as much space as requested.
1730 1730   */
1731 1731  static void *
1732 1732  arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1733 1733      arc_buf_contents_t type)
1734 1734  {
1735 1735          arc_state_t *evicted_state;
1736 1736          uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1737 1737          arc_buf_hdr_t *ab, *ab_prev = NULL;
1738 1738          list_t *list = &state->arcs_list[type];
1739 1739          kmutex_t *hash_lock;
1740 1740          boolean_t have_lock;
1741 1741          void *stolen = NULL;
1742 1742  
1743 1743          ASSERT(state == arc_mru || state == arc_mfu);
1744 1744  
1745 1745          evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1746 1746  
1747 1747          mutex_enter(&state->arcs_mtx);
1748 1748          mutex_enter(&evicted_state->arcs_mtx);
1749 1749  
1750 1750          for (ab = list_tail(list); ab; ab = ab_prev) {
1751 1751                  ab_prev = list_prev(list, ab);
1752 1752                  /* prefetch buffers have a minimum lifespan */
1753 1753                  if (HDR_IO_IN_PROGRESS(ab) ||
1754 1754                      (spa && ab->b_spa != spa) ||
1755 1755                      (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1756 1756                      ddi_get_lbolt() - ab->b_arc_access <
1757 1757                      arc_min_prefetch_lifespan)) {
1758 1758                          skipped++;
1759 1759                          continue;
1760 1760                  }
1761 1761                  /* "lookahead" for better eviction candidate */
1762 1762                  if (recycle && ab->b_size != bytes &&
1763 1763                      ab_prev && ab_prev->b_size == bytes)
1764 1764                          continue;
1765 1765                  hash_lock = HDR_LOCK(ab);
1766 1766                  have_lock = MUTEX_HELD(hash_lock);
1767 1767                  if (have_lock || mutex_tryenter(hash_lock)) {
1768 1768                          ASSERT0(refcount_count(&ab->b_refcnt));
1769 1769                          ASSERT(ab->b_datacnt > 0);
1770 1770                          while (ab->b_buf) {
1771 1771                                  arc_buf_t *buf = ab->b_buf;
1772 1772                                  if (!mutex_tryenter(&buf->b_evict_lock)) {
1773 1773                                          missed += 1;
1774 1774                                          break;
1775 1775                                  }
1776 1776                                  if (buf->b_data) {
1777 1777                                          bytes_evicted += ab->b_size;
1778 1778                                          if (recycle && ab->b_type == type &&
1779 1779                                              ab->b_size == bytes &&
1780 1780                                              !HDR_L2_WRITING(ab)) {
1781 1781                                                  stolen = buf->b_data;
1782 1782                                                  recycle = FALSE;
1783 1783                                          }
1784 1784                                  }
1785 1785                                  if (buf->b_efunc) {
1786 1786                                          mutex_enter(&arc_eviction_mtx);
1787 1787                                          arc_buf_destroy(buf,
1788 1788                                              buf->b_data == stolen, FALSE);
1789 1789                                          ab->b_buf = buf->b_next;
1790 1790                                          buf->b_hdr = &arc_eviction_hdr;
1791 1791                                          buf->b_next = arc_eviction_list;
1792 1792                                          arc_eviction_list = buf;
1793 1793                                          mutex_exit(&arc_eviction_mtx);
1794 1794                                          mutex_exit(&buf->b_evict_lock);
1795 1795                                  } else {
1796 1796                                          mutex_exit(&buf->b_evict_lock);
1797 1797                                          arc_buf_destroy(buf,
1798 1798                                              buf->b_data == stolen, TRUE);
1799 1799                                  }
1800 1800                          }
1801 1801  
1802 1802                          if (ab->b_l2hdr) {
1803 1803                                  ARCSTAT_INCR(arcstat_evict_l2_cached,
1804 1804                                      ab->b_size);
1805 1805                          } else {
1806 1806                                  if (l2arc_write_eligible(ab->b_spa, ab)) {
1807 1807                                          ARCSTAT_INCR(arcstat_evict_l2_eligible,
1808 1808                                              ab->b_size);
1809 1809                                  } else {
1810 1810                                          ARCSTAT_INCR(
1811 1811                                              arcstat_evict_l2_ineligible,
1812 1812                                              ab->b_size);
1813 1813                                  }
1814 1814                          }
1815 1815  
1816 1816                          if (ab->b_datacnt == 0) {
1817 1817                                  arc_change_state(evicted_state, ab, hash_lock);
1818 1818                                  ASSERT(HDR_IN_HASH_TABLE(ab));
1819 1819                                  ab->b_flags |= ARC_IN_HASH_TABLE;
1820 1820                                  ab->b_flags &= ~ARC_BUF_AVAILABLE;
1821 1821                                  DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1822 1822                          }
1823 1823                          if (!have_lock)
1824 1824                                  mutex_exit(hash_lock);
1825 1825                          if (bytes >= 0 && bytes_evicted >= bytes)
1826 1826                                  break;
1827 1827                  } else {
1828 1828                          missed += 1;
1829 1829                  }
1830 1830          }
1831 1831  
1832 1832          mutex_exit(&evicted_state->arcs_mtx);
1833 1833          mutex_exit(&state->arcs_mtx);
1834 1834  
1835 1835          if (bytes_evicted < bytes)
1836 1836                  dprintf("only evicted %lld bytes from %x",
1837 1837                      (longlong_t)bytes_evicted, state);
1838 1838  
1839 1839          if (skipped)
1840 1840                  ARCSTAT_INCR(arcstat_evict_skip, skipped);
1841 1841  
1842 1842          if (missed)
1843 1843                  ARCSTAT_INCR(arcstat_mutex_miss, missed);
1844 1844  
1845 1845          /*
1846 1846           * We have just evicted some data into the ghost state, make
1847 1847           * sure we also adjust the ghost state size if necessary.
1848 1848           */
1849 1849          if (arc_no_grow &&
1850 1850              arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1851 1851                  int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1852 1852                      arc_mru_ghost->arcs_size - arc_c;
1853 1853  
1854 1854                  if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1855 1855                          int64_t todelete =
1856 1856                              MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1857 1857                          arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1858 1858                  } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1859 1859                          int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1860 1860                              arc_mru_ghost->arcs_size +
1861 1861                              arc_mfu_ghost->arcs_size - arc_c);
1862 1862                          arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1863 1863                  }
1864 1864          }
1865 1865  
1866 1866          return (stolen);
1867 1867  }
1868 1868  
1869 1869  /*
1870 1870   * Remove buffers from list until we've removed the specified number of
1871 1871   * bytes.  Destroy the buffers that are removed.
1872 1872   */
1873 1873  static void
1874 1874  arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1875 1875  {
1876 1876          arc_buf_hdr_t *ab, *ab_prev;
1877 1877          arc_buf_hdr_t marker = { 0 };
1878 1878          list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1879 1879          kmutex_t *hash_lock;
1880 1880          uint64_t bytes_deleted = 0;
1881 1881          uint64_t bufs_skipped = 0;
1882 1882  
1883 1883          ASSERT(GHOST_STATE(state));
1884 1884  top:
1885 1885          mutex_enter(&state->arcs_mtx);
1886 1886          for (ab = list_tail(list); ab; ab = ab_prev) {
1887 1887                  ab_prev = list_prev(list, ab);
1888 1888                  if (spa && ab->b_spa != spa)
1889 1889                          continue;
1890 1890  
1891 1891                  /* ignore markers */
1892 1892                  if (ab->b_spa == 0)
1893 1893                          continue;
1894 1894  
1895 1895                  hash_lock = HDR_LOCK(ab);
1896 1896                  /* caller may be trying to modify this buffer, skip it */
1897 1897                  if (MUTEX_HELD(hash_lock))
1898 1898                          continue;
1899 1899                  if (mutex_tryenter(hash_lock)) {
1900 1900                          ASSERT(!HDR_IO_IN_PROGRESS(ab));
1901 1901                          ASSERT(ab->b_buf == NULL);
1902 1902                          ARCSTAT_BUMP(arcstat_deleted);
1903 1903                          bytes_deleted += ab->b_size;
1904 1904  
1905 1905                          if (ab->b_l2hdr != NULL) {
1906 1906                                  /*
1907 1907                                   * This buffer is cached on the 2nd Level ARC;
1908 1908                                   * don't destroy the header.
1909 1909                                   */
1910 1910                                  arc_change_state(arc_l2c_only, ab, hash_lock);
1911 1911                                  mutex_exit(hash_lock);
1912 1912                          } else {
1913 1913                                  arc_change_state(arc_anon, ab, hash_lock);
1914 1914                                  mutex_exit(hash_lock);
1915 1915                                  arc_hdr_destroy(ab);
1916 1916                          }
1917 1917  
1918 1918                          DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1919 1919                          if (bytes >= 0 && bytes_deleted >= bytes)
1920 1920                                  break;
1921 1921                  } else if (bytes < 0) {
1922 1922                          /*
1923 1923                           * Insert a list marker and then wait for the
1924 1924                           * hash lock to become available. Once its
1925 1925                           * available, restart from where we left off.
1926 1926                           */
1927 1927                          list_insert_after(list, ab, &marker);
1928 1928                          mutex_exit(&state->arcs_mtx);
1929 1929                          mutex_enter(hash_lock);
1930 1930                          mutex_exit(hash_lock);
1931 1931                          mutex_enter(&state->arcs_mtx);
1932 1932                          ab_prev = list_prev(list, &marker);
1933 1933                          list_remove(list, &marker);
1934 1934                  } else
1935 1935                          bufs_skipped += 1;
1936 1936          }
1937 1937          mutex_exit(&state->arcs_mtx);
1938 1938  
1939 1939          if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1940 1940              (bytes < 0 || bytes_deleted < bytes)) {
1941 1941                  list = &state->arcs_list[ARC_BUFC_METADATA];
1942 1942                  goto top;
1943 1943          }
1944 1944  
1945 1945          if (bufs_skipped) {
1946 1946                  ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1947 1947                  ASSERT(bytes >= 0);
1948 1948          }
1949 1949  
1950 1950          if (bytes_deleted < bytes)
1951 1951                  dprintf("only deleted %lld bytes from %p",
1952 1952                      (longlong_t)bytes_deleted, state);
1953 1953  }
1954 1954  
1955 1955  static void
1956 1956  arc_adjust(void)
1957 1957  {
1958 1958          int64_t adjustment, delta;
1959 1959  
1960 1960          /*
1961 1961           * Adjust MRU size
1962 1962           */
1963 1963  
1964 1964          adjustment = MIN((int64_t)(arc_size - arc_c),
1965 1965              (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1966 1966              arc_p));
1967 1967  
1968 1968          if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1969 1969                  delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1970 1970                  (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
1971 1971                  adjustment -= delta;
1972 1972          }
1973 1973  
1974 1974          if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1975 1975                  delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1976 1976                  (void) arc_evict(arc_mru, NULL, delta, FALSE,
1977 1977                      ARC_BUFC_METADATA);
1978 1978          }
1979 1979  
1980 1980          /*
1981 1981           * Adjust MFU size
1982 1982           */
1983 1983  
1984 1984          adjustment = arc_size - arc_c;
1985 1985  
1986 1986          if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1987 1987                  delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1988 1988                  (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
1989 1989                  adjustment -= delta;
1990 1990          }
1991 1991  
1992 1992          if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1993 1993                  int64_t delta = MIN(adjustment,
1994 1994                      arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
1995 1995                  (void) arc_evict(arc_mfu, NULL, delta, FALSE,
1996 1996                      ARC_BUFC_METADATA);
1997 1997          }
1998 1998  
1999 1999          /*
2000 2000           * Adjust ghost lists
2001 2001           */
2002 2002  
2003 2003          adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2004 2004  
2005 2005          if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2006 2006                  delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2007 2007                  arc_evict_ghost(arc_mru_ghost, NULL, delta);
2008 2008          }
2009 2009  
2010 2010          adjustment =
2011 2011              arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2012 2012  
2013 2013          if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2014 2014                  delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2015 2015                  arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2016 2016          }
2017 2017  }
2018 2018  
2019 2019  static void
2020 2020  arc_do_user_evicts(void)
2021 2021  {
2022 2022          mutex_enter(&arc_eviction_mtx);
2023 2023          while (arc_eviction_list != NULL) {
2024 2024                  arc_buf_t *buf = arc_eviction_list;
2025 2025                  arc_eviction_list = buf->b_next;
2026 2026                  mutex_enter(&buf->b_evict_lock);
2027 2027                  buf->b_hdr = NULL;
2028 2028                  mutex_exit(&buf->b_evict_lock);
2029 2029                  mutex_exit(&arc_eviction_mtx);
2030 2030  
2031 2031                  if (buf->b_efunc != NULL)
2032 2032                          VERIFY(buf->b_efunc(buf) == 0);
2033 2033  
2034 2034                  buf->b_efunc = NULL;
2035 2035                  buf->b_private = NULL;
2036 2036                  kmem_cache_free(buf_cache, buf);
2037 2037                  mutex_enter(&arc_eviction_mtx);
2038 2038          }
2039 2039          mutex_exit(&arc_eviction_mtx);
2040 2040  }
2041 2041  
2042 2042  /*
2043 2043   * Flush all *evictable* data from the cache for the given spa.
2044 2044   * NOTE: this will not touch "active" (i.e. referenced) data.
2045 2045   */
2046 2046  void
2047 2047  arc_flush(spa_t *spa)
2048 2048  {
2049 2049          uint64_t guid = 0;
2050 2050  
2051 2051          if (spa)
2052 2052                  guid = spa_load_guid(spa);
2053 2053  
2054 2054          while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2055 2055                  (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2056 2056                  if (spa)
2057 2057                          break;
2058 2058          }
2059 2059          while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2060 2060                  (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2061 2061                  if (spa)
2062 2062                          break;
2063 2063          }
2064 2064          while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2065 2065                  (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2066 2066                  if (spa)
2067 2067                          break;
2068 2068          }
2069 2069          while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2070 2070                  (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2071 2071                  if (spa)
2072 2072                          break;
2073 2073          }
2074 2074  
2075 2075          arc_evict_ghost(arc_mru_ghost, guid, -1);
2076 2076          arc_evict_ghost(arc_mfu_ghost, guid, -1);
2077 2077  
2078 2078          mutex_enter(&arc_reclaim_thr_lock);
2079 2079          arc_do_user_evicts();
2080 2080          mutex_exit(&arc_reclaim_thr_lock);
2081 2081          ASSERT(spa || arc_eviction_list == NULL);
2082 2082  }
2083 2083  
2084 2084  void
2085 2085  arc_shrink(void)
2086 2086  {
2087 2087          if (arc_c > arc_c_min) {
2088 2088                  uint64_t to_free;
2089 2089  
2090 2090  #ifdef _KERNEL
2091 2091                  to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2092 2092  #else
2093 2093                  to_free = arc_c >> arc_shrink_shift;
2094 2094  #endif
2095 2095                  if (arc_c > arc_c_min + to_free)
2096 2096                          atomic_add_64(&arc_c, -to_free);
2097 2097                  else
2098 2098                          arc_c = arc_c_min;
2099 2099  
2100 2100                  atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2101 2101                  if (arc_c > arc_size)
2102 2102                          arc_c = MAX(arc_size, arc_c_min);
2103 2103                  if (arc_p > arc_c)
2104 2104                          arc_p = (arc_c >> 1);
2105 2105                  ASSERT(arc_c >= arc_c_min);
2106 2106                  ASSERT((int64_t)arc_p >= 0);
2107 2107          }
2108 2108  
2109 2109          if (arc_size > arc_c)
2110 2110                  arc_adjust();
2111 2111  }
2112 2112  
2113 2113  /*
2114 2114   * Determine if the system is under memory pressure and is asking
2115 2115   * to reclaim memory. A return value of 1 indicates that the system
2116 2116   * is under memory pressure and that the arc should adjust accordingly.
2117 2117   */
2118 2118  static int
2119 2119  arc_reclaim_needed(void)
2120 2120  {
2121 2121          uint64_t extra;
2122 2122  
2123 2123  #ifdef _KERNEL
2124 2124  
2125 2125          if (needfree)
2126 2126                  return (1);
2127 2127  
2128 2128          /*
2129 2129           * take 'desfree' extra pages, so we reclaim sooner, rather than later
2130 2130           */
2131 2131          extra = desfree;
2132 2132  
2133 2133          /*
2134 2134           * check that we're out of range of the pageout scanner.  It starts to
2135 2135           * schedule paging if freemem is less than lotsfree and needfree.
2136 2136           * lotsfree is the high-water mark for pageout, and needfree is the
2137 2137           * number of needed free pages.  We add extra pages here to make sure
2138 2138           * the scanner doesn't start up while we're freeing memory.
2139 2139           */
2140 2140          if (freemem < lotsfree + needfree + extra)
2141 2141                  return (1);
2142 2142  
2143 2143          /*
2144 2144           * check to make sure that swapfs has enough space so that anon
2145 2145           * reservations can still succeed. anon_resvmem() checks that the
2146 2146           * availrmem is greater than swapfs_minfree, and the number of reserved
2147 2147           * swap pages.  We also add a bit of extra here just to prevent
2148 2148           * circumstances from getting really dire.
2149 2149           */
2150 2150          if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2151 2151                  return (1);
2152 2152  
2153 2153  #if defined(__i386)
2154 2154          /*
2155 2155           * If we're on an i386 platform, it's possible that we'll exhaust the
2156 2156           * kernel heap space before we ever run out of available physical
2157 2157           * memory.  Most checks of the size of the heap_area compare against
2158 2158           * tune.t_minarmem, which is the minimum available real memory that we
2159 2159           * can have in the system.  However, this is generally fixed at 25 pages
2160 2160           * which is so low that it's useless.  In this comparison, we seek to
2161 2161           * calculate the total heap-size, and reclaim if more than 3/4ths of the
2162 2162           * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2163 2163           * free)
2164 2164           */
2165 2165          if (vmem_size(heap_arena, VMEM_FREE) <
2166 2166              (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2167 2167                  return (1);
2168 2168  #endif
2169 2169  
2170 2170          /*
2171 2171           * If zio data pages are being allocated out of a separate heap segment,
2172 2172           * then enforce that the size of available vmem for this arena remains
2173 2173           * above about 1/16th free.
2174 2174           *
2175 2175           * Note: The 1/16th arena free requirement was put in place
2176 2176           * to aggressively evict memory from the arc in order to avoid
2177 2177           * memory fragmentation issues.
2178 2178           */
2179 2179          if (zio_arena != NULL &&
2180 2180              vmem_size(zio_arena, VMEM_FREE) <
2181 2181              (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2182 2182                  return (1);
2183 2183  #else
2184 2184          if (spa_get_random(100) == 0)
2185 2185                  return (1);
2186 2186  #endif
2187 2187          return (0);
2188 2188  }
2189 2189  
2190 2190  static void
2191 2191  arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2192 2192  {
2193 2193          size_t                  i;
2194 2194          kmem_cache_t            *prev_cache = NULL;
2195 2195          kmem_cache_t            *prev_data_cache = NULL;
2196 2196          extern kmem_cache_t     *zio_buf_cache[];
2197 2197          extern kmem_cache_t     *zio_data_buf_cache[];
2198 2198  
2199 2199  #ifdef _KERNEL
2200 2200          if (arc_meta_used >= arc_meta_limit) {
2201 2201                  /*
2202 2202                   * We are exceeding our meta-data cache limit.
2203 2203                   * Purge some DNLC entries to release holds on meta-data.
2204 2204                   */
2205 2205                  dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2206 2206          }
2207 2207  #if defined(__i386)
2208 2208          /*
2209 2209           * Reclaim unused memory from all kmem caches.
2210 2210           */
2211 2211          kmem_reap();
2212 2212  #endif
2213 2213  #endif
2214 2214  
2215 2215          /*
2216 2216           * An aggressive reclamation will shrink the cache size as well as
2217 2217           * reap free buffers from the arc kmem caches.
2218 2218           */
2219 2219          if (strat == ARC_RECLAIM_AGGR)
2220 2220                  arc_shrink();
2221 2221  
2222 2222          for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2223 2223                  if (zio_buf_cache[i] != prev_cache) {
2224 2224                          prev_cache = zio_buf_cache[i];
2225 2225                          kmem_cache_reap_now(zio_buf_cache[i]);
2226 2226                  }
2227 2227                  if (zio_data_buf_cache[i] != prev_data_cache) {
2228 2228                          prev_data_cache = zio_data_buf_cache[i];
2229 2229                          kmem_cache_reap_now(zio_data_buf_cache[i]);
2230 2230                  }
2231 2231          }
2232 2232          kmem_cache_reap_now(buf_cache);
2233 2233          kmem_cache_reap_now(hdr_cache);
2234 2234  
2235 2235          /*
2236 2236           * Ask the vmem areana to reclaim unused memory from its
2237 2237           * quantum caches.
2238 2238           */
2239 2239          if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2240 2240                  vmem_qcache_reap(zio_arena);
2241 2241  }
2242 2242  
2243 2243  static void
2244 2244  arc_reclaim_thread(void)
2245 2245  {
2246 2246          clock_t                 growtime = 0;
2247 2247          arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2248 2248          callb_cpr_t             cpr;
2249 2249  
2250 2250          CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2251 2251  
2252 2252          mutex_enter(&arc_reclaim_thr_lock);
2253 2253          while (arc_thread_exit == 0) {
2254 2254                  if (arc_reclaim_needed()) {
2255 2255  
2256 2256                          if (arc_no_grow) {
2257 2257                                  if (last_reclaim == ARC_RECLAIM_CONS) {
2258 2258                                          last_reclaim = ARC_RECLAIM_AGGR;
2259 2259                                  } else {
2260 2260                                          last_reclaim = ARC_RECLAIM_CONS;
2261 2261                                  }
2262 2262                          } else {
2263 2263                                  arc_no_grow = TRUE;
2264 2264                                  last_reclaim = ARC_RECLAIM_AGGR;
2265 2265                                  membar_producer();
2266 2266                          }
2267 2267  
2268 2268                          /* reset the growth delay for every reclaim */
2269 2269                          growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2270 2270  
2271 2271                          arc_kmem_reap_now(last_reclaim);
2272 2272                          arc_warm = B_TRUE;
2273 2273  
2274 2274                  } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2275 2275                          arc_no_grow = FALSE;
2276 2276                  }
2277 2277  
2278 2278                  arc_adjust();
2279 2279  
2280 2280                  if (arc_eviction_list != NULL)
2281 2281                          arc_do_user_evicts();
2282 2282  
2283 2283                  /* block until needed, or one second, whichever is shorter */
2284 2284                  CALLB_CPR_SAFE_BEGIN(&cpr);
2285 2285                  (void) cv_timedwait(&arc_reclaim_thr_cv,
2286 2286                      &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2287 2287                  CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2288 2288          }
2289 2289  
2290 2290          arc_thread_exit = 0;
2291 2291          cv_broadcast(&arc_reclaim_thr_cv);
2292 2292          CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2293 2293          thread_exit();
2294 2294  }
2295 2295  
2296 2296  /*
2297 2297   * Adapt arc info given the number of bytes we are trying to add and
2298 2298   * the state that we are comming from.  This function is only called
2299 2299   * when we are adding new content to the cache.
2300 2300   */
2301 2301  static void
2302 2302  arc_adapt(int bytes, arc_state_t *state)
2303 2303  {
2304 2304          int mult;
2305 2305          uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2306 2306  
2307 2307          if (state == arc_l2c_only)
2308 2308                  return;
2309 2309  
2310 2310          ASSERT(bytes > 0);
2311 2311          /*
2312 2312           * Adapt the target size of the MRU list:
2313 2313           *      - if we just hit in the MRU ghost list, then increase
2314 2314           *        the target size of the MRU list.
2315 2315           *      - if we just hit in the MFU ghost list, then increase
2316 2316           *        the target size of the MFU list by decreasing the
2317 2317           *        target size of the MRU list.
2318 2318           */
2319 2319          if (state == arc_mru_ghost) {
2320 2320                  mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2321 2321                      1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2322 2322                  mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2323 2323  
2324 2324                  arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2325 2325          } else if (state == arc_mfu_ghost) {
2326 2326                  uint64_t delta;
2327 2327  
2328 2328                  mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2329 2329                      1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2330 2330                  mult = MIN(mult, 10);
2331 2331  
2332 2332                  delta = MIN(bytes * mult, arc_p);
2333 2333                  arc_p = MAX(arc_p_min, arc_p - delta);
2334 2334          }
2335 2335          ASSERT((int64_t)arc_p >= 0);
2336 2336  
2337 2337          if (arc_reclaim_needed()) {
2338 2338                  cv_signal(&arc_reclaim_thr_cv);
2339 2339                  return;
2340 2340          }
2341 2341  
2342 2342          if (arc_no_grow)
2343 2343                  return;
2344 2344  
2345 2345          if (arc_c >= arc_c_max)
2346 2346                  return;
2347 2347  
2348 2348          /*
2349 2349           * If we're within (2 * maxblocksize) bytes of the target
2350 2350           * cache size, increment the target cache size
2351 2351           */
2352 2352          if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2353 2353                  atomic_add_64(&arc_c, (int64_t)bytes);
2354 2354                  if (arc_c > arc_c_max)
2355 2355                          arc_c = arc_c_max;
2356 2356                  else if (state == arc_anon)
2357 2357                          atomic_add_64(&arc_p, (int64_t)bytes);
2358 2358                  if (arc_p > arc_c)
2359 2359                          arc_p = arc_c;
2360 2360          }
2361 2361          ASSERT((int64_t)arc_p >= 0);
2362 2362  }
2363 2363  
2364 2364  /*
2365 2365   * Check if the cache has reached its limits and eviction is required
2366 2366   * prior to insert.
2367 2367   */
2368 2368  static int
2369 2369  arc_evict_needed(arc_buf_contents_t type)
2370 2370  {
2371 2371          if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2372 2372                  return (1);
2373 2373  
2374 2374          if (arc_reclaim_needed())
2375 2375                  return (1);
2376 2376  
2377 2377          return (arc_size > arc_c);
2378 2378  }
2379 2379  
2380 2380  /*
2381 2381   * The buffer, supplied as the first argument, needs a data block.
2382 2382   * So, if we are at cache max, determine which cache should be victimized.
2383 2383   * We have the following cases:
2384 2384   *
2385 2385   * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2386 2386   * In this situation if we're out of space, but the resident size of the MFU is
2387 2387   * under the limit, victimize the MFU cache to satisfy this insertion request.
2388 2388   *
2389 2389   * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2390 2390   * Here, we've used up all of the available space for the MRU, so we need to
2391 2391   * evict from our own cache instead.  Evict from the set of resident MRU
2392 2392   * entries.
2393 2393   *
2394 2394   * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2395 2395   * c minus p represents the MFU space in the cache, since p is the size of the
2396 2396   * cache that is dedicated to the MRU.  In this situation there's still space on
2397 2397   * the MFU side, so the MRU side needs to be victimized.
2398 2398   *
2399 2399   * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2400 2400   * MFU's resident set is consuming more space than it has been allotted.  In
2401 2401   * this situation, we must victimize our own cache, the MFU, for this insertion.
2402 2402   */
2403 2403  static void
2404 2404  arc_get_data_buf(arc_buf_t *buf)
2405 2405  {
2406 2406          arc_state_t             *state = buf->b_hdr->b_state;
2407 2407          uint64_t                size = buf->b_hdr->b_size;
2408 2408          arc_buf_contents_t      type = buf->b_hdr->b_type;
2409 2409  
2410 2410          arc_adapt(size, state);
2411 2411  
2412 2412          /*
2413 2413           * We have not yet reached cache maximum size,
2414 2414           * just allocate a new buffer.
2415 2415           */
2416 2416          if (!arc_evict_needed(type)) {
2417 2417                  if (type == ARC_BUFC_METADATA) {
2418 2418                          buf->b_data = zio_buf_alloc(size);
2419 2419                          arc_space_consume(size, ARC_SPACE_DATA);
2420 2420                  } else {
2421 2421                          ASSERT(type == ARC_BUFC_DATA);
2422 2422                          buf->b_data = zio_data_buf_alloc(size);
2423 2423                          ARCSTAT_INCR(arcstat_data_size, size);
2424 2424                          atomic_add_64(&arc_size, size);
2425 2425                  }
2426 2426                  goto out;
2427 2427          }
2428 2428  
2429 2429          /*
2430 2430           * If we are prefetching from the mfu ghost list, this buffer
2431 2431           * will end up on the mru list; so steal space from there.
2432 2432           */
2433 2433          if (state == arc_mfu_ghost)
2434 2434                  state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2435 2435          else if (state == arc_mru_ghost)
2436 2436                  state = arc_mru;
2437 2437  
2438 2438          if (state == arc_mru || state == arc_anon) {
2439 2439                  uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2440 2440                  state = (arc_mfu->arcs_lsize[type] >= size &&
2441 2441                      arc_p > mru_used) ? arc_mfu : arc_mru;
2442 2442          } else {
2443 2443                  /* MFU cases */
2444 2444                  uint64_t mfu_space = arc_c - arc_p;
2445 2445                  state =  (arc_mru->arcs_lsize[type] >= size &&
2446 2446                      mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2447 2447          }
2448 2448          if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2449 2449                  if (type == ARC_BUFC_METADATA) {
2450 2450                          buf->b_data = zio_buf_alloc(size);
2451 2451                          arc_space_consume(size, ARC_SPACE_DATA);
2452 2452                  } else {
2453 2453                          ASSERT(type == ARC_BUFC_DATA);
2454 2454                          buf->b_data = zio_data_buf_alloc(size);
2455 2455                          ARCSTAT_INCR(arcstat_data_size, size);
2456 2456                          atomic_add_64(&arc_size, size);
2457 2457                  }
2458 2458                  ARCSTAT_BUMP(arcstat_recycle_miss);
2459 2459          }
2460 2460          ASSERT(buf->b_data != NULL);
2461 2461  out:
2462 2462          /*
2463 2463           * Update the state size.  Note that ghost states have a
2464 2464           * "ghost size" and so don't need to be updated.
2465 2465           */
2466 2466          if (!GHOST_STATE(buf->b_hdr->b_state)) {
2467 2467                  arc_buf_hdr_t *hdr = buf->b_hdr;
2468 2468  
2469 2469                  atomic_add_64(&hdr->b_state->arcs_size, size);
2470 2470                  if (list_link_active(&hdr->b_arc_node)) {
2471 2471                          ASSERT(refcount_is_zero(&hdr->b_refcnt));
2472 2472                          atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2473 2473                  }
2474 2474                  /*
2475 2475                   * If we are growing the cache, and we are adding anonymous
2476 2476                   * data, and we have outgrown arc_p, update arc_p
2477 2477                   */
2478 2478                  if (arc_size < arc_c && hdr->b_state == arc_anon &&
2479 2479                      arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2480 2480                          arc_p = MIN(arc_c, arc_p + size);
2481 2481          }
2482 2482  }
2483 2483  
2484 2484  /*
2485 2485   * This routine is called whenever a buffer is accessed.
2486 2486   * NOTE: the hash lock is dropped in this function.
2487 2487   */
2488 2488  static void
2489 2489  arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2490 2490  {
2491 2491          clock_t now;
2492 2492  
2493 2493          ASSERT(MUTEX_HELD(hash_lock));
2494 2494  
2495 2495          if (buf->b_state == arc_anon) {
2496 2496                  /*
2497 2497                   * This buffer is not in the cache, and does not
2498 2498                   * appear in our "ghost" list.  Add the new buffer
2499 2499                   * to the MRU state.
2500 2500                   */
2501 2501  
2502 2502                  ASSERT(buf->b_arc_access == 0);
2503 2503                  buf->b_arc_access = ddi_get_lbolt();
2504 2504                  DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2505 2505                  arc_change_state(arc_mru, buf, hash_lock);
2506 2506  
2507 2507          } else if (buf->b_state == arc_mru) {
2508 2508                  now = ddi_get_lbolt();
2509 2509  
2510 2510                  /*
2511 2511                   * If this buffer is here because of a prefetch, then either:
2512 2512                   * - clear the flag if this is a "referencing" read
2513 2513                   *   (any subsequent access will bump this into the MFU state).
2514 2514                   * or
2515 2515                   * - move the buffer to the head of the list if this is
2516 2516                   *   another prefetch (to make it less likely to be evicted).
2517 2517                   */
2518 2518                  if ((buf->b_flags & ARC_PREFETCH) != 0) {
2519 2519                          if (refcount_count(&buf->b_refcnt) == 0) {
2520 2520                                  ASSERT(list_link_active(&buf->b_arc_node));
2521 2521                          } else {
2522 2522                                  buf->b_flags &= ~ARC_PREFETCH;
2523 2523                                  ARCSTAT_BUMP(arcstat_mru_hits);
2524 2524                          }
2525 2525                          buf->b_arc_access = now;
2526 2526                          return;
2527 2527                  }
2528 2528  
2529 2529                  /*
2530 2530                   * This buffer has been "accessed" only once so far,
2531 2531                   * but it is still in the cache. Move it to the MFU
2532 2532                   * state.
2533 2533                   */
2534 2534                  if (now > buf->b_arc_access + ARC_MINTIME) {
2535 2535                          /*
2536 2536                           * More than 125ms have passed since we
2537 2537                           * instantiated this buffer.  Move it to the
2538 2538                           * most frequently used state.
2539 2539                           */
2540 2540                          buf->b_arc_access = now;
2541 2541                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2542 2542                          arc_change_state(arc_mfu, buf, hash_lock);
2543 2543                  }
2544 2544                  ARCSTAT_BUMP(arcstat_mru_hits);
2545 2545          } else if (buf->b_state == arc_mru_ghost) {
2546 2546                  arc_state_t     *new_state;
2547 2547                  /*
2548 2548                   * This buffer has been "accessed" recently, but
2549 2549                   * was evicted from the cache.  Move it to the
2550 2550                   * MFU state.
2551 2551                   */
2552 2552  
2553 2553                  if (buf->b_flags & ARC_PREFETCH) {
2554 2554                          new_state = arc_mru;
2555 2555                          if (refcount_count(&buf->b_refcnt) > 0)
2556 2556                                  buf->b_flags &= ~ARC_PREFETCH;
2557 2557                          DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2558 2558                  } else {
2559 2559                          new_state = arc_mfu;
2560 2560                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2561 2561                  }
2562 2562  
2563 2563                  buf->b_arc_access = ddi_get_lbolt();
2564 2564                  arc_change_state(new_state, buf, hash_lock);
2565 2565  
2566 2566                  ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2567 2567          } else if (buf->b_state == arc_mfu) {
2568 2568                  /*
2569 2569                   * This buffer has been accessed more than once and is
2570 2570                   * still in the cache.  Keep it in the MFU state.
2571 2571                   *
2572 2572                   * NOTE: an add_reference() that occurred when we did
2573 2573                   * the arc_read() will have kicked this off the list.
2574 2574                   * If it was a prefetch, we will explicitly move it to
2575 2575                   * the head of the list now.
2576 2576                   */
2577 2577                  if ((buf->b_flags & ARC_PREFETCH) != 0) {
2578 2578                          ASSERT(refcount_count(&buf->b_refcnt) == 0);
2579 2579                          ASSERT(list_link_active(&buf->b_arc_node));
2580 2580                  }
2581 2581                  ARCSTAT_BUMP(arcstat_mfu_hits);
2582 2582                  buf->b_arc_access = ddi_get_lbolt();
2583 2583          } else if (buf->b_state == arc_mfu_ghost) {
2584 2584                  arc_state_t     *new_state = arc_mfu;
2585 2585                  /*
2586 2586                   * This buffer has been accessed more than once but has
2587 2587                   * been evicted from the cache.  Move it back to the
2588 2588                   * MFU state.
2589 2589                   */
2590 2590  
2591 2591                  if (buf->b_flags & ARC_PREFETCH) {
2592 2592                          /*
2593 2593                           * This is a prefetch access...
2594 2594                           * move this block back to the MRU state.
2595 2595                           */
2596 2596                          ASSERT0(refcount_count(&buf->b_refcnt));
2597 2597                          new_state = arc_mru;
2598 2598                  }
2599 2599  
2600 2600                  buf->b_arc_access = ddi_get_lbolt();
2601 2601                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2602 2602                  arc_change_state(new_state, buf, hash_lock);
2603 2603  
2604 2604                  ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2605 2605          } else if (buf->b_state == arc_l2c_only) {
2606 2606                  /*
2607 2607                   * This buffer is on the 2nd Level ARC.
2608 2608                   */
2609 2609  
2610 2610                  buf->b_arc_access = ddi_get_lbolt();
2611 2611                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2612 2612                  arc_change_state(arc_mfu, buf, hash_lock);
2613 2613          } else {
2614 2614                  ASSERT(!"invalid arc state");
2615 2615          }
2616 2616  }
2617 2617  
2618 2618  /* a generic arc_done_func_t which you can use */
2619 2619  /* ARGSUSED */
2620 2620  void
2621 2621  arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2622 2622  {
2623 2623          if (zio == NULL || zio->io_error == 0)
2624 2624                  bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2625 2625          VERIFY(arc_buf_remove_ref(buf, arg));
2626 2626  }
2627 2627  
2628 2628  /* a generic arc_done_func_t */
2629 2629  void
2630 2630  arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2631 2631  {
2632 2632          arc_buf_t **bufp = arg;
2633 2633          if (zio && zio->io_error) {
2634 2634                  VERIFY(arc_buf_remove_ref(buf, arg));
2635 2635                  *bufp = NULL;
2636 2636          } else {
2637 2637                  *bufp = buf;
2638 2638                  ASSERT(buf->b_data);
2639 2639          }
2640 2640  }
2641 2641  
2642 2642  static void
2643 2643  arc_read_done(zio_t *zio)
2644 2644  {
2645 2645          arc_buf_hdr_t   *hdr, *found;
2646 2646          arc_buf_t       *buf;
2647 2647          arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2648 2648          kmutex_t        *hash_lock;
2649 2649          arc_callback_t  *callback_list, *acb;
2650 2650          int             freeable = FALSE;
2651 2651  
2652 2652          buf = zio->io_private;
2653 2653          hdr = buf->b_hdr;
2654 2654  
2655 2655          /*
2656 2656           * The hdr was inserted into hash-table and removed from lists
2657 2657           * prior to starting I/O.  We should find this header, since
2658 2658           * it's in the hash table, and it should be legit since it's
2659 2659           * not possible to evict it during the I/O.  The only possible
2660 2660           * reason for it not to be found is if we were freed during the
2661 2661           * read.
2662 2662           */
2663 2663          found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2664 2664              &hash_lock);
2665 2665  
2666 2666          ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2667 2667              (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2668 2668              (found == hdr && HDR_L2_READING(hdr)));
2669 2669  
2670 2670          hdr->b_flags &= ~ARC_L2_EVICTED;
2671 2671          if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2672 2672                  hdr->b_flags &= ~ARC_L2CACHE;
2673 2673  
2674 2674          /* byteswap if necessary */
2675 2675          callback_list = hdr->b_acb;
2676 2676          ASSERT(callback_list != NULL);
2677 2677          if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2678 2678                  dmu_object_byteswap_t bswap =
2679 2679                      DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2680 2680                  arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2681 2681                      byteswap_uint64_array :
2682 2682                      dmu_ot_byteswap[bswap].ob_func;
2683 2683                  func(buf->b_data, hdr->b_size);
2684 2684          }
2685 2685  
2686 2686          arc_cksum_compute(buf, B_FALSE);
2687 2687          arc_buf_watch(buf);
2688 2688  
2689 2689          if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2690 2690                  /*
2691 2691                   * Only call arc_access on anonymous buffers.  This is because
2692 2692                   * if we've issued an I/O for an evicted buffer, we've already
2693 2693                   * called arc_access (to prevent any simultaneous readers from
2694 2694                   * getting confused).
2695 2695                   */
2696 2696                  arc_access(hdr, hash_lock);
2697 2697          }
2698 2698  
2699 2699          /* create copies of the data buffer for the callers */
2700 2700          abuf = buf;
2701 2701          for (acb = callback_list; acb; acb = acb->acb_next) {
2702 2702                  if (acb->acb_done) {
2703 2703                          if (abuf == NULL) {
2704 2704                                  ARCSTAT_BUMP(arcstat_duplicate_reads);
2705 2705                                  abuf = arc_buf_clone(buf);
2706 2706                          }
2707 2707                          acb->acb_buf = abuf;
2708 2708                          abuf = NULL;
2709 2709                  }
2710 2710          }
2711 2711          hdr->b_acb = NULL;
2712 2712          hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2713 2713          ASSERT(!HDR_BUF_AVAILABLE(hdr));
2714 2714          if (abuf == buf) {
2715 2715                  ASSERT(buf->b_efunc == NULL);
2716 2716                  ASSERT(hdr->b_datacnt == 1);
2717 2717                  hdr->b_flags |= ARC_BUF_AVAILABLE;
2718 2718          }
2719 2719  
2720 2720          ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2721 2721  
2722 2722          if (zio->io_error != 0) {
2723 2723                  hdr->b_flags |= ARC_IO_ERROR;
2724 2724                  if (hdr->b_state != arc_anon)
2725 2725                          arc_change_state(arc_anon, hdr, hash_lock);
2726 2726                  if (HDR_IN_HASH_TABLE(hdr))
2727 2727                          buf_hash_remove(hdr);
2728 2728                  freeable = refcount_is_zero(&hdr->b_refcnt);
2729 2729          }
2730 2730  
2731 2731          /*
2732 2732           * Broadcast before we drop the hash_lock to avoid the possibility
2733 2733           * that the hdr (and hence the cv) might be freed before we get to
2734 2734           * the cv_broadcast().
2735 2735           */
2736 2736          cv_broadcast(&hdr->b_cv);
2737 2737  
2738 2738          if (hash_lock) {
2739 2739                  mutex_exit(hash_lock);
2740 2740          } else {
2741 2741                  /*
2742 2742                   * This block was freed while we waited for the read to
2743 2743                   * complete.  It has been removed from the hash table and
2744 2744                   * moved to the anonymous state (so that it won't show up
2745 2745                   * in the cache).
2746 2746                   */
2747 2747                  ASSERT3P(hdr->b_state, ==, arc_anon);
2748 2748                  freeable = refcount_is_zero(&hdr->b_refcnt);
2749 2749          }
2750 2750  
2751 2751          /* execute each callback and free its structure */
2752 2752          while ((acb = callback_list) != NULL) {
2753 2753                  if (acb->acb_done)
2754 2754                          acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2755 2755  
2756 2756                  if (acb->acb_zio_dummy != NULL) {
2757 2757                          acb->acb_zio_dummy->io_error = zio->io_error;
2758 2758                          zio_nowait(acb->acb_zio_dummy);
2759 2759                  }
2760 2760  
2761 2761                  callback_list = acb->acb_next;
2762 2762                  kmem_free(acb, sizeof (arc_callback_t));
2763 2763          }
2764 2764  
2765 2765          if (freeable)
2766 2766                  arc_hdr_destroy(hdr);
2767 2767  }
2768 2768  
2769 2769  /*
2770 2770   * "Read" the block at the specified DVA (in bp) via the
2771 2771   * cache.  If the block is found in the cache, invoke the provided
2772 2772   * callback immediately and return.  Note that the `zio' parameter
2773 2773   * in the callback will be NULL in this case, since no IO was
2774 2774   * required.  If the block is not in the cache pass the read request
2775 2775   * on to the spa with a substitute callback function, so that the
2776 2776   * requested block will be added to the cache.
2777 2777   *
2778 2778   * If a read request arrives for a block that has a read in-progress,

↓ open down ↓

2778 lines elided

↑ open up ↑

2779 2779   * either wait for the in-progress read to complete (and return the
2780 2780   * results); or, if this is a read with a "done" func, add a record
2781 2781   * to the read to invoke the "done" func when the read completes,
2782 2782   * and return; or just return.
2783 2783   *
2784 2784   * arc_read_done() will invoke all the requested "done" functions
2785 2785   * for readers of this block.
2786 2786   */
2787 2787  int
2788 2788  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2789      -    void *private, int priority, int zio_flags, uint32_t *arc_flags,
     2789 +    void *cb_private, int priority, int zio_flags, uint32_t *arc_flags,
2790 2790      const zbookmark_t *zb)
2791 2791  {
2792 2792          arc_buf_hdr_t *hdr;
2793 2793          arc_buf_t *buf = NULL;
2794 2794          kmutex_t *hash_lock;
2795 2795          zio_t *rzio;
2796 2796          uint64_t guid = spa_load_guid(spa);
2797 2797  
2798 2798  top:
2799 2799          hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),

2800 2800              &hash_lock);
2801 2801          if (hdr && hdr->b_datacnt > 0) {
2802 2802  
2803 2803                  *arc_flags |= ARC_CACHED;
2804 2804  
2805 2805                  if (HDR_IO_IN_PROGRESS(hdr)) {
2806 2806  
2807 2807                          if (*arc_flags & ARC_WAIT) {
2808 2808                                  cv_wait(&hdr->b_cv, hash_lock);
2809 2809                                  mutex_exit(hash_lock);

↓ open down ↓

10 lines elided

↑ open up ↑

2810 2810                                  goto top;
2811 2811                          }
2812 2812                          ASSERT(*arc_flags & ARC_NOWAIT);
2813 2813  
2814 2814                          if (done) {
2815 2815                                  arc_callback_t  *acb = NULL;
2816 2816  
2817 2817                                  acb = kmem_zalloc(sizeof (arc_callback_t),
2818 2818                                      KM_SLEEP);
2819 2819                                  acb->acb_done = done;
2820      -                                acb->acb_private = private;
     2820 +                                acb->acb_private = cb_private;
2821 2821                                  if (pio != NULL)
2822 2822                                          acb->acb_zio_dummy = zio_null(pio,
2823 2823                                              spa, NULL, NULL, NULL, zio_flags);
2824 2824  
2825 2825                                  ASSERT(acb->acb_done != NULL);
2826 2826                                  acb->acb_next = hdr->b_acb;
2827 2827                                  hdr->b_acb = acb;
2828      -                                add_reference(hdr, hash_lock, private);
     2828 +                                add_reference(hdr, hash_lock, cb_private);
2829 2829                                  mutex_exit(hash_lock);
2830 2830                                  return (0);
2831 2831                          }
2832 2832                          mutex_exit(hash_lock);
2833 2833                          return (0);
2834 2834                  }
2835 2835  
2836 2836                  ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2837 2837  
2838 2838                  if (done) {
2839      -                        add_reference(hdr, hash_lock, private);
     2839 +                        add_reference(hdr, hash_lock, cb_private);
2840 2840                          /*
2841 2841                           * If this block is already in use, create a new
2842 2842                           * copy of the data so that we will be guaranteed
2843 2843                           * that arc_release() will always succeed.
2844 2844                           */
2845 2845                          buf = hdr->b_buf;
2846 2846                          ASSERT(buf);
2847 2847                          ASSERT(buf->b_data);
2848 2848                          if (HDR_BUF_AVAILABLE(hdr)) {
2849 2849                                  ASSERT(buf->b_efunc == NULL);

2850 2850                                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2851 2851                          } else {
2852 2852                                  buf = arc_buf_clone(buf);
2853 2853                          }
2854 2854  
2855 2855                  } else if (*arc_flags & ARC_PREFETCH &&
2856 2856                      refcount_count(&hdr->b_refcnt) == 0) {
2857 2857                          hdr->b_flags |= ARC_PREFETCH;
2858 2858                  }
2859 2859                  DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);

↓ open down ↓

10 lines elided

↑ open up ↑

2860 2860                  arc_access(hdr, hash_lock);
2861 2861                  if (*arc_flags & ARC_L2CACHE)
2862 2862                          hdr->b_flags |= ARC_L2CACHE;
2863 2863                  mutex_exit(hash_lock);
2864 2864                  ARCSTAT_BUMP(arcstat_hits);
2865 2865                  ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2866 2866                      demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2867 2867                      data, metadata, hits);
2868 2868  
2869 2869                  if (done)
2870      -                        done(NULL, buf, private);
     2870 +                        done(NULL, buf, cb_private);
2871 2871          } else {
2872 2872                  uint64_t size = BP_GET_LSIZE(bp);
2873 2873                  arc_callback_t  *acb;
2874 2874                  vdev_t *vd = NULL;
2875 2875                  uint64_t addr = 0;
2876 2876                  boolean_t devw = B_FALSE;
2877 2877  
2878 2878                  if (hdr == NULL) {
2879 2879                          /* this block is not in the cache */
2880 2880                          arc_buf_hdr_t   *exists;
2881 2881                          arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2882      -                        buf = arc_buf_alloc(spa, size, private, type);
     2882 +                        buf = arc_buf_alloc(spa, size, cb_private, type);
2883 2883                          hdr = buf->b_hdr;
2884 2884                          hdr->b_dva = *BP_IDENTITY(bp);
2885 2885                          hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2886 2886                          hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2887 2887                          exists = buf_hash_insert(hdr, &hash_lock);
2888 2888                          if (exists) {
2889 2889                                  /* somebody beat us to the hash insert */
2890 2890                                  mutex_exit(hash_lock);
2891 2891                                  buf_discard_identity(hdr);
2892      -                                (void) arc_buf_remove_ref(buf, private);
     2892 +                                (void) arc_buf_remove_ref(buf, cb_private);
2893 2893                                  goto top; /* restart the IO request */
2894 2894                          }
2895 2895                          /* if this is a prefetch, we don't have a reference */
2896 2896                          if (*arc_flags & ARC_PREFETCH) {
2897 2897                                  (void) remove_reference(hdr, hash_lock,
2898      -                                    private);
     2898 +                                    cb_private);
2899 2899                                  hdr->b_flags |= ARC_PREFETCH;
2900 2900                          }
2901 2901                          if (*arc_flags & ARC_L2CACHE)
2902 2902                                  hdr->b_flags |= ARC_L2CACHE;
2903 2903                          if (BP_GET_LEVEL(bp) > 0)
2904 2904                                  hdr->b_flags |= ARC_INDIRECT;
2905 2905                  } else {
2906 2906                          /* this block is in the ghost cache */
2907 2907                          ASSERT(GHOST_STATE(hdr->b_state));
2908 2908                          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2909 2909                          ASSERT0(refcount_count(&hdr->b_refcnt));
2910 2910                          ASSERT(hdr->b_buf == NULL);
2911 2911  
2912 2912                          /* if this is a prefetch, we don't have a reference */
2913 2913                          if (*arc_flags & ARC_PREFETCH)
2914 2914                                  hdr->b_flags |= ARC_PREFETCH;
2915 2915                          else
2916      -                                add_reference(hdr, hash_lock, private);
     2916 +                                add_reference(hdr, hash_lock, cb_private);
2917 2917                          if (*arc_flags & ARC_L2CACHE)
2918 2918                                  hdr->b_flags |= ARC_L2CACHE;
2919 2919                          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2920 2920                          buf->b_hdr = hdr;
2921 2921                          buf->b_data = NULL;
2922 2922                          buf->b_efunc = NULL;
2923 2923                          buf->b_private = NULL;
2924 2924                          buf->b_next = NULL;
2925 2925                          hdr->b_buf = buf;
2926 2926                          ASSERT(hdr->b_datacnt == 0);
2927 2927                          hdr->b_datacnt = 1;
2928 2928                          arc_get_data_buf(buf);
2929 2929                          arc_access(hdr, hash_lock);
2930 2930                  }
2931 2931  
2932 2932                  ASSERT(!GHOST_STATE(hdr->b_state));
2933 2933  
2934 2934                  acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2935 2935                  acb->acb_done = done;
2936      -                acb->acb_private = private;
     2936 +                acb->acb_private = cb_private;
2937 2937  
2938 2938                  ASSERT(hdr->b_acb == NULL);
2939 2939                  hdr->b_acb = acb;
2940 2940                  hdr->b_flags |= ARC_IO_IN_PROGRESS;
2941 2941  
2942 2942                  if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2943 2943                      (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2944 2944                          devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2945 2945                          addr = hdr->b_l2hdr->b_daddr;
2946 2946                          /*

2947 2947                           * Lock out device removal.
2948 2948                           */
2949 2949                          if (vdev_is_dead(vd) ||
2950 2950                              !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2951 2951                                  vd = NULL;
2952 2952                  }
2953 2953  
2954 2954                  mutex_exit(hash_lock);
2955 2955  
2956 2956                  ASSERT3U(hdr->b_size, ==, size);
2957 2957                  DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
2958 2958                      uint64_t, size, zbookmark_t *, zb);
2959 2959                  ARCSTAT_BUMP(arcstat_misses);
2960 2960                  ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2961 2961                      demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2962 2962                      data, metadata, misses);
2963 2963  
2964 2964                  if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
2965 2965                          /*
2966 2966                           * Read from the L2ARC if the following are true:
2967 2967                           * 1. The L2ARC vdev was previously cached.
2968 2968                           * 2. This buffer still has L2ARC metadata.
2969 2969                           * 3. This buffer isn't currently writing to the L2ARC.
2970 2970                           * 4. The L2ARC entry wasn't evicted, which may
2971 2971                           *    also have invalidated the vdev.
2972 2972                           * 5. This isn't prefetch and l2arc_noprefetch is set.
2973 2973                           */
2974 2974                          if (hdr->b_l2hdr != NULL &&
2975 2975                              !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
2976 2976                              !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
2977 2977                                  l2arc_read_callback_t *cb;
2978 2978  
2979 2979                                  DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2980 2980                                  ARCSTAT_BUMP(arcstat_l2_hits);
2981 2981  
2982 2982                                  cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2983 2983                                      KM_SLEEP);
2984 2984                                  cb->l2rcb_buf = buf;
2985 2985                                  cb->l2rcb_spa = spa;
2986 2986                                  cb->l2rcb_bp = *bp;
2987 2987                                  cb->l2rcb_zb = *zb;
2988 2988                                  cb->l2rcb_flags = zio_flags;
2989 2989  
2990 2990                                  ASSERT(addr >= VDEV_LABEL_START_SIZE &&
2991 2991                                      addr + size < vd->vdev_psize -
2992 2992                                      VDEV_LABEL_END_SIZE);
2993 2993  
2994 2994                                  /*
2995 2995                                   * l2arc read.  The SCL_L2ARC lock will be
2996 2996                                   * released by l2arc_read_done().
2997 2997                                   */
2998 2998                                  rzio = zio_read_phys(pio, vd, addr, size,
2999 2999                                      buf->b_data, ZIO_CHECKSUM_OFF,
3000 3000                                      l2arc_read_done, cb, priority, zio_flags |
3001 3001                                      ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3002 3002                                      ZIO_FLAG_DONT_PROPAGATE |
3003 3003                                      ZIO_FLAG_DONT_RETRY, B_FALSE);
3004 3004                                  DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3005 3005                                      zio_t *, rzio);
3006 3006                                  ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3007 3007  
3008 3008                                  if (*arc_flags & ARC_NOWAIT) {
3009 3009                                          zio_nowait(rzio);
3010 3010                                          return (0);
3011 3011                                  }
3012 3012  
3013 3013                                  ASSERT(*arc_flags & ARC_WAIT);
3014 3014                                  if (zio_wait(rzio) == 0)
3015 3015                                          return (0);
3016 3016  
3017 3017                                  /* l2arc read error; goto zio_read() */
3018 3018                          } else {
3019 3019                                  DTRACE_PROBE1(l2arc__miss,
3020 3020                                      arc_buf_hdr_t *, hdr);
3021 3021                                  ARCSTAT_BUMP(arcstat_l2_misses);
3022 3022                                  if (HDR_L2_WRITING(hdr))
3023 3023                                          ARCSTAT_BUMP(arcstat_l2_rw_clash);
3024 3024                                  spa_config_exit(spa, SCL_L2ARC, vd);
3025 3025                          }
3026 3026                  } else {
3027 3027                          if (vd != NULL)
3028 3028                                  spa_config_exit(spa, SCL_L2ARC, vd);
3029 3029                          if (l2arc_ndev != 0) {
3030 3030                                  DTRACE_PROBE1(l2arc__miss,
3031 3031                                      arc_buf_hdr_t *, hdr);
3032 3032                                  ARCSTAT_BUMP(arcstat_l2_misses);
3033 3033                          }
3034 3034                  }
3035 3035  
3036 3036                  rzio = zio_read(pio, spa, bp, buf->b_data, size,
3037 3037                      arc_read_done, buf, priority, zio_flags, zb);
3038 3038

↓ open down ↓

92 lines elided

↑ open up ↑

3039 3039                  if (*arc_flags & ARC_WAIT)
3040 3040                          return (zio_wait(rzio));
3041 3041  
3042 3042                  ASSERT(*arc_flags & ARC_NOWAIT);
3043 3043                  zio_nowait(rzio);
3044 3044          }
3045 3045          return (0);
3046 3046  }
3047 3047  
3048 3048  void
3049      -arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
     3049 +arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *cb_private)
3050 3050  {
3051 3051          ASSERT(buf->b_hdr != NULL);
3052 3052          ASSERT(buf->b_hdr->b_state != arc_anon);
3053 3053          ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3054 3054          ASSERT(buf->b_efunc == NULL);
3055 3055          ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3056 3056  
3057 3057          buf->b_efunc = func;
3058      -        buf->b_private = private;
     3058 +        buf->b_private = cb_private;
3059 3059  }
3060 3060  
3061 3061  /*
3062 3062   * This is used by the DMU to let the ARC know that a buffer is
3063 3063   * being evicted, so the ARC should clean up.  If this arc buf
3064 3064   * is not yet in the evicted state, it will be put there.
3065 3065   */
3066 3066  int
3067 3067  arc_buf_evict(arc_buf_t *buf)
3068 3068  {

3069 3069          arc_buf_hdr_t *hdr;
3070 3070          kmutex_t *hash_lock;
3071 3071          arc_buf_t **bufp;
3072 3072  
3073 3073          mutex_enter(&buf->b_evict_lock);
3074 3074          hdr = buf->b_hdr;
3075 3075          if (hdr == NULL) {
3076 3076                  /*
3077 3077                   * We are in arc_do_user_evicts().
3078 3078                   */
3079 3079                  ASSERT(buf->b_data == NULL);
3080 3080                  mutex_exit(&buf->b_evict_lock);
3081 3081                  return (0);
3082 3082          } else if (buf->b_data == NULL) {
3083 3083                  arc_buf_t copy = *buf; /* structure assignment */
3084 3084                  /*
3085 3085                   * We are on the eviction list; process this buffer now
3086 3086                   * but let arc_do_user_evicts() do the reaping.
3087 3087                   */
3088 3088                  buf->b_efunc = NULL;
3089 3089                  mutex_exit(&buf->b_evict_lock);
3090 3090                  VERIFY(copy.b_efunc(&copy) == 0);
3091 3091                  return (1);
3092 3092          }
3093 3093          hash_lock = HDR_LOCK(hdr);
3094 3094          mutex_enter(hash_lock);
3095 3095          hdr = buf->b_hdr;
3096 3096          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3097 3097  
3098 3098          ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3099 3099          ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3100 3100  
3101 3101          /*
3102 3102           * Pull this buffer off of the hdr
3103 3103           */
3104 3104          bufp = &hdr->b_buf;
3105 3105          while (*bufp != buf)
3106 3106                  bufp = &(*bufp)->b_next;
3107 3107          *bufp = buf->b_next;
3108 3108  
3109 3109          ASSERT(buf->b_data != NULL);
3110 3110          arc_buf_destroy(buf, FALSE, FALSE);
3111 3111  
3112 3112          if (hdr->b_datacnt == 0) {
3113 3113                  arc_state_t *old_state = hdr->b_state;
3114 3114                  arc_state_t *evicted_state;
3115 3115  
3116 3116                  ASSERT(hdr->b_buf == NULL);
3117 3117                  ASSERT(refcount_is_zero(&hdr->b_refcnt));
3118 3118  
3119 3119                  evicted_state =
3120 3120                      (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3121 3121  
3122 3122                  mutex_enter(&old_state->arcs_mtx);
3123 3123                  mutex_enter(&evicted_state->arcs_mtx);
3124 3124  
3125 3125                  arc_change_state(evicted_state, hdr, hash_lock);
3126 3126                  ASSERT(HDR_IN_HASH_TABLE(hdr));
3127 3127                  hdr->b_flags |= ARC_IN_HASH_TABLE;
3128 3128                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3129 3129  
3130 3130                  mutex_exit(&evicted_state->arcs_mtx);
3131 3131                  mutex_exit(&old_state->arcs_mtx);
3132 3132          }
3133 3133          mutex_exit(hash_lock);
3134 3134          mutex_exit(&buf->b_evict_lock);
3135 3135  
3136 3136          VERIFY(buf->b_efunc(buf) == 0);
3137 3137          buf->b_efunc = NULL;
3138 3138          buf->b_private = NULL;
3139 3139          buf->b_hdr = NULL;
3140 3140          buf->b_next = NULL;
3141 3141          kmem_cache_free(buf_cache, buf);
3142 3142          return (1);
3143 3143  }
3144 3144  
3145 3145  /*
3146 3146   * Release this buffer from the cache.  This must be done
3147 3147   * after a read and prior to modifying the buffer contents.
3148 3148   * If the buffer has more than one reference, we must make
3149 3149   * a new hdr for the buffer.
3150 3150   */
3151 3151  void
3152 3152  arc_release(arc_buf_t *buf, void *tag)
3153 3153  {
3154 3154          arc_buf_hdr_t *hdr;
3155 3155          kmutex_t *hash_lock = NULL;
3156 3156          l2arc_buf_hdr_t *l2hdr;
3157 3157          uint64_t buf_size;
3158 3158  
3159 3159          /*
3160 3160           * It would be nice to assert that if it's DMU metadata (level >
3161 3161           * 0 || it's the dnode file), then it must be syncing context.
3162 3162           * But we don't know that information at this level.
3163 3163           */
3164 3164  
3165 3165          mutex_enter(&buf->b_evict_lock);
3166 3166          hdr = buf->b_hdr;
3167 3167  
3168 3168          /* this buffer is not on any list */
3169 3169          ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3170 3170  
3171 3171          if (hdr->b_state == arc_anon) {
3172 3172                  /* this buffer is already released */
3173 3173                  ASSERT(buf->b_efunc == NULL);
3174 3174          } else {
3175 3175                  hash_lock = HDR_LOCK(hdr);
3176 3176                  mutex_enter(hash_lock);
3177 3177                  hdr = buf->b_hdr;
3178 3178                  ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3179 3179          }
3180 3180  
3181 3181          l2hdr = hdr->b_l2hdr;
3182 3182          if (l2hdr) {
3183 3183                  mutex_enter(&l2arc_buflist_mtx);
3184 3184                  hdr->b_l2hdr = NULL;
3185 3185          }
3186 3186          buf_size = hdr->b_size;
3187 3187  
3188 3188          /*
3189 3189           * Do we have more than one buf?
3190 3190           */
3191 3191          if (hdr->b_datacnt > 1) {
3192 3192                  arc_buf_hdr_t *nhdr;
3193 3193                  arc_buf_t **bufp;
3194 3194                  uint64_t blksz = hdr->b_size;
3195 3195                  uint64_t spa = hdr->b_spa;
3196 3196                  arc_buf_contents_t type = hdr->b_type;
3197 3197                  uint32_t flags = hdr->b_flags;
3198 3198  
3199 3199                  ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3200 3200                  /*
3201 3201                   * Pull the data off of this hdr and attach it to
3202 3202                   * a new anonymous hdr.
3203 3203                   */
3204 3204                  (void) remove_reference(hdr, hash_lock, tag);
3205 3205                  bufp = &hdr->b_buf;
3206 3206                  while (*bufp != buf)
3207 3207                          bufp = &(*bufp)->b_next;
3208 3208                  *bufp = buf->b_next;
3209 3209                  buf->b_next = NULL;
3210 3210  
3211 3211                  ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3212 3212                  atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3213 3213                  if (refcount_is_zero(&hdr->b_refcnt)) {
3214 3214                          uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3215 3215                          ASSERT3U(*size, >=, hdr->b_size);
3216 3216                          atomic_add_64(size, -hdr->b_size);
3217 3217                  }
3218 3218  
3219 3219                  /*
3220 3220                   * We're releasing a duplicate user data buffer, update
3221 3221                   * our statistics accordingly.
3222 3222                   */
3223 3223                  if (hdr->b_type == ARC_BUFC_DATA) {
3224 3224                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3225 3225                          ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3226 3226                              -hdr->b_size);
3227 3227                  }
3228 3228                  hdr->b_datacnt -= 1;
3229 3229                  arc_cksum_verify(buf);
3230 3230                  arc_buf_unwatch(buf);
3231 3231  
3232 3232                  mutex_exit(hash_lock);
3233 3233  
3234 3234                  nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3235 3235                  nhdr->b_size = blksz;
3236 3236                  nhdr->b_spa = spa;
3237 3237                  nhdr->b_type = type;
3238 3238                  nhdr->b_buf = buf;
3239 3239                  nhdr->b_state = arc_anon;
3240 3240                  nhdr->b_arc_access = 0;
3241 3241                  nhdr->b_flags = flags & ARC_L2_WRITING;
3242 3242                  nhdr->b_l2hdr = NULL;
3243 3243                  nhdr->b_datacnt = 1;
3244 3244                  nhdr->b_freeze_cksum = NULL;
3245 3245                  (void) refcount_add(&nhdr->b_refcnt, tag);
3246 3246                  buf->b_hdr = nhdr;
3247 3247                  mutex_exit(&buf->b_evict_lock);
3248 3248                  atomic_add_64(&arc_anon->arcs_size, blksz);
3249 3249          } else {
3250 3250                  mutex_exit(&buf->b_evict_lock);
3251 3251                  ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3252 3252                  ASSERT(!list_link_active(&hdr->b_arc_node));
3253 3253                  ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3254 3254                  if (hdr->b_state != arc_anon)
3255 3255                          arc_change_state(arc_anon, hdr, hash_lock);
3256 3256                  hdr->b_arc_access = 0;
3257 3257                  if (hash_lock)
3258 3258                          mutex_exit(hash_lock);
3259 3259  
3260 3260                  buf_discard_identity(hdr);
3261 3261                  arc_buf_thaw(buf);
3262 3262          }
3263 3263          buf->b_efunc = NULL;
3264 3264          buf->b_private = NULL;
3265 3265  
3266 3266          if (l2hdr) {
3267 3267                  list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3268 3268                  kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3269 3269                  ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3270 3270                  mutex_exit(&l2arc_buflist_mtx);
3271 3271          }
3272 3272  }
3273 3273  
3274 3274  int
3275 3275  arc_released(arc_buf_t *buf)
3276 3276  {
3277 3277          int released;
3278 3278  
3279 3279          mutex_enter(&buf->b_evict_lock);
3280 3280          released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3281 3281          mutex_exit(&buf->b_evict_lock);
3282 3282          return (released);
3283 3283  }
3284 3284  
3285 3285  int
3286 3286  arc_has_callback(arc_buf_t *buf)
3287 3287  {
3288 3288          int callback;
3289 3289  
3290 3290          mutex_enter(&buf->b_evict_lock);
3291 3291          callback = (buf->b_efunc != NULL);
3292 3292          mutex_exit(&buf->b_evict_lock);
3293 3293          return (callback);
3294 3294  }
3295 3295  
3296 3296  #ifdef ZFS_DEBUG
3297 3297  int
3298 3298  arc_referenced(arc_buf_t *buf)
3299 3299  {
3300 3300          int referenced;
3301 3301  
3302 3302          mutex_enter(&buf->b_evict_lock);
3303 3303          referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3304 3304          mutex_exit(&buf->b_evict_lock);
3305 3305          return (referenced);
3306 3306  }
3307 3307  #endif
3308 3308  
3309 3309  static void
3310 3310  arc_write_ready(zio_t *zio)
3311 3311  {
3312 3312          arc_write_callback_t *callback = zio->io_private;
3313 3313          arc_buf_t *buf = callback->awcb_buf;
3314 3314          arc_buf_hdr_t *hdr = buf->b_hdr;
3315 3315  
3316 3316          ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3317 3317          callback->awcb_ready(zio, buf, callback->awcb_private);
3318 3318  
3319 3319          /*
3320 3320           * If the IO is already in progress, then this is a re-write
3321 3321           * attempt, so we need to thaw and re-compute the cksum.
3322 3322           * It is the responsibility of the callback to handle the
3323 3323           * accounting for any re-write attempt.
3324 3324           */
3325 3325          if (HDR_IO_IN_PROGRESS(hdr)) {
3326 3326                  mutex_enter(&hdr->b_freeze_lock);
3327 3327                  if (hdr->b_freeze_cksum != NULL) {
3328 3328                          kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3329 3329                          hdr->b_freeze_cksum = NULL;
3330 3330                  }
3331 3331                  mutex_exit(&hdr->b_freeze_lock);
3332 3332          }
3333 3333          arc_cksum_compute(buf, B_FALSE);
3334 3334          hdr->b_flags |= ARC_IO_IN_PROGRESS;
3335 3335  }
3336 3336  
3337 3337  static void
3338 3338  arc_write_done(zio_t *zio)
3339 3339  {
3340 3340          arc_write_callback_t *callback = zio->io_private;
3341 3341          arc_buf_t *buf = callback->awcb_buf;
3342 3342          arc_buf_hdr_t *hdr = buf->b_hdr;
3343 3343  
3344 3344          ASSERT(hdr->b_acb == NULL);
3345 3345  
3346 3346          if (zio->io_error == 0) {
3347 3347                  hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3348 3348                  hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3349 3349                  hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3350 3350          } else {
3351 3351                  ASSERT(BUF_EMPTY(hdr));
3352 3352          }
3353 3353  
3354 3354          /*
3355 3355           * If the block to be written was all-zero, we may have
3356 3356           * compressed it away.  In this case no write was performed
3357 3357           * so there will be no dva/birth/checksum.  The buffer must
3358 3358           * therefore remain anonymous (and uncached).
3359 3359           */
3360 3360          if (!BUF_EMPTY(hdr)) {
3361 3361                  arc_buf_hdr_t *exists;
3362 3362                  kmutex_t *hash_lock;
3363 3363  
3364 3364                  ASSERT(zio->io_error == 0);
3365 3365  
3366 3366                  arc_cksum_verify(buf);
3367 3367  
3368 3368                  exists = buf_hash_insert(hdr, &hash_lock);
3369 3369                  if (exists) {
3370 3370                          /*
3371 3371                           * This can only happen if we overwrite for
3372 3372                           * sync-to-convergence, because we remove
3373 3373                           * buffers from the hash table when we arc_free().
3374 3374                           */
3375 3375                          if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3376 3376                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3377 3377                                          panic("bad overwrite, hdr=%p exists=%p",
3378 3378                                              (void *)hdr, (void *)exists);
3379 3379                                  ASSERT(refcount_is_zero(&exists->b_refcnt));
3380 3380                                  arc_change_state(arc_anon, exists, hash_lock);
3381 3381                                  mutex_exit(hash_lock);
3382 3382                                  arc_hdr_destroy(exists);
3383 3383                                  exists = buf_hash_insert(hdr, &hash_lock);
3384 3384                                  ASSERT3P(exists, ==, NULL);
3385 3385                          } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3386 3386                                  /* nopwrite */
3387 3387                                  ASSERT(zio->io_prop.zp_nopwrite);
3388 3388                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3389 3389                                          panic("bad nopwrite, hdr=%p exists=%p",
3390 3390                                              (void *)hdr, (void *)exists);
3391 3391                          } else {
3392 3392                                  /* Dedup */
3393 3393                                  ASSERT(hdr->b_datacnt == 1);
3394 3394                                  ASSERT(hdr->b_state == arc_anon);
3395 3395                                  ASSERT(BP_GET_DEDUP(zio->io_bp));
3396 3396                                  ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3397 3397                          }
3398 3398                  }
3399 3399                  hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3400 3400                  /* if it's not anon, we are doing a scrub */
3401 3401                  if (!exists && hdr->b_state == arc_anon)
3402 3402                          arc_access(hdr, hash_lock);
3403 3403                  mutex_exit(hash_lock);
3404 3404          } else {
3405 3405                  hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3406 3406          }

↓ open down ↓

338 lines elided

↑ open up ↑

3407 3407  
3408 3408          ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3409 3409          callback->awcb_done(zio, buf, callback->awcb_private);
3410 3410  
3411 3411          kmem_free(callback, sizeof (arc_write_callback_t));
3412 3412  }
3413 3413  
3414 3414  zio_t *
3415 3415  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3416 3416      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3417      -    arc_done_func_t *ready, arc_done_func_t *done, void *private,
     3417 +    arc_done_func_t *ready, arc_done_func_t *done, void *cb_private,
3418 3418      int priority, int zio_flags, const zbookmark_t *zb)
3419 3419  {
3420 3420          arc_buf_hdr_t *hdr = buf->b_hdr;
3421 3421          arc_write_callback_t *callback;
3422 3422          zio_t *zio;
3423 3423  
3424 3424          ASSERT(ready != NULL);
3425 3425          ASSERT(done != NULL);
3426 3426          ASSERT(!HDR_IO_ERROR(hdr));
3427 3427          ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3428 3428          ASSERT(hdr->b_acb == NULL);
3429 3429          if (l2arc)
3430 3430                  hdr->b_flags |= ARC_L2CACHE;
3431 3431          callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3432 3432          callback->awcb_ready = ready;
3433 3433          callback->awcb_done = done;
3434      -        callback->awcb_private = private;
     3434 +        callback->awcb_private = cb_private;
3435 3435          callback->awcb_buf = buf;
3436 3436  
3437 3437          zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3438 3438              arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3439 3439  
3440 3440          return (zio);
3441 3441  }
3442 3442  
3443 3443  static int
3444 3444  arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)

3445 3445  {
3446 3446  #ifdef _KERNEL
3447 3447          uint64_t available_memory = ptob(freemem);
3448 3448          static uint64_t page_load = 0;
3449 3449          static uint64_t last_txg = 0;
3450 3450  
3451 3451  #if defined(__i386)
3452 3452          available_memory =
3453 3453              MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3454 3454  #endif
3455 3455          if (available_memory >= zfs_write_limit_max)
3456 3456                  return (0);
3457 3457  
3458 3458          if (txg > last_txg) {
3459 3459                  last_txg = txg;
3460 3460                  page_load = 0;
3461 3461          }
3462 3462          /*
3463 3463           * If we are in pageout, we know that memory is already tight,
3464 3464           * the arc is already going to be evicting, so we just want to
3465 3465           * continue to let page writes occur as quickly as possible.
3466 3466           */
3467 3467          if (curproc == proc_pageout) {
3468 3468                  if (page_load > MAX(ptob(minfree), available_memory) / 4)
3469 3469                          return (SET_ERROR(ERESTART));
3470 3470                  /* Note: reserve is inflated, so we deflate */
3471 3471                  page_load += reserve / 8;
3472 3472                  return (0);
3473 3473          } else if (page_load > 0 && arc_reclaim_needed()) {
3474 3474                  /* memory is low, delay before restarting */
3475 3475                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3476 3476                  return (SET_ERROR(EAGAIN));
3477 3477          }
3478 3478          page_load = 0;
3479 3479  
3480 3480          if (arc_size > arc_c_min) {
3481 3481                  uint64_t evictable_memory =
3482 3482                      arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3483 3483                      arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3484 3484                      arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3485 3485                      arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3486 3486                  available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3487 3487          }
3488 3488  
3489 3489          if (inflight_data > available_memory / 4) {
3490 3490                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3491 3491                  return (SET_ERROR(ERESTART));
3492 3492          }
3493 3493  #endif
3494 3494          return (0);
3495 3495  }
3496 3496  
3497 3497  void
3498 3498  arc_tempreserve_clear(uint64_t reserve)
3499 3499  {
3500 3500          atomic_add_64(&arc_tempreserve, -reserve);
3501 3501          ASSERT((int64_t)arc_tempreserve >= 0);
3502 3502  }
3503 3503  
3504 3504  int
3505 3505  arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3506 3506  {
3507 3507          int error;
3508 3508          uint64_t anon_size;
3509 3509  
3510 3510  #ifdef ZFS_DEBUG
3511 3511          /*
3512 3512           * Once in a while, fail for no reason.  Everything should cope.
3513 3513           */
3514 3514          if (spa_get_random(10000) == 0) {
3515 3515                  dprintf("forcing random failure\n");
3516 3516                  return (SET_ERROR(ERESTART));
3517 3517          }
3518 3518  #endif
3519 3519          if (reserve > arc_c/4 && !arc_no_grow)
3520 3520                  arc_c = MIN(arc_c_max, reserve * 4);
3521 3521          if (reserve > arc_c)
3522 3522                  return (SET_ERROR(ENOMEM));
3523 3523  
3524 3524          /*
3525 3525           * Don't count loaned bufs as in flight dirty data to prevent long
3526 3526           * network delays from blocking transactions that are ready to be
3527 3527           * assigned to a txg.
3528 3528           */
3529 3529          anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3530 3530  
3531 3531          /*
3532 3532           * Writes will, almost always, require additional memory allocations
3533 3533           * in order to compress/encrypt/etc the data.  We therefor need to
3534 3534           * make sure that there is sufficient available memory for this.
3535 3535           */
3536 3536          if (error = arc_memory_throttle(reserve, anon_size, txg))
3537 3537                  return (error);
3538 3538  
3539 3539          /*
3540 3540           * Throttle writes when the amount of dirty data in the cache
3541 3541           * gets too large.  We try to keep the cache less than half full
3542 3542           * of dirty blocks so that our sync times don't grow too large.
3543 3543           * Note: if two requests come in concurrently, we might let them
3544 3544           * both succeed, when one of them should fail.  Not a huge deal.
3545 3545           */
3546 3546  
3547 3547          if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3548 3548              anon_size > arc_c / 4) {
3549 3549                  dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3550 3550                      "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3551 3551                      arc_tempreserve>>10,
3552 3552                      arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3553 3553                      arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3554 3554                      reserve>>10, arc_c>>10);
3555 3555                  return (SET_ERROR(ERESTART));
3556 3556          }
3557 3557          atomic_add_64(&arc_tempreserve, reserve);
3558 3558          return (0);
3559 3559  }
3560 3560  
3561 3561  void
3562 3562  arc_init(void)
3563 3563  {
3564 3564          mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3565 3565          cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3566 3566  
3567 3567          /* Convert seconds to clock ticks */
3568 3568          arc_min_prefetch_lifespan = 1 * hz;
3569 3569  
3570 3570          /* Start out with 1/8 of all memory */
3571 3571          arc_c = physmem * PAGESIZE / 8;
3572 3572  
3573 3573  #ifdef _KERNEL
3574 3574          /*
3575 3575           * On architectures where the physical memory can be larger
3576 3576           * than the addressable space (intel in 32-bit mode), we may
3577 3577           * need to limit the cache to 1/8 of VM size.
3578 3578           */
3579 3579          arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3580 3580  #endif
3581 3581  
3582 3582          /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3583 3583          arc_c_min = MAX(arc_c / 4, 64<<20);
3584 3584          /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3585 3585          if (arc_c * 8 >= 1<<30)
3586 3586                  arc_c_max = (arc_c * 8) - (1<<30);
3587 3587          else
3588 3588                  arc_c_max = arc_c_min;
3589 3589          arc_c_max = MAX(arc_c * 6, arc_c_max);
3590 3590  
3591 3591          /*
3592 3592           * Allow the tunables to override our calculations if they are
3593 3593           * reasonable (ie. over 64MB)
3594 3594           */
3595 3595          if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3596 3596                  arc_c_max = zfs_arc_max;
3597 3597          if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3598 3598                  arc_c_min = zfs_arc_min;
3599 3599  
3600 3600          arc_c = arc_c_max;
3601 3601          arc_p = (arc_c >> 1);
3602 3602  
3603 3603          /* limit meta-data to 1/4 of the arc capacity */
3604 3604          arc_meta_limit = arc_c_max / 4;
3605 3605  
3606 3606          /* Allow the tunable to override if it is reasonable */
3607 3607          if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3608 3608                  arc_meta_limit = zfs_arc_meta_limit;
3609 3609  
3610 3610          if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3611 3611                  arc_c_min = arc_meta_limit / 2;
3612 3612  
3613 3613          if (zfs_arc_grow_retry > 0)
3614 3614                  arc_grow_retry = zfs_arc_grow_retry;
3615 3615  
3616 3616          if (zfs_arc_shrink_shift > 0)
3617 3617                  arc_shrink_shift = zfs_arc_shrink_shift;
3618 3618  
3619 3619          if (zfs_arc_p_min_shift > 0)
3620 3620                  arc_p_min_shift = zfs_arc_p_min_shift;
3621 3621  
3622 3622          /* if kmem_flags are set, lets try to use less memory */
3623 3623          if (kmem_debugging())
3624 3624                  arc_c = arc_c / 2;
3625 3625          if (arc_c < arc_c_min)
3626 3626                  arc_c = arc_c_min;
3627 3627  
3628 3628          arc_anon = &ARC_anon;
3629 3629          arc_mru = &ARC_mru;
3630 3630          arc_mru_ghost = &ARC_mru_ghost;
3631 3631          arc_mfu = &ARC_mfu;
3632 3632          arc_mfu_ghost = &ARC_mfu_ghost;
3633 3633          arc_l2c_only = &ARC_l2c_only;
3634 3634          arc_size = 0;
3635 3635  
3636 3636          mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3637 3637          mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3638 3638          mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3639 3639          mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3640 3640          mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3641 3641          mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3642 3642  
3643 3643          list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3644 3644              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3645 3645          list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3646 3646              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3647 3647          list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3648 3648              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3649 3649          list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3650 3650              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3651 3651          list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3652 3652              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3653 3653          list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3654 3654              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3655 3655          list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3656 3656              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3657 3657          list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3658 3658              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3659 3659          list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3660 3660              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3661 3661          list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3662 3662              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3663 3663  
3664 3664          buf_init();
3665 3665  
3666 3666          arc_thread_exit = 0;
3667 3667          arc_eviction_list = NULL;
3668 3668          mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3669 3669          bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3670 3670  
3671 3671          arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3672 3672              sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3673 3673  
3674 3674          if (arc_ksp != NULL) {
3675 3675                  arc_ksp->ks_data = &arc_stats;
3676 3676                  kstat_install(arc_ksp);
3677 3677          }
3678 3678  
3679 3679          (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3680 3680              TS_RUN, minclsyspri);
3681 3681  
3682 3682          arc_dead = FALSE;
3683 3683          arc_warm = B_FALSE;
3684 3684  
3685 3685          if (zfs_write_limit_max == 0)
3686 3686                  zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3687 3687          else
3688 3688                  zfs_write_limit_shift = 0;
3689 3689          mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3690 3690  }
3691 3691  
3692 3692  void
3693 3693  arc_fini(void)
3694 3694  {
3695 3695          mutex_enter(&arc_reclaim_thr_lock);
3696 3696          arc_thread_exit = 1;
3697 3697          while (arc_thread_exit != 0)
3698 3698                  cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3699 3699          mutex_exit(&arc_reclaim_thr_lock);
3700 3700  
3701 3701          arc_flush(NULL);
3702 3702  
3703 3703          arc_dead = TRUE;
3704 3704  
3705 3705          if (arc_ksp != NULL) {
3706 3706                  kstat_delete(arc_ksp);
3707 3707                  arc_ksp = NULL;
3708 3708          }
3709 3709  
3710 3710          mutex_destroy(&arc_eviction_mtx);
3711 3711          mutex_destroy(&arc_reclaim_thr_lock);
3712 3712          cv_destroy(&arc_reclaim_thr_cv);
3713 3713  
3714 3714          list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3715 3715          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3716 3716          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3717 3717          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3718 3718          list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3719 3719          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3720 3720          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3721 3721          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3722 3722  
3723 3723          mutex_destroy(&arc_anon->arcs_mtx);
3724 3724          mutex_destroy(&arc_mru->arcs_mtx);
3725 3725          mutex_destroy(&arc_mru_ghost->arcs_mtx);
3726 3726          mutex_destroy(&arc_mfu->arcs_mtx);
3727 3727          mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3728 3728          mutex_destroy(&arc_l2c_only->arcs_mtx);
3729 3729  
3730 3730          mutex_destroy(&zfs_write_limit_lock);
3731 3731  
3732 3732          buf_fini();
3733 3733  
3734 3734          ASSERT(arc_loaned_bytes == 0);
3735 3735  }
3736 3736  
3737 3737  /*
3738 3738   * Level 2 ARC
3739 3739   *
3740 3740   * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3741 3741   * It uses dedicated storage devices to hold cached data, which are populated
3742 3742   * using large infrequent writes.  The main role of this cache is to boost
3743 3743   * the performance of random read workloads.  The intended L2ARC devices
3744 3744   * include short-stroked disks, solid state disks, and other media with
3745 3745   * substantially faster read latency than disk.
3746 3746   *
3747 3747   *                 +-----------------------+
3748 3748   *                 |         ARC           |
3749 3749   *                 +-----------------------+
3750 3750   *                    |         ^     ^
3751 3751   *                    |         |     |
3752 3752   *      l2arc_feed_thread()    arc_read()
3753 3753   *                    |         |     |
3754 3754   *                    |  l2arc read   |
3755 3755   *                    V         |     |
3756 3756   *               +---------------+    |
3757 3757   *               |     L2ARC     |    |
3758 3758   *               +---------------+    |
3759 3759   *                   |    ^           |
3760 3760   *          l2arc_write() |           |
3761 3761   *                   |    |           |
3762 3762   *                   V    |           |
3763 3763   *                 +-------+      +-------+
3764 3764   *                 | vdev  |      | vdev  |
3765 3765   *                 | cache |      | cache |
3766 3766   *                 +-------+      +-------+
3767 3767   *                 +=========+     .-----.
3768 3768   *                 :  L2ARC  :    |-_____-|
3769 3769   *                 : devices :    | Disks |
3770 3770   *                 +=========+    `-_____-'
3771 3771   *
3772 3772   * Read requests are satisfied from the following sources, in order:
3773 3773   *
3774 3774   *      1) ARC
3775 3775   *      2) vdev cache of L2ARC devices
3776 3776   *      3) L2ARC devices
3777 3777   *      4) vdev cache of disks
3778 3778   *      5) disks
3779 3779   *
3780 3780   * Some L2ARC device types exhibit extremely slow write performance.
3781 3781   * To accommodate for this there are some significant differences between
3782 3782   * the L2ARC and traditional cache design:
3783 3783   *
3784 3784   * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3785 3785   * the ARC behave as usual, freeing buffers and placing headers on ghost
3786 3786   * lists.  The ARC does not send buffers to the L2ARC during eviction as
3787 3787   * this would add inflated write latencies for all ARC memory pressure.
3788 3788   *
3789 3789   * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3790 3790   * It does this by periodically scanning buffers from the eviction-end of
3791 3791   * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3792 3792   * not already there.  It scans until a headroom of buffers is satisfied,
3793 3793   * which itself is a buffer for ARC eviction.  The thread that does this is
3794 3794   * l2arc_feed_thread(), illustrated below; example sizes are included to
3795 3795   * provide a better sense of ratio than this diagram:
3796 3796   *
3797 3797   *             head -->                        tail
3798 3798   *              +---------------------+----------+
3799 3799   *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3800 3800   *              +---------------------+----------+   |   o L2ARC eligible
3801 3801   *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3802 3802   *              +---------------------+----------+   |
3803 3803   *                   15.9 Gbytes      ^ 32 Mbytes    |
3804 3804   *                                 headroom          |
3805 3805   *                                            l2arc_feed_thread()
3806 3806   *                                                   |
3807 3807   *                       l2arc write hand <--[oooo]--'
3808 3808   *                               |           8 Mbyte
3809 3809   *                               |          write max
3810 3810   *                               V
3811 3811   *                +==============================+
3812 3812   *      L2ARC dev |####|#|###|###|    |####| ... |
3813 3813   *                +==============================+
3814 3814   *                           32 Gbytes
3815 3815   *
3816 3816   * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3817 3817   * evicted, then the L2ARC has cached a buffer much sooner than it probably
3818 3818   * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3819 3819   * safe to say that this is an uncommon case, since buffers at the end of
3820 3820   * the ARC lists have moved there due to inactivity.
3821 3821   *
3822 3822   * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3823 3823   * then the L2ARC simply misses copying some buffers.  This serves as a
3824 3824   * pressure valve to prevent heavy read workloads from both stalling the ARC
3825 3825   * with waits and clogging the L2ARC with writes.  This also helps prevent
3826 3826   * the potential for the L2ARC to churn if it attempts to cache content too
3827 3827   * quickly, such as during backups of the entire pool.
3828 3828   *
3829 3829   * 5. After system boot and before the ARC has filled main memory, there are
3830 3830   * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3831 3831   * lists can remain mostly static.  Instead of searching from tail of these
3832 3832   * lists as pictured, the l2arc_feed_thread() will search from the list heads
3833 3833   * for eligible buffers, greatly increasing its chance of finding them.
3834 3834   *
3835 3835   * The L2ARC device write speed is also boosted during this time so that
3836 3836   * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3837 3837   * there are no L2ARC reads, and no fear of degrading read performance
3838 3838   * through increased writes.
3839 3839   *
3840 3840   * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3841 3841   * the vdev queue can aggregate them into larger and fewer writes.  Each
3842 3842   * device is written to in a rotor fashion, sweeping writes through
3843 3843   * available space then repeating.
3844 3844   *
3845 3845   * 7. The L2ARC does not store dirty content.  It never needs to flush
3846 3846   * write buffers back to disk based storage.
3847 3847   *
3848 3848   * 8. If an ARC buffer is written (and dirtied) which also exists in the
3849 3849   * L2ARC, the now stale L2ARC buffer is immediately dropped.
3850 3850   *
3851 3851   * The performance of the L2ARC can be tweaked by a number of tunables, which
3852 3852   * may be necessary for different workloads:
3853 3853   *
3854 3854   *      l2arc_write_max         max write bytes per interval
3855 3855   *      l2arc_write_boost       extra write bytes during device warmup
3856 3856   *      l2arc_noprefetch        skip caching prefetched buffers
3857 3857   *      l2arc_headroom          number of max device writes to precache
3858 3858   *      l2arc_feed_secs         seconds between L2ARC writing
3859 3859   *
3860 3860   * Tunables may be removed or added as future performance improvements are
3861 3861   * integrated, and also may become zpool properties.
3862 3862   *
3863 3863   * There are three key functions that control how the L2ARC warms up:
3864 3864   *
3865 3865   *      l2arc_write_eligible()  check if a buffer is eligible to cache
3866 3866   *      l2arc_write_size()      calculate how much to write
3867 3867   *      l2arc_write_interval()  calculate sleep delay between writes
3868 3868   *
3869 3869   * These three functions determine what to write, how much, and how quickly
3870 3870   * to send writes.
3871 3871   */
3872 3872  
3873 3873  static boolean_t
3874 3874  l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3875 3875  {
3876 3876          /*
3877 3877           * A buffer is *not* eligible for the L2ARC if it:
3878 3878           * 1. belongs to a different spa.
3879 3879           * 2. is already cached on the L2ARC.
3880 3880           * 3. has an I/O in progress (it may be an incomplete read).
3881 3881           * 4. is flagged not eligible (zfs property).
3882 3882           */
3883 3883          if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3884 3884              HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3885 3885                  return (B_FALSE);
3886 3886  
3887 3887          return (B_TRUE);
3888 3888  }
3889 3889  
3890 3890  static uint64_t
3891 3891  l2arc_write_size(l2arc_dev_t *dev)
3892 3892  {
3893 3893          uint64_t size;
3894 3894  
3895 3895          size = dev->l2ad_write;
3896 3896  
3897 3897          if (arc_warm == B_FALSE)
3898 3898                  size += dev->l2ad_boost;
3899 3899  
3900 3900          return (size);
3901 3901  
3902 3902  }
3903 3903  
3904 3904  static clock_t
3905 3905  l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
3906 3906  {
3907 3907          clock_t interval, next, now;
3908 3908  
3909 3909          /*
3910 3910           * If the ARC lists are busy, increase our write rate; if the
3911 3911           * lists are stale, idle back.  This is achieved by checking
3912 3912           * how much we previously wrote - if it was more than half of
3913 3913           * what we wanted, schedule the next write much sooner.
3914 3914           */
3915 3915          if (l2arc_feed_again && wrote > (wanted / 2))
3916 3916                  interval = (hz * l2arc_feed_min_ms) / 1000;
3917 3917          else
3918 3918                  interval = hz * l2arc_feed_secs;
3919 3919  
3920 3920          now = ddi_get_lbolt();
3921 3921          next = MAX(now, MIN(now + interval, began + interval));
3922 3922  
3923 3923          return (next);
3924 3924  }
3925 3925  
3926 3926  static void
3927 3927  l2arc_hdr_stat_add(void)
3928 3928  {
3929 3929          ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3930 3930          ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3931 3931  }
3932 3932  
3933 3933  static void
3934 3934  l2arc_hdr_stat_remove(void)
3935 3935  {
3936 3936          ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3937 3937          ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3938 3938  }
3939 3939  
3940 3940  /*
3941 3941   * Cycle through L2ARC devices.  This is how L2ARC load balances.
3942 3942   * If a device is returned, this also returns holding the spa config lock.
3943 3943   */
3944 3944  static l2arc_dev_t *
3945 3945  l2arc_dev_get_next(void)
3946 3946  {
3947 3947          l2arc_dev_t *first, *next = NULL;
3948 3948  
3949 3949          /*
3950 3950           * Lock out the removal of spas (spa_namespace_lock), then removal
3951 3951           * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
3952 3952           * both locks will be dropped and a spa config lock held instead.
3953 3953           */
3954 3954          mutex_enter(&spa_namespace_lock);
3955 3955          mutex_enter(&l2arc_dev_mtx);
3956 3956  
3957 3957          /* if there are no vdevs, there is nothing to do */
3958 3958          if (l2arc_ndev == 0)
3959 3959                  goto out;
3960 3960  
3961 3961          first = NULL;
3962 3962          next = l2arc_dev_last;
3963 3963          do {
3964 3964                  /* loop around the list looking for a non-faulted vdev */
3965 3965                  if (next == NULL) {
3966 3966                          next = list_head(l2arc_dev_list);
3967 3967                  } else {
3968 3968                          next = list_next(l2arc_dev_list, next);
3969 3969                          if (next == NULL)
3970 3970                                  next = list_head(l2arc_dev_list);
3971 3971                  }
3972 3972  
3973 3973                  /* if we have come back to the start, bail out */
3974 3974                  if (first == NULL)
3975 3975                          first = next;
3976 3976                  else if (next == first)
3977 3977                          break;
3978 3978  
3979 3979          } while (vdev_is_dead(next->l2ad_vdev));
3980 3980  
3981 3981          /* if we were unable to find any usable vdevs, return NULL */
3982 3982          if (vdev_is_dead(next->l2ad_vdev))
3983 3983                  next = NULL;
3984 3984  
3985 3985          l2arc_dev_last = next;
3986 3986  
3987 3987  out:
3988 3988          mutex_exit(&l2arc_dev_mtx);
3989 3989  
3990 3990          /*
3991 3991           * Grab the config lock to prevent the 'next' device from being
3992 3992           * removed while we are writing to it.
3993 3993           */
3994 3994          if (next != NULL)
3995 3995                  spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
3996 3996          mutex_exit(&spa_namespace_lock);
3997 3997  
3998 3998          return (next);
3999 3999  }
4000 4000  
4001 4001  /*
4002 4002   * Free buffers that were tagged for destruction.
4003 4003   */
4004 4004  static void
4005 4005  l2arc_do_free_on_write()
4006 4006  {
4007 4007          list_t *buflist;
4008 4008          l2arc_data_free_t *df, *df_prev;
4009 4009  
4010 4010          mutex_enter(&l2arc_free_on_write_mtx);
4011 4011          buflist = l2arc_free_on_write;
4012 4012  
4013 4013          for (df = list_tail(buflist); df; df = df_prev) {
4014 4014                  df_prev = list_prev(buflist, df);
4015 4015                  ASSERT(df->l2df_data != NULL);
4016 4016                  ASSERT(df->l2df_func != NULL);
4017 4017                  df->l2df_func(df->l2df_data, df->l2df_size);
4018 4018                  list_remove(buflist, df);
4019 4019                  kmem_free(df, sizeof (l2arc_data_free_t));
4020 4020          }
4021 4021  
4022 4022          mutex_exit(&l2arc_free_on_write_mtx);
4023 4023  }
4024 4024  
4025 4025  /*
4026 4026   * A write to a cache device has completed.  Update all headers to allow
4027 4027   * reads from these buffers to begin.
4028 4028   */
4029 4029  static void
4030 4030  l2arc_write_done(zio_t *zio)
4031 4031  {
4032 4032          l2arc_write_callback_t *cb;
4033 4033          l2arc_dev_t *dev;
4034 4034          list_t *buflist;
4035 4035          arc_buf_hdr_t *head, *ab, *ab_prev;
4036 4036          l2arc_buf_hdr_t *abl2;
4037 4037          kmutex_t *hash_lock;
4038 4038  
4039 4039          cb = zio->io_private;
4040 4040          ASSERT(cb != NULL);
4041 4041          dev = cb->l2wcb_dev;
4042 4042          ASSERT(dev != NULL);
4043 4043          head = cb->l2wcb_head;
4044 4044          ASSERT(head != NULL);
4045 4045          buflist = dev->l2ad_buflist;
4046 4046          ASSERT(buflist != NULL);
4047 4047          DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4048 4048              l2arc_write_callback_t *, cb);
4049 4049  
4050 4050          if (zio->io_error != 0)
4051 4051                  ARCSTAT_BUMP(arcstat_l2_writes_error);
4052 4052  
4053 4053          mutex_enter(&l2arc_buflist_mtx);
4054 4054  
4055 4055          /*
4056 4056           * All writes completed, or an error was hit.
4057 4057           */
4058 4058          for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4059 4059                  ab_prev = list_prev(buflist, ab);
4060 4060  
4061 4061                  hash_lock = HDR_LOCK(ab);
4062 4062                  if (!mutex_tryenter(hash_lock)) {
4063 4063                          /*
4064 4064                           * This buffer misses out.  It may be in a stage
4065 4065                           * of eviction.  Its ARC_L2_WRITING flag will be
4066 4066                           * left set, denying reads to this buffer.
4067 4067                           */
4068 4068                          ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4069 4069                          continue;
4070 4070                  }
4071 4071  
4072 4072                  if (zio->io_error != 0) {
4073 4073                          /*
4074 4074                           * Error - drop L2ARC entry.
4075 4075                           */
4076 4076                          list_remove(buflist, ab);
4077 4077                          abl2 = ab->b_l2hdr;
4078 4078                          ab->b_l2hdr = NULL;
4079 4079                          kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4080 4080                          ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4081 4081                  }
4082 4082  
4083 4083                  /*
4084 4084                   * Allow ARC to begin reads to this L2ARC entry.
4085 4085                   */
4086 4086                  ab->b_flags &= ~ARC_L2_WRITING;
4087 4087  
4088 4088                  mutex_exit(hash_lock);
4089 4089          }
4090 4090  
4091 4091          atomic_inc_64(&l2arc_writes_done);
4092 4092          list_remove(buflist, head);
4093 4093          kmem_cache_free(hdr_cache, head);
4094 4094          mutex_exit(&l2arc_buflist_mtx);
4095 4095  
4096 4096          l2arc_do_free_on_write();
4097 4097  
4098 4098          kmem_free(cb, sizeof (l2arc_write_callback_t));
4099 4099  }
4100 4100  
4101 4101  /*
4102 4102   * A read to a cache device completed.  Validate buffer contents before
4103 4103   * handing over to the regular ARC routines.
4104 4104   */
4105 4105  static void
4106 4106  l2arc_read_done(zio_t *zio)
4107 4107  {
4108 4108          l2arc_read_callback_t *cb;
4109 4109          arc_buf_hdr_t *hdr;
4110 4110          arc_buf_t *buf;
4111 4111          kmutex_t *hash_lock;
4112 4112          int equal;
4113 4113  
4114 4114          ASSERT(zio->io_vd != NULL);
4115 4115          ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4116 4116  
4117 4117          spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4118 4118  
4119 4119          cb = zio->io_private;
4120 4120          ASSERT(cb != NULL);
4121 4121          buf = cb->l2rcb_buf;
4122 4122          ASSERT(buf != NULL);
4123 4123  
4124 4124          hash_lock = HDR_LOCK(buf->b_hdr);
4125 4125          mutex_enter(hash_lock);
4126 4126          hdr = buf->b_hdr;
4127 4127          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4128 4128  
4129 4129          /*
4130 4130           * Check this survived the L2ARC journey.
4131 4131           */
4132 4132          equal = arc_cksum_equal(buf);
4133 4133          if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4134 4134                  mutex_exit(hash_lock);
4135 4135                  zio->io_private = buf;
4136 4136                  zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4137 4137                  zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
4138 4138                  arc_read_done(zio);
4139 4139          } else {
4140 4140                  mutex_exit(hash_lock);
4141 4141                  /*
4142 4142                   * Buffer didn't survive caching.  Increment stats and
4143 4143                   * reissue to the original storage device.
4144 4144                   */
4145 4145                  if (zio->io_error != 0) {
4146 4146                          ARCSTAT_BUMP(arcstat_l2_io_error);
4147 4147                  } else {
4148 4148                          zio->io_error = SET_ERROR(EIO);
4149 4149                  }
4150 4150                  if (!equal)
4151 4151                          ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4152 4152  
4153 4153                  /*
4154 4154                   * If there's no waiter, issue an async i/o to the primary
4155 4155                   * storage now.  If there *is* a waiter, the caller must
4156 4156                   * issue the i/o in a context where it's OK to block.
4157 4157                   */
4158 4158                  if (zio->io_waiter == NULL) {
4159 4159                          zio_t *pio = zio_unique_parent(zio);
4160 4160  
4161 4161                          ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4162 4162  
4163 4163                          zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4164 4164                              buf->b_data, zio->io_size, arc_read_done, buf,
4165 4165                              zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4166 4166                  }
4167 4167          }
4168 4168  
4169 4169          kmem_free(cb, sizeof (l2arc_read_callback_t));
4170 4170  }
4171 4171  
4172 4172  /*
4173 4173   * This is the list priority from which the L2ARC will search for pages to
4174 4174   * cache.  This is used within loops (0..3) to cycle through lists in the
4175 4175   * desired order.  This order can have a significant effect on cache
4176 4176   * performance.
4177 4177   *
4178 4178   * Currently the metadata lists are hit first, MFU then MRU, followed by
4179 4179   * the data lists.  This function returns a locked list, and also returns
4180 4180   * the lock pointer.
4181 4181   */
4182 4182  static list_t *
4183 4183  l2arc_list_locked(int list_num, kmutex_t **lock)
4184 4184  {
4185 4185          list_t *list = NULL;
4186 4186  
4187 4187          ASSERT(list_num >= 0 && list_num <= 3);
4188 4188  
4189 4189          switch (list_num) {
4190 4190          case 0:
4191 4191                  list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4192 4192                  *lock = &arc_mfu->arcs_mtx;
4193 4193                  break;
4194 4194          case 1:
4195 4195                  list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4196 4196                  *lock = &arc_mru->arcs_mtx;
4197 4197                  break;
4198 4198          case 2:
4199 4199                  list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4200 4200                  *lock = &arc_mfu->arcs_mtx;
4201 4201                  break;
4202 4202          case 3:
4203 4203                  list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4204 4204                  *lock = &arc_mru->arcs_mtx;
4205 4205                  break;
4206 4206          }
4207 4207  
4208 4208          ASSERT(!(MUTEX_HELD(*lock)));
4209 4209          mutex_enter(*lock);
4210 4210          return (list);
4211 4211  }
4212 4212  
4213 4213  /*
4214 4214   * Evict buffers from the device write hand to the distance specified in
4215 4215   * bytes.  This distance may span populated buffers, it may span nothing.
4216 4216   * This is clearing a region on the L2ARC device ready for writing.
4217 4217   * If the 'all' boolean is set, every buffer is evicted.
4218 4218   */
4219 4219  static void
4220 4220  l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4221 4221  {
4222 4222          list_t *buflist;
4223 4223          l2arc_buf_hdr_t *abl2;
4224 4224          arc_buf_hdr_t *ab, *ab_prev;
4225 4225          kmutex_t *hash_lock;
4226 4226          uint64_t taddr;
4227 4227  
4228 4228          buflist = dev->l2ad_buflist;
4229 4229  
4230 4230          if (buflist == NULL)
4231 4231                  return;
4232 4232  
4233 4233          if (!all && dev->l2ad_first) {
4234 4234                  /*
4235 4235                   * This is the first sweep through the device.  There is
4236 4236                   * nothing to evict.
4237 4237                   */
4238 4238                  return;
4239 4239          }
4240 4240  
4241 4241          if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4242 4242                  /*
4243 4243                   * When nearing the end of the device, evict to the end
4244 4244                   * before the device write hand jumps to the start.
4245 4245                   */
4246 4246                  taddr = dev->l2ad_end;
4247 4247          } else {
4248 4248                  taddr = dev->l2ad_hand + distance;
4249 4249          }
4250 4250          DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4251 4251              uint64_t, taddr, boolean_t, all);
4252 4252  
4253 4253  top:
4254 4254          mutex_enter(&l2arc_buflist_mtx);
4255 4255          for (ab = list_tail(buflist); ab; ab = ab_prev) {
4256 4256                  ab_prev = list_prev(buflist, ab);
4257 4257  
4258 4258                  hash_lock = HDR_LOCK(ab);
4259 4259                  if (!mutex_tryenter(hash_lock)) {
4260 4260                          /*
4261 4261                           * Missed the hash lock.  Retry.
4262 4262                           */
4263 4263                          ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4264 4264                          mutex_exit(&l2arc_buflist_mtx);
4265 4265                          mutex_enter(hash_lock);
4266 4266                          mutex_exit(hash_lock);
4267 4267                          goto top;
4268 4268                  }
4269 4269  
4270 4270                  if (HDR_L2_WRITE_HEAD(ab)) {
4271 4271                          /*
4272 4272                           * We hit a write head node.  Leave it for
4273 4273                           * l2arc_write_done().
4274 4274                           */
4275 4275                          list_remove(buflist, ab);
4276 4276                          mutex_exit(hash_lock);
4277 4277                          continue;
4278 4278                  }
4279 4279  
4280 4280                  if (!all && ab->b_l2hdr != NULL &&
4281 4281                      (ab->b_l2hdr->b_daddr > taddr ||
4282 4282                      ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4283 4283                          /*
4284 4284                           * We've evicted to the target address,
4285 4285                           * or the end of the device.
4286 4286                           */
4287 4287                          mutex_exit(hash_lock);
4288 4288                          break;
4289 4289                  }
4290 4290  
4291 4291                  if (HDR_FREE_IN_PROGRESS(ab)) {
4292 4292                          /*
4293 4293                           * Already on the path to destruction.
4294 4294                           */
4295 4295                          mutex_exit(hash_lock);
4296 4296                          continue;
4297 4297                  }
4298 4298  
4299 4299                  if (ab->b_state == arc_l2c_only) {
4300 4300                          ASSERT(!HDR_L2_READING(ab));
4301 4301                          /*
4302 4302                           * This doesn't exist in the ARC.  Destroy.
4303 4303                           * arc_hdr_destroy() will call list_remove()
4304 4304                           * and decrement arcstat_l2_size.
4305 4305                           */
4306 4306                          arc_change_state(arc_anon, ab, hash_lock);
4307 4307                          arc_hdr_destroy(ab);
4308 4308                  } else {
4309 4309                          /*
4310 4310                           * Invalidate issued or about to be issued
4311 4311                           * reads, since we may be about to write
4312 4312                           * over this location.
4313 4313                           */
4314 4314                          if (HDR_L2_READING(ab)) {
4315 4315                                  ARCSTAT_BUMP(arcstat_l2_evict_reading);
4316 4316                                  ab->b_flags |= ARC_L2_EVICTED;
4317 4317                          }
4318 4318  
4319 4319                          /*
4320 4320                           * Tell ARC this no longer exists in L2ARC.
4321 4321                           */
4322 4322                          if (ab->b_l2hdr != NULL) {
4323 4323                                  abl2 = ab->b_l2hdr;
4324 4324                                  ab->b_l2hdr = NULL;
4325 4325                                  kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4326 4326                                  ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4327 4327                          }
4328 4328                          list_remove(buflist, ab);
4329 4329  
4330 4330                          /*
4331 4331                           * This may have been leftover after a
4332 4332                           * failed write.
4333 4333                           */
4334 4334                          ab->b_flags &= ~ARC_L2_WRITING;
4335 4335                  }
4336 4336                  mutex_exit(hash_lock);
4337 4337          }
4338 4338          mutex_exit(&l2arc_buflist_mtx);
4339 4339  
4340 4340          vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4341 4341          dev->l2ad_evict = taddr;
4342 4342  }
4343 4343  
4344 4344  /*
4345 4345   * Find and write ARC buffers to the L2ARC device.
4346 4346   *
4347 4347   * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4348 4348   * for reading until they have completed writing.
4349 4349   */
4350 4350  static uint64_t
4351 4351  l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4352 4352  {
4353 4353          arc_buf_hdr_t *ab, *ab_prev, *head;
4354 4354          l2arc_buf_hdr_t *hdrl2;
4355 4355          list_t *list;
4356 4356          uint64_t passed_sz, write_sz, buf_sz, headroom;
4357 4357          void *buf_data;
4358 4358          kmutex_t *hash_lock, *list_lock;
4359 4359          boolean_t have_lock, full;
4360 4360          l2arc_write_callback_t *cb;
4361 4361          zio_t *pio, *wzio;
4362 4362          uint64_t guid = spa_load_guid(spa);
4363 4363  
4364 4364          ASSERT(dev->l2ad_vdev != NULL);
4365 4365  
4366 4366          pio = NULL;
4367 4367          write_sz = 0;
4368 4368          full = B_FALSE;
4369 4369          head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4370 4370          head->b_flags |= ARC_L2_WRITE_HEAD;
4371 4371  
4372 4372          /*
4373 4373           * Copy buffers for L2ARC writing.
4374 4374           */
4375 4375          mutex_enter(&l2arc_buflist_mtx);
4376 4376          for (int try = 0; try <= 3; try++) {
4377 4377                  list = l2arc_list_locked(try, &list_lock);
4378 4378                  passed_sz = 0;
4379 4379  
4380 4380                  /*
4381 4381                   * L2ARC fast warmup.
4382 4382                   *
4383 4383                   * Until the ARC is warm and starts to evict, read from the
4384 4384                   * head of the ARC lists rather than the tail.
4385 4385                   */
4386 4386                  headroom = target_sz * l2arc_headroom;
4387 4387                  if (arc_warm == B_FALSE)
4388 4388                          ab = list_head(list);
4389 4389                  else
4390 4390                          ab = list_tail(list);
4391 4391  
4392 4392                  for (; ab; ab = ab_prev) {
4393 4393                          if (arc_warm == B_FALSE)
4394 4394                                  ab_prev = list_next(list, ab);
4395 4395                          else
4396 4396                                  ab_prev = list_prev(list, ab);
4397 4397  
4398 4398                          hash_lock = HDR_LOCK(ab);
4399 4399                          have_lock = MUTEX_HELD(hash_lock);
4400 4400                          if (!have_lock && !mutex_tryenter(hash_lock)) {
4401 4401                                  /*
4402 4402                                   * Skip this buffer rather than waiting.
4403 4403                                   */
4404 4404                                  continue;
4405 4405                          }
4406 4406  
4407 4407                          passed_sz += ab->b_size;
4408 4408                          if (passed_sz > headroom) {
4409 4409                                  /*
4410 4410                                   * Searched too far.
4411 4411                                   */
4412 4412                                  mutex_exit(hash_lock);
4413 4413                                  break;
4414 4414                          }
4415 4415  
4416 4416                          if (!l2arc_write_eligible(guid, ab)) {
4417 4417                                  mutex_exit(hash_lock);
4418 4418                                  continue;
4419 4419                          }
4420 4420  
4421 4421                          if ((write_sz + ab->b_size) > target_sz) {
4422 4422                                  full = B_TRUE;
4423 4423                                  mutex_exit(hash_lock);
4424 4424                                  break;
4425 4425                          }
4426 4426  
4427 4427                          if (pio == NULL) {
4428 4428                                  /*
4429 4429                                   * Insert a dummy header on the buflist so
4430 4430                                   * l2arc_write_done() can find where the
4431 4431                                   * write buffers begin without searching.
4432 4432                                   */
4433 4433                                  list_insert_head(dev->l2ad_buflist, head);
4434 4434  
4435 4435                                  cb = kmem_alloc(
4436 4436                                      sizeof (l2arc_write_callback_t), KM_SLEEP);
4437 4437                                  cb->l2wcb_dev = dev;
4438 4438                                  cb->l2wcb_head = head;
4439 4439                                  pio = zio_root(spa, l2arc_write_done, cb,
4440 4440                                      ZIO_FLAG_CANFAIL);
4441 4441                          }
4442 4442  
4443 4443                          /*
4444 4444                           * Create and add a new L2ARC header.
4445 4445                           */
4446 4446                          hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4447 4447                          hdrl2->b_dev = dev;
4448 4448                          hdrl2->b_daddr = dev->l2ad_hand;
4449 4449  
4450 4450                          ab->b_flags |= ARC_L2_WRITING;
4451 4451                          ab->b_l2hdr = hdrl2;
4452 4452                          list_insert_head(dev->l2ad_buflist, ab);
4453 4453                          buf_data = ab->b_buf->b_data;
4454 4454                          buf_sz = ab->b_size;
4455 4455  
4456 4456                          /*
4457 4457                           * Compute and store the buffer cksum before
4458 4458                           * writing.  On debug the cksum is verified first.
4459 4459                           */
4460 4460                          arc_cksum_verify(ab->b_buf);
4461 4461                          arc_cksum_compute(ab->b_buf, B_TRUE);
4462 4462  
4463 4463                          mutex_exit(hash_lock);
4464 4464  
4465 4465                          wzio = zio_write_phys(pio, dev->l2ad_vdev,
4466 4466                              dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4467 4467                              NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4468 4468                              ZIO_FLAG_CANFAIL, B_FALSE);
4469 4469  
4470 4470                          DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4471 4471                              zio_t *, wzio);
4472 4472                          (void) zio_nowait(wzio);
4473 4473  
4474 4474                          /*
4475 4475                           * Keep the clock hand suitably device-aligned.
4476 4476                           */
4477 4477                          buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4478 4478  
4479 4479                          write_sz += buf_sz;
4480 4480                          dev->l2ad_hand += buf_sz;
4481 4481                  }
4482 4482  
4483 4483                  mutex_exit(list_lock);
4484 4484  
4485 4485                  if (full == B_TRUE)
4486 4486                          break;
4487 4487          }
4488 4488          mutex_exit(&l2arc_buflist_mtx);
4489 4489  
4490 4490          if (pio == NULL) {
4491 4491                  ASSERT0(write_sz);
4492 4492                  kmem_cache_free(hdr_cache, head);
4493 4493                  return (0);
4494 4494          }
4495 4495  
4496 4496          ASSERT3U(write_sz, <=, target_sz);
4497 4497          ARCSTAT_BUMP(arcstat_l2_writes_sent);
4498 4498          ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4499 4499          ARCSTAT_INCR(arcstat_l2_size, write_sz);
4500 4500          vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4501 4501  
4502 4502          /*
4503 4503           * Bump device hand to the device start if it is approaching the end.
4504 4504           * l2arc_evict() will already have evicted ahead for this case.
4505 4505           */
4506 4506          if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4507 4507                  vdev_space_update(dev->l2ad_vdev,
4508 4508                      dev->l2ad_end - dev->l2ad_hand, 0, 0);
4509 4509                  dev->l2ad_hand = dev->l2ad_start;
4510 4510                  dev->l2ad_evict = dev->l2ad_start;
4511 4511                  dev->l2ad_first = B_FALSE;
4512 4512          }
4513 4513  
4514 4514          dev->l2ad_writing = B_TRUE;
4515 4515          (void) zio_wait(pio);
4516 4516          dev->l2ad_writing = B_FALSE;
4517 4517  
4518 4518          return (write_sz);
4519 4519  }
4520 4520  
4521 4521  /*
4522 4522   * This thread feeds the L2ARC at regular intervals.  This is the beating
4523 4523   * heart of the L2ARC.
4524 4524   */
4525 4525  static void
4526 4526  l2arc_feed_thread(void)
4527 4527  {
4528 4528          callb_cpr_t cpr;
4529 4529          l2arc_dev_t *dev;
4530 4530          spa_t *spa;
4531 4531          uint64_t size, wrote;
4532 4532          clock_t begin, next = ddi_get_lbolt();
4533 4533  
4534 4534          CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4535 4535  
4536 4536          mutex_enter(&l2arc_feed_thr_lock);
4537 4537  
4538 4538          while (l2arc_thread_exit == 0) {
4539 4539                  CALLB_CPR_SAFE_BEGIN(&cpr);
4540 4540                  (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4541 4541                      next);
4542 4542                  CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4543 4543                  next = ddi_get_lbolt() + hz;
4544 4544  
4545 4545                  /*
4546 4546                   * Quick check for L2ARC devices.
4547 4547                   */
4548 4548                  mutex_enter(&l2arc_dev_mtx);
4549 4549                  if (l2arc_ndev == 0) {
4550 4550                          mutex_exit(&l2arc_dev_mtx);
4551 4551                          continue;
4552 4552                  }
4553 4553                  mutex_exit(&l2arc_dev_mtx);
4554 4554                  begin = ddi_get_lbolt();
4555 4555  
4556 4556                  /*
4557 4557                   * This selects the next l2arc device to write to, and in
4558 4558                   * doing so the next spa to feed from: dev->l2ad_spa.   This
4559 4559                   * will return NULL if there are now no l2arc devices or if
4560 4560                   * they are all faulted.
4561 4561                   *
4562 4562                   * If a device is returned, its spa's config lock is also
4563 4563                   * held to prevent device removal.  l2arc_dev_get_next()
4564 4564                   * will grab and release l2arc_dev_mtx.
4565 4565                   */
4566 4566                  if ((dev = l2arc_dev_get_next()) == NULL)
4567 4567                          continue;
4568 4568  
4569 4569                  spa = dev->l2ad_spa;
4570 4570                  ASSERT(spa != NULL);
4571 4571  
4572 4572                  /*
4573 4573                   * If the pool is read-only then force the feed thread to
4574 4574                   * sleep a little longer.
4575 4575                   */
4576 4576                  if (!spa_writeable(spa)) {
4577 4577                          next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4578 4578                          spa_config_exit(spa, SCL_L2ARC, dev);
4579 4579                          continue;
4580 4580                  }
4581 4581  
4582 4582                  /*
4583 4583                   * Avoid contributing to memory pressure.
4584 4584                   */
4585 4585                  if (arc_reclaim_needed()) {
4586 4586                          ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4587 4587                          spa_config_exit(spa, SCL_L2ARC, dev);
4588 4588                          continue;
4589 4589                  }
4590 4590  
4591 4591                  ARCSTAT_BUMP(arcstat_l2_feeds);
4592 4592  
4593 4593                  size = l2arc_write_size(dev);
4594 4594  
4595 4595                  /*
4596 4596                   * Evict L2ARC buffers that will be overwritten.
4597 4597                   */
4598 4598                  l2arc_evict(dev, size, B_FALSE);
4599 4599  
4600 4600                  /*
4601 4601                   * Write ARC buffers.
4602 4602                   */
4603 4603                  wrote = l2arc_write_buffers(spa, dev, size);
4604 4604  
4605 4605                  /*
4606 4606                   * Calculate interval between writes.
4607 4607                   */
4608 4608                  next = l2arc_write_interval(begin, size, wrote);
4609 4609                  spa_config_exit(spa, SCL_L2ARC, dev);
4610 4610          }
4611 4611  
4612 4612          l2arc_thread_exit = 0;
4613 4613          cv_broadcast(&l2arc_feed_thr_cv);
4614 4614          CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
4615 4615          thread_exit();
4616 4616  }
4617 4617  
4618 4618  boolean_t
4619 4619  l2arc_vdev_present(vdev_t *vd)
4620 4620  {
4621 4621          l2arc_dev_t *dev;
4622 4622  
4623 4623          mutex_enter(&l2arc_dev_mtx);
4624 4624          for (dev = list_head(l2arc_dev_list); dev != NULL;
4625 4625              dev = list_next(l2arc_dev_list, dev)) {
4626 4626                  if (dev->l2ad_vdev == vd)
4627 4627                          break;
4628 4628          }
4629 4629          mutex_exit(&l2arc_dev_mtx);
4630 4630  
4631 4631          return (dev != NULL);
4632 4632  }
4633 4633  
4634 4634  /*
4635 4635   * Add a vdev for use by the L2ARC.  By this point the spa has already
4636 4636   * validated the vdev and opened it.
4637 4637   */
4638 4638  void
4639 4639  l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4640 4640  {
4641 4641          l2arc_dev_t *adddev;
4642 4642  
4643 4643          ASSERT(!l2arc_vdev_present(vd));
4644 4644  
4645 4645          /*
4646 4646           * Create a new l2arc device entry.
4647 4647           */
4648 4648          adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4649 4649          adddev->l2ad_spa = spa;
4650 4650          adddev->l2ad_vdev = vd;
4651 4651          adddev->l2ad_write = l2arc_write_max;
4652 4652          adddev->l2ad_boost = l2arc_write_boost;
4653 4653          adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4654 4654          adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4655 4655          adddev->l2ad_hand = adddev->l2ad_start;
4656 4656          adddev->l2ad_evict = adddev->l2ad_start;
4657 4657          adddev->l2ad_first = B_TRUE;
4658 4658          adddev->l2ad_writing = B_FALSE;
4659 4659          ASSERT3U(adddev->l2ad_write, >, 0);
4660 4660  
4661 4661          /*
4662 4662           * This is a list of all ARC buffers that are still valid on the
4663 4663           * device.
4664 4664           */
4665 4665          adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4666 4666          list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4667 4667              offsetof(arc_buf_hdr_t, b_l2node));
4668 4668  
4669 4669          vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4670 4670  
4671 4671          /*
4672 4672           * Add device to global list
4673 4673           */
4674 4674          mutex_enter(&l2arc_dev_mtx);
4675 4675          list_insert_head(l2arc_dev_list, adddev);
4676 4676          atomic_inc_64(&l2arc_ndev);
4677 4677          mutex_exit(&l2arc_dev_mtx);
4678 4678  }
4679 4679  
4680 4680  /*
4681 4681   * Remove a vdev from the L2ARC.
4682 4682   */
4683 4683  void
4684 4684  l2arc_remove_vdev(vdev_t *vd)
4685 4685  {
4686 4686          l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4687 4687  
4688 4688          /*
4689 4689           * Find the device by vdev
4690 4690           */
4691 4691          mutex_enter(&l2arc_dev_mtx);
4692 4692          for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4693 4693                  nextdev = list_next(l2arc_dev_list, dev);
4694 4694                  if (vd == dev->l2ad_vdev) {
4695 4695                          remdev = dev;
4696 4696                          break;
4697 4697                  }
4698 4698          }
4699 4699          ASSERT(remdev != NULL);
4700 4700  
4701 4701          /*
4702 4702           * Remove device from global list
4703 4703           */
4704 4704          list_remove(l2arc_dev_list, remdev);
4705 4705          l2arc_dev_last = NULL;          /* may have been invalidated */
4706 4706          atomic_dec_64(&l2arc_ndev);
4707 4707          mutex_exit(&l2arc_dev_mtx);
4708 4708  
4709 4709          /*
4710 4710           * Clear all buflists and ARC references.  L2ARC device flush.
4711 4711           */
4712 4712          l2arc_evict(remdev, 0, B_TRUE);
4713 4713          list_destroy(remdev->l2ad_buflist);
4714 4714          kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4715 4715          kmem_free(remdev, sizeof (l2arc_dev_t));
4716 4716  }
4717 4717  
4718 4718  void
4719 4719  l2arc_init(void)
4720 4720  {
4721 4721          l2arc_thread_exit = 0;
4722 4722          l2arc_ndev = 0;
4723 4723          l2arc_writes_sent = 0;
4724 4724          l2arc_writes_done = 0;
4725 4725  
4726 4726          mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4727 4727          cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4728 4728          mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4729 4729          mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4730 4730          mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4731 4731  
4732 4732          l2arc_dev_list = &L2ARC_dev_list;
4733 4733          l2arc_free_on_write = &L2ARC_free_on_write;
4734 4734          list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4735 4735              offsetof(l2arc_dev_t, l2ad_node));
4736 4736          list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4737 4737              offsetof(l2arc_data_free_t, l2df_list_node));
4738 4738  }
4739 4739  
4740 4740  void
4741 4741  l2arc_fini(void)
4742 4742  {
4743 4743          /*
4744 4744           * This is called from dmu_fini(), which is called from spa_fini();
4745 4745           * Because of this, we can assume that all l2arc devices have
4746 4746           * already been removed when the pools themselves were removed.
4747 4747           */
4748 4748  
4749 4749          l2arc_do_free_on_write();
4750 4750  
4751 4751          mutex_destroy(&l2arc_feed_thr_lock);
4752 4752          cv_destroy(&l2arc_feed_thr_cv);
4753 4753          mutex_destroy(&l2arc_dev_mtx);
4754 4754          mutex_destroy(&l2arc_buflist_mtx);
4755 4755          mutex_destroy(&l2arc_free_on_write_mtx);
4756 4756  
4757 4757          list_destroy(l2arc_dev_list);
4758 4758          list_destroy(l2arc_free_on_write);
4759 4759  }
4760 4760  
4761 4761  void
4762 4762  l2arc_start(void)
4763 4763  {
4764 4764          if (!(spa_mode_global & FWRITE))
4765 4765                  return;
4766 4766  
4767 4767          (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4768 4768              TS_RUN, minclsyspri);
4769 4769  }
4770 4770  
4771 4771  void
4772 4772  l2arc_stop(void)
4773 4773  {
4774 4774          if (!(spa_mode_global & FWRITE))
4775 4775                  return;
4776 4776  
4777 4777          mutex_enter(&l2arc_feed_thr_lock);
4778 4778          cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
4779 4779          l2arc_thread_exit = 1;
4780 4780          while (l2arc_thread_exit != 0)
4781 4781                  cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4782 4782          mutex_exit(&l2arc_feed_thr_lock);
4783 4783  }

↓ open down ↓

1339 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX