illumos-gate Wdiff usr/src/uts/common/fs/zfs/arc.c

Print this page

6220 memleak in l2arc on debug build

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24   24   * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26   26   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  27   27   */
  28   28  
  29   29  /*
  30   30   * DVA-based Adjustable Replacement Cache
  31   31   *
  32   32   * While much of the theory of operation used here is
  33   33   * based on the self-tuning, low overhead replacement cache
  34   34   * presented by Megiddo and Modha at FAST 2003, there are some
  35   35   * significant differences:
  36   36   *
  37   37   * 1. The Megiddo and Modha model assumes any page is evictable.
  38   38   * Pages in its cache cannot be "locked" into memory.  This makes
  39   39   * the eviction algorithm simple: evict the last page in the list.
  40   40   * This also make the performance characteristics easy to reason
  41   41   * about.  Our cache is not so simple.  At any given moment, some
  42   42   * subset of the blocks in the cache are un-evictable because we
  43   43   * have handed out a reference to them.  Blocks are only evictable
  44   44   * when there are no external references active.  This makes
  45   45   * eviction far more problematic:  we choose to evict the evictable
  46   46   * blocks that are the "lowest" in the list.
  47   47   *
  48   48   * There are times when it is not possible to evict the requested
  49   49   * space.  In these circumstances we are unable to adjust the cache
  50   50   * size.  To prevent the cache growing unbounded at these times we
  51   51   * implement a "cache throttle" that slows the flow of new data
  52   52   * into the cache until we can make space available.
  53   53   *
  54   54   * 2. The Megiddo and Modha model assumes a fixed cache size.
  55   55   * Pages are evicted when the cache is full and there is a cache
  56   56   * miss.  Our model has a variable sized cache.  It grows with
  57   57   * high use, but also tries to react to memory pressure from the
  58   58   * operating system: decreasing its size when system memory is
  59   59   * tight.
  60   60   *
  61   61   * 3. The Megiddo and Modha model assumes a fixed page size. All
  62   62   * elements of the cache are therefore exactly the same size.  So
  63   63   * when adjusting the cache size following a cache miss, its simply
  64   64   * a matter of choosing a single page to evict.  In our model, we
  65   65   * have variable sized cache blocks (rangeing from 512 bytes to
  66   66   * 128K bytes).  We therefore choose a set of blocks to evict to make
  67   67   * space for a cache miss that approximates as closely as possible
  68   68   * the space used by the new block.
  69   69   *
  70   70   * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71   71   * by N. Megiddo & D. Modha, FAST 2003
  72   72   */
  73   73  
  74   74  /*
  75   75   * The locking model:
  76   76   *
  77   77   * A new reference to a cache buffer can be obtained in two
  78   78   * ways: 1) via a hash table lookup using the DVA as a key,
  79   79   * or 2) via one of the ARC lists.  The arc_read() interface
  80   80   * uses method 1, while the internal arc algorithms for
  81   81   * adjusting the cache use method 2.  We therefore provide two
  82   82   * types of locks: 1) the hash table lock array, and 2) the
  83   83   * arc list locks.
  84   84   *
  85   85   * Buffers do not have their own mutexes, rather they rely on the
  86   86   * hash table mutexes for the bulk of their protection (i.e. most
  87   87   * fields in the arc_buf_hdr_t are protected by these mutexes).
  88   88   *
  89   89   * buf_hash_find() returns the appropriate mutex (held) when it
  90   90   * locates the requested buffer in the hash table.  It returns
  91   91   * NULL for the mutex if the buffer was not in the table.
  92   92   *
  93   93   * buf_hash_remove() expects the appropriate hash mutex to be
  94   94   * already held before it is invoked.
  95   95   *
  96   96   * Each arc state also has a mutex which is used to protect the
  97   97   * buffer list associated with the state.  When attempting to
  98   98   * obtain a hash table lock while holding an arc list lock you
  99   99   * must use: mutex_tryenter() to avoid deadlock.  Also note that
 100  100   * the active state mutex must be held before the ghost state mutex.
 101  101   *
 102  102   * Arc buffers may have an associated eviction callback function.
 103  103   * This function will be invoked prior to removing the buffer (e.g.
 104  104   * in arc_do_user_evicts()).  Note however that the data associated
 105  105   * with the buffer may be evicted prior to the callback.  The callback
 106  106   * must be made with *no locks held* (to prevent deadlock).  Additionally,
 107  107   * the users of callbacks must ensure that their private data is
 108  108   * protected from simultaneous callbacks from arc_clear_callback()
 109  109   * and arc_do_user_evicts().
 110  110   *
 111  111   * Note that the majority of the performance stats are manipulated
 112  112   * with atomic operations.
 113  113   *
 114  114   * The L2ARC uses the l2ad_mtx on each vdev for the following:
 115  115   *
 116  116   *      - L2ARC buflist creation
 117  117   *      - L2ARC buflist eviction
 118  118   *      - L2ARC write completion, which walks L2ARC buflists
 119  119   *      - ARC header destruction, as it removes from L2ARC buflists
 120  120   *      - ARC header release, as it removes from L2ARC buflists
 121  121   */
 122  122  
 123  123  #include <sys/spa.h>
 124  124  #include <sys/zio.h>
 125  125  #include <sys/zio_compress.h>
 126  126  #include <sys/zfs_context.h>
 127  127  #include <sys/arc.h>
 128  128  #include <sys/refcount.h>
 129  129  #include <sys/vdev.h>
 130  130  #include <sys/vdev_impl.h>
 131  131  #include <sys/dsl_pool.h>
 132  132  #include <sys/multilist.h>
 133  133  #ifdef _KERNEL
 134  134  #include <sys/vmsystm.h>
 135  135  #include <vm/anon.h>
 136  136  #include <sys/fs/swapnode.h>
 137  137  #include <sys/dnlc.h>
 138  138  #endif
 139  139  #include <sys/callb.h>
 140  140  #include <sys/kstat.h>
 141  141  #include <zfs_fletcher.h>
 142  142  
 143  143  #ifndef _KERNEL
 144  144  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 145  145  boolean_t arc_watch = B_FALSE;
 146  146  int arc_procfd;
 147  147  #endif
 148  148  
 149  149  static kmutex_t         arc_reclaim_lock;
 150  150  static kcondvar_t       arc_reclaim_thread_cv;
 151  151  static boolean_t        arc_reclaim_thread_exit;
 152  152  static kcondvar_t       arc_reclaim_waiters_cv;
 153  153  
 154  154  static kmutex_t         arc_user_evicts_lock;
 155  155  static kcondvar_t       arc_user_evicts_cv;
 156  156  static boolean_t        arc_user_evicts_thread_exit;
 157  157  
 158  158  uint_t arc_reduce_dnlc_percent = 3;
 159  159  
 160  160  /*
 161  161   * The number of headers to evict in arc_evict_state_impl() before
 162  162   * dropping the sublist lock and evicting from another sublist. A lower
 163  163   * value means we're more likely to evict the "correct" header (i.e. the
 164  164   * oldest header in the arc state), but comes with higher overhead
 165  165   * (i.e. more invocations of arc_evict_state_impl()).
 166  166   */
 167  167  int zfs_arc_evict_batch_limit = 10;
 168  168  
 169  169  /*
 170  170   * The number of sublists used for each of the arc state lists. If this
 171  171   * is not set to a suitable value by the user, it will be configured to
 172  172   * the number of CPUs on the system in arc_init().
 173  173   */
 174  174  int zfs_arc_num_sublists_per_state = 0;
 175  175  
 176  176  /* number of seconds before growing cache again */
 177  177  static int              arc_grow_retry = 60;
 178  178  
 179  179  /* shift of arc_c for calculating overflow limit in arc_get_data_buf */
 180  180  int             zfs_arc_overflow_shift = 8;
 181  181  
 182  182  /* shift of arc_c for calculating both min and max arc_p */
 183  183  static int              arc_p_min_shift = 4;
 184  184  
 185  185  /* log2(fraction of arc to reclaim) */
 186  186  static int              arc_shrink_shift = 7;
 187  187  
 188  188  /*
 189  189   * log2(fraction of ARC which must be free to allow growing).
 190  190   * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
 191  191   * when reading a new block into the ARC, we will evict an equal-sized block
 192  192   * from the ARC.
 193  193   *
 194  194   * This must be less than arc_shrink_shift, so that when we shrink the ARC,
 195  195   * we will still not allow it to grow.
 196  196   */
 197  197  int                     arc_no_grow_shift = 5;
 198  198  
 199  199  
 200  200  /*
 201  201   * minimum lifespan of a prefetch block in clock ticks
 202  202   * (initialized in arc_init())
 203  203   */
 204  204  static int              arc_min_prefetch_lifespan;
 205  205  
 206  206  /*
 207  207   * If this percent of memory is free, don't throttle.
 208  208   */
 209  209  int arc_lotsfree_percent = 10;
 210  210  
 211  211  static int arc_dead;
 212  212  
 213  213  /*
 214  214   * The arc has filled available memory and has now warmed up.
 215  215   */
 216  216  static boolean_t arc_warm;
 217  217  
 218  218  /*
 219  219   * These tunables are for performance analysis.
 220  220   */
 221  221  uint64_t zfs_arc_max;
 222  222  uint64_t zfs_arc_min;
 223  223  uint64_t zfs_arc_meta_limit = 0;
 224  224  uint64_t zfs_arc_meta_min = 0;
 225  225  int zfs_arc_grow_retry = 0;
 226  226  int zfs_arc_shrink_shift = 0;
 227  227  int zfs_arc_p_min_shift = 0;
 228  228  int zfs_disable_dup_eviction = 0;
 229  229  int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 230  230  
 231  231  /*
 232  232   * Note that buffers can be in one of 6 states:
 233  233   *      ARC_anon        - anonymous (discussed below)
 234  234   *      ARC_mru         - recently used, currently cached
 235  235   *      ARC_mru_ghost   - recentely used, no longer in cache
 236  236   *      ARC_mfu         - frequently used, currently cached
 237  237   *      ARC_mfu_ghost   - frequently used, no longer in cache
 238  238   *      ARC_l2c_only    - exists in L2ARC but not other states
 239  239   * When there are no active references to the buffer, they are
 240  240   * are linked onto a list in one of these arc states.  These are
 241  241   * the only buffers that can be evicted or deleted.  Within each
 242  242   * state there are multiple lists, one for meta-data and one for
 243  243   * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 244  244   * etc.) is tracked separately so that it can be managed more
 245  245   * explicitly: favored over data, limited explicitly.
 246  246   *
 247  247   * Anonymous buffers are buffers that are not associated with
 248  248   * a DVA.  These are buffers that hold dirty block copies
 249  249   * before they are written to stable storage.  By definition,
 250  250   * they are "ref'd" and are considered part of arc_mru
 251  251   * that cannot be freed.  Generally, they will aquire a DVA
 252  252   * as they are written and migrate onto the arc_mru list.
 253  253   *
 254  254   * The ARC_l2c_only state is for buffers that are in the second
 255  255   * level ARC but no longer in any of the ARC_m* lists.  The second
 256  256   * level ARC itself may also contain buffers that are in any of
 257  257   * the ARC_m* states - meaning that a buffer can exist in two
 258  258   * places.  The reason for the ARC_l2c_only state is to keep the
 259  259   * buffer header in the hash table, so that reads that hit the
 260  260   * second level ARC benefit from these fast lookups.
 261  261   */
 262  262  
 263  263  typedef struct arc_state {
 264  264          /*
 265  265           * list of evictable buffers
 266  266           */
 267  267          multilist_t arcs_list[ARC_BUFC_NUMTYPES];
 268  268          /*
 269  269           * total amount of evictable data in this state
 270  270           */
 271  271          uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
 272  272          /*
 273  273           * total amount of data in this state; this includes: evictable,
 274  274           * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
 275  275           */
 276  276          refcount_t arcs_size;
 277  277  } arc_state_t;
 278  278  
 279  279  /* The 6 states: */
 280  280  static arc_state_t ARC_anon;
 281  281  static arc_state_t ARC_mru;
 282  282  static arc_state_t ARC_mru_ghost;
 283  283  static arc_state_t ARC_mfu;
 284  284  static arc_state_t ARC_mfu_ghost;
 285  285  static arc_state_t ARC_l2c_only;
 286  286  
 287  287  typedef struct arc_stats {
 288  288          kstat_named_t arcstat_hits;
 289  289          kstat_named_t arcstat_misses;
 290  290          kstat_named_t arcstat_demand_data_hits;
 291  291          kstat_named_t arcstat_demand_data_misses;
 292  292          kstat_named_t arcstat_demand_metadata_hits;
 293  293          kstat_named_t arcstat_demand_metadata_misses;
 294  294          kstat_named_t arcstat_prefetch_data_hits;
 295  295          kstat_named_t arcstat_prefetch_data_misses;
 296  296          kstat_named_t arcstat_prefetch_metadata_hits;
 297  297          kstat_named_t arcstat_prefetch_metadata_misses;
 298  298          kstat_named_t arcstat_mru_hits;
 299  299          kstat_named_t arcstat_mru_ghost_hits;
 300  300          kstat_named_t arcstat_mfu_hits;
 301  301          kstat_named_t arcstat_mfu_ghost_hits;
 302  302          kstat_named_t arcstat_deleted;
 303  303          /*
 304  304           * Number of buffers that could not be evicted because the hash lock
 305  305           * was held by another thread.  The lock may not necessarily be held
 306  306           * by something using the same buffer, since hash locks are shared
 307  307           * by multiple buffers.
 308  308           */
 309  309          kstat_named_t arcstat_mutex_miss;
 310  310          /*
 311  311           * Number of buffers skipped because they have I/O in progress, are
 312  312           * indrect prefetch buffers that have not lived long enough, or are
 313  313           * not from the spa we're trying to evict from.
 314  314           */
 315  315          kstat_named_t arcstat_evict_skip;
 316  316          /*
 317  317           * Number of times arc_evict_state() was unable to evict enough
 318  318           * buffers to reach it's target amount.
 319  319           */
 320  320          kstat_named_t arcstat_evict_not_enough;
 321  321          kstat_named_t arcstat_evict_l2_cached;
 322  322          kstat_named_t arcstat_evict_l2_eligible;
 323  323          kstat_named_t arcstat_evict_l2_ineligible;
 324  324          kstat_named_t arcstat_evict_l2_skip;
 325  325          kstat_named_t arcstat_hash_elements;
 326  326          kstat_named_t arcstat_hash_elements_max;
 327  327          kstat_named_t arcstat_hash_collisions;
 328  328          kstat_named_t arcstat_hash_chains;
 329  329          kstat_named_t arcstat_hash_chain_max;
 330  330          kstat_named_t arcstat_p;
 331  331          kstat_named_t arcstat_c;
 332  332          kstat_named_t arcstat_c_min;
 333  333          kstat_named_t arcstat_c_max;
 334  334          kstat_named_t arcstat_size;
 335  335          /*
 336  336           * Number of bytes consumed by internal ARC structures necessary
 337  337           * for tracking purposes; these structures are not actually
 338  338           * backed by ARC buffers. This includes arc_buf_hdr_t structures
 339  339           * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
 340  340           * caches), and arc_buf_t structures (allocated via arc_buf_t
 341  341           * cache).
 342  342           */
 343  343          kstat_named_t arcstat_hdr_size;
 344  344          /*
 345  345           * Number of bytes consumed by ARC buffers of type equal to
 346  346           * ARC_BUFC_DATA. This is generally consumed by buffers backing
 347  347           * on disk user data (e.g. plain file contents).
 348  348           */
 349  349          kstat_named_t arcstat_data_size;
 350  350          /*
 351  351           * Number of bytes consumed by ARC buffers of type equal to
 352  352           * ARC_BUFC_METADATA. This is generally consumed by buffers
 353  353           * backing on disk data that is used for internal ZFS
 354  354           * structures (e.g. ZAP, dnode, indirect blocks, etc).
 355  355           */
 356  356          kstat_named_t arcstat_metadata_size;
 357  357          /*
 358  358           * Number of bytes consumed by various buffers and structures
 359  359           * not actually backed with ARC buffers. This includes bonus
 360  360           * buffers (allocated directly via zio_buf_* functions),
 361  361           * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
 362  362           * cache), and dnode_t structures (allocated via dnode_t cache).
 363  363           */
 364  364          kstat_named_t arcstat_other_size;
 365  365          /*
 366  366           * Total number of bytes consumed by ARC buffers residing in the
 367  367           * arc_anon state. This includes *all* buffers in the arc_anon
 368  368           * state; e.g. data, metadata, evictable, and unevictable buffers
 369  369           * are all included in this value.
 370  370           */
 371  371          kstat_named_t arcstat_anon_size;
 372  372          /*
 373  373           * Number of bytes consumed by ARC buffers that meet the
 374  374           * following criteria: backing buffers of type ARC_BUFC_DATA,
 375  375           * residing in the arc_anon state, and are eligible for eviction
 376  376           * (e.g. have no outstanding holds on the buffer).
 377  377           */
 378  378          kstat_named_t arcstat_anon_evictable_data;
 379  379          /*
 380  380           * Number of bytes consumed by ARC buffers that meet the
 381  381           * following criteria: backing buffers of type ARC_BUFC_METADATA,
 382  382           * residing in the arc_anon state, and are eligible for eviction
 383  383           * (e.g. have no outstanding holds on the buffer).
 384  384           */
 385  385          kstat_named_t arcstat_anon_evictable_metadata;
 386  386          /*
 387  387           * Total number of bytes consumed by ARC buffers residing in the
 388  388           * arc_mru state. This includes *all* buffers in the arc_mru
 389  389           * state; e.g. data, metadata, evictable, and unevictable buffers
 390  390           * are all included in this value.
 391  391           */
 392  392          kstat_named_t arcstat_mru_size;
 393  393          /*
 394  394           * Number of bytes consumed by ARC buffers that meet the
 395  395           * following criteria: backing buffers of type ARC_BUFC_DATA,
 396  396           * residing in the arc_mru state, and are eligible for eviction
 397  397           * (e.g. have no outstanding holds on the buffer).
 398  398           */
 399  399          kstat_named_t arcstat_mru_evictable_data;
 400  400          /*
 401  401           * Number of bytes consumed by ARC buffers that meet the
 402  402           * following criteria: backing buffers of type ARC_BUFC_METADATA,
 403  403           * residing in the arc_mru state, and are eligible for eviction
 404  404           * (e.g. have no outstanding holds on the buffer).
 405  405           */
 406  406          kstat_named_t arcstat_mru_evictable_metadata;
 407  407          /*
 408  408           * Total number of bytes that *would have been* consumed by ARC
 409  409           * buffers in the arc_mru_ghost state. The key thing to note
 410  410           * here, is the fact that this size doesn't actually indicate
 411  411           * RAM consumption. The ghost lists only consist of headers and
 412  412           * don't actually have ARC buffers linked off of these headers.
 413  413           * Thus, *if* the headers had associated ARC buffers, these
 414  414           * buffers *would have* consumed this number of bytes.
 415  415           */
 416  416          kstat_named_t arcstat_mru_ghost_size;
 417  417          /*
 418  418           * Number of bytes that *would have been* consumed by ARC
 419  419           * buffers that are eligible for eviction, of type
 420  420           * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
 421  421           */
 422  422          kstat_named_t arcstat_mru_ghost_evictable_data;
 423  423          /*
 424  424           * Number of bytes that *would have been* consumed by ARC
 425  425           * buffers that are eligible for eviction, of type
 426  426           * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
 427  427           */
 428  428          kstat_named_t arcstat_mru_ghost_evictable_metadata;
 429  429          /*
 430  430           * Total number of bytes consumed by ARC buffers residing in the
 431  431           * arc_mfu state. This includes *all* buffers in the arc_mfu
 432  432           * state; e.g. data, metadata, evictable, and unevictable buffers
 433  433           * are all included in this value.
 434  434           */
 435  435          kstat_named_t arcstat_mfu_size;
 436  436          /*
 437  437           * Number of bytes consumed by ARC buffers that are eligible for
 438  438           * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
 439  439           * state.
 440  440           */
 441  441          kstat_named_t arcstat_mfu_evictable_data;
 442  442          /*
 443  443           * Number of bytes consumed by ARC buffers that are eligible for
 444  444           * eviction, of type ARC_BUFC_METADATA, and reside in the
 445  445           * arc_mfu state.
 446  446           */
 447  447          kstat_named_t arcstat_mfu_evictable_metadata;
 448  448          /*
 449  449           * Total number of bytes that *would have been* consumed by ARC
 450  450           * buffers in the arc_mfu_ghost state. See the comment above
 451  451           * arcstat_mru_ghost_size for more details.
 452  452           */
 453  453          kstat_named_t arcstat_mfu_ghost_size;
 454  454          /*
 455  455           * Number of bytes that *would have been* consumed by ARC
 456  456           * buffers that are eligible for eviction, of type
 457  457           * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
 458  458           */
 459  459          kstat_named_t arcstat_mfu_ghost_evictable_data;
 460  460          /*
 461  461           * Number of bytes that *would have been* consumed by ARC
 462  462           * buffers that are eligible for eviction, of type
 463  463           * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
 464  464           */
 465  465          kstat_named_t arcstat_mfu_ghost_evictable_metadata;
 466  466          kstat_named_t arcstat_l2_hits;
 467  467          kstat_named_t arcstat_l2_misses;
 468  468          kstat_named_t arcstat_l2_feeds;
 469  469          kstat_named_t arcstat_l2_rw_clash;
 470  470          kstat_named_t arcstat_l2_read_bytes;
 471  471          kstat_named_t arcstat_l2_write_bytes;
 472  472          kstat_named_t arcstat_l2_writes_sent;
 473  473          kstat_named_t arcstat_l2_writes_done;
 474  474          kstat_named_t arcstat_l2_writes_error;
 475  475          kstat_named_t arcstat_l2_writes_lock_retry;
 476  476          kstat_named_t arcstat_l2_evict_lock_retry;
 477  477          kstat_named_t arcstat_l2_evict_reading;
 478  478          kstat_named_t arcstat_l2_evict_l1cached;
 479  479          kstat_named_t arcstat_l2_free_on_write;
 480  480          kstat_named_t arcstat_l2_cdata_free_on_write;
 481  481          kstat_named_t arcstat_l2_abort_lowmem;
 482  482          kstat_named_t arcstat_l2_cksum_bad;
 483  483          kstat_named_t arcstat_l2_io_error;
 484  484          kstat_named_t arcstat_l2_size;
 485  485          kstat_named_t arcstat_l2_asize;
 486  486          kstat_named_t arcstat_l2_hdr_size;
 487  487          kstat_named_t arcstat_l2_compress_successes;
 488  488          kstat_named_t arcstat_l2_compress_zeros;
 489  489          kstat_named_t arcstat_l2_compress_failures;
 490  490          kstat_named_t arcstat_memory_throttle_count;
 491  491          kstat_named_t arcstat_duplicate_buffers;
 492  492          kstat_named_t arcstat_duplicate_buffers_size;
 493  493          kstat_named_t arcstat_duplicate_reads;
 494  494          kstat_named_t arcstat_meta_used;
 495  495          kstat_named_t arcstat_meta_limit;
 496  496          kstat_named_t arcstat_meta_max;
 497  497          kstat_named_t arcstat_meta_min;
 498  498  } arc_stats_t;
 499  499  
 500  500  static arc_stats_t arc_stats = {
 501  501          { "hits",                       KSTAT_DATA_UINT64 },
 502  502          { "misses",                     KSTAT_DATA_UINT64 },
 503  503          { "demand_data_hits",           KSTAT_DATA_UINT64 },
 504  504          { "demand_data_misses",         KSTAT_DATA_UINT64 },
 505  505          { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 506  506          { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 507  507          { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 508  508          { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 509  509          { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 510  510          { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 511  511          { "mru_hits",                   KSTAT_DATA_UINT64 },
 512  512          { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 513  513          { "mfu_hits",                   KSTAT_DATA_UINT64 },
 514  514          { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 515  515          { "deleted",                    KSTAT_DATA_UINT64 },
 516  516          { "mutex_miss",                 KSTAT_DATA_UINT64 },
 517  517          { "evict_skip",                 KSTAT_DATA_UINT64 },
 518  518          { "evict_not_enough",           KSTAT_DATA_UINT64 },
 519  519          { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 520  520          { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 521  521          { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 522  522          { "evict_l2_skip",              KSTAT_DATA_UINT64 },
 523  523          { "hash_elements",              KSTAT_DATA_UINT64 },
 524  524          { "hash_elements_max",          KSTAT_DATA_UINT64 },
 525  525          { "hash_collisions",            KSTAT_DATA_UINT64 },
 526  526          { "hash_chains",                KSTAT_DATA_UINT64 },
 527  527          { "hash_chain_max",             KSTAT_DATA_UINT64 },
 528  528          { "p",                          KSTAT_DATA_UINT64 },
 529  529          { "c",                          KSTAT_DATA_UINT64 },
 530  530          { "c_min",                      KSTAT_DATA_UINT64 },
 531  531          { "c_max",                      KSTAT_DATA_UINT64 },
 532  532          { "size",                       KSTAT_DATA_UINT64 },
 533  533          { "hdr_size",                   KSTAT_DATA_UINT64 },
 534  534          { "data_size",                  KSTAT_DATA_UINT64 },
 535  535          { "metadata_size",              KSTAT_DATA_UINT64 },
 536  536          { "other_size",                 KSTAT_DATA_UINT64 },
 537  537          { "anon_size",                  KSTAT_DATA_UINT64 },
 538  538          { "anon_evictable_data",        KSTAT_DATA_UINT64 },
 539  539          { "anon_evictable_metadata",    KSTAT_DATA_UINT64 },
 540  540          { "mru_size",                   KSTAT_DATA_UINT64 },
 541  541          { "mru_evictable_data",         KSTAT_DATA_UINT64 },
 542  542          { "mru_evictable_metadata",     KSTAT_DATA_UINT64 },
 543  543          { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 544  544          { "mru_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 545  545          { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 546  546          { "mfu_size",                   KSTAT_DATA_UINT64 },
 547  547          { "mfu_evictable_data",         KSTAT_DATA_UINT64 },
 548  548          { "mfu_evictable_metadata",     KSTAT_DATA_UINT64 },
 549  549          { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 550  550          { "mfu_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 551  551          { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 552  552          { "l2_hits",                    KSTAT_DATA_UINT64 },
 553  553          { "l2_misses",                  KSTAT_DATA_UINT64 },
 554  554          { "l2_feeds",                   KSTAT_DATA_UINT64 },
 555  555          { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 556  556          { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 557  557          { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 558  558          { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 559  559          { "l2_writes_done",             KSTAT_DATA_UINT64 },
 560  560          { "l2_writes_error",            KSTAT_DATA_UINT64 },
 561  561          { "l2_writes_lock_retry",       KSTAT_DATA_UINT64 },
 562  562          { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 563  563          { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 564  564          { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
 565  565          { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 566  566          { "l2_cdata_free_on_write",     KSTAT_DATA_UINT64 },
 567  567          { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 568  568          { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 569  569          { "l2_io_error",                KSTAT_DATA_UINT64 },
 570  570          { "l2_size",                    KSTAT_DATA_UINT64 },
 571  571          { "l2_asize",                   KSTAT_DATA_UINT64 },
 572  572          { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 573  573          { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 574  574          { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 575  575          { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 576  576          { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 577  577          { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 578  578          { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 579  579          { "duplicate_reads",            KSTAT_DATA_UINT64 },
 580  580          { "arc_meta_used",              KSTAT_DATA_UINT64 },
 581  581          { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 582  582          { "arc_meta_max",               KSTAT_DATA_UINT64 },
 583  583          { "arc_meta_min",               KSTAT_DATA_UINT64 }
 584  584  };
 585  585  
 586  586  #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 587  587  
 588  588  #define ARCSTAT_INCR(stat, val) \
 589  589          atomic_add_64(&arc_stats.stat.value.ui64, (val))
 590  590  
 591  591  #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 592  592  #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 593  593  
 594  594  #define ARCSTAT_MAX(stat, val) {                                        \
 595  595          uint64_t m;                                                     \
 596  596          while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 597  597              (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 598  598                  continue;                                               \
 599  599  }
 600  600  
 601  601  #define ARCSTAT_MAXSTAT(stat) \
 602  602          ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 603  603  
 604  604  /*
 605  605   * We define a macro to allow ARC hits/misses to be easily broken down by
 606  606   * two separate conditions, giving a total of four different subtypes for
 607  607   * each of hits and misses (so eight statistics total).
 608  608   */
 609  609  #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 610  610          if (cond1) {                                                    \
 611  611                  if (cond2) {                                            \
 612  612                          ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 613  613                  } else {                                                \
 614  614                          ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 615  615                  }                                                       \
 616  616          } else {                                                        \
 617  617                  if (cond2) {                                            \
 618  618                          ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 619  619                  } else {                                                \
 620  620                          ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 621  621                  }                                                       \
 622  622          }
 623  623  
 624  624  kstat_t                 *arc_ksp;
 625  625  static arc_state_t      *arc_anon;
 626  626  static arc_state_t      *arc_mru;
 627  627  static arc_state_t      *arc_mru_ghost;
 628  628  static arc_state_t      *arc_mfu;
 629  629  static arc_state_t      *arc_mfu_ghost;
 630  630  static arc_state_t      *arc_l2c_only;
 631  631  
 632  632  /*
 633  633   * There are several ARC variables that are critical to export as kstats --
 634  634   * but we don't want to have to grovel around in the kstat whenever we wish to
 635  635   * manipulate them.  For these variables, we therefore define them to be in
 636  636   * terms of the statistic variable.  This assures that we are not introducing
 637  637   * the possibility of inconsistency by having shadow copies of the variables,
 638  638   * while still allowing the code to be readable.
 639  639   */
 640  640  #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 641  641  #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 642  642  #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 643  643  #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 644  644  #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 645  645  #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 646  646  #define arc_meta_min    ARCSTAT(arcstat_meta_min) /* min size for metadata */
 647  647  #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 648  648  #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 649  649  
 650  650  #define L2ARC_IS_VALID_COMPRESS(_c_) \
 651  651          ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 652  652  
 653  653  static int              arc_no_grow;    /* Don't try to grow cache size */
 654  654  static uint64_t         arc_tempreserve;
 655  655  static uint64_t         arc_loaned_bytes;
 656  656  
 657  657  typedef struct arc_callback arc_callback_t;
 658  658  
 659  659  struct arc_callback {
 660  660          void                    *acb_private;
 661  661          arc_done_func_t         *acb_done;
 662  662          arc_buf_t               *acb_buf;
 663  663          zio_t                   *acb_zio_dummy;
 664  664          arc_callback_t          *acb_next;
 665  665  };
 666  666  
 667  667  typedef struct arc_write_callback arc_write_callback_t;
 668  668  
 669  669  struct arc_write_callback {
 670  670          void            *awcb_private;
 671  671          arc_done_func_t *awcb_ready;
 672  672          arc_done_func_t *awcb_physdone;
 673  673          arc_done_func_t *awcb_done;
 674  674          arc_buf_t       *awcb_buf;
 675  675  };
 676  676  
 677  677  /*
 678  678   * ARC buffers are separated into multiple structs as a memory saving measure:
 679  679   *   - Common fields struct, always defined, and embedded within it:
 680  680   *       - L2-only fields, always allocated but undefined when not in L2ARC
 681  681   *       - L1-only fields, only allocated when in L1ARC
 682  682   *
 683  683   *           Buffer in L1                     Buffer only in L2
 684  684   *    +------------------------+          +------------------------+
 685  685   *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
 686  686   *    |                        |          |                        |
 687  687   *    |                        |          |                        |
 688  688   *    |                        |          |                        |
 689  689   *    +------------------------+          +------------------------+
 690  690   *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
 691  691   *    | (undefined if L1-only) |          |                        |
 692  692   *    +------------------------+          +------------------------+
 693  693   *    | l1arc_buf_hdr_t        |
 694  694   *    |                        |
 695  695   *    |                        |
 696  696   *    |                        |
 697  697   *    |                        |
 698  698   *    +------------------------+
 699  699   *
 700  700   * Because it's possible for the L2ARC to become extremely large, we can wind
 701  701   * up eating a lot of memory in L2ARC buffer headers, so the size of a header
 702  702   * is minimized by only allocating the fields necessary for an L1-cached buffer
 703  703   * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
 704  704   * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
 705  705   * words in pointers. arc_hdr_realloc() is used to switch a header between
 706  706   * these two allocation states.
 707  707   */
 708  708  typedef struct l1arc_buf_hdr {
 709  709          kmutex_t                b_freeze_lock;
 710  710  #ifdef ZFS_DEBUG
 711  711          /*
 712  712           * used for debugging wtih kmem_flags - by allocating and freeing
 713  713           * b_thawed when the buffer is thawed, we get a record of the stack
 714  714           * trace that thawed it.
 715  715           */
 716  716          void                    *b_thawed;
 717  717  #endif
 718  718  
 719  719          arc_buf_t               *b_buf;
 720  720          uint32_t                b_datacnt;
 721  721          /* for waiting on writes to complete */
 722  722          kcondvar_t              b_cv;
 723  723  
 724  724          /* protected by arc state mutex */
 725  725          arc_state_t             *b_state;
 726  726          multilist_node_t        b_arc_node;
 727  727  
 728  728          /* updated atomically */
 729  729          clock_t                 b_arc_access;
 730  730  
 731  731          /* self protecting */
 732  732          refcount_t              b_refcnt;
 733  733  
 734  734          arc_callback_t          *b_acb;
 735  735          /* temporary buffer holder for in-flight compressed data */
 736  736          void                    *b_tmp_cdata;
 737  737  } l1arc_buf_hdr_t;
 738  738  
 739  739  typedef struct l2arc_dev l2arc_dev_t;
 740  740  
 741  741  typedef struct l2arc_buf_hdr {
 742  742          /* protected by arc_buf_hdr mutex */
 743  743          l2arc_dev_t             *b_dev;         /* L2ARC device */
 744  744          uint64_t                b_daddr;        /* disk address, offset byte */
 745  745          /* real alloc'd buffer size depending on b_compress applied */
 746  746          int32_t                 b_asize;
 747  747          uint8_t                 b_compress;
 748  748  
 749  749          list_node_t             b_l2node;
 750  750  } l2arc_buf_hdr_t;
 751  751  
 752  752  struct arc_buf_hdr {
 753  753          /* protected by hash lock */
 754  754          dva_t                   b_dva;
 755  755          uint64_t                b_birth;
 756  756          /*
 757  757           * Even though this checksum is only set/verified when a buffer is in
 758  758           * the L1 cache, it needs to be in the set of common fields because it
 759  759           * must be preserved from the time before a buffer is written out to
 760  760           * L2ARC until after it is read back in.
 761  761           */
 762  762          zio_cksum_t             *b_freeze_cksum;
 763  763  
 764  764          arc_buf_hdr_t           *b_hash_next;
 765  765          arc_flags_t             b_flags;
 766  766  
 767  767          /* immutable */
 768  768          int32_t                 b_size;
 769  769          uint64_t                b_spa;
 770  770  
 771  771          /* L2ARC fields. Undefined when not in L2ARC. */
 772  772          l2arc_buf_hdr_t         b_l2hdr;
 773  773          /* L1ARC fields. Undefined when in l2arc_only state */
 774  774          l1arc_buf_hdr_t         b_l1hdr;
 775  775  };
 776  776  
 777  777  static arc_buf_t *arc_eviction_list;
 778  778  static arc_buf_hdr_t arc_eviction_hdr;
 779  779  
 780  780  #define GHOST_STATE(state)      \
 781  781          ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 782  782          (state) == arc_l2c_only)
 783  783  
 784  784  #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 785  785  #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 786  786  #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 787  787  #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
 788  788  #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
 789  789  #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
 790  790  
 791  791  #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
 792  792  #define HDR_L2COMPRESS(hdr)     ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
 793  793  #define HDR_L2_READING(hdr)     \
 794  794              (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&      \
 795  795              ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 796  796  #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 797  797  #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 798  798  #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 799  799  
 800  800  #define HDR_ISTYPE_METADATA(hdr)        \
 801  801              ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 802  802  #define HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
 803  803  
 804  804  #define HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 805  805  #define HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 806  806  
 807  807  /*
 808  808   * Other sizes
 809  809   */
 810  810  
 811  811  #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 812  812  #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 813  813  
 814  814  /*
 815  815   * Hash table routines
 816  816   */
 817  817  
 818  818  #define HT_LOCK_PAD     64
 819  819  
 820  820  struct ht_lock {
 821  821          kmutex_t        ht_lock;
 822  822  #ifdef _KERNEL
 823  823          unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 824  824  #endif
 825  825  };
 826  826  
 827  827  #define BUF_LOCKS 256
 828  828  typedef struct buf_hash_table {
 829  829          uint64_t ht_mask;
 830  830          arc_buf_hdr_t **ht_table;
 831  831          struct ht_lock ht_locks[BUF_LOCKS];
 832  832  } buf_hash_table_t;
 833  833  
 834  834  static buf_hash_table_t buf_hash_table;
 835  835  
 836  836  #define BUF_HASH_INDEX(spa, dva, birth) \
 837  837          (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 838  838  #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 839  839  #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 840  840  #define HDR_LOCK(hdr) \
 841  841          (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 842  842  
 843  843  uint64_t zfs_crc64_table[256];
 844  844  
 845  845  /*
 846  846   * Level 2 ARC
 847  847   */
 848  848  
 849  849  #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 850  850  #define L2ARC_HEADROOM          2                       /* num of writes */
 851  851  /*
 852  852   * If we discover during ARC scan any buffers to be compressed, we boost
 853  853   * our headroom for the next scanning cycle by this percentage multiple.
 854  854   */
 855  855  #define L2ARC_HEADROOM_BOOST    200
 856  856  #define L2ARC_FEED_SECS         1               /* caching interval secs */
 857  857  #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 858  858  
 859  859  /*
 860  860   * Used to distinguish headers that are being process by
 861  861   * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
 862  862   * address. This can happen when the header is added to the l2arc's list
 863  863   * of buffers to write in the first stage of l2arc_write_buffers(), but
 864  864   * has not yet been written out which happens in the second stage of
 865  865   * l2arc_write_buffers().
 866  866   */
 867  867  #define L2ARC_ADDR_UNSET        ((uint64_t)(-1))
 868  868  
 869  869  #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 870  870  #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 871  871  
 872  872  /* L2ARC Performance Tunables */
 873  873  uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 874  874  uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 875  875  uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 876  876  uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 877  877  uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 878  878  uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 879  879  boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 880  880  boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 881  881  boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 882  882  
 883  883  /*
 884  884   * L2ARC Internals
 885  885   */
 886  886  struct l2arc_dev {
 887  887          vdev_t                  *l2ad_vdev;     /* vdev */
 888  888          spa_t                   *l2ad_spa;      /* spa */
 889  889          uint64_t                l2ad_hand;      /* next write location */
 890  890          uint64_t                l2ad_start;     /* first addr on device */
 891  891          uint64_t                l2ad_end;       /* last addr on device */
 892  892          boolean_t               l2ad_first;     /* first sweep through */
 893  893          boolean_t               l2ad_writing;   /* currently writing */
 894  894          kmutex_t                l2ad_mtx;       /* lock for buffer list */
 895  895          list_t                  l2ad_buflist;   /* buffer list */
 896  896          list_node_t             l2ad_node;      /* device list node */
 897  897          refcount_t              l2ad_alloc;     /* allocated bytes */
 898  898  };
 899  899  
 900  900  static list_t L2ARC_dev_list;                   /* device list */
 901  901  static list_t *l2arc_dev_list;                  /* device list pointer */
 902  902  static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 903  903  static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 904  904  static list_t L2ARC_free_on_write;              /* free after write buf list */
 905  905  static list_t *l2arc_free_on_write;             /* free after write list ptr */
 906  906  static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 907  907  static uint64_t l2arc_ndev;                     /* number of devices */
 908  908  
 909  909  typedef struct l2arc_read_callback {
 910  910          arc_buf_t               *l2rcb_buf;             /* read buffer */
 911  911          spa_t                   *l2rcb_spa;             /* spa */
 912  912          blkptr_t                l2rcb_bp;               /* original blkptr */
 913  913          zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
 914  914          int                     l2rcb_flags;            /* original flags */
 915  915          enum zio_compress       l2rcb_compress;         /* applied compress */
 916  916  } l2arc_read_callback_t;
 917  917  
 918  918  typedef struct l2arc_write_callback {
 919  919          l2arc_dev_t     *l2wcb_dev;             /* device info */
 920  920          arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 921  921  } l2arc_write_callback_t;
 922  922  
 923  923  typedef struct l2arc_data_free {
 924  924          /* protected by l2arc_free_on_write_mtx */
 925  925          void            *l2df_data;
 926  926          size_t          l2df_size;
 927  927          void            (*l2df_func)(void *, size_t);
 928  928          list_node_t     l2df_list_node;
 929  929  } l2arc_data_free_t;
 930  930  
 931  931  static kmutex_t l2arc_feed_thr_lock;
 932  932  static kcondvar_t l2arc_feed_thr_cv;
 933  933  static uint8_t l2arc_thread_exit;
 934  934  
 935  935  static void arc_get_data_buf(arc_buf_t *);
 936  936  static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 937  937  static boolean_t arc_is_overflowing();
 938  938  static void arc_buf_watch(arc_buf_t *);
 939  939  
 940  940  static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 941  941  static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 942  942  
 943  943  static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 944  944  static void l2arc_read_done(zio_t *);
 945  945  
 946  946  static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
 947  947  static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
 948  948  static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
 949  949  
 950  950  static uint64_t
 951  951  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 952  952  {
 953  953          uint8_t *vdva = (uint8_t *)dva;
 954  954          uint64_t crc = -1ULL;
 955  955          int i;
 956  956  
 957  957          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 958  958  
 959  959          for (i = 0; i < sizeof (dva_t); i++)
 960  960                  crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 961  961  
 962  962          crc ^= (spa>>8) ^ birth;
 963  963  
 964  964          return (crc);
 965  965  }
 966  966  
 967  967  #define BUF_EMPTY(buf)                                          \
 968  968          ((buf)->b_dva.dva_word[0] == 0 &&                       \
 969  969          (buf)->b_dva.dva_word[1] == 0)
 970  970  
 971  971  #define BUF_EQUAL(spa, dva, birth, buf)                         \
 972  972          ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 973  973          ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 974  974          ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 975  975  
 976  976  static void
 977  977  buf_discard_identity(arc_buf_hdr_t *hdr)
 978  978  {
 979  979          hdr->b_dva.dva_word[0] = 0;
 980  980          hdr->b_dva.dva_word[1] = 0;
 981  981          hdr->b_birth = 0;
 982  982  }
 983  983  
 984  984  static arc_buf_hdr_t *
 985  985  buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 986  986  {
 987  987          const dva_t *dva = BP_IDENTITY(bp);
 988  988          uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 989  989          uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 990  990          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 991  991          arc_buf_hdr_t *hdr;
 992  992  
 993  993          mutex_enter(hash_lock);
 994  994          for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 995  995              hdr = hdr->b_hash_next) {
 996  996                  if (BUF_EQUAL(spa, dva, birth, hdr)) {
 997  997                          *lockp = hash_lock;
 998  998                          return (hdr);
 999  999                  }
1000 1000          }
1001 1001          mutex_exit(hash_lock);
1002 1002          *lockp = NULL;
1003 1003          return (NULL);
1004 1004  }
1005 1005  
1006 1006  /*
1007 1007   * Insert an entry into the hash table.  If there is already an element
1008 1008   * equal to elem in the hash table, then the already existing element
1009 1009   * will be returned and the new element will not be inserted.
1010 1010   * Otherwise returns NULL.
1011 1011   * If lockp == NULL, the caller is assumed to already hold the hash lock.
1012 1012   */
1013 1013  static arc_buf_hdr_t *
1014 1014  buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1015 1015  {
1016 1016          uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1017 1017          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1018 1018          arc_buf_hdr_t *fhdr;
1019 1019          uint32_t i;
1020 1020  
1021 1021          ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1022 1022          ASSERT(hdr->b_birth != 0);
1023 1023          ASSERT(!HDR_IN_HASH_TABLE(hdr));
1024 1024  
1025 1025          if (lockp != NULL) {
1026 1026                  *lockp = hash_lock;
1027 1027                  mutex_enter(hash_lock);
1028 1028          } else {
1029 1029                  ASSERT(MUTEX_HELD(hash_lock));
1030 1030          }
1031 1031  
1032 1032          for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1033 1033              fhdr = fhdr->b_hash_next, i++) {
1034 1034                  if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1035 1035                          return (fhdr);
1036 1036          }
1037 1037  
1038 1038          hdr->b_hash_next = buf_hash_table.ht_table[idx];
1039 1039          buf_hash_table.ht_table[idx] = hdr;
1040 1040          hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
1041 1041  
1042 1042          /* collect some hash table performance data */
1043 1043          if (i > 0) {
1044 1044                  ARCSTAT_BUMP(arcstat_hash_collisions);
1045 1045                  if (i == 1)
1046 1046                          ARCSTAT_BUMP(arcstat_hash_chains);
1047 1047  
1048 1048                  ARCSTAT_MAX(arcstat_hash_chain_max, i);
1049 1049          }
1050 1050  
1051 1051          ARCSTAT_BUMP(arcstat_hash_elements);
1052 1052          ARCSTAT_MAXSTAT(arcstat_hash_elements);
1053 1053  
1054 1054          return (NULL);
1055 1055  }
1056 1056  
1057 1057  static void
1058 1058  buf_hash_remove(arc_buf_hdr_t *hdr)
1059 1059  {
1060 1060          arc_buf_hdr_t *fhdr, **hdrp;
1061 1061          uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1062 1062  
1063 1063          ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1064 1064          ASSERT(HDR_IN_HASH_TABLE(hdr));
1065 1065  
1066 1066          hdrp = &buf_hash_table.ht_table[idx];
1067 1067          while ((fhdr = *hdrp) != hdr) {
1068 1068                  ASSERT(fhdr != NULL);
1069 1069                  hdrp = &fhdr->b_hash_next;
1070 1070          }
1071 1071          *hdrp = hdr->b_hash_next;
1072 1072          hdr->b_hash_next = NULL;
1073 1073          hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
1074 1074  
1075 1075          /* collect some hash table performance data */
1076 1076          ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1077 1077  
1078 1078          if (buf_hash_table.ht_table[idx] &&
1079 1079              buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1080 1080                  ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1081 1081  }
1082 1082  
1083 1083  /*
1084 1084   * Global data structures and functions for the buf kmem cache.
1085 1085   */
1086 1086  static kmem_cache_t *hdr_full_cache;
1087 1087  static kmem_cache_t *hdr_l2only_cache;
1088 1088  static kmem_cache_t *buf_cache;
1089 1089  
1090 1090  static void
1091 1091  buf_fini(void)
1092 1092  {
1093 1093          int i;
1094 1094  
1095 1095          kmem_free(buf_hash_table.ht_table,
1096 1096              (buf_hash_table.ht_mask + 1) * sizeof (void *));
1097 1097          for (i = 0; i < BUF_LOCKS; i++)
1098 1098                  mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1099 1099          kmem_cache_destroy(hdr_full_cache);
1100 1100          kmem_cache_destroy(hdr_l2only_cache);
1101 1101          kmem_cache_destroy(buf_cache);
1102 1102  }
1103 1103  
1104 1104  /*
1105 1105   * Constructor callback - called when the cache is empty
1106 1106   * and a new buf is requested.
1107 1107   */
1108 1108  /* ARGSUSED */
1109 1109  static int
1110 1110  hdr_full_cons(void *vbuf, void *unused, int kmflag)
1111 1111  {
1112 1112          arc_buf_hdr_t *hdr = vbuf;
1113 1113  
1114 1114          bzero(hdr, HDR_FULL_SIZE);
1115 1115          cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1116 1116          refcount_create(&hdr->b_l1hdr.b_refcnt);
1117 1117          mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1118 1118          multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1119 1119          arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1120 1120  
1121 1121          return (0);
1122 1122  }
1123 1123  
1124 1124  /* ARGSUSED */
1125 1125  static int
1126 1126  hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1127 1127  {
1128 1128          arc_buf_hdr_t *hdr = vbuf;
1129 1129  
1130 1130          bzero(hdr, HDR_L2ONLY_SIZE);
1131 1131          arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1132 1132  
1133 1133          return (0);
1134 1134  }
1135 1135  
1136 1136  /* ARGSUSED */
1137 1137  static int
1138 1138  buf_cons(void *vbuf, void *unused, int kmflag)
1139 1139  {
1140 1140          arc_buf_t *buf = vbuf;
1141 1141  
1142 1142          bzero(buf, sizeof (arc_buf_t));
1143 1143          mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1144 1144          arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1145 1145  
1146 1146          return (0);
1147 1147  }
1148 1148  
1149 1149  /*
1150 1150   * Destructor callback - called when a cached buf is
1151 1151   * no longer required.
1152 1152   */
1153 1153  /* ARGSUSED */
1154 1154  static void
1155 1155  hdr_full_dest(void *vbuf, void *unused)
1156 1156  {
1157 1157          arc_buf_hdr_t *hdr = vbuf;
1158 1158  
1159 1159          ASSERT(BUF_EMPTY(hdr));
1160 1160          cv_destroy(&hdr->b_l1hdr.b_cv);
1161 1161          refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1162 1162          mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1163 1163          ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1164 1164          arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1165 1165  }
1166 1166  
1167 1167  /* ARGSUSED */
1168 1168  static void
1169 1169  hdr_l2only_dest(void *vbuf, void *unused)
1170 1170  {
1171 1171          arc_buf_hdr_t *hdr = vbuf;
1172 1172  
1173 1173          ASSERT(BUF_EMPTY(hdr));
1174 1174          arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1175 1175  }
1176 1176  
1177 1177  /* ARGSUSED */
1178 1178  static void
1179 1179  buf_dest(void *vbuf, void *unused)
1180 1180  {
1181 1181          arc_buf_t *buf = vbuf;
1182 1182  
1183 1183          mutex_destroy(&buf->b_evict_lock);
1184 1184          arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1185 1185  }
1186 1186  
1187 1187  /*
1188 1188   * Reclaim callback -- invoked when memory is low.
1189 1189   */
1190 1190  /* ARGSUSED */
1191 1191  static void
1192 1192  hdr_recl(void *unused)
1193 1193  {
1194 1194          dprintf("hdr_recl called\n");
1195 1195          /*
1196 1196           * umem calls the reclaim func when we destroy the buf cache,
1197 1197           * which is after we do arc_fini().
1198 1198           */
1199 1199          if (!arc_dead)
1200 1200                  cv_signal(&arc_reclaim_thread_cv);
1201 1201  }
1202 1202  
1203 1203  static void
1204 1204  buf_init(void)
1205 1205  {
1206 1206          uint64_t *ct;
1207 1207          uint64_t hsize = 1ULL << 12;
1208 1208          int i, j;
1209 1209  
1210 1210          /*
1211 1211           * The hash table is big enough to fill all of physical memory
1212 1212           * with an average block size of zfs_arc_average_blocksize (default 8K).
1213 1213           * By default, the table will take up
1214 1214           * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1215 1215           */
1216 1216          while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
1217 1217                  hsize <<= 1;
1218 1218  retry:
1219 1219          buf_hash_table.ht_mask = hsize - 1;
1220 1220          buf_hash_table.ht_table =
1221 1221              kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1222 1222          if (buf_hash_table.ht_table == NULL) {
1223 1223                  ASSERT(hsize > (1ULL << 8));
1224 1224                  hsize >>= 1;
1225 1225                  goto retry;
1226 1226          }
1227 1227  
1228 1228          hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1229 1229              0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1230 1230          hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1231 1231              HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1232 1232              NULL, NULL, 0);
1233 1233          buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1234 1234              0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1235 1235  
1236 1236          for (i = 0; i < 256; i++)
1237 1237                  for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1238 1238                          *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1239 1239  
1240 1240          for (i = 0; i < BUF_LOCKS; i++) {
1241 1241                  mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1242 1242                      NULL, MUTEX_DEFAULT, NULL);
1243 1243          }
1244 1244  }
1245 1245  
1246 1246  /*
1247 1247   * Transition between the two allocation states for the arc_buf_hdr struct.
1248 1248   * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1249 1249   * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1250 1250   * version is used when a cache buffer is only in the L2ARC in order to reduce
1251 1251   * memory usage.
1252 1252   */
1253 1253  static arc_buf_hdr_t *
1254 1254  arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1255 1255  {
1256 1256          ASSERT(HDR_HAS_L2HDR(hdr));
1257 1257  
1258 1258          arc_buf_hdr_t *nhdr;
1259 1259          l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1260 1260  
1261 1261          ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1262 1262              (old == hdr_l2only_cache && new == hdr_full_cache));
1263 1263  
1264 1264          nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1265 1265  
1266 1266          ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1267 1267          buf_hash_remove(hdr);
1268 1268  
1269 1269          bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1270 1270  
1271 1271          if (new == hdr_full_cache) {
1272 1272                  nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1273 1273                  /*
1274 1274                   * arc_access and arc_change_state need to be aware that a
1275 1275                   * header has just come out of L2ARC, so we set its state to
1276 1276                   * l2c_only even though it's about to change.
1277 1277                   */
1278 1278                  nhdr->b_l1hdr.b_state = arc_l2c_only;
1279 1279  
1280 1280                  /* Verify previous threads set to NULL before freeing */
1281 1281                  ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1282 1282          } else {
1283 1283                  ASSERT(hdr->b_l1hdr.b_buf == NULL);
1284 1284                  ASSERT0(hdr->b_l1hdr.b_datacnt);
1285 1285  
1286 1286                  /*
1287 1287                   * If we've reached here, We must have been called from
1288 1288                   * arc_evict_hdr(), as such we should have already been
1289 1289                   * removed from any ghost list we were previously on
1290 1290                   * (which protects us from racing with arc_evict_state),
1291 1291                   * thus no locking is needed during this check.
1292 1292                   */
1293 1293                  ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));

↓ open down ↓

1293 lines elided

↑ open up ↑

1294 1294  
1295 1295                  /*
1296 1296                   * A buffer must not be moved into the arc_l2c_only
1297 1297                   * state if it's not finished being written out to the
1298 1298                   * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
1299 1299                   * might try to be accessed, even though it was removed.
1300 1300                   */
1301 1301                  VERIFY(!HDR_L2_WRITING(hdr));
1302 1302                  VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1303 1303  
     1304 +#ifdef ZFS_DEBUG
     1305 +                if (hdr->b_l1hdr.b_thawed != NULL) {
     1306 +                        kmem_free(hdr->b_l1hdr.b_thawed, 1);
     1307 +                        hdr->b_l1hdr.b_thawed = NULL;
     1308 +                }
     1309 +#endif
     1310 +
1304 1311                  nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1305 1312          }
1306 1313          /*
1307 1314           * The header has been reallocated so we need to re-insert it into any
1308 1315           * lists it was on.
1309 1316           */
1310 1317          (void) buf_hash_insert(nhdr, NULL);
1311 1318  
1312 1319          ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1313 1320

1314 1321          mutex_enter(&dev->l2ad_mtx);
1315 1322  
1316 1323          /*
1317 1324           * We must place the realloc'ed header back into the list at
1318 1325           * the same spot. Otherwise, if it's placed earlier in the list,
1319 1326           * l2arc_write_buffers() could find it during the function's
1320 1327           * write phase, and try to write it out to the l2arc.
1321 1328           */
1322 1329          list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1323 1330          list_remove(&dev->l2ad_buflist, hdr);
1324 1331  
1325 1332          mutex_exit(&dev->l2ad_mtx);
1326 1333  
1327 1334          /*
1328 1335           * Since we're using the pointer address as the tag when
1329 1336           * incrementing and decrementing the l2ad_alloc refcount, we
1330 1337           * must remove the old pointer (that we're about to destroy) and
1331 1338           * add the new pointer to the refcount. Otherwise we'd remove
1332 1339           * the wrong pointer address when calling arc_hdr_destroy() later.
1333 1340           */
1334 1341  
1335 1342          (void) refcount_remove_many(&dev->l2ad_alloc,
1336 1343              hdr->b_l2hdr.b_asize, hdr);
1337 1344  
1338 1345          (void) refcount_add_many(&dev->l2ad_alloc,
1339 1346              nhdr->b_l2hdr.b_asize, nhdr);
1340 1347  
1341 1348          buf_discard_identity(hdr);
1342 1349          hdr->b_freeze_cksum = NULL;
1343 1350          kmem_cache_free(old, hdr);
1344 1351  
1345 1352          return (nhdr);
1346 1353  }
1347 1354  
1348 1355  
1349 1356  #define ARC_MINTIME     (hz>>4) /* 62 ms */
1350 1357  
1351 1358  static void
1352 1359  arc_cksum_verify(arc_buf_t *buf)
1353 1360  {
1354 1361          zio_cksum_t zc;
1355 1362  
1356 1363          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1357 1364                  return;
1358 1365  
1359 1366          mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1360 1367          if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1361 1368                  mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1362 1369                  return;
1363 1370          }
1364 1371          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1365 1372          if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1366 1373                  panic("buffer modified while frozen!");
1367 1374          mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1368 1375  }
1369 1376  
1370 1377  static int
1371 1378  arc_cksum_equal(arc_buf_t *buf)
1372 1379  {
1373 1380          zio_cksum_t zc;
1374 1381          int equal;
1375 1382  
1376 1383          mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1377 1384          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1378 1385          equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1379 1386          mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1380 1387  
1381 1388          return (equal);
1382 1389  }
1383 1390  
1384 1391  static void
1385 1392  arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1386 1393  {
1387 1394          if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1388 1395                  return;
1389 1396  
1390 1397          mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1391 1398          if (buf->b_hdr->b_freeze_cksum != NULL) {
1392 1399                  mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1393 1400                  return;
1394 1401          }
1395 1402          buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1396 1403          fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1397 1404              buf->b_hdr->b_freeze_cksum);
1398 1405          mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1399 1406          arc_buf_watch(buf);
1400 1407  }
1401 1408  
1402 1409  #ifndef _KERNEL
1403 1410  typedef struct procctl {
1404 1411          long cmd;
1405 1412          prwatch_t prwatch;
1406 1413  } procctl_t;
1407 1414  #endif
1408 1415  
1409 1416  /* ARGSUSED */
1410 1417  static void
1411 1418  arc_buf_unwatch(arc_buf_t *buf)
1412 1419  {
1413 1420  #ifndef _KERNEL
1414 1421          if (arc_watch) {
1415 1422                  int result;
1416 1423                  procctl_t ctl;
1417 1424                  ctl.cmd = PCWATCH;
1418 1425                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1419 1426                  ctl.prwatch.pr_size = 0;
1420 1427                  ctl.prwatch.pr_wflags = 0;
1421 1428                  result = write(arc_procfd, &ctl, sizeof (ctl));
1422 1429                  ASSERT3U(result, ==, sizeof (ctl));
1423 1430          }
1424 1431  #endif
1425 1432  }
1426 1433  
1427 1434  /* ARGSUSED */
1428 1435  static void
1429 1436  arc_buf_watch(arc_buf_t *buf)
1430 1437  {
1431 1438  #ifndef _KERNEL
1432 1439          if (arc_watch) {
1433 1440                  int result;
1434 1441                  procctl_t ctl;
1435 1442                  ctl.cmd = PCWATCH;
1436 1443                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1437 1444                  ctl.prwatch.pr_size = buf->b_hdr->b_size;
1438 1445                  ctl.prwatch.pr_wflags = WA_WRITE;
1439 1446                  result = write(arc_procfd, &ctl, sizeof (ctl));
1440 1447                  ASSERT3U(result, ==, sizeof (ctl));
1441 1448          }
1442 1449  #endif
1443 1450  }
1444 1451  
1445 1452  static arc_buf_contents_t
1446 1453  arc_buf_type(arc_buf_hdr_t *hdr)
1447 1454  {
1448 1455          if (HDR_ISTYPE_METADATA(hdr)) {
1449 1456                  return (ARC_BUFC_METADATA);
1450 1457          } else {
1451 1458                  return (ARC_BUFC_DATA);
1452 1459          }
1453 1460  }
1454 1461  
1455 1462  static uint32_t
1456 1463  arc_bufc_to_flags(arc_buf_contents_t type)
1457 1464  {
1458 1465          switch (type) {
1459 1466          case ARC_BUFC_DATA:
1460 1467                  /* metadata field is 0 if buffer contains normal data */
1461 1468                  return (0);
1462 1469          case ARC_BUFC_METADATA:
1463 1470                  return (ARC_FLAG_BUFC_METADATA);
1464 1471          default:
1465 1472                  break;
1466 1473          }
1467 1474          panic("undefined ARC buffer type!");
1468 1475          return ((uint32_t)-1);
1469 1476  }
1470 1477  
1471 1478  void
1472 1479  arc_buf_thaw(arc_buf_t *buf)
1473 1480  {
1474 1481          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1475 1482                  if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1476 1483                          panic("modifying non-anon buffer!");
1477 1484                  if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1478 1485                          panic("modifying buffer while i/o in progress!");
1479 1486                  arc_cksum_verify(buf);
1480 1487          }
1481 1488  
1482 1489          mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1483 1490          if (buf->b_hdr->b_freeze_cksum != NULL) {
1484 1491                  kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1485 1492                  buf->b_hdr->b_freeze_cksum = NULL;
1486 1493          }
1487 1494  
1488 1495  #ifdef ZFS_DEBUG
1489 1496          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1490 1497                  if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
1491 1498                          kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
1492 1499                  buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1493 1500          }
1494 1501  #endif
1495 1502  
1496 1503          mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1497 1504  
1498 1505          arc_buf_unwatch(buf);
1499 1506  }
1500 1507  
1501 1508  void
1502 1509  arc_buf_freeze(arc_buf_t *buf)
1503 1510  {
1504 1511          kmutex_t *hash_lock;
1505 1512  
1506 1513          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1507 1514                  return;
1508 1515  
1509 1516          hash_lock = HDR_LOCK(buf->b_hdr);
1510 1517          mutex_enter(hash_lock);
1511 1518  
1512 1519          ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1513 1520              buf->b_hdr->b_l1hdr.b_state == arc_anon);
1514 1521          arc_cksum_compute(buf, B_FALSE);
1515 1522          mutex_exit(hash_lock);
1516 1523  
1517 1524  }
1518 1525  
1519 1526  static void
1520 1527  add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1521 1528  {
1522 1529          ASSERT(HDR_HAS_L1HDR(hdr));
1523 1530          ASSERT(MUTEX_HELD(hash_lock));
1524 1531          arc_state_t *state = hdr->b_l1hdr.b_state;
1525 1532  
1526 1533          if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1527 1534              (state != arc_anon)) {
1528 1535                  /* We don't use the L2-only state list. */
1529 1536                  if (state != arc_l2c_only) {
1530 1537                          arc_buf_contents_t type = arc_buf_type(hdr);
1531 1538                          uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1532 1539                          multilist_t *list = &state->arcs_list[type];
1533 1540                          uint64_t *size = &state->arcs_lsize[type];
1534 1541  
1535 1542                          multilist_remove(list, hdr);
1536 1543  
1537 1544                          if (GHOST_STATE(state)) {
1538 1545                                  ASSERT0(hdr->b_l1hdr.b_datacnt);
1539 1546                                  ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1540 1547                                  delta = hdr->b_size;
1541 1548                          }
1542 1549                          ASSERT(delta > 0);
1543 1550                          ASSERT3U(*size, >=, delta);
1544 1551                          atomic_add_64(size, -delta);
1545 1552                  }
1546 1553                  /* remove the prefetch flag if we get a reference */
1547 1554                  hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1548 1555          }
1549 1556  }
1550 1557  
1551 1558  static int
1552 1559  remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1553 1560  {
1554 1561          int cnt;
1555 1562          arc_state_t *state = hdr->b_l1hdr.b_state;
1556 1563  
1557 1564          ASSERT(HDR_HAS_L1HDR(hdr));
1558 1565          ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1559 1566          ASSERT(!GHOST_STATE(state));
1560 1567  
1561 1568          /*
1562 1569           * arc_l2c_only counts as a ghost state so we don't need to explicitly
1563 1570           * check to prevent usage of the arc_l2c_only list.
1564 1571           */
1565 1572          if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1566 1573              (state != arc_anon)) {
1567 1574                  arc_buf_contents_t type = arc_buf_type(hdr);
1568 1575                  multilist_t *list = &state->arcs_list[type];
1569 1576                  uint64_t *size = &state->arcs_lsize[type];
1570 1577  
1571 1578                  multilist_insert(list, hdr);
1572 1579  
1573 1580                  ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1574 1581                  atomic_add_64(size, hdr->b_size *
1575 1582                      hdr->b_l1hdr.b_datacnt);
1576 1583          }
1577 1584          return (cnt);
1578 1585  }
1579 1586  
1580 1587  /*
1581 1588   * Move the supplied buffer to the indicated state. The hash lock
1582 1589   * for the buffer must be held by the caller.
1583 1590   */
1584 1591  static void
1585 1592  arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1586 1593      kmutex_t *hash_lock)
1587 1594  {
1588 1595          arc_state_t *old_state;
1589 1596          int64_t refcnt;
1590 1597          uint32_t datacnt;
1591 1598          uint64_t from_delta, to_delta;
1592 1599          arc_buf_contents_t buftype = arc_buf_type(hdr);
1593 1600  
1594 1601          /*
1595 1602           * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1596 1603           * in arc_read() when bringing a buffer out of the L2ARC.  However, the
1597 1604           * L1 hdr doesn't always exist when we change state to arc_anon before
1598 1605           * destroying a header, in which case reallocating to add the L1 hdr is
1599 1606           * pointless.
1600 1607           */
1601 1608          if (HDR_HAS_L1HDR(hdr)) {
1602 1609                  old_state = hdr->b_l1hdr.b_state;
1603 1610                  refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1604 1611                  datacnt = hdr->b_l1hdr.b_datacnt;
1605 1612          } else {
1606 1613                  old_state = arc_l2c_only;
1607 1614                  refcnt = 0;
1608 1615                  datacnt = 0;
1609 1616          }
1610 1617  
1611 1618          ASSERT(MUTEX_HELD(hash_lock));
1612 1619          ASSERT3P(new_state, !=, old_state);
1613 1620          ASSERT(refcnt == 0 || datacnt > 0);
1614 1621          ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1615 1622          ASSERT(old_state != arc_anon || datacnt <= 1);
1616 1623  
1617 1624          from_delta = to_delta = datacnt * hdr->b_size;
1618 1625  
1619 1626          /*
1620 1627           * If this buffer is evictable, transfer it from the
1621 1628           * old state list to the new state list.
1622 1629           */
1623 1630          if (refcnt == 0) {
1624 1631                  if (old_state != arc_anon && old_state != arc_l2c_only) {
1625 1632                          uint64_t *size = &old_state->arcs_lsize[buftype];
1626 1633  
1627 1634                          ASSERT(HDR_HAS_L1HDR(hdr));
1628 1635                          multilist_remove(&old_state->arcs_list[buftype], hdr);
1629 1636  
1630 1637                          /*
1631 1638                           * If prefetching out of the ghost cache,
1632 1639                           * we will have a non-zero datacnt.
1633 1640                           */
1634 1641                          if (GHOST_STATE(old_state) && datacnt == 0) {
1635 1642                                  /* ghost elements have a ghost size */
1636 1643                                  ASSERT(hdr->b_l1hdr.b_buf == NULL);
1637 1644                                  from_delta = hdr->b_size;
1638 1645                          }
1639 1646                          ASSERT3U(*size, >=, from_delta);
1640 1647                          atomic_add_64(size, -from_delta);
1641 1648                  }
1642 1649                  if (new_state != arc_anon && new_state != arc_l2c_only) {
1643 1650                          uint64_t *size = &new_state->arcs_lsize[buftype];
1644 1651  
1645 1652                          /*
1646 1653                           * An L1 header always exists here, since if we're
1647 1654                           * moving to some L1-cached state (i.e. not l2c_only or
1648 1655                           * anonymous), we realloc the header to add an L1hdr
1649 1656                           * beforehand.
1650 1657                           */
1651 1658                          ASSERT(HDR_HAS_L1HDR(hdr));
1652 1659                          multilist_insert(&new_state->arcs_list[buftype], hdr);
1653 1660  
1654 1661                          /* ghost elements have a ghost size */
1655 1662                          if (GHOST_STATE(new_state)) {
1656 1663                                  ASSERT0(datacnt);
1657 1664                                  ASSERT(hdr->b_l1hdr.b_buf == NULL);
1658 1665                                  to_delta = hdr->b_size;
1659 1666                          }
1660 1667                          atomic_add_64(size, to_delta);
1661 1668                  }
1662 1669          }
1663 1670  
1664 1671          ASSERT(!BUF_EMPTY(hdr));
1665 1672          if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1666 1673                  buf_hash_remove(hdr);
1667 1674  
1668 1675          /* adjust state sizes (ignore arc_l2c_only) */
1669 1676  
1670 1677          if (to_delta && new_state != arc_l2c_only) {
1671 1678                  ASSERT(HDR_HAS_L1HDR(hdr));
1672 1679                  if (GHOST_STATE(new_state)) {
1673 1680                          ASSERT0(datacnt);
1674 1681  
1675 1682                          /*
1676 1683                           * We moving a header to a ghost state, we first
1677 1684                           * remove all arc buffers. Thus, we'll have a
1678 1685                           * datacnt of zero, and no arc buffer to use for
1679 1686                           * the reference. As a result, we use the arc
1680 1687                           * header pointer for the reference.
1681 1688                           */
1682 1689                          (void) refcount_add_many(&new_state->arcs_size,
1683 1690                              hdr->b_size, hdr);
1684 1691                  } else {
1685 1692                          ASSERT3U(datacnt, !=, 0);
1686 1693  
1687 1694                          /*
1688 1695                           * Each individual buffer holds a unique reference,
1689 1696                           * thus we must remove each of these references one
1690 1697                           * at a time.
1691 1698                           */
1692 1699                          for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
1693 1700                              buf = buf->b_next) {
1694 1701                                  (void) refcount_add_many(&new_state->arcs_size,
1695 1702                                      hdr->b_size, buf);
1696 1703                          }
1697 1704                  }
1698 1705          }
1699 1706  
1700 1707          if (from_delta && old_state != arc_l2c_only) {
1701 1708                  ASSERT(HDR_HAS_L1HDR(hdr));
1702 1709                  if (GHOST_STATE(old_state)) {
1703 1710                          /*
1704 1711                           * When moving a header off of a ghost state,
1705 1712                           * there's the possibility for datacnt to be
1706 1713                           * non-zero. This is because we first add the
1707 1714                           * arc buffer to the header prior to changing
1708 1715                           * the header's state. Since we used the header
1709 1716                           * for the reference when putting the header on
1710 1717                           * the ghost state, we must balance that and use
1711 1718                           * the header when removing off the ghost state
1712 1719                           * (even though datacnt is non zero).
1713 1720                           */
1714 1721  
1715 1722                          IMPLY(datacnt == 0, new_state == arc_anon ||
1716 1723                              new_state == arc_l2c_only);
1717 1724  
1718 1725                          (void) refcount_remove_many(&old_state->arcs_size,
1719 1726                              hdr->b_size, hdr);
1720 1727                  } else {
1721 1728                          ASSERT3P(datacnt, !=, 0);
1722 1729  
1723 1730                          /*
1724 1731                           * Each individual buffer holds a unique reference,
1725 1732                           * thus we must remove each of these references one
1726 1733                           * at a time.
1727 1734                           */
1728 1735                          for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
1729 1736                              buf = buf->b_next) {
1730 1737                                  (void) refcount_remove_many(
1731 1738                                      &old_state->arcs_size, hdr->b_size, buf);
1732 1739                          }
1733 1740                  }
1734 1741          }
1735 1742  
1736 1743          if (HDR_HAS_L1HDR(hdr))
1737 1744                  hdr->b_l1hdr.b_state = new_state;
1738 1745  
1739 1746          /*
1740 1747           * L2 headers should never be on the L2 state list since they don't
1741 1748           * have L1 headers allocated.
1742 1749           */
1743 1750          ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1744 1751              multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
1745 1752  }
1746 1753  
1747 1754  void
1748 1755  arc_space_consume(uint64_t space, arc_space_type_t type)
1749 1756  {
1750 1757          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1751 1758  
1752 1759          switch (type) {
1753 1760          case ARC_SPACE_DATA:
1754 1761                  ARCSTAT_INCR(arcstat_data_size, space);
1755 1762                  break;
1756 1763          case ARC_SPACE_META:
1757 1764                  ARCSTAT_INCR(arcstat_metadata_size, space);
1758 1765                  break;
1759 1766          case ARC_SPACE_OTHER:
1760 1767                  ARCSTAT_INCR(arcstat_other_size, space);
1761 1768                  break;
1762 1769          case ARC_SPACE_HDRS:
1763 1770                  ARCSTAT_INCR(arcstat_hdr_size, space);
1764 1771                  break;
1765 1772          case ARC_SPACE_L2HDRS:
1766 1773                  ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1767 1774                  break;
1768 1775          }
1769 1776  
1770 1777          if (type != ARC_SPACE_DATA)
1771 1778                  ARCSTAT_INCR(arcstat_meta_used, space);
1772 1779  
1773 1780          atomic_add_64(&arc_size, space);
1774 1781  }
1775 1782  
1776 1783  void
1777 1784  arc_space_return(uint64_t space, arc_space_type_t type)
1778 1785  {
1779 1786          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1780 1787  
1781 1788          switch (type) {
1782 1789          case ARC_SPACE_DATA:
1783 1790                  ARCSTAT_INCR(arcstat_data_size, -space);
1784 1791                  break;
1785 1792          case ARC_SPACE_META:
1786 1793                  ARCSTAT_INCR(arcstat_metadata_size, -space);
1787 1794                  break;
1788 1795          case ARC_SPACE_OTHER:
1789 1796                  ARCSTAT_INCR(arcstat_other_size, -space);
1790 1797                  break;
1791 1798          case ARC_SPACE_HDRS:
1792 1799                  ARCSTAT_INCR(arcstat_hdr_size, -space);
1793 1800                  break;
1794 1801          case ARC_SPACE_L2HDRS:
1795 1802                  ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1796 1803                  break;
1797 1804          }
1798 1805  
1799 1806          if (type != ARC_SPACE_DATA) {
1800 1807                  ASSERT(arc_meta_used >= space);
1801 1808                  if (arc_meta_max < arc_meta_used)
1802 1809                          arc_meta_max = arc_meta_used;
1803 1810                  ARCSTAT_INCR(arcstat_meta_used, -space);
1804 1811          }
1805 1812  
1806 1813          ASSERT(arc_size >= space);
1807 1814          atomic_add_64(&arc_size, -space);
1808 1815  }
1809 1816  
1810 1817  arc_buf_t *
1811 1818  arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
1812 1819  {
1813 1820          arc_buf_hdr_t *hdr;
1814 1821          arc_buf_t *buf;
1815 1822  
1816 1823          ASSERT3U(size, >, 0);
1817 1824          hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
1818 1825          ASSERT(BUF_EMPTY(hdr));
1819 1826          ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
1820 1827          hdr->b_size = size;
1821 1828          hdr->b_spa = spa_load_guid(spa);
1822 1829  
1823 1830          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1824 1831          buf->b_hdr = hdr;
1825 1832          buf->b_data = NULL;
1826 1833          buf->b_efunc = NULL;
1827 1834          buf->b_private = NULL;
1828 1835          buf->b_next = NULL;
1829 1836  
1830 1837          hdr->b_flags = arc_bufc_to_flags(type);
1831 1838          hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1832 1839  
1833 1840          hdr->b_l1hdr.b_buf = buf;
1834 1841          hdr->b_l1hdr.b_state = arc_anon;
1835 1842          hdr->b_l1hdr.b_arc_access = 0;
1836 1843          hdr->b_l1hdr.b_datacnt = 1;
1837 1844          hdr->b_l1hdr.b_tmp_cdata = NULL;
1838 1845  
1839 1846          arc_get_data_buf(buf);
1840 1847          ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1841 1848          (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1842 1849  
1843 1850          return (buf);
1844 1851  }
1845 1852  
1846 1853  static char *arc_onloan_tag = "onloan";
1847 1854  
1848 1855  /*
1849 1856   * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1850 1857   * flight data by arc_tempreserve_space() until they are "returned". Loaned
1851 1858   * buffers must be returned to the arc before they can be used by the DMU or
1852 1859   * freed.
1853 1860   */
1854 1861  arc_buf_t *
1855 1862  arc_loan_buf(spa_t *spa, int size)
1856 1863  {
1857 1864          arc_buf_t *buf;
1858 1865  
1859 1866          buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1860 1867  
1861 1868          atomic_add_64(&arc_loaned_bytes, size);
1862 1869          return (buf);
1863 1870  }
1864 1871  
1865 1872  /*
1866 1873   * Return a loaned arc buffer to the arc.
1867 1874   */
1868 1875  void
1869 1876  arc_return_buf(arc_buf_t *buf, void *tag)
1870 1877  {
1871 1878          arc_buf_hdr_t *hdr = buf->b_hdr;
1872 1879  
1873 1880          ASSERT(buf->b_data != NULL);
1874 1881          ASSERT(HDR_HAS_L1HDR(hdr));
1875 1882          (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1876 1883          (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1877 1884  
1878 1885          atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1879 1886  }
1880 1887  
1881 1888  /* Detach an arc_buf from a dbuf (tag) */
1882 1889  void
1883 1890  arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1884 1891  {
1885 1892          arc_buf_hdr_t *hdr = buf->b_hdr;
1886 1893  
1887 1894          ASSERT(buf->b_data != NULL);
1888 1895          ASSERT(HDR_HAS_L1HDR(hdr));
1889 1896          (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1890 1897          (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
1891 1898          buf->b_efunc = NULL;
1892 1899          buf->b_private = NULL;
1893 1900  
1894 1901          atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1895 1902  }
1896 1903  
1897 1904  static arc_buf_t *
1898 1905  arc_buf_clone(arc_buf_t *from)
1899 1906  {
1900 1907          arc_buf_t *buf;
1901 1908          arc_buf_hdr_t *hdr = from->b_hdr;
1902 1909          uint64_t size = hdr->b_size;
1903 1910  
1904 1911          ASSERT(HDR_HAS_L1HDR(hdr));
1905 1912          ASSERT(hdr->b_l1hdr.b_state != arc_anon);
1906 1913  
1907 1914          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1908 1915          buf->b_hdr = hdr;
1909 1916          buf->b_data = NULL;
1910 1917          buf->b_efunc = NULL;
1911 1918          buf->b_private = NULL;
1912 1919          buf->b_next = hdr->b_l1hdr.b_buf;
1913 1920          hdr->b_l1hdr.b_buf = buf;
1914 1921          arc_get_data_buf(buf);
1915 1922          bcopy(from->b_data, buf->b_data, size);
1916 1923  
1917 1924          /*
1918 1925           * This buffer already exists in the arc so create a duplicate
1919 1926           * copy for the caller.  If the buffer is associated with user data
1920 1927           * then track the size and number of duplicates.  These stats will be
1921 1928           * updated as duplicate buffers are created and destroyed.
1922 1929           */
1923 1930          if (HDR_ISTYPE_DATA(hdr)) {
1924 1931                  ARCSTAT_BUMP(arcstat_duplicate_buffers);
1925 1932                  ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1926 1933          }
1927 1934          hdr->b_l1hdr.b_datacnt += 1;
1928 1935          return (buf);
1929 1936  }
1930 1937  
1931 1938  void
1932 1939  arc_buf_add_ref(arc_buf_t *buf, void* tag)
1933 1940  {
1934 1941          arc_buf_hdr_t *hdr;
1935 1942          kmutex_t *hash_lock;
1936 1943  
1937 1944          /*
1938 1945           * Check to see if this buffer is evicted.  Callers
1939 1946           * must verify b_data != NULL to know if the add_ref
1940 1947           * was successful.
1941 1948           */
1942 1949          mutex_enter(&buf->b_evict_lock);
1943 1950          if (buf->b_data == NULL) {
1944 1951                  mutex_exit(&buf->b_evict_lock);
1945 1952                  return;
1946 1953          }
1947 1954          hash_lock = HDR_LOCK(buf->b_hdr);
1948 1955          mutex_enter(hash_lock);
1949 1956          hdr = buf->b_hdr;
1950 1957          ASSERT(HDR_HAS_L1HDR(hdr));
1951 1958          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1952 1959          mutex_exit(&buf->b_evict_lock);
1953 1960  
1954 1961          ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
1955 1962              hdr->b_l1hdr.b_state == arc_mfu);
1956 1963  
1957 1964          add_reference(hdr, hash_lock, tag);
1958 1965          DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1959 1966          arc_access(hdr, hash_lock);
1960 1967          mutex_exit(hash_lock);
1961 1968          ARCSTAT_BUMP(arcstat_hits);
1962 1969          ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
1963 1970              demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
1964 1971              data, metadata, hits);
1965 1972  }
1966 1973  
1967 1974  static void
1968 1975  arc_buf_free_on_write(void *data, size_t size,
1969 1976      void (*free_func)(void *, size_t))
1970 1977  {
1971 1978          l2arc_data_free_t *df;
1972 1979  
1973 1980          df = kmem_alloc(sizeof (*df), KM_SLEEP);
1974 1981          df->l2df_data = data;
1975 1982          df->l2df_size = size;
1976 1983          df->l2df_func = free_func;
1977 1984          mutex_enter(&l2arc_free_on_write_mtx);
1978 1985          list_insert_head(l2arc_free_on_write, df);
1979 1986          mutex_exit(&l2arc_free_on_write_mtx);
1980 1987  }
1981 1988  
1982 1989  /*
1983 1990   * Free the arc data buffer.  If it is an l2arc write in progress,
1984 1991   * the buffer is placed on l2arc_free_on_write to be freed later.
1985 1992   */
1986 1993  static void
1987 1994  arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1988 1995  {
1989 1996          arc_buf_hdr_t *hdr = buf->b_hdr;
1990 1997  
1991 1998          if (HDR_L2_WRITING(hdr)) {
1992 1999                  arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1993 2000                  ARCSTAT_BUMP(arcstat_l2_free_on_write);
1994 2001          } else {
1995 2002                  free_func(buf->b_data, hdr->b_size);
1996 2003          }
1997 2004  }
1998 2005  
1999 2006  static void
2000 2007  arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
2001 2008  {
2002 2009          ASSERT(HDR_HAS_L2HDR(hdr));
2003 2010          ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
2004 2011  
2005 2012          /*
2006 2013           * The b_tmp_cdata field is linked off of the b_l1hdr, so if
2007 2014           * that doesn't exist, the header is in the arc_l2c_only state,
2008 2015           * and there isn't anything to free (it's already been freed).
2009 2016           */
2010 2017          if (!HDR_HAS_L1HDR(hdr))
2011 2018                  return;
2012 2019  
2013 2020          /*
2014 2021           * The header isn't being written to the l2arc device, thus it
2015 2022           * shouldn't have a b_tmp_cdata to free.
2016 2023           */
2017 2024          if (!HDR_L2_WRITING(hdr)) {
2018 2025                  ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2019 2026                  return;
2020 2027          }
2021 2028  
2022 2029          /*
2023 2030           * The header does not have compression enabled. This can be due
2024 2031           * to the buffer not being compressible, or because we're
2025 2032           * freeing the buffer before the second phase of
2026 2033           * l2arc_write_buffer() has started (which does the compression
2027 2034           * step). In either case, b_tmp_cdata does not point to a
2028 2035           * separately compressed buffer, so there's nothing to free (it
2029 2036           * points to the same buffer as the arc_buf_t's b_data field).
2030 2037           */
2031 2038          if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) {
2032 2039                  hdr->b_l1hdr.b_tmp_cdata = NULL;
2033 2040                  return;
2034 2041          }
2035 2042  
2036 2043          /*
2037 2044           * There's nothing to free since the buffer was all zero's and
2038 2045           * compressed to a zero length buffer.
2039 2046           */
2040 2047          if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) {
2041 2048                  ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2042 2049                  return;
2043 2050          }
2044 2051  
2045 2052          ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress));
2046 2053  
2047 2054          arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
2048 2055              hdr->b_size, zio_data_buf_free);
2049 2056  
2050 2057          ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
2051 2058          hdr->b_l1hdr.b_tmp_cdata = NULL;
2052 2059  }
2053 2060  
2054 2061  /*
2055 2062   * Free up buf->b_data and if 'remove' is set, then pull the
2056 2063   * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2057 2064   */
2058 2065  static void
2059 2066  arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
2060 2067  {
2061 2068          arc_buf_t **bufp;
2062 2069  
2063 2070          /* free up data associated with the buf */
2064 2071          if (buf->b_data != NULL) {
2065 2072                  arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
2066 2073                  uint64_t size = buf->b_hdr->b_size;
2067 2074                  arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
2068 2075  
2069 2076                  arc_cksum_verify(buf);
2070 2077                  arc_buf_unwatch(buf);
2071 2078  
2072 2079                  if (type == ARC_BUFC_METADATA) {
2073 2080                          arc_buf_data_free(buf, zio_buf_free);
2074 2081                          arc_space_return(size, ARC_SPACE_META);
2075 2082                  } else {
2076 2083                          ASSERT(type == ARC_BUFC_DATA);
2077 2084                          arc_buf_data_free(buf, zio_data_buf_free);
2078 2085                          arc_space_return(size, ARC_SPACE_DATA);
2079 2086                  }
2080 2087  
2081 2088                  /* protected by hash lock, if in the hash table */
2082 2089                  if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
2083 2090                          uint64_t *cnt = &state->arcs_lsize[type];
2084 2091  
2085 2092                          ASSERT(refcount_is_zero(
2086 2093                              &buf->b_hdr->b_l1hdr.b_refcnt));
2087 2094                          ASSERT(state != arc_anon && state != arc_l2c_only);
2088 2095  
2089 2096                          ASSERT3U(*cnt, >=, size);
2090 2097                          atomic_add_64(cnt, -size);
2091 2098                  }
2092 2099  
2093 2100                  (void) refcount_remove_many(&state->arcs_size, size, buf);
2094 2101                  buf->b_data = NULL;
2095 2102  
2096 2103                  /*
2097 2104                   * If we're destroying a duplicate buffer make sure
2098 2105                   * that the appropriate statistics are updated.
2099 2106                   */
2100 2107                  if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2101 2108                      HDR_ISTYPE_DATA(buf->b_hdr)) {
2102 2109                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2103 2110                          ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2104 2111                  }
2105 2112                  ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2106 2113                  buf->b_hdr->b_l1hdr.b_datacnt -= 1;
2107 2114          }
2108 2115  
2109 2116          /* only remove the buf if requested */
2110 2117          if (!remove)
2111 2118                  return;
2112 2119  
2113 2120          /* remove the buf from the hdr list */
2114 2121          for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2115 2122              bufp = &(*bufp)->b_next)
2116 2123                  continue;
2117 2124          *bufp = buf->b_next;
2118 2125          buf->b_next = NULL;
2119 2126  
2120 2127          ASSERT(buf->b_efunc == NULL);
2121 2128  
2122 2129          /* clean up the buf */
2123 2130          buf->b_hdr = NULL;
2124 2131          kmem_cache_free(buf_cache, buf);
2125 2132  }
2126 2133  
2127 2134  static void
2128 2135  arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2129 2136  {
2130 2137          l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2131 2138          l2arc_dev_t *dev = l2hdr->b_dev;
2132 2139  
2133 2140          ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2134 2141          ASSERT(HDR_HAS_L2HDR(hdr));
2135 2142  
2136 2143          list_remove(&dev->l2ad_buflist, hdr);
2137 2144  
2138 2145          /*
2139 2146           * We don't want to leak the b_tmp_cdata buffer that was
2140 2147           * allocated in l2arc_write_buffers()
2141 2148           */
2142 2149          arc_buf_l2_cdata_free(hdr);
2143 2150  
2144 2151          /*
2145 2152           * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2146 2153           * this header is being processed by l2arc_write_buffers() (i.e.
2147 2154           * it's in the first stage of l2arc_write_buffers()).
2148 2155           * Re-affirming that truth here, just to serve as a reminder. If
2149 2156           * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2150 2157           * may not have its HDR_L2_WRITING flag set. (the write may have
2151 2158           * completed, in which case HDR_L2_WRITING will be false and the
2152 2159           * b_daddr field will point to the address of the buffer on disk).
2153 2160           */
2154 2161          IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2155 2162  
2156 2163          /*
2157 2164           * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2158 2165           * l2arc_write_buffers(). Since we've just removed this header
2159 2166           * from the l2arc buffer list, this header will never reach the
2160 2167           * second stage of l2arc_write_buffers(), which increments the
2161 2168           * accounting stats for this header. Thus, we must be careful
2162 2169           * not to decrement them for this header either.
2163 2170           */
2164 2171          if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2165 2172                  ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2166 2173                  ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2167 2174  
2168 2175                  vdev_space_update(dev->l2ad_vdev,
2169 2176                      -l2hdr->b_asize, 0, 0);
2170 2177  
2171 2178                  (void) refcount_remove_many(&dev->l2ad_alloc,
2172 2179                      l2hdr->b_asize, hdr);
2173 2180          }
2174 2181  
2175 2182          hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2176 2183  }
2177 2184  
2178 2185  static void
2179 2186  arc_hdr_destroy(arc_buf_hdr_t *hdr)
2180 2187  {
2181 2188          if (HDR_HAS_L1HDR(hdr)) {
2182 2189                  ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2183 2190                      hdr->b_l1hdr.b_datacnt > 0);
2184 2191                  ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2185 2192                  ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2186 2193          }
2187 2194          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2188 2195          ASSERT(!HDR_IN_HASH_TABLE(hdr));
2189 2196  
2190 2197          if (HDR_HAS_L2HDR(hdr)) {
2191 2198                  l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2192 2199                  boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
2193 2200  
2194 2201                  if (!buflist_held)
2195 2202                          mutex_enter(&dev->l2ad_mtx);
2196 2203  
2197 2204                  /*
2198 2205                   * Even though we checked this conditional above, we
2199 2206                   * need to check this again now that we have the
2200 2207                   * l2ad_mtx. This is because we could be racing with
2201 2208                   * another thread calling l2arc_evict() which might have
2202 2209                   * destroyed this header's L2 portion as we were waiting
2203 2210                   * to acquire the l2ad_mtx. If that happens, we don't
2204 2211                   * want to re-destroy the header's L2 portion.
2205 2212                   */
2206 2213                  if (HDR_HAS_L2HDR(hdr))
2207 2214                          arc_hdr_l2hdr_destroy(hdr);
2208 2215  
2209 2216                  if (!buflist_held)
2210 2217                          mutex_exit(&dev->l2ad_mtx);
2211 2218          }
2212 2219  
2213 2220          if (!BUF_EMPTY(hdr))
2214 2221                  buf_discard_identity(hdr);
2215 2222  
2216 2223          if (hdr->b_freeze_cksum != NULL) {
2217 2224                  kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2218 2225                  hdr->b_freeze_cksum = NULL;
2219 2226          }
2220 2227  
2221 2228          if (HDR_HAS_L1HDR(hdr)) {
2222 2229                  while (hdr->b_l1hdr.b_buf) {
2223 2230                          arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2224 2231  
2225 2232                          if (buf->b_efunc != NULL) {
2226 2233                                  mutex_enter(&arc_user_evicts_lock);
2227 2234                                  mutex_enter(&buf->b_evict_lock);
2228 2235                                  ASSERT(buf->b_hdr != NULL);
2229 2236                                  arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
2230 2237                                  hdr->b_l1hdr.b_buf = buf->b_next;
2231 2238                                  buf->b_hdr = &arc_eviction_hdr;
2232 2239                                  buf->b_next = arc_eviction_list;
2233 2240                                  arc_eviction_list = buf;
2234 2241                                  mutex_exit(&buf->b_evict_lock);
2235 2242                                  cv_signal(&arc_user_evicts_cv);
2236 2243                                  mutex_exit(&arc_user_evicts_lock);
2237 2244                          } else {
2238 2245                                  arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
2239 2246                          }
2240 2247                  }
2241 2248  #ifdef ZFS_DEBUG
2242 2249                  if (hdr->b_l1hdr.b_thawed != NULL) {
2243 2250                          kmem_free(hdr->b_l1hdr.b_thawed, 1);
2244 2251                          hdr->b_l1hdr.b_thawed = NULL;
2245 2252                  }
2246 2253  #endif
2247 2254          }
2248 2255  
2249 2256          ASSERT3P(hdr->b_hash_next, ==, NULL);
2250 2257          if (HDR_HAS_L1HDR(hdr)) {
2251 2258                  ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
2252 2259                  ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2253 2260                  kmem_cache_free(hdr_full_cache, hdr);
2254 2261          } else {
2255 2262                  kmem_cache_free(hdr_l2only_cache, hdr);
2256 2263          }
2257 2264  }
2258 2265  
2259 2266  void
2260 2267  arc_buf_free(arc_buf_t *buf, void *tag)
2261 2268  {
2262 2269          arc_buf_hdr_t *hdr = buf->b_hdr;
2263 2270          int hashed = hdr->b_l1hdr.b_state != arc_anon;
2264 2271  
2265 2272          ASSERT(buf->b_efunc == NULL);
2266 2273          ASSERT(buf->b_data != NULL);
2267 2274  
2268 2275          if (hashed) {
2269 2276                  kmutex_t *hash_lock = HDR_LOCK(hdr);
2270 2277  
2271 2278                  mutex_enter(hash_lock);
2272 2279                  hdr = buf->b_hdr;
2273 2280                  ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2274 2281  
2275 2282                  (void) remove_reference(hdr, hash_lock, tag);
2276 2283                  if (hdr->b_l1hdr.b_datacnt > 1) {
2277 2284                          arc_buf_destroy(buf, TRUE);
2278 2285                  } else {
2279 2286                          ASSERT(buf == hdr->b_l1hdr.b_buf);
2280 2287                          ASSERT(buf->b_efunc == NULL);
2281 2288                          hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2282 2289                  }
2283 2290                  mutex_exit(hash_lock);
2284 2291          } else if (HDR_IO_IN_PROGRESS(hdr)) {
2285 2292                  int destroy_hdr;
2286 2293                  /*
2287 2294                   * We are in the middle of an async write.  Don't destroy
2288 2295                   * this buffer unless the write completes before we finish
2289 2296                   * decrementing the reference count.
2290 2297                   */
2291 2298                  mutex_enter(&arc_user_evicts_lock);
2292 2299                  (void) remove_reference(hdr, NULL, tag);
2293 2300                  ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2294 2301                  destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2295 2302                  mutex_exit(&arc_user_evicts_lock);
2296 2303                  if (destroy_hdr)
2297 2304                          arc_hdr_destroy(hdr);
2298 2305          } else {
2299 2306                  if (remove_reference(hdr, NULL, tag) > 0)
2300 2307                          arc_buf_destroy(buf, TRUE);
2301 2308                  else
2302 2309                          arc_hdr_destroy(hdr);
2303 2310          }
2304 2311  }
2305 2312  
2306 2313  boolean_t
2307 2314  arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2308 2315  {
2309 2316          arc_buf_hdr_t *hdr = buf->b_hdr;
2310 2317          kmutex_t *hash_lock = HDR_LOCK(hdr);
2311 2318          boolean_t no_callback = (buf->b_efunc == NULL);
2312 2319  
2313 2320          if (hdr->b_l1hdr.b_state == arc_anon) {
2314 2321                  ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2315 2322                  arc_buf_free(buf, tag);
2316 2323                  return (no_callback);
2317 2324          }
2318 2325  
2319 2326          mutex_enter(hash_lock);
2320 2327          hdr = buf->b_hdr;
2321 2328          ASSERT(hdr->b_l1hdr.b_datacnt > 0);
2322 2329          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2323 2330          ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2324 2331          ASSERT(buf->b_data != NULL);
2325 2332  
2326 2333          (void) remove_reference(hdr, hash_lock, tag);
2327 2334          if (hdr->b_l1hdr.b_datacnt > 1) {
2328 2335                  if (no_callback)
2329 2336                          arc_buf_destroy(buf, TRUE);
2330 2337          } else if (no_callback) {
2331 2338                  ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
2332 2339                  ASSERT(buf->b_efunc == NULL);
2333 2340                  hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2334 2341          }
2335 2342          ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2336 2343              refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2337 2344          mutex_exit(hash_lock);
2338 2345          return (no_callback);
2339 2346  }
2340 2347  
2341 2348  int32_t
2342 2349  arc_buf_size(arc_buf_t *buf)
2343 2350  {
2344 2351          return (buf->b_hdr->b_size);
2345 2352  }
2346 2353  
2347 2354  /*
2348 2355   * Called from the DMU to determine if the current buffer should be
2349 2356   * evicted. In order to ensure proper locking, the eviction must be initiated
2350 2357   * from the DMU. Return true if the buffer is associated with user data and
2351 2358   * duplicate buffers still exist.
2352 2359   */
2353 2360  boolean_t
2354 2361  arc_buf_eviction_needed(arc_buf_t *buf)
2355 2362  {
2356 2363          arc_buf_hdr_t *hdr;
2357 2364          boolean_t evict_needed = B_FALSE;
2358 2365  
2359 2366          if (zfs_disable_dup_eviction)
2360 2367                  return (B_FALSE);
2361 2368  
2362 2369          mutex_enter(&buf->b_evict_lock);
2363 2370          hdr = buf->b_hdr;
2364 2371          if (hdr == NULL) {
2365 2372                  /*
2366 2373                   * We are in arc_do_user_evicts(); let that function
2367 2374                   * perform the eviction.
2368 2375                   */
2369 2376                  ASSERT(buf->b_data == NULL);
2370 2377                  mutex_exit(&buf->b_evict_lock);
2371 2378                  return (B_FALSE);
2372 2379          } else if (buf->b_data == NULL) {
2373 2380                  /*
2374 2381                   * We have already been added to the arc eviction list;
2375 2382                   * recommend eviction.
2376 2383                   */
2377 2384                  ASSERT3P(hdr, ==, &arc_eviction_hdr);
2378 2385                  mutex_exit(&buf->b_evict_lock);
2379 2386                  return (B_TRUE);
2380 2387          }
2381 2388  
2382 2389          if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2383 2390                  evict_needed = B_TRUE;
2384 2391  
2385 2392          mutex_exit(&buf->b_evict_lock);
2386 2393          return (evict_needed);
2387 2394  }
2388 2395  
2389 2396  /*
2390 2397   * Evict the arc_buf_hdr that is provided as a parameter. The resultant
2391 2398   * state of the header is dependent on it's state prior to entering this
2392 2399   * function. The following transitions are possible:
2393 2400   *
2394 2401   *    - arc_mru -> arc_mru_ghost
2395 2402   *    - arc_mfu -> arc_mfu_ghost
2396 2403   *    - arc_mru_ghost -> arc_l2c_only
2397 2404   *    - arc_mru_ghost -> deleted
2398 2405   *    - arc_mfu_ghost -> arc_l2c_only
2399 2406   *    - arc_mfu_ghost -> deleted
2400 2407   */
2401 2408  static int64_t
2402 2409  arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
2403 2410  {
2404 2411          arc_state_t *evicted_state, *state;
2405 2412          int64_t bytes_evicted = 0;
2406 2413  
2407 2414          ASSERT(MUTEX_HELD(hash_lock));
2408 2415          ASSERT(HDR_HAS_L1HDR(hdr));
2409 2416  
2410 2417          state = hdr->b_l1hdr.b_state;
2411 2418          if (GHOST_STATE(state)) {
2412 2419                  ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2413 2420                  ASSERT(hdr->b_l1hdr.b_buf == NULL);
2414 2421  
2415 2422                  /*
2416 2423                   * l2arc_write_buffers() relies on a header's L1 portion
2417 2424                   * (i.e. it's b_tmp_cdata field) during it's write phase.
2418 2425                   * Thus, we cannot push a header onto the arc_l2c_only
2419 2426                   * state (removing it's L1 piece) until the header is
2420 2427                   * done being written to the l2arc.
2421 2428                   */
2422 2429                  if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
2423 2430                          ARCSTAT_BUMP(arcstat_evict_l2_skip);
2424 2431                          return (bytes_evicted);
2425 2432                  }
2426 2433  
2427 2434                  ARCSTAT_BUMP(arcstat_deleted);
2428 2435                  bytes_evicted += hdr->b_size;
2429 2436  
2430 2437                  DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2431 2438  
2432 2439                  if (HDR_HAS_L2HDR(hdr)) {
2433 2440                          /*
2434 2441                           * This buffer is cached on the 2nd Level ARC;
2435 2442                           * don't destroy the header.
2436 2443                           */
2437 2444                          arc_change_state(arc_l2c_only, hdr, hash_lock);
2438 2445                          /*
2439 2446                           * dropping from L1+L2 cached to L2-only,
2440 2447                           * realloc to remove the L1 header.
2441 2448                           */
2442 2449                          hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2443 2450                              hdr_l2only_cache);
2444 2451                  } else {
2445 2452                          arc_change_state(arc_anon, hdr, hash_lock);
2446 2453                          arc_hdr_destroy(hdr);
2447 2454                  }
2448 2455                  return (bytes_evicted);
2449 2456          }
2450 2457  
2451 2458          ASSERT(state == arc_mru || state == arc_mfu);
2452 2459          evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2453 2460  
2454 2461          /* prefetch buffers have a minimum lifespan */
2455 2462          if (HDR_IO_IN_PROGRESS(hdr) ||
2456 2463              ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2457 2464              ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2458 2465              arc_min_prefetch_lifespan)) {
2459 2466                  ARCSTAT_BUMP(arcstat_evict_skip);
2460 2467                  return (bytes_evicted);
2461 2468          }
2462 2469  
2463 2470          ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2464 2471          ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2465 2472          while (hdr->b_l1hdr.b_buf) {
2466 2473                  arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2467 2474                  if (!mutex_tryenter(&buf->b_evict_lock)) {
2468 2475                          ARCSTAT_BUMP(arcstat_mutex_miss);
2469 2476                          break;
2470 2477                  }
2471 2478                  if (buf->b_data != NULL)
2472 2479                          bytes_evicted += hdr->b_size;
2473 2480                  if (buf->b_efunc != NULL) {
2474 2481                          mutex_enter(&arc_user_evicts_lock);
2475 2482                          arc_buf_destroy(buf, FALSE);
2476 2483                          hdr->b_l1hdr.b_buf = buf->b_next;
2477 2484                          buf->b_hdr = &arc_eviction_hdr;
2478 2485                          buf->b_next = arc_eviction_list;
2479 2486                          arc_eviction_list = buf;
2480 2487                          cv_signal(&arc_user_evicts_cv);
2481 2488                          mutex_exit(&arc_user_evicts_lock);
2482 2489                          mutex_exit(&buf->b_evict_lock);
2483 2490                  } else {
2484 2491                          mutex_exit(&buf->b_evict_lock);
2485 2492                          arc_buf_destroy(buf, TRUE);
2486 2493                  }
2487 2494          }
2488 2495  
2489 2496          if (HDR_HAS_L2HDR(hdr)) {
2490 2497                  ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
2491 2498          } else {
2492 2499                  if (l2arc_write_eligible(hdr->b_spa, hdr))
2493 2500                          ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
2494 2501                  else
2495 2502                          ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
2496 2503          }
2497 2504  
2498 2505          if (hdr->b_l1hdr.b_datacnt == 0) {
2499 2506                  arc_change_state(evicted_state, hdr, hash_lock);
2500 2507                  ASSERT(HDR_IN_HASH_TABLE(hdr));
2501 2508                  hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2502 2509                  hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2503 2510                  DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2504 2511          }
2505 2512  
2506 2513          return (bytes_evicted);
2507 2514  }
2508 2515  
2509 2516  static uint64_t
2510 2517  arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
2511 2518      uint64_t spa, int64_t bytes)
2512 2519  {
2513 2520          multilist_sublist_t *mls;
2514 2521          uint64_t bytes_evicted = 0;
2515 2522          arc_buf_hdr_t *hdr;
2516 2523          kmutex_t *hash_lock;
2517 2524          int evict_count = 0;
2518 2525  
2519 2526          ASSERT3P(marker, !=, NULL);
2520 2527          IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
2521 2528  
2522 2529          mls = multilist_sublist_lock(ml, idx);
2523 2530  
2524 2531          for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
2525 2532              hdr = multilist_sublist_prev(mls, marker)) {
2526 2533                  if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
2527 2534                      (evict_count >= zfs_arc_evict_batch_limit))
2528 2535                          break;
2529 2536  
2530 2537                  /*
2531 2538                   * To keep our iteration location, move the marker
2532 2539                   * forward. Since we're not holding hdr's hash lock, we
2533 2540                   * must be very careful and not remove 'hdr' from the
2534 2541                   * sublist. Otherwise, other consumers might mistake the
2535 2542                   * 'hdr' as not being on a sublist when they call the
2536 2543                   * multilist_link_active() function (they all rely on
2537 2544                   * the hash lock protecting concurrent insertions and
2538 2545                   * removals). multilist_sublist_move_forward() was
2539 2546                   * specifically implemented to ensure this is the case
2540 2547                   * (only 'marker' will be removed and re-inserted).
2541 2548                   */
2542 2549                  multilist_sublist_move_forward(mls, marker);
2543 2550  
2544 2551                  /*
2545 2552                   * The only case where the b_spa field should ever be
2546 2553                   * zero, is the marker headers inserted by
2547 2554                   * arc_evict_state(). It's possible for multiple threads
2548 2555                   * to be calling arc_evict_state() concurrently (e.g.
2549 2556                   * dsl_pool_close() and zio_inject_fault()), so we must
2550 2557                   * skip any markers we see from these other threads.
2551 2558                   */
2552 2559                  if (hdr->b_spa == 0)
2553 2560                          continue;
2554 2561  
2555 2562                  /* we're only interested in evicting buffers of a certain spa */
2556 2563                  if (spa != 0 && hdr->b_spa != spa) {
2557 2564                          ARCSTAT_BUMP(arcstat_evict_skip);
2558 2565                          continue;
2559 2566                  }
2560 2567  
2561 2568                  hash_lock = HDR_LOCK(hdr);
2562 2569  
2563 2570                  /*
2564 2571                   * We aren't calling this function from any code path
2565 2572                   * that would already be holding a hash lock, so we're
2566 2573                   * asserting on this assumption to be defensive in case
2567 2574                   * this ever changes. Without this check, it would be
2568 2575                   * possible to incorrectly increment arcstat_mutex_miss
2569 2576                   * below (e.g. if the code changed such that we called
2570 2577                   * this function with a hash lock held).
2571 2578                   */
2572 2579                  ASSERT(!MUTEX_HELD(hash_lock));
2573 2580  
2574 2581                  if (mutex_tryenter(hash_lock)) {
2575 2582                          uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
2576 2583                          mutex_exit(hash_lock);
2577 2584  
2578 2585                          bytes_evicted += evicted;
2579 2586  
2580 2587                          /*
2581 2588                           * If evicted is zero, arc_evict_hdr() must have
2582 2589                           * decided to skip this header, don't increment
2583 2590                           * evict_count in this case.
2584 2591                           */
2585 2592                          if (evicted != 0)
2586 2593                                  evict_count++;
2587 2594  
2588 2595                          /*
2589 2596                           * If arc_size isn't overflowing, signal any
2590 2597                           * threads that might happen to be waiting.
2591 2598                           *
2592 2599                           * For each header evicted, we wake up a single
2593 2600                           * thread. If we used cv_broadcast, we could
2594 2601                           * wake up "too many" threads causing arc_size
2595 2602                           * to significantly overflow arc_c; since
2596 2603                           * arc_get_data_buf() doesn't check for overflow
2597 2604                           * when it's woken up (it doesn't because it's
2598 2605                           * possible for the ARC to be overflowing while
2599 2606                           * full of un-evictable buffers, and the
2600 2607                           * function should proceed in this case).
2601 2608                           *
2602 2609                           * If threads are left sleeping, due to not
2603 2610                           * using cv_broadcast, they will be woken up
2604 2611                           * just before arc_reclaim_thread() sleeps.
2605 2612                           */
2606 2613                          mutex_enter(&arc_reclaim_lock);
2607 2614                          if (!arc_is_overflowing())
2608 2615                                  cv_signal(&arc_reclaim_waiters_cv);
2609 2616                          mutex_exit(&arc_reclaim_lock);
2610 2617                  } else {
2611 2618                          ARCSTAT_BUMP(arcstat_mutex_miss);
2612 2619                  }
2613 2620          }
2614 2621  
2615 2622          multilist_sublist_unlock(mls);
2616 2623  
2617 2624          return (bytes_evicted);
2618 2625  }
2619 2626  
2620 2627  /*
2621 2628   * Evict buffers from the given arc state, until we've removed the
2622 2629   * specified number of bytes. Move the removed buffers to the
2623 2630   * appropriate evict state.
2624 2631   *
2625 2632   * This function makes a "best effort". It skips over any buffers
2626 2633   * it can't get a hash_lock on, and so, may not catch all candidates.
2627 2634   * It may also return without evicting as much space as requested.
2628 2635   *
2629 2636   * If bytes is specified using the special value ARC_EVICT_ALL, this
2630 2637   * will evict all available (i.e. unlocked and evictable) buffers from
2631 2638   * the given arc state; which is used by arc_flush().
2632 2639   */
2633 2640  static uint64_t
2634 2641  arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
2635 2642      arc_buf_contents_t type)
2636 2643  {
2637 2644          uint64_t total_evicted = 0;
2638 2645          multilist_t *ml = &state->arcs_list[type];
2639 2646          int num_sublists;
2640 2647          arc_buf_hdr_t **markers;
2641 2648  
2642 2649          IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
2643 2650  
2644 2651          num_sublists = multilist_get_num_sublists(ml);
2645 2652  
2646 2653          /*
2647 2654           * If we've tried to evict from each sublist, made some
2648 2655           * progress, but still have not hit the target number of bytes
2649 2656           * to evict, we want to keep trying. The markers allow us to
2650 2657           * pick up where we left off for each individual sublist, rather
2651 2658           * than starting from the tail each time.
2652 2659           */
2653 2660          markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
2654 2661          for (int i = 0; i < num_sublists; i++) {
2655 2662                  markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
2656 2663  
2657 2664                  /*
2658 2665                   * A b_spa of 0 is used to indicate that this header is
2659 2666                   * a marker. This fact is used in arc_adjust_type() and
2660 2667                   * arc_evict_state_impl().
2661 2668                   */
2662 2669                  markers[i]->b_spa = 0;
2663 2670  
2664 2671                  multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
2665 2672                  multilist_sublist_insert_tail(mls, markers[i]);
2666 2673                  multilist_sublist_unlock(mls);
2667 2674          }
2668 2675  
2669 2676          /*
2670 2677           * While we haven't hit our target number of bytes to evict, or
2671 2678           * we're evicting all available buffers.
2672 2679           */
2673 2680          while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
2674 2681                  /*
2675 2682                   * Start eviction using a randomly selected sublist,
2676 2683                   * this is to try and evenly balance eviction across all
2677 2684                   * sublists. Always starting at the same sublist
2678 2685                   * (e.g. index 0) would cause evictions to favor certain
2679 2686                   * sublists over others.
2680 2687                   */
2681 2688                  int sublist_idx = multilist_get_random_index(ml);
2682 2689                  uint64_t scan_evicted = 0;
2683 2690  
2684 2691                  for (int i = 0; i < num_sublists; i++) {
2685 2692                          uint64_t bytes_remaining;
2686 2693                          uint64_t bytes_evicted;
2687 2694  
2688 2695                          if (bytes == ARC_EVICT_ALL)
2689 2696                                  bytes_remaining = ARC_EVICT_ALL;
2690 2697                          else if (total_evicted < bytes)
2691 2698                                  bytes_remaining = bytes - total_evicted;
2692 2699                          else
2693 2700                                  break;
2694 2701  
2695 2702                          bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
2696 2703                              markers[sublist_idx], spa, bytes_remaining);
2697 2704  
2698 2705                          scan_evicted += bytes_evicted;
2699 2706                          total_evicted += bytes_evicted;
2700 2707  
2701 2708                          /* we've reached the end, wrap to the beginning */
2702 2709                          if (++sublist_idx >= num_sublists)
2703 2710                                  sublist_idx = 0;
2704 2711                  }
2705 2712  
2706 2713                  /*
2707 2714                   * If we didn't evict anything during this scan, we have
2708 2715                   * no reason to believe we'll evict more during another
2709 2716                   * scan, so break the loop.
2710 2717                   */
2711 2718                  if (scan_evicted == 0) {
2712 2719                          /* This isn't possible, let's make that obvious */
2713 2720                          ASSERT3S(bytes, !=, 0);
2714 2721  
2715 2722                          /*
2716 2723                           * When bytes is ARC_EVICT_ALL, the only way to
2717 2724                           * break the loop is when scan_evicted is zero.
2718 2725                           * In that case, we actually have evicted enough,
2719 2726                           * so we don't want to increment the kstat.
2720 2727                           */
2721 2728                          if (bytes != ARC_EVICT_ALL) {
2722 2729                                  ASSERT3S(total_evicted, <, bytes);
2723 2730                                  ARCSTAT_BUMP(arcstat_evict_not_enough);
2724 2731                          }
2725 2732  
2726 2733                          break;
2727 2734                  }
2728 2735          }
2729 2736  
2730 2737          for (int i = 0; i < num_sublists; i++) {
2731 2738                  multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
2732 2739                  multilist_sublist_remove(mls, markers[i]);
2733 2740                  multilist_sublist_unlock(mls);
2734 2741  
2735 2742                  kmem_cache_free(hdr_full_cache, markers[i]);
2736 2743          }
2737 2744          kmem_free(markers, sizeof (*markers) * num_sublists);
2738 2745  
2739 2746          return (total_evicted);
2740 2747  }
2741 2748  
2742 2749  /*
2743 2750   * Flush all "evictable" data of the given type from the arc state
2744 2751   * specified. This will not evict any "active" buffers (i.e. referenced).
2745 2752   *
2746 2753   * When 'retry' is set to FALSE, the function will make a single pass
2747 2754   * over the state and evict any buffers that it can. Since it doesn't
2748 2755   * continually retry the eviction, it might end up leaving some buffers
2749 2756   * in the ARC due to lock misses.
2750 2757   *
2751 2758   * When 'retry' is set to TRUE, the function will continually retry the
2752 2759   * eviction until *all* evictable buffers have been removed from the
2753 2760   * state. As a result, if concurrent insertions into the state are
2754 2761   * allowed (e.g. if the ARC isn't shutting down), this function might
2755 2762   * wind up in an infinite loop, continually trying to evict buffers.
2756 2763   */
2757 2764  static uint64_t
2758 2765  arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
2759 2766      boolean_t retry)
2760 2767  {
2761 2768          uint64_t evicted = 0;
2762 2769  
2763 2770          while (state->arcs_lsize[type] != 0) {
2764 2771                  evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
2765 2772  
2766 2773                  if (!retry)
2767 2774                          break;
2768 2775          }
2769 2776  
2770 2777          return (evicted);
2771 2778  }
2772 2779  
2773 2780  /*
2774 2781   * Evict the specified number of bytes from the state specified,
2775 2782   * restricting eviction to the spa and type given. This function
2776 2783   * prevents us from trying to evict more from a state's list than
2777 2784   * is "evictable", and to skip evicting altogether when passed a
2778 2785   * negative value for "bytes". In contrast, arc_evict_state() will
2779 2786   * evict everything it can, when passed a negative value for "bytes".
2780 2787   */
2781 2788  static uint64_t
2782 2789  arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
2783 2790      arc_buf_contents_t type)
2784 2791  {
2785 2792          int64_t delta;
2786 2793  
2787 2794          if (bytes > 0 && state->arcs_lsize[type] > 0) {
2788 2795                  delta = MIN(state->arcs_lsize[type], bytes);
2789 2796                  return (arc_evict_state(state, spa, delta, type));
2790 2797          }
2791 2798  
2792 2799          return (0);
2793 2800  }
2794 2801  
2795 2802  /*
2796 2803   * Evict metadata buffers from the cache, such that arc_meta_used is
2797 2804   * capped by the arc_meta_limit tunable.
2798 2805   */
2799 2806  static uint64_t
2800 2807  arc_adjust_meta(void)
2801 2808  {
2802 2809          uint64_t total_evicted = 0;
2803 2810          int64_t target;
2804 2811  
2805 2812          /*
2806 2813           * If we're over the meta limit, we want to evict enough
2807 2814           * metadata to get back under the meta limit. We don't want to
2808 2815           * evict so much that we drop the MRU below arc_p, though. If
2809 2816           * we're over the meta limit more than we're over arc_p, we
2810 2817           * evict some from the MRU here, and some from the MFU below.
2811 2818           */
2812 2819          target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
2813 2820              (int64_t)(refcount_count(&arc_anon->arcs_size) +
2814 2821              refcount_count(&arc_mru->arcs_size) - arc_p));
2815 2822  
2816 2823          total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2817 2824  
2818 2825          /*
2819 2826           * Similar to the above, we want to evict enough bytes to get us
2820 2827           * below the meta limit, but not so much as to drop us below the
2821 2828           * space alloted to the MFU (which is defined as arc_c - arc_p).
2822 2829           */
2823 2830          target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
2824 2831              (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
2825 2832  
2826 2833          total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
2827 2834  
2828 2835          return (total_evicted);
2829 2836  }
2830 2837  
2831 2838  /*
2832 2839   * Return the type of the oldest buffer in the given arc state
2833 2840   *
2834 2841   * This function will select a random sublist of type ARC_BUFC_DATA and
2835 2842   * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
2836 2843   * is compared, and the type which contains the "older" buffer will be
2837 2844   * returned.
2838 2845   */
2839 2846  static arc_buf_contents_t
2840 2847  arc_adjust_type(arc_state_t *state)
2841 2848  {
2842 2849          multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
2843 2850          multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
2844 2851          int data_idx = multilist_get_random_index(data_ml);
2845 2852          int meta_idx = multilist_get_random_index(meta_ml);
2846 2853          multilist_sublist_t *data_mls;
2847 2854          multilist_sublist_t *meta_mls;
2848 2855          arc_buf_contents_t type;
2849 2856          arc_buf_hdr_t *data_hdr;
2850 2857          arc_buf_hdr_t *meta_hdr;
2851 2858  
2852 2859          /*
2853 2860           * We keep the sublist lock until we're finished, to prevent
2854 2861           * the headers from being destroyed via arc_evict_state().
2855 2862           */
2856 2863          data_mls = multilist_sublist_lock(data_ml, data_idx);
2857 2864          meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
2858 2865  
2859 2866          /*
2860 2867           * These two loops are to ensure we skip any markers that
2861 2868           * might be at the tail of the lists due to arc_evict_state().
2862 2869           */
2863 2870  
2864 2871          for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
2865 2872              data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
2866 2873                  if (data_hdr->b_spa != 0)
2867 2874                          break;
2868 2875          }
2869 2876  
2870 2877          for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
2871 2878              meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
2872 2879                  if (meta_hdr->b_spa != 0)
2873 2880                          break;
2874 2881          }
2875 2882  
2876 2883          if (data_hdr == NULL && meta_hdr == NULL) {
2877 2884                  type = ARC_BUFC_DATA;
2878 2885          } else if (data_hdr == NULL) {
2879 2886                  ASSERT3P(meta_hdr, !=, NULL);
2880 2887                  type = ARC_BUFC_METADATA;
2881 2888          } else if (meta_hdr == NULL) {
2882 2889                  ASSERT3P(data_hdr, !=, NULL);
2883 2890                  type = ARC_BUFC_DATA;
2884 2891          } else {
2885 2892                  ASSERT3P(data_hdr, !=, NULL);
2886 2893                  ASSERT3P(meta_hdr, !=, NULL);
2887 2894  
2888 2895                  /* The headers can't be on the sublist without an L1 header */
2889 2896                  ASSERT(HDR_HAS_L1HDR(data_hdr));
2890 2897                  ASSERT(HDR_HAS_L1HDR(meta_hdr));
2891 2898  
2892 2899                  if (data_hdr->b_l1hdr.b_arc_access <
2893 2900                      meta_hdr->b_l1hdr.b_arc_access) {
2894 2901                          type = ARC_BUFC_DATA;
2895 2902                  } else {
2896 2903                          type = ARC_BUFC_METADATA;
2897 2904                  }
2898 2905          }
2899 2906  
2900 2907          multilist_sublist_unlock(meta_mls);
2901 2908          multilist_sublist_unlock(data_mls);
2902 2909  
2903 2910          return (type);
2904 2911  }
2905 2912  
2906 2913  /*
2907 2914   * Evict buffers from the cache, such that arc_size is capped by arc_c.
2908 2915   */
2909 2916  static uint64_t
2910 2917  arc_adjust(void)
2911 2918  {
2912 2919          uint64_t total_evicted = 0;
2913 2920          uint64_t bytes;
2914 2921          int64_t target;
2915 2922  
2916 2923          /*
2917 2924           * If we're over arc_meta_limit, we want to correct that before
2918 2925           * potentially evicting data buffers below.
2919 2926           */
2920 2927          total_evicted += arc_adjust_meta();
2921 2928  
2922 2929          /*
2923 2930           * Adjust MRU size
2924 2931           *
2925 2932           * If we're over the target cache size, we want to evict enough
2926 2933           * from the list to get back to our target size. We don't want
2927 2934           * to evict too much from the MRU, such that it drops below
2928 2935           * arc_p. So, if we're over our target cache size more than
2929 2936           * the MRU is over arc_p, we'll evict enough to get back to
2930 2937           * arc_p here, and then evict more from the MFU below.
2931 2938           */
2932 2939          target = MIN((int64_t)(arc_size - arc_c),
2933 2940              (int64_t)(refcount_count(&arc_anon->arcs_size) +
2934 2941              refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
2935 2942  
2936 2943          /*
2937 2944           * If we're below arc_meta_min, always prefer to evict data.
2938 2945           * Otherwise, try to satisfy the requested number of bytes to
2939 2946           * evict from the type which contains older buffers; in an
2940 2947           * effort to keep newer buffers in the cache regardless of their
2941 2948           * type. If we cannot satisfy the number of bytes from this
2942 2949           * type, spill over into the next type.
2943 2950           */
2944 2951          if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
2945 2952              arc_meta_used > arc_meta_min) {
2946 2953                  bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2947 2954                  total_evicted += bytes;
2948 2955  
2949 2956                  /*
2950 2957                   * If we couldn't evict our target number of bytes from
2951 2958                   * metadata, we try to get the rest from data.
2952 2959                   */
2953 2960                  target -= bytes;
2954 2961  
2955 2962                  total_evicted +=
2956 2963                      arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
2957 2964          } else {
2958 2965                  bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
2959 2966                  total_evicted += bytes;
2960 2967  
2961 2968                  /*
2962 2969                   * If we couldn't evict our target number of bytes from
2963 2970                   * data, we try to get the rest from metadata.
2964 2971                   */
2965 2972                  target -= bytes;
2966 2973  
2967 2974                  total_evicted +=
2968 2975                      arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2969 2976          }
2970 2977  
2971 2978          /*
2972 2979           * Adjust MFU size
2973 2980           *
2974 2981           * Now that we've tried to evict enough from the MRU to get its
2975 2982           * size back to arc_p, if we're still above the target cache
2976 2983           * size, we evict the rest from the MFU.
2977 2984           */
2978 2985          target = arc_size - arc_c;
2979 2986  
2980 2987          if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
2981 2988              arc_meta_used > arc_meta_min) {
2982 2989                  bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
2983 2990                  total_evicted += bytes;
2984 2991  
2985 2992                  /*
2986 2993                   * If we couldn't evict our target number of bytes from
2987 2994                   * metadata, we try to get the rest from data.
2988 2995                   */
2989 2996                  target -= bytes;
2990 2997  
2991 2998                  total_evicted +=
2992 2999                      arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
2993 3000          } else {
2994 3001                  bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
2995 3002                  total_evicted += bytes;
2996 3003  
2997 3004                  /*
2998 3005                   * If we couldn't evict our target number of bytes from
2999 3006                   * data, we try to get the rest from data.
3000 3007                   */
3001 3008                  target -= bytes;
3002 3009  
3003 3010                  total_evicted +=
3004 3011                      arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3005 3012          }
3006 3013  
3007 3014          /*
3008 3015           * Adjust ghost lists
3009 3016           *
3010 3017           * In addition to the above, the ARC also defines target values
3011 3018           * for the ghost lists. The sum of the mru list and mru ghost
3012 3019           * list should never exceed the target size of the cache, and
3013 3020           * the sum of the mru list, mfu list, mru ghost list, and mfu
3014 3021           * ghost list should never exceed twice the target size of the
3015 3022           * cache. The following logic enforces these limits on the ghost
3016 3023           * caches, and evicts from them as needed.
3017 3024           */
3018 3025          target = refcount_count(&arc_mru->arcs_size) +
3019 3026              refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
3020 3027  
3021 3028          bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
3022 3029          total_evicted += bytes;
3023 3030  
3024 3031          target -= bytes;
3025 3032  
3026 3033          total_evicted +=
3027 3034              arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
3028 3035  
3029 3036          /*
3030 3037           * We assume the sum of the mru list and mfu list is less than
3031 3038           * or equal to arc_c (we enforced this above), which means we
3032 3039           * can use the simpler of the two equations below:
3033 3040           *
3034 3041           *      mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
3035 3042           *                  mru ghost + mfu ghost <= arc_c
3036 3043           */
3037 3044          target = refcount_count(&arc_mru_ghost->arcs_size) +
3038 3045              refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
3039 3046  
3040 3047          bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
3041 3048          total_evicted += bytes;
3042 3049  
3043 3050          target -= bytes;
3044 3051  
3045 3052          total_evicted +=
3046 3053              arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
3047 3054  
3048 3055          return (total_evicted);
3049 3056  }
3050 3057  
3051 3058  static void
3052 3059  arc_do_user_evicts(void)
3053 3060  {
3054 3061          mutex_enter(&arc_user_evicts_lock);
3055 3062          while (arc_eviction_list != NULL) {
3056 3063                  arc_buf_t *buf = arc_eviction_list;
3057 3064                  arc_eviction_list = buf->b_next;
3058 3065                  mutex_enter(&buf->b_evict_lock);
3059 3066                  buf->b_hdr = NULL;
3060 3067                  mutex_exit(&buf->b_evict_lock);
3061 3068                  mutex_exit(&arc_user_evicts_lock);
3062 3069  
3063 3070                  if (buf->b_efunc != NULL)
3064 3071                          VERIFY0(buf->b_efunc(buf->b_private));
3065 3072  
3066 3073                  buf->b_efunc = NULL;
3067 3074                  buf->b_private = NULL;
3068 3075                  kmem_cache_free(buf_cache, buf);
3069 3076                  mutex_enter(&arc_user_evicts_lock);
3070 3077          }
3071 3078          mutex_exit(&arc_user_evicts_lock);
3072 3079  }
3073 3080  
3074 3081  void
3075 3082  arc_flush(spa_t *spa, boolean_t retry)
3076 3083  {
3077 3084          uint64_t guid = 0;
3078 3085  
3079 3086          /*
3080 3087           * If retry is TRUE, a spa must not be specified since we have
3081 3088           * no good way to determine if all of a spa's buffers have been
3082 3089           * evicted from an arc state.
3083 3090           */
3084 3091          ASSERT(!retry || spa == 0);
3085 3092  
3086 3093          if (spa != NULL)
3087 3094                  guid = spa_load_guid(spa);
3088 3095  
3089 3096          (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
3090 3097          (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
3091 3098  
3092 3099          (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
3093 3100          (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
3094 3101  
3095 3102          (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
3096 3103          (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
3097 3104  
3098 3105          (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
3099 3106          (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
3100 3107  
3101 3108          arc_do_user_evicts();
3102 3109          ASSERT(spa || arc_eviction_list == NULL);
3103 3110  }
3104 3111  
3105 3112  void
3106 3113  arc_shrink(int64_t to_free)
3107 3114  {
3108 3115          if (arc_c > arc_c_min) {
3109 3116  
3110 3117                  if (arc_c > arc_c_min + to_free)
3111 3118                          atomic_add_64(&arc_c, -to_free);
3112 3119                  else
3113 3120                          arc_c = arc_c_min;
3114 3121  
3115 3122                  atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
3116 3123                  if (arc_c > arc_size)
3117 3124                          arc_c = MAX(arc_size, arc_c_min);
3118 3125                  if (arc_p > arc_c)
3119 3126                          arc_p = (arc_c >> 1);
3120 3127                  ASSERT(arc_c >= arc_c_min);
3121 3128                  ASSERT((int64_t)arc_p >= 0);
3122 3129          }
3123 3130  
3124 3131          if (arc_size > arc_c)
3125 3132                  (void) arc_adjust();
3126 3133  }
3127 3134  
3128 3135  typedef enum free_memory_reason_t {
3129 3136          FMR_UNKNOWN,
3130 3137          FMR_NEEDFREE,
3131 3138          FMR_LOTSFREE,
3132 3139          FMR_SWAPFS_MINFREE,
3133 3140          FMR_PAGES_PP_MAXIMUM,
3134 3141          FMR_HEAP_ARENA,
3135 3142          FMR_ZIO_ARENA,
3136 3143  } free_memory_reason_t;
3137 3144  
3138 3145  int64_t last_free_memory;
3139 3146  free_memory_reason_t last_free_reason;
3140 3147  
3141 3148  /*
3142 3149   * Additional reserve of pages for pp_reserve.
3143 3150   */
3144 3151  int64_t arc_pages_pp_reserve = 64;
3145 3152  
3146 3153  /*
3147 3154   * Additional reserve of pages for swapfs.
3148 3155   */
3149 3156  int64_t arc_swapfs_reserve = 64;
3150 3157  
3151 3158  /*
3152 3159   * Return the amount of memory that can be consumed before reclaim will be
3153 3160   * needed.  Positive if there is sufficient free memory, negative indicates
3154 3161   * the amount of memory that needs to be freed up.
3155 3162   */
3156 3163  static int64_t
3157 3164  arc_available_memory(void)
3158 3165  {
3159 3166          int64_t lowest = INT64_MAX;
3160 3167          int64_t n;
3161 3168          free_memory_reason_t r = FMR_UNKNOWN;
3162 3169  
3163 3170  #ifdef _KERNEL
3164 3171          if (needfree > 0) {
3165 3172                  n = PAGESIZE * (-needfree);
3166 3173                  if (n < lowest) {
3167 3174                          lowest = n;
3168 3175                          r = FMR_NEEDFREE;
3169 3176                  }
3170 3177          }
3171 3178  
3172 3179          /*
3173 3180           * check that we're out of range of the pageout scanner.  It starts to
3174 3181           * schedule paging if freemem is less than lotsfree and needfree.
3175 3182           * lotsfree is the high-water mark for pageout, and needfree is the
3176 3183           * number of needed free pages.  We add extra pages here to make sure
3177 3184           * the scanner doesn't start up while we're freeing memory.
3178 3185           */
3179 3186          n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3180 3187          if (n < lowest) {
3181 3188                  lowest = n;
3182 3189                  r = FMR_LOTSFREE;
3183 3190          }
3184 3191  
3185 3192          /*
3186 3193           * check to make sure that swapfs has enough space so that anon
3187 3194           * reservations can still succeed. anon_resvmem() checks that the
3188 3195           * availrmem is greater than swapfs_minfree, and the number of reserved
3189 3196           * swap pages.  We also add a bit of extra here just to prevent
3190 3197           * circumstances from getting really dire.
3191 3198           */
3192 3199          n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3193 3200              desfree - arc_swapfs_reserve);
3194 3201          if (n < lowest) {
3195 3202                  lowest = n;
3196 3203                  r = FMR_SWAPFS_MINFREE;
3197 3204          }
3198 3205  
3199 3206  
3200 3207          /*
3201 3208           * Check that we have enough availrmem that memory locking (e.g., via
3202 3209           * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
3203 3210           * stores the number of pages that cannot be locked; when availrmem
3204 3211           * drops below pages_pp_maximum, page locking mechanisms such as
3205 3212           * page_pp_lock() will fail.)
3206 3213           */
3207 3214          n = PAGESIZE * (availrmem - pages_pp_maximum -
3208 3215              arc_pages_pp_reserve);
3209 3216          if (n < lowest) {
3210 3217                  lowest = n;
3211 3218                  r = FMR_PAGES_PP_MAXIMUM;
3212 3219          }
3213 3220  
3214 3221  #if defined(__i386)
3215 3222          /*
3216 3223           * If we're on an i386 platform, it's possible that we'll exhaust the
3217 3224           * kernel heap space before we ever run out of available physical
3218 3225           * memory.  Most checks of the size of the heap_area compare against
3219 3226           * tune.t_minarmem, which is the minimum available real memory that we
3220 3227           * can have in the system.  However, this is generally fixed at 25 pages
3221 3228           * which is so low that it's useless.  In this comparison, we seek to
3222 3229           * calculate the total heap-size, and reclaim if more than 3/4ths of the
3223 3230           * heap is allocated.  (Or, in the calculation, if less than 1/4th is
3224 3231           * free)
3225 3232           */
3226 3233          n = vmem_size(heap_arena, VMEM_FREE) -
3227 3234              (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
3228 3235          if (n < lowest) {
3229 3236                  lowest = n;
3230 3237                  r = FMR_HEAP_ARENA;
3231 3238          }
3232 3239  #endif
3233 3240  
3234 3241          /*
3235 3242           * If zio data pages are being allocated out of a separate heap segment,
3236 3243           * then enforce that the size of available vmem for this arena remains
3237 3244           * above about 1/16th free.
3238 3245           *
3239 3246           * Note: The 1/16th arena free requirement was put in place
3240 3247           * to aggressively evict memory from the arc in order to avoid
3241 3248           * memory fragmentation issues.
3242 3249           */
3243 3250          if (zio_arena != NULL) {
3244 3251                  n = vmem_size(zio_arena, VMEM_FREE) -
3245 3252                      (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
3246 3253                  if (n < lowest) {
3247 3254                          lowest = n;
3248 3255                          r = FMR_ZIO_ARENA;
3249 3256                  }
3250 3257          }
3251 3258  #else
3252 3259          /* Every 100 calls, free a small amount */
3253 3260          if (spa_get_random(100) == 0)
3254 3261                  lowest = -1024;
3255 3262  #endif
3256 3263  
3257 3264          last_free_memory = lowest;
3258 3265          last_free_reason = r;
3259 3266  
3260 3267          return (lowest);
3261 3268  }
3262 3269  
3263 3270  
3264 3271  /*
3265 3272   * Determine if the system is under memory pressure and is asking
3266 3273   * to reclaim memory. A return value of TRUE indicates that the system
3267 3274   * is under memory pressure and that the arc should adjust accordingly.
3268 3275   */
3269 3276  static boolean_t
3270 3277  arc_reclaim_needed(void)
3271 3278  {
3272 3279          return (arc_available_memory() < 0);
3273 3280  }
3274 3281  
3275 3282  static void
3276 3283  arc_kmem_reap_now(void)
3277 3284  {
3278 3285          size_t                  i;
3279 3286          kmem_cache_t            *prev_cache = NULL;
3280 3287          kmem_cache_t            *prev_data_cache = NULL;
3281 3288          extern kmem_cache_t     *zio_buf_cache[];
3282 3289          extern kmem_cache_t     *zio_data_buf_cache[];
3283 3290          extern kmem_cache_t     *range_seg_cache;
3284 3291  
3285 3292  #ifdef _KERNEL
3286 3293          if (arc_meta_used >= arc_meta_limit) {
3287 3294                  /*
3288 3295                   * We are exceeding our meta-data cache limit.
3289 3296                   * Purge some DNLC entries to release holds on meta-data.
3290 3297                   */
3291 3298                  dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
3292 3299          }
3293 3300  #if defined(__i386)
3294 3301          /*
3295 3302           * Reclaim unused memory from all kmem caches.
3296 3303           */
3297 3304          kmem_reap();
3298 3305  #endif
3299 3306  #endif
3300 3307  
3301 3308          for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3302 3309                  if (zio_buf_cache[i] != prev_cache) {
3303 3310                          prev_cache = zio_buf_cache[i];
3304 3311                          kmem_cache_reap_now(zio_buf_cache[i]);
3305 3312                  }
3306 3313                  if (zio_data_buf_cache[i] != prev_data_cache) {
3307 3314                          prev_data_cache = zio_data_buf_cache[i];
3308 3315                          kmem_cache_reap_now(zio_data_buf_cache[i]);
3309 3316                  }
3310 3317          }
3311 3318          kmem_cache_reap_now(buf_cache);
3312 3319          kmem_cache_reap_now(hdr_full_cache);
3313 3320          kmem_cache_reap_now(hdr_l2only_cache);
3314 3321          kmem_cache_reap_now(range_seg_cache);
3315 3322  
3316 3323          if (zio_arena != NULL) {
3317 3324                  /*
3318 3325                   * Ask the vmem arena to reclaim unused memory from its
3319 3326                   * quantum caches.
3320 3327                   */
3321 3328                  vmem_qcache_reap(zio_arena);
3322 3329          }
3323 3330  }
3324 3331  
3325 3332  /*
3326 3333   * Threads can block in arc_get_data_buf() waiting for this thread to evict
3327 3334   * enough data and signal them to proceed. When this happens, the threads in
3328 3335   * arc_get_data_buf() are sleeping while holding the hash lock for their
3329 3336   * particular arc header. Thus, we must be careful to never sleep on a
3330 3337   * hash lock in this thread. This is to prevent the following deadlock:
3331 3338   *
3332 3339   *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
3333 3340   *    waiting for the reclaim thread to signal it.
3334 3341   *
3335 3342   *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
3336 3343   *    fails, and goes to sleep forever.
3337 3344   *
3338 3345   * This possible deadlock is avoided by always acquiring a hash lock
3339 3346   * using mutex_tryenter() from arc_reclaim_thread().
3340 3347   */
3341 3348  static void
3342 3349  arc_reclaim_thread(void)
3343 3350  {
3344 3351          clock_t                 growtime = 0;
3345 3352          callb_cpr_t             cpr;
3346 3353  
3347 3354          CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
3348 3355  
3349 3356          mutex_enter(&arc_reclaim_lock);
3350 3357          while (!arc_reclaim_thread_exit) {
3351 3358                  int64_t free_memory = arc_available_memory();
3352 3359                  uint64_t evicted = 0;
3353 3360  
3354 3361                  mutex_exit(&arc_reclaim_lock);
3355 3362  
3356 3363                  if (free_memory < 0) {
3357 3364  
3358 3365                          arc_no_grow = B_TRUE;
3359 3366                          arc_warm = B_TRUE;
3360 3367  
3361 3368                          /*
3362 3369                           * Wait at least zfs_grow_retry (default 60) seconds
3363 3370                           * before considering growing.
3364 3371                           */
3365 3372                          growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
3366 3373  
3367 3374                          arc_kmem_reap_now();
3368 3375  
3369 3376                          /*
3370 3377                           * If we are still low on memory, shrink the ARC
3371 3378                           * so that we have arc_shrink_min free space.
3372 3379                           */
3373 3380                          free_memory = arc_available_memory();
3374 3381  
3375 3382                          int64_t to_free =
3376 3383                              (arc_c >> arc_shrink_shift) - free_memory;
3377 3384                          if (to_free > 0) {
3378 3385  #ifdef _KERNEL
3379 3386                                  to_free = MAX(to_free, ptob(needfree));
3380 3387  #endif
3381 3388                                  arc_shrink(to_free);
3382 3389                          }
3383 3390                  } else if (free_memory < arc_c >> arc_no_grow_shift) {
3384 3391                          arc_no_grow = B_TRUE;
3385 3392                  } else if (ddi_get_lbolt() >= growtime) {
3386 3393                          arc_no_grow = B_FALSE;
3387 3394                  }
3388 3395  
3389 3396                  evicted = arc_adjust();
3390 3397  
3391 3398                  mutex_enter(&arc_reclaim_lock);
3392 3399  
3393 3400                  /*
3394 3401                   * If evicted is zero, we couldn't evict anything via
3395 3402                   * arc_adjust(). This could be due to hash lock
3396 3403                   * collisions, but more likely due to the majority of
3397 3404                   * arc buffers being unevictable. Therefore, even if
3398 3405                   * arc_size is above arc_c, another pass is unlikely to
3399 3406                   * be helpful and could potentially cause us to enter an
3400 3407                   * infinite loop.
3401 3408                   */
3402 3409                  if (arc_size <= arc_c || evicted == 0) {
3403 3410                          /*
3404 3411                           * We're either no longer overflowing, or we
3405 3412                           * can't evict anything more, so we should wake
3406 3413                           * up any threads before we go to sleep.
3407 3414                           */
3408 3415                          cv_broadcast(&arc_reclaim_waiters_cv);
3409 3416  
3410 3417                          /*
3411 3418                           * Block until signaled, or after one second (we
3412 3419                           * might need to perform arc_kmem_reap_now()
3413 3420                           * even if we aren't being signalled)
3414 3421                           */
3415 3422                          CALLB_CPR_SAFE_BEGIN(&cpr);
3416 3423                          (void) cv_timedwait(&arc_reclaim_thread_cv,
3417 3424                              &arc_reclaim_lock, ddi_get_lbolt() + hz);
3418 3425                          CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
3419 3426                  }
3420 3427          }
3421 3428  
3422 3429          arc_reclaim_thread_exit = FALSE;
3423 3430          cv_broadcast(&arc_reclaim_thread_cv);
3424 3431          CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_lock */
3425 3432          thread_exit();
3426 3433  }
3427 3434  
3428 3435  static void
3429 3436  arc_user_evicts_thread(void)
3430 3437  {
3431 3438          callb_cpr_t cpr;
3432 3439  
3433 3440          CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
3434 3441  
3435 3442          mutex_enter(&arc_user_evicts_lock);
3436 3443          while (!arc_user_evicts_thread_exit) {
3437 3444                  mutex_exit(&arc_user_evicts_lock);
3438 3445  
3439 3446                  arc_do_user_evicts();
3440 3447  
3441 3448                  /*
3442 3449                   * This is necessary in order for the mdb ::arc dcmd to
3443 3450                   * show up to date information. Since the ::arc command
3444 3451                   * does not call the kstat's update function, without
3445 3452                   * this call, the command may show stale stats for the
3446 3453                   * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3447 3454                   * with this change, the data might be up to 1 second
3448 3455                   * out of date; but that should suffice. The arc_state_t
3449 3456                   * structures can be queried directly if more accurate
3450 3457                   * information is needed.
3451 3458                   */
3452 3459                  if (arc_ksp != NULL)
3453 3460                          arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3454 3461  
3455 3462                  mutex_enter(&arc_user_evicts_lock);
3456 3463  
3457 3464                  /*
3458 3465                   * Block until signaled, or after one second (we need to
3459 3466                   * call the arc's kstat update function regularly).
3460 3467                   */
3461 3468                  CALLB_CPR_SAFE_BEGIN(&cpr);
3462 3469                  (void) cv_timedwait(&arc_user_evicts_cv,
3463 3470                      &arc_user_evicts_lock, ddi_get_lbolt() + hz);
3464 3471                  CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
3465 3472          }
3466 3473  
3467 3474          arc_user_evicts_thread_exit = FALSE;
3468 3475          cv_broadcast(&arc_user_evicts_cv);
3469 3476          CALLB_CPR_EXIT(&cpr);           /* drops arc_user_evicts_lock */
3470 3477          thread_exit();
3471 3478  }
3472 3479  
3473 3480  /*
3474 3481   * Adapt arc info given the number of bytes we are trying to add and
3475 3482   * the state that we are comming from.  This function is only called
3476 3483   * when we are adding new content to the cache.
3477 3484   */
3478 3485  static void
3479 3486  arc_adapt(int bytes, arc_state_t *state)
3480 3487  {
3481 3488          int mult;
3482 3489          uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
3483 3490          int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
3484 3491          int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
3485 3492  
3486 3493          if (state == arc_l2c_only)
3487 3494                  return;
3488 3495  
3489 3496          ASSERT(bytes > 0);
3490 3497          /*
3491 3498           * Adapt the target size of the MRU list:
3492 3499           *      - if we just hit in the MRU ghost list, then increase
3493 3500           *        the target size of the MRU list.
3494 3501           *      - if we just hit in the MFU ghost list, then increase
3495 3502           *        the target size of the MFU list by decreasing the
3496 3503           *        target size of the MRU list.
3497 3504           */
3498 3505          if (state == arc_mru_ghost) {
3499 3506                  mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
3500 3507                  mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3501 3508  
3502 3509                  arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
3503 3510          } else if (state == arc_mfu_ghost) {
3504 3511                  uint64_t delta;
3505 3512  
3506 3513                  mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
3507 3514                  mult = MIN(mult, 10);
3508 3515  
3509 3516                  delta = MIN(bytes * mult, arc_p);
3510 3517                  arc_p = MAX(arc_p_min, arc_p - delta);
3511 3518          }
3512 3519          ASSERT((int64_t)arc_p >= 0);
3513 3520  
3514 3521          if (arc_reclaim_needed()) {
3515 3522                  cv_signal(&arc_reclaim_thread_cv);
3516 3523                  return;
3517 3524          }
3518 3525  
3519 3526          if (arc_no_grow)
3520 3527                  return;
3521 3528  
3522 3529          if (arc_c >= arc_c_max)
3523 3530                  return;
3524 3531  
3525 3532          /*
3526 3533           * If we're within (2 * maxblocksize) bytes of the target
3527 3534           * cache size, increment the target cache size
3528 3535           */
3529 3536          if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3530 3537                  atomic_add_64(&arc_c, (int64_t)bytes);
3531 3538                  if (arc_c > arc_c_max)
3532 3539                          arc_c = arc_c_max;
3533 3540                  else if (state == arc_anon)
3534 3541                          atomic_add_64(&arc_p, (int64_t)bytes);
3535 3542                  if (arc_p > arc_c)
3536 3543                          arc_p = arc_c;
3537 3544          }
3538 3545          ASSERT((int64_t)arc_p >= 0);
3539 3546  }
3540 3547  
3541 3548  /*
3542 3549   * Check if arc_size has grown past our upper threshold, determined by
3543 3550   * zfs_arc_overflow_shift.
3544 3551   */
3545 3552  static boolean_t
3546 3553  arc_is_overflowing(void)
3547 3554  {
3548 3555          /* Always allow at least one block of overflow */
3549 3556          uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
3550 3557              arc_c >> zfs_arc_overflow_shift);
3551 3558  
3552 3559          return (arc_size >= arc_c + overflow);
3553 3560  }
3554 3561  
3555 3562  /*
3556 3563   * The buffer, supplied as the first argument, needs a data block. If we
3557 3564   * are hitting the hard limit for the cache size, we must sleep, waiting
3558 3565   * for the eviction thread to catch up. If we're past the target size
3559 3566   * but below the hard limit, we'll only signal the reclaim thread and
3560 3567   * continue on.
3561 3568   */
3562 3569  static void
3563 3570  arc_get_data_buf(arc_buf_t *buf)
3564 3571  {
3565 3572          arc_state_t             *state = buf->b_hdr->b_l1hdr.b_state;
3566 3573          uint64_t                size = buf->b_hdr->b_size;
3567 3574          arc_buf_contents_t      type = arc_buf_type(buf->b_hdr);
3568 3575  
3569 3576          arc_adapt(size, state);
3570 3577  
3571 3578          /*
3572 3579           * If arc_size is currently overflowing, and has grown past our
3573 3580           * upper limit, we must be adding data faster than the evict
3574 3581           * thread can evict. Thus, to ensure we don't compound the
3575 3582           * problem by adding more data and forcing arc_size to grow even
3576 3583           * further past it's target size, we halt and wait for the
3577 3584           * eviction thread to catch up.
3578 3585           *
3579 3586           * It's also possible that the reclaim thread is unable to evict
3580 3587           * enough buffers to get arc_size below the overflow limit (e.g.
3581 3588           * due to buffers being un-evictable, or hash lock collisions).
3582 3589           * In this case, we want to proceed regardless if we're
3583 3590           * overflowing; thus we don't use a while loop here.
3584 3591           */
3585 3592          if (arc_is_overflowing()) {
3586 3593                  mutex_enter(&arc_reclaim_lock);
3587 3594  
3588 3595                  /*
3589 3596                   * Now that we've acquired the lock, we may no longer be
3590 3597                   * over the overflow limit, lets check.
3591 3598                   *
3592 3599                   * We're ignoring the case of spurious wake ups. If that
3593 3600                   * were to happen, it'd let this thread consume an ARC
3594 3601                   * buffer before it should have (i.e. before we're under
3595 3602                   * the overflow limit and were signalled by the reclaim
3596 3603                   * thread). As long as that is a rare occurrence, it
3597 3604                   * shouldn't cause any harm.
3598 3605                   */
3599 3606                  if (arc_is_overflowing()) {
3600 3607                          cv_signal(&arc_reclaim_thread_cv);
3601 3608                          cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
3602 3609                  }
3603 3610  
3604 3611                  mutex_exit(&arc_reclaim_lock);
3605 3612          }
3606 3613  
3607 3614          if (type == ARC_BUFC_METADATA) {
3608 3615                  buf->b_data = zio_buf_alloc(size);
3609 3616                  arc_space_consume(size, ARC_SPACE_META);
3610 3617          } else {
3611 3618                  ASSERT(type == ARC_BUFC_DATA);
3612 3619                  buf->b_data = zio_data_buf_alloc(size);
3613 3620                  arc_space_consume(size, ARC_SPACE_DATA);
3614 3621          }
3615 3622  
3616 3623          /*
3617 3624           * Update the state size.  Note that ghost states have a
3618 3625           * "ghost size" and so don't need to be updated.
3619 3626           */
3620 3627          if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3621 3628                  arc_buf_hdr_t *hdr = buf->b_hdr;
3622 3629                  arc_state_t *state = hdr->b_l1hdr.b_state;
3623 3630  
3624 3631                  (void) refcount_add_many(&state->arcs_size, size, buf);
3625 3632  
3626 3633                  /*
3627 3634                   * If this is reached via arc_read, the link is
3628 3635                   * protected by the hash lock. If reached via
3629 3636                   * arc_buf_alloc, the header should not be accessed by
3630 3637                   * any other thread. And, if reached via arc_read_done,
3631 3638                   * the hash lock will protect it if it's found in the
3632 3639                   * hash table; otherwise no other thread should be
3633 3640                   * trying to [add|remove]_reference it.
3634 3641                   */
3635 3642                  if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
3636 3643                          ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3637 3644                          atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3638 3645                              size);
3639 3646                  }
3640 3647                  /*
3641 3648                   * If we are growing the cache, and we are adding anonymous
3642 3649                   * data, and we have outgrown arc_p, update arc_p
3643 3650                   */
3644 3651                  if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
3645 3652                      (refcount_count(&arc_anon->arcs_size) +
3646 3653                      refcount_count(&arc_mru->arcs_size) > arc_p))
3647 3654                          arc_p = MIN(arc_c, arc_p + size);
3648 3655          }
3649 3656  }
3650 3657  
3651 3658  /*
3652 3659   * This routine is called whenever a buffer is accessed.
3653 3660   * NOTE: the hash lock is dropped in this function.
3654 3661   */
3655 3662  static void
3656 3663  arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3657 3664  {
3658 3665          clock_t now;
3659 3666  
3660 3667          ASSERT(MUTEX_HELD(hash_lock));
3661 3668          ASSERT(HDR_HAS_L1HDR(hdr));
3662 3669  
3663 3670          if (hdr->b_l1hdr.b_state == arc_anon) {
3664 3671                  /*
3665 3672                   * This buffer is not in the cache, and does not
3666 3673                   * appear in our "ghost" list.  Add the new buffer
3667 3674                   * to the MRU state.
3668 3675                   */
3669 3676  
3670 3677                  ASSERT0(hdr->b_l1hdr.b_arc_access);
3671 3678                  hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3672 3679                  DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3673 3680                  arc_change_state(arc_mru, hdr, hash_lock);
3674 3681  
3675 3682          } else if (hdr->b_l1hdr.b_state == arc_mru) {
3676 3683                  now = ddi_get_lbolt();
3677 3684  
3678 3685                  /*
3679 3686                   * If this buffer is here because of a prefetch, then either:
3680 3687                   * - clear the flag if this is a "referencing" read
3681 3688                   *   (any subsequent access will bump this into the MFU state).
3682 3689                   * or
3683 3690                   * - move the buffer to the head of the list if this is
3684 3691                   *   another prefetch (to make it less likely to be evicted).
3685 3692                   */
3686 3693                  if (HDR_PREFETCH(hdr)) {
3687 3694                          if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3688 3695                                  /* link protected by hash lock */
3689 3696                                  ASSERT(multilist_link_active(
3690 3697                                      &hdr->b_l1hdr.b_arc_node));
3691 3698                          } else {
3692 3699                                  hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3693 3700                                  ARCSTAT_BUMP(arcstat_mru_hits);
3694 3701                          }
3695 3702                          hdr->b_l1hdr.b_arc_access = now;
3696 3703                          return;
3697 3704                  }
3698 3705  
3699 3706                  /*
3700 3707                   * This buffer has been "accessed" only once so far,
3701 3708                   * but it is still in the cache. Move it to the MFU
3702 3709                   * state.
3703 3710                   */
3704 3711                  if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
3705 3712                          /*
3706 3713                           * More than 125ms have passed since we
3707 3714                           * instantiated this buffer.  Move it to the
3708 3715                           * most frequently used state.
3709 3716                           */
3710 3717                          hdr->b_l1hdr.b_arc_access = now;
3711 3718                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3712 3719                          arc_change_state(arc_mfu, hdr, hash_lock);
3713 3720                  }
3714 3721                  ARCSTAT_BUMP(arcstat_mru_hits);
3715 3722          } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
3716 3723                  arc_state_t     *new_state;
3717 3724                  /*
3718 3725                   * This buffer has been "accessed" recently, but
3719 3726                   * was evicted from the cache.  Move it to the
3720 3727                   * MFU state.
3721 3728                   */
3722 3729  
3723 3730                  if (HDR_PREFETCH(hdr)) {
3724 3731                          new_state = arc_mru;
3725 3732                          if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
3726 3733                                  hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3727 3734                          DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3728 3735                  } else {
3729 3736                          new_state = arc_mfu;
3730 3737                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3731 3738                  }
3732 3739  
3733 3740                  hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3734 3741                  arc_change_state(new_state, hdr, hash_lock);
3735 3742  
3736 3743                  ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3737 3744          } else if (hdr->b_l1hdr.b_state == arc_mfu) {
3738 3745                  /*
3739 3746                   * This buffer has been accessed more than once and is
3740 3747                   * still in the cache.  Keep it in the MFU state.
3741 3748                   *
3742 3749                   * NOTE: an add_reference() that occurred when we did
3743 3750                   * the arc_read() will have kicked this off the list.
3744 3751                   * If it was a prefetch, we will explicitly move it to
3745 3752                   * the head of the list now.
3746 3753                   */
3747 3754                  if ((HDR_PREFETCH(hdr)) != 0) {
3748 3755                          ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3749 3756                          /* link protected by hash_lock */
3750 3757                          ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3751 3758                  }
3752 3759                  ARCSTAT_BUMP(arcstat_mfu_hits);
3753 3760                  hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3754 3761          } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
3755 3762                  arc_state_t     *new_state = arc_mfu;
3756 3763                  /*
3757 3764                   * This buffer has been accessed more than once but has
3758 3765                   * been evicted from the cache.  Move it back to the
3759 3766                   * MFU state.
3760 3767                   */
3761 3768  
3762 3769                  if (HDR_PREFETCH(hdr)) {
3763 3770                          /*
3764 3771                           * This is a prefetch access...
3765 3772                           * move this block back to the MRU state.
3766 3773                           */
3767 3774                          ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
3768 3775                          new_state = arc_mru;
3769 3776                  }
3770 3777  
3771 3778                  hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3772 3779                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3773 3780                  arc_change_state(new_state, hdr, hash_lock);
3774 3781  
3775 3782                  ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3776 3783          } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
3777 3784                  /*
3778 3785                   * This buffer is on the 2nd Level ARC.
3779 3786                   */
3780 3787  
3781 3788                  hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3782 3789                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3783 3790                  arc_change_state(arc_mfu, hdr, hash_lock);
3784 3791          } else {
3785 3792                  ASSERT(!"invalid arc state");
3786 3793          }
3787 3794  }
3788 3795  
3789 3796  /* a generic arc_done_func_t which you can use */
3790 3797  /* ARGSUSED */
3791 3798  void
3792 3799  arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3793 3800  {
3794 3801          if (zio == NULL || zio->io_error == 0)
3795 3802                  bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3796 3803          VERIFY(arc_buf_remove_ref(buf, arg));
3797 3804  }
3798 3805  
3799 3806  /* a generic arc_done_func_t */
3800 3807  void
3801 3808  arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3802 3809  {
3803 3810          arc_buf_t **bufp = arg;
3804 3811          if (zio && zio->io_error) {
3805 3812                  VERIFY(arc_buf_remove_ref(buf, arg));
3806 3813                  *bufp = NULL;
3807 3814          } else {
3808 3815                  *bufp = buf;
3809 3816                  ASSERT(buf->b_data);
3810 3817          }
3811 3818  }
3812 3819  
3813 3820  static void
3814 3821  arc_read_done(zio_t *zio)
3815 3822  {
3816 3823          arc_buf_hdr_t   *hdr;
3817 3824          arc_buf_t       *buf;
3818 3825          arc_buf_t       *abuf;  /* buffer we're assigning to callback */
3819 3826          kmutex_t        *hash_lock = NULL;
3820 3827          arc_callback_t  *callback_list, *acb;
3821 3828          int             freeable = FALSE;
3822 3829  
3823 3830          buf = zio->io_private;
3824 3831          hdr = buf->b_hdr;
3825 3832  
3826 3833          /*
3827 3834           * The hdr was inserted into hash-table and removed from lists
3828 3835           * prior to starting I/O.  We should find this header, since
3829 3836           * it's in the hash table, and it should be legit since it's
3830 3837           * not possible to evict it during the I/O.  The only possible
3831 3838           * reason for it not to be found is if we were freed during the
3832 3839           * read.
3833 3840           */
3834 3841          if (HDR_IN_HASH_TABLE(hdr)) {
3835 3842                  ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3836 3843                  ASSERT3U(hdr->b_dva.dva_word[0], ==,
3837 3844                      BP_IDENTITY(zio->io_bp)->dva_word[0]);
3838 3845                  ASSERT3U(hdr->b_dva.dva_word[1], ==,
3839 3846                      BP_IDENTITY(zio->io_bp)->dva_word[1]);
3840 3847  
3841 3848                  arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3842 3849                      &hash_lock);
3843 3850  
3844 3851                  ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3845 3852                      hash_lock == NULL) ||
3846 3853                      (found == hdr &&
3847 3854                      DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3848 3855                      (found == hdr && HDR_L2_READING(hdr)));
3849 3856          }
3850 3857  
3851 3858          hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
3852 3859          if (l2arc_noprefetch && HDR_PREFETCH(hdr))
3853 3860                  hdr->b_flags &= ~ARC_FLAG_L2CACHE;
3854 3861  
3855 3862          /* byteswap if necessary */
3856 3863          callback_list = hdr->b_l1hdr.b_acb;
3857 3864          ASSERT(callback_list != NULL);
3858 3865          if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3859 3866                  dmu_object_byteswap_t bswap =
3860 3867                      DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3861 3868                  arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3862 3869                      byteswap_uint64_array :
3863 3870                      dmu_ot_byteswap[bswap].ob_func;
3864 3871                  func(buf->b_data, hdr->b_size);
3865 3872          }
3866 3873  
3867 3874          arc_cksum_compute(buf, B_FALSE);
3868 3875          arc_buf_watch(buf);
3869 3876  
3870 3877          if (hash_lock && zio->io_error == 0 &&
3871 3878              hdr->b_l1hdr.b_state == arc_anon) {
3872 3879                  /*
3873 3880                   * Only call arc_access on anonymous buffers.  This is because
3874 3881                   * if we've issued an I/O for an evicted buffer, we've already
3875 3882                   * called arc_access (to prevent any simultaneous readers from
3876 3883                   * getting confused).
3877 3884                   */
3878 3885                  arc_access(hdr, hash_lock);
3879 3886          }
3880 3887  
3881 3888          /* create copies of the data buffer for the callers */
3882 3889          abuf = buf;
3883 3890          for (acb = callback_list; acb; acb = acb->acb_next) {
3884 3891                  if (acb->acb_done) {
3885 3892                          if (abuf == NULL) {
3886 3893                                  ARCSTAT_BUMP(arcstat_duplicate_reads);
3887 3894                                  abuf = arc_buf_clone(buf);
3888 3895                          }
3889 3896                          acb->acb_buf = abuf;
3890 3897                          abuf = NULL;
3891 3898                  }
3892 3899          }
3893 3900          hdr->b_l1hdr.b_acb = NULL;
3894 3901          hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3895 3902          ASSERT(!HDR_BUF_AVAILABLE(hdr));
3896 3903          if (abuf == buf) {
3897 3904                  ASSERT(buf->b_efunc == NULL);
3898 3905                  ASSERT(hdr->b_l1hdr.b_datacnt == 1);
3899 3906                  hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3900 3907          }
3901 3908  
3902 3909          ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
3903 3910              callback_list != NULL);
3904 3911  
3905 3912          if (zio->io_error != 0) {
3906 3913                  hdr->b_flags |= ARC_FLAG_IO_ERROR;
3907 3914                  if (hdr->b_l1hdr.b_state != arc_anon)
3908 3915                          arc_change_state(arc_anon, hdr, hash_lock);
3909 3916                  if (HDR_IN_HASH_TABLE(hdr))
3910 3917                          buf_hash_remove(hdr);
3911 3918                  freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3912 3919          }
3913 3920  
3914 3921          /*
3915 3922           * Broadcast before we drop the hash_lock to avoid the possibility
3916 3923           * that the hdr (and hence the cv) might be freed before we get to
3917 3924           * the cv_broadcast().
3918 3925           */
3919 3926          cv_broadcast(&hdr->b_l1hdr.b_cv);
3920 3927  
3921 3928          if (hash_lock != NULL) {
3922 3929                  mutex_exit(hash_lock);
3923 3930          } else {
3924 3931                  /*
3925 3932                   * This block was freed while we waited for the read to
3926 3933                   * complete.  It has been removed from the hash table and
3927 3934                   * moved to the anonymous state (so that it won't show up
3928 3935                   * in the cache).
3929 3936                   */
3930 3937                  ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3931 3938                  freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3932 3939          }
3933 3940  
3934 3941          /* execute each callback and free its structure */
3935 3942          while ((acb = callback_list) != NULL) {
3936 3943                  if (acb->acb_done)
3937 3944                          acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3938 3945  
3939 3946                  if (acb->acb_zio_dummy != NULL) {
3940 3947                          acb->acb_zio_dummy->io_error = zio->io_error;
3941 3948                          zio_nowait(acb->acb_zio_dummy);
3942 3949                  }
3943 3950  
3944 3951                  callback_list = acb->acb_next;
3945 3952                  kmem_free(acb, sizeof (arc_callback_t));
3946 3953          }
3947 3954  
3948 3955          if (freeable)
3949 3956                  arc_hdr_destroy(hdr);
3950 3957  }
3951 3958  
3952 3959  /*
3953 3960   * "Read" the block at the specified DVA (in bp) via the
3954 3961   * cache.  If the block is found in the cache, invoke the provided
3955 3962   * callback immediately and return.  Note that the `zio' parameter
3956 3963   * in the callback will be NULL in this case, since no IO was
3957 3964   * required.  If the block is not in the cache pass the read request
3958 3965   * on to the spa with a substitute callback function, so that the
3959 3966   * requested block will be added to the cache.
3960 3967   *
3961 3968   * If a read request arrives for a block that has a read in-progress,
3962 3969   * either wait for the in-progress read to complete (and return the
3963 3970   * results); or, if this is a read with a "done" func, add a record
3964 3971   * to the read to invoke the "done" func when the read completes,
3965 3972   * and return; or just return.
3966 3973   *
3967 3974   * arc_read_done() will invoke all the requested "done" functions
3968 3975   * for readers of this block.
3969 3976   */
3970 3977  int
3971 3978  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3972 3979      void *private, zio_priority_t priority, int zio_flags,
3973 3980      arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
3974 3981  {
3975 3982          arc_buf_hdr_t *hdr = NULL;
3976 3983          arc_buf_t *buf = NULL;
3977 3984          kmutex_t *hash_lock = NULL;
3978 3985          zio_t *rzio;
3979 3986          uint64_t guid = spa_load_guid(spa);
3980 3987  
3981 3988          ASSERT(!BP_IS_EMBEDDED(bp) ||
3982 3989              BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3983 3990  
3984 3991  top:
3985 3992          if (!BP_IS_EMBEDDED(bp)) {
3986 3993                  /*
3987 3994                   * Embedded BP's have no DVA and require no I/O to "read".
3988 3995                   * Create an anonymous arc buf to back it.
3989 3996                   */
3990 3997                  hdr = buf_hash_find(guid, bp, &hash_lock);
3991 3998          }
3992 3999  
3993 4000          if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
3994 4001  
3995 4002                  *arc_flags |= ARC_FLAG_CACHED;
3996 4003  
3997 4004                  if (HDR_IO_IN_PROGRESS(hdr)) {
3998 4005  
3999 4006                          if (*arc_flags & ARC_FLAG_WAIT) {
4000 4007                                  cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
4001 4008                                  mutex_exit(hash_lock);
4002 4009                                  goto top;
4003 4010                          }
4004 4011                          ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4005 4012  
4006 4013                          if (done) {
4007 4014                                  arc_callback_t  *acb = NULL;
4008 4015  
4009 4016                                  acb = kmem_zalloc(sizeof (arc_callback_t),
4010 4017                                      KM_SLEEP);
4011 4018                                  acb->acb_done = done;
4012 4019                                  acb->acb_private = private;
4013 4020                                  if (pio != NULL)
4014 4021                                          acb->acb_zio_dummy = zio_null(pio,
4015 4022                                              spa, NULL, NULL, NULL, zio_flags);
4016 4023  
4017 4024                                  ASSERT(acb->acb_done != NULL);
4018 4025                                  acb->acb_next = hdr->b_l1hdr.b_acb;
4019 4026                                  hdr->b_l1hdr.b_acb = acb;
4020 4027                                  add_reference(hdr, hash_lock, private);
4021 4028                                  mutex_exit(hash_lock);
4022 4029                                  return (0);
4023 4030                          }
4024 4031                          mutex_exit(hash_lock);
4025 4032                          return (0);
4026 4033                  }
4027 4034  
4028 4035                  ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4029 4036                      hdr->b_l1hdr.b_state == arc_mfu);
4030 4037  
4031 4038                  if (done) {
4032 4039                          add_reference(hdr, hash_lock, private);
4033 4040                          /*
4034 4041                           * If this block is already in use, create a new
4035 4042                           * copy of the data so that we will be guaranteed
4036 4043                           * that arc_release() will always succeed.
4037 4044                           */
4038 4045                          buf = hdr->b_l1hdr.b_buf;
4039 4046                          ASSERT(buf);
4040 4047                          ASSERT(buf->b_data);
4041 4048                          if (HDR_BUF_AVAILABLE(hdr)) {
4042 4049                                  ASSERT(buf->b_efunc == NULL);
4043 4050                                  hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4044 4051                          } else {
4045 4052                                  buf = arc_buf_clone(buf);
4046 4053                          }
4047 4054  
4048 4055                  } else if (*arc_flags & ARC_FLAG_PREFETCH &&
4049 4056                      refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
4050 4057                          hdr->b_flags |= ARC_FLAG_PREFETCH;
4051 4058                  }
4052 4059                  DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
4053 4060                  arc_access(hdr, hash_lock);
4054 4061                  if (*arc_flags & ARC_FLAG_L2CACHE)
4055 4062                          hdr->b_flags |= ARC_FLAG_L2CACHE;
4056 4063                  if (*arc_flags & ARC_FLAG_L2COMPRESS)
4057 4064                          hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4058 4065                  mutex_exit(hash_lock);
4059 4066                  ARCSTAT_BUMP(arcstat_hits);
4060 4067                  ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4061 4068                      demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4062 4069                      data, metadata, hits);
4063 4070  
4064 4071                  if (done)
4065 4072                          done(NULL, buf, private);
4066 4073          } else {
4067 4074                  uint64_t size = BP_GET_LSIZE(bp);
4068 4075                  arc_callback_t *acb;
4069 4076                  vdev_t *vd = NULL;
4070 4077                  uint64_t addr = 0;
4071 4078                  boolean_t devw = B_FALSE;
4072 4079                  enum zio_compress b_compress = ZIO_COMPRESS_OFF;
4073 4080                  int32_t b_asize = 0;
4074 4081  
4075 4082                  if (hdr == NULL) {
4076 4083                          /* this block is not in the cache */
4077 4084                          arc_buf_hdr_t *exists = NULL;
4078 4085                          arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
4079 4086                          buf = arc_buf_alloc(spa, size, private, type);
4080 4087                          hdr = buf->b_hdr;
4081 4088                          if (!BP_IS_EMBEDDED(bp)) {
4082 4089                                  hdr->b_dva = *BP_IDENTITY(bp);
4083 4090                                  hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
4084 4091                                  exists = buf_hash_insert(hdr, &hash_lock);
4085 4092                          }
4086 4093                          if (exists != NULL) {
4087 4094                                  /* somebody beat us to the hash insert */
4088 4095                                  mutex_exit(hash_lock);
4089 4096                                  buf_discard_identity(hdr);
4090 4097                                  (void) arc_buf_remove_ref(buf, private);
4091 4098                                  goto top; /* restart the IO request */
4092 4099                          }
4093 4100  
4094 4101                          /* if this is a prefetch, we don't have a reference */
4095 4102                          if (*arc_flags & ARC_FLAG_PREFETCH) {
4096 4103                                  (void) remove_reference(hdr, hash_lock,
4097 4104                                      private);
4098 4105                                  hdr->b_flags |= ARC_FLAG_PREFETCH;
4099 4106                          }
4100 4107                          if (*arc_flags & ARC_FLAG_L2CACHE)
4101 4108                                  hdr->b_flags |= ARC_FLAG_L2CACHE;
4102 4109                          if (*arc_flags & ARC_FLAG_L2COMPRESS)
4103 4110                                  hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4104 4111                          if (BP_GET_LEVEL(bp) > 0)
4105 4112                                  hdr->b_flags |= ARC_FLAG_INDIRECT;
4106 4113                  } else {
4107 4114                          /*
4108 4115                           * This block is in the ghost cache. If it was L2-only
4109 4116                           * (and thus didn't have an L1 hdr), we realloc the
4110 4117                           * header to add an L1 hdr.
4111 4118                           */
4112 4119                          if (!HDR_HAS_L1HDR(hdr)) {
4113 4120                                  hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
4114 4121                                      hdr_full_cache);
4115 4122                          }
4116 4123  
4117 4124                          ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
4118 4125                          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4119 4126                          ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4120 4127                          ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
4121 4128  
4122 4129                          /* if this is a prefetch, we don't have a reference */
4123 4130                          if (*arc_flags & ARC_FLAG_PREFETCH)
4124 4131                                  hdr->b_flags |= ARC_FLAG_PREFETCH;
4125 4132                          else
4126 4133                                  add_reference(hdr, hash_lock, private);
4127 4134                          if (*arc_flags & ARC_FLAG_L2CACHE)
4128 4135                                  hdr->b_flags |= ARC_FLAG_L2CACHE;
4129 4136                          if (*arc_flags & ARC_FLAG_L2COMPRESS)
4130 4137                                  hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4131 4138                          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
4132 4139                          buf->b_hdr = hdr;
4133 4140                          buf->b_data = NULL;
4134 4141                          buf->b_efunc = NULL;
4135 4142                          buf->b_private = NULL;
4136 4143                          buf->b_next = NULL;
4137 4144                          hdr->b_l1hdr.b_buf = buf;
4138 4145                          ASSERT0(hdr->b_l1hdr.b_datacnt);
4139 4146                          hdr->b_l1hdr.b_datacnt = 1;
4140 4147                          arc_get_data_buf(buf);
4141 4148                          arc_access(hdr, hash_lock);
4142 4149                  }
4143 4150  
4144 4151                  ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
4145 4152  
4146 4153                  acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4147 4154                  acb->acb_done = done;
4148 4155                  acb->acb_private = private;
4149 4156  
4150 4157                  ASSERT(hdr->b_l1hdr.b_acb == NULL);
4151 4158                  hdr->b_l1hdr.b_acb = acb;
4152 4159                  hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4153 4160  
4154 4161                  if (HDR_HAS_L2HDR(hdr) &&
4155 4162                      (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4156 4163                          devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4157 4164                          addr = hdr->b_l2hdr.b_daddr;
4158 4165                          b_compress = hdr->b_l2hdr.b_compress;
4159 4166                          b_asize = hdr->b_l2hdr.b_asize;
4160 4167                          /*
4161 4168                           * Lock out device removal.
4162 4169                           */
4163 4170                          if (vdev_is_dead(vd) ||
4164 4171                              !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4165 4172                                  vd = NULL;
4166 4173                  }
4167 4174  
4168 4175                  if (hash_lock != NULL)
4169 4176                          mutex_exit(hash_lock);
4170 4177  
4171 4178                  /*
4172 4179                   * At this point, we have a level 1 cache miss.  Try again in
4173 4180                   * L2ARC if possible.
4174 4181                   */
4175 4182                  ASSERT3U(hdr->b_size, ==, size);
4176 4183                  DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4177 4184                      uint64_t, size, zbookmark_phys_t *, zb);
4178 4185                  ARCSTAT_BUMP(arcstat_misses);
4179 4186                  ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4180 4187                      demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4181 4188                      data, metadata, misses);
4182 4189  
4183 4190                  if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
4184 4191                          /*
4185 4192                           * Read from the L2ARC if the following are true:
4186 4193                           * 1. The L2ARC vdev was previously cached.
4187 4194                           * 2. This buffer still has L2ARC metadata.
4188 4195                           * 3. This buffer isn't currently writing to the L2ARC.
4189 4196                           * 4. The L2ARC entry wasn't evicted, which may
4190 4197                           *    also have invalidated the vdev.
4191 4198                           * 5. This isn't prefetch and l2arc_noprefetch is set.
4192 4199                           */
4193 4200                          if (HDR_HAS_L2HDR(hdr) &&
4194 4201                              !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4195 4202                              !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
4196 4203                                  l2arc_read_callback_t *cb;
4197 4204  
4198 4205                                  DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4199 4206                                  ARCSTAT_BUMP(arcstat_l2_hits);
4200 4207  
4201 4208                                  cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
4202 4209                                      KM_SLEEP);
4203 4210                                  cb->l2rcb_buf = buf;
4204 4211                                  cb->l2rcb_spa = spa;
4205 4212                                  cb->l2rcb_bp = *bp;
4206 4213                                  cb->l2rcb_zb = *zb;
4207 4214                                  cb->l2rcb_flags = zio_flags;
4208 4215                                  cb->l2rcb_compress = b_compress;
4209 4216  
4210 4217                                  ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4211 4218                                      addr + size < vd->vdev_psize -
4212 4219                                      VDEV_LABEL_END_SIZE);
4213 4220  
4214 4221                                  /*
4215 4222                                   * l2arc read.  The SCL_L2ARC lock will be
4216 4223                                   * released by l2arc_read_done().
4217 4224                                   * Issue a null zio if the underlying buffer
4218 4225                                   * was squashed to zero size by compression.
4219 4226                                   */
4220 4227                                  if (b_compress == ZIO_COMPRESS_EMPTY) {
4221 4228                                          rzio = zio_null(pio, spa, vd,
4222 4229                                              l2arc_read_done, cb,
4223 4230                                              zio_flags | ZIO_FLAG_DONT_CACHE |
4224 4231                                              ZIO_FLAG_CANFAIL |
4225 4232                                              ZIO_FLAG_DONT_PROPAGATE |
4226 4233                                              ZIO_FLAG_DONT_RETRY);
4227 4234                                  } else {
4228 4235                                          rzio = zio_read_phys(pio, vd, addr,
4229 4236                                              b_asize, buf->b_data,
4230 4237                                              ZIO_CHECKSUM_OFF,
4231 4238                                              l2arc_read_done, cb, priority,
4232 4239                                              zio_flags | ZIO_FLAG_DONT_CACHE |
4233 4240                                              ZIO_FLAG_CANFAIL |
4234 4241                                              ZIO_FLAG_DONT_PROPAGATE |
4235 4242                                              ZIO_FLAG_DONT_RETRY, B_FALSE);
4236 4243                                  }
4237 4244                                  DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4238 4245                                      zio_t *, rzio);
4239 4246                                  ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
4240 4247  
4241 4248                                  if (*arc_flags & ARC_FLAG_NOWAIT) {
4242 4249                                          zio_nowait(rzio);
4243 4250                                          return (0);
4244 4251                                  }
4245 4252  
4246 4253                                  ASSERT(*arc_flags & ARC_FLAG_WAIT);
4247 4254                                  if (zio_wait(rzio) == 0)
4248 4255                                          return (0);
4249 4256  
4250 4257                                  /* l2arc read error; goto zio_read() */
4251 4258                          } else {
4252 4259                                  DTRACE_PROBE1(l2arc__miss,
4253 4260                                      arc_buf_hdr_t *, hdr);
4254 4261                                  ARCSTAT_BUMP(arcstat_l2_misses);
4255 4262                                  if (HDR_L2_WRITING(hdr))
4256 4263                                          ARCSTAT_BUMP(arcstat_l2_rw_clash);
4257 4264                                  spa_config_exit(spa, SCL_L2ARC, vd);
4258 4265                          }
4259 4266                  } else {
4260 4267                          if (vd != NULL)
4261 4268                                  spa_config_exit(spa, SCL_L2ARC, vd);
4262 4269                          if (l2arc_ndev != 0) {
4263 4270                                  DTRACE_PROBE1(l2arc__miss,
4264 4271                                      arc_buf_hdr_t *, hdr);
4265 4272                                  ARCSTAT_BUMP(arcstat_l2_misses);
4266 4273                          }
4267 4274                  }
4268 4275  
4269 4276                  rzio = zio_read(pio, spa, bp, buf->b_data, size,
4270 4277                      arc_read_done, buf, priority, zio_flags, zb);
4271 4278  
4272 4279                  if (*arc_flags & ARC_FLAG_WAIT)
4273 4280                          return (zio_wait(rzio));
4274 4281  
4275 4282                  ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4276 4283                  zio_nowait(rzio);
4277 4284          }
4278 4285          return (0);
4279 4286  }
4280 4287  
4281 4288  void
4282 4289  arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4283 4290  {
4284 4291          ASSERT(buf->b_hdr != NULL);
4285 4292          ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4286 4293          ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4287 4294              func == NULL);
4288 4295          ASSERT(buf->b_efunc == NULL);
4289 4296          ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4290 4297  
4291 4298          buf->b_efunc = func;
4292 4299          buf->b_private = private;
4293 4300  }
4294 4301  
4295 4302  /*
4296 4303   * Notify the arc that a block was freed, and thus will never be used again.
4297 4304   */
4298 4305  void
4299 4306  arc_freed(spa_t *spa, const blkptr_t *bp)
4300 4307  {
4301 4308          arc_buf_hdr_t *hdr;
4302 4309          kmutex_t *hash_lock;
4303 4310          uint64_t guid = spa_load_guid(spa);
4304 4311  
4305 4312          ASSERT(!BP_IS_EMBEDDED(bp));
4306 4313  
4307 4314          hdr = buf_hash_find(guid, bp, &hash_lock);
4308 4315          if (hdr == NULL)
4309 4316                  return;
4310 4317          if (HDR_BUF_AVAILABLE(hdr)) {
4311 4318                  arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4312 4319                  add_reference(hdr, hash_lock, FTAG);
4313 4320                  hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4314 4321                  mutex_exit(hash_lock);
4315 4322  
4316 4323                  arc_release(buf, FTAG);
4317 4324                  (void) arc_buf_remove_ref(buf, FTAG);
4318 4325          } else {
4319 4326                  mutex_exit(hash_lock);
4320 4327          }
4321 4328  
4322 4329  }
4323 4330  
4324 4331  /*
4325 4332   * Clear the user eviction callback set by arc_set_callback(), first calling
4326 4333   * it if it exists.  Because the presence of a callback keeps an arc_buf cached
4327 4334   * clearing the callback may result in the arc_buf being destroyed.  However,
4328 4335   * it will not result in the *last* arc_buf being destroyed, hence the data
4329 4336   * will remain cached in the ARC. We make a copy of the arc buffer here so
4330 4337   * that we can process the callback without holding any locks.
4331 4338   *
4332 4339   * It's possible that the callback is already in the process of being cleared
4333 4340   * by another thread.  In this case we can not clear the callback.
4334 4341   *
4335 4342   * Returns B_TRUE if the callback was successfully called and cleared.
4336 4343   */
4337 4344  boolean_t
4338 4345  arc_clear_callback(arc_buf_t *buf)
4339 4346  {
4340 4347          arc_buf_hdr_t *hdr;
4341 4348          kmutex_t *hash_lock;
4342 4349          arc_evict_func_t *efunc = buf->b_efunc;
4343 4350          void *private = buf->b_private;
4344 4351  
4345 4352          mutex_enter(&buf->b_evict_lock);
4346 4353          hdr = buf->b_hdr;
4347 4354          if (hdr == NULL) {
4348 4355                  /*
4349 4356                   * We are in arc_do_user_evicts().
4350 4357                   */
4351 4358                  ASSERT(buf->b_data == NULL);
4352 4359                  mutex_exit(&buf->b_evict_lock);
4353 4360                  return (B_FALSE);
4354 4361          } else if (buf->b_data == NULL) {
4355 4362                  /*
4356 4363                   * We are on the eviction list; process this buffer now
4357 4364                   * but let arc_do_user_evicts() do the reaping.
4358 4365                   */
4359 4366                  buf->b_efunc = NULL;
4360 4367                  mutex_exit(&buf->b_evict_lock);
4361 4368                  VERIFY0(efunc(private));
4362 4369                  return (B_TRUE);
4363 4370          }
4364 4371          hash_lock = HDR_LOCK(hdr);
4365 4372          mutex_enter(hash_lock);
4366 4373          hdr = buf->b_hdr;
4367 4374          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4368 4375  
4369 4376          ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4370 4377              hdr->b_l1hdr.b_datacnt);
4371 4378          ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4372 4379              hdr->b_l1hdr.b_state == arc_mfu);
4373 4380  
4374 4381          buf->b_efunc = NULL;
4375 4382          buf->b_private = NULL;
4376 4383  
4377 4384          if (hdr->b_l1hdr.b_datacnt > 1) {
4378 4385                  mutex_exit(&buf->b_evict_lock);
4379 4386                  arc_buf_destroy(buf, TRUE);
4380 4387          } else {
4381 4388                  ASSERT(buf == hdr->b_l1hdr.b_buf);
4382 4389                  hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4383 4390                  mutex_exit(&buf->b_evict_lock);
4384 4391          }
4385 4392  
4386 4393          mutex_exit(hash_lock);
4387 4394          VERIFY0(efunc(private));
4388 4395          return (B_TRUE);
4389 4396  }
4390 4397  
4391 4398  /*
4392 4399   * Release this buffer from the cache, making it an anonymous buffer.  This
4393 4400   * must be done after a read and prior to modifying the buffer contents.
4394 4401   * If the buffer has more than one reference, we must make
4395 4402   * a new hdr for the buffer.
4396 4403   */
4397 4404  void
4398 4405  arc_release(arc_buf_t *buf, void *tag)
4399 4406  {
4400 4407          arc_buf_hdr_t *hdr = buf->b_hdr;
4401 4408  
4402 4409          /*
4403 4410           * It would be nice to assert that if it's DMU metadata (level >
4404 4411           * 0 || it's the dnode file), then it must be syncing context.
4405 4412           * But we don't know that information at this level.
4406 4413           */
4407 4414  
4408 4415          mutex_enter(&buf->b_evict_lock);
4409 4416  
4410 4417          ASSERT(HDR_HAS_L1HDR(hdr));
4411 4418  
4412 4419          /*
4413 4420           * We don't grab the hash lock prior to this check, because if
4414 4421           * the buffer's header is in the arc_anon state, it won't be
4415 4422           * linked into the hash table.
4416 4423           */
4417 4424          if (hdr->b_l1hdr.b_state == arc_anon) {
4418 4425                  mutex_exit(&buf->b_evict_lock);
4419 4426                  ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4420 4427                  ASSERT(!HDR_IN_HASH_TABLE(hdr));
4421 4428                  ASSERT(!HDR_HAS_L2HDR(hdr));
4422 4429                  ASSERT(BUF_EMPTY(hdr));
4423 4430  
4424 4431                  ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4425 4432                  ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4426 4433                  ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4427 4434  
4428 4435                  ASSERT3P(buf->b_efunc, ==, NULL);
4429 4436                  ASSERT3P(buf->b_private, ==, NULL);
4430 4437  
4431 4438                  hdr->b_l1hdr.b_arc_access = 0;
4432 4439                  arc_buf_thaw(buf);
4433 4440  
4434 4441                  return;
4435 4442          }
4436 4443  
4437 4444          kmutex_t *hash_lock = HDR_LOCK(hdr);
4438 4445          mutex_enter(hash_lock);
4439 4446  
4440 4447          /*
4441 4448           * This assignment is only valid as long as the hash_lock is
4442 4449           * held, we must be careful not to reference state or the
4443 4450           * b_state field after dropping the lock.
4444 4451           */
4445 4452          arc_state_t *state = hdr->b_l1hdr.b_state;
4446 4453          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4447 4454          ASSERT3P(state, !=, arc_anon);
4448 4455  
4449 4456          /* this buffer is not on any list */
4450 4457          ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4451 4458  
4452 4459          if (HDR_HAS_L2HDR(hdr)) {
4453 4460                  mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4454 4461  
4455 4462                  /*
4456 4463                   * We have to recheck this conditional again now that
4457 4464                   * we're holding the l2ad_mtx to prevent a race with
4458 4465                   * another thread which might be concurrently calling
4459 4466                   * l2arc_evict(). In that case, l2arc_evict() might have
4460 4467                   * destroyed the header's L2 portion as we were waiting
4461 4468                   * to acquire the l2ad_mtx.
4462 4469                   */
4463 4470                  if (HDR_HAS_L2HDR(hdr))
4464 4471                          arc_hdr_l2hdr_destroy(hdr);
4465 4472  
4466 4473                  mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4467 4474          }
4468 4475  
4469 4476          /*
4470 4477           * Do we have more than one buf?
4471 4478           */
4472 4479          if (hdr->b_l1hdr.b_datacnt > 1) {
4473 4480                  arc_buf_hdr_t *nhdr;
4474 4481                  arc_buf_t **bufp;
4475 4482                  uint64_t blksz = hdr->b_size;
4476 4483                  uint64_t spa = hdr->b_spa;
4477 4484                  arc_buf_contents_t type = arc_buf_type(hdr);
4478 4485                  uint32_t flags = hdr->b_flags;
4479 4486  
4480 4487                  ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4481 4488                  /*
4482 4489                   * Pull the data off of this hdr and attach it to
4483 4490                   * a new anonymous hdr.
4484 4491                   */
4485 4492                  (void) remove_reference(hdr, hash_lock, tag);
4486 4493                  bufp = &hdr->b_l1hdr.b_buf;
4487 4494                  while (*bufp != buf)
4488 4495                          bufp = &(*bufp)->b_next;
4489 4496                  *bufp = buf->b_next;
4490 4497                  buf->b_next = NULL;
4491 4498  
4492 4499                  ASSERT3P(state, !=, arc_l2c_only);
4493 4500  
4494 4501                  (void) refcount_remove_many(
4495 4502                      &state->arcs_size, hdr->b_size, buf);
4496 4503  
4497 4504                  if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4498 4505                          ASSERT3P(state, !=, arc_l2c_only);
4499 4506                          uint64_t *size = &state->arcs_lsize[type];
4500 4507                          ASSERT3U(*size, >=, hdr->b_size);
4501 4508                          atomic_add_64(size, -hdr->b_size);
4502 4509                  }
4503 4510  
4504 4511                  /*
4505 4512                   * We're releasing a duplicate user data buffer, update
4506 4513                   * our statistics accordingly.
4507 4514                   */
4508 4515                  if (HDR_ISTYPE_DATA(hdr)) {
4509 4516                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4510 4517                          ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4511 4518                              -hdr->b_size);
4512 4519                  }
4513 4520                  hdr->b_l1hdr.b_datacnt -= 1;
4514 4521                  arc_cksum_verify(buf);
4515 4522                  arc_buf_unwatch(buf);
4516 4523  
4517 4524                  mutex_exit(hash_lock);
4518 4525  
4519 4526                  nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4520 4527                  nhdr->b_size = blksz;
4521 4528                  nhdr->b_spa = spa;
4522 4529  
4523 4530                  nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4524 4531                  nhdr->b_flags |= arc_bufc_to_flags(type);
4525 4532                  nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4526 4533  
4527 4534                  nhdr->b_l1hdr.b_buf = buf;
4528 4535                  nhdr->b_l1hdr.b_datacnt = 1;
4529 4536                  nhdr->b_l1hdr.b_state = arc_anon;
4530 4537                  nhdr->b_l1hdr.b_arc_access = 0;
4531 4538                  nhdr->b_l1hdr.b_tmp_cdata = NULL;
4532 4539                  nhdr->b_freeze_cksum = NULL;
4533 4540  
4534 4541                  (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4535 4542                  buf->b_hdr = nhdr;
4536 4543                  mutex_exit(&buf->b_evict_lock);
4537 4544                  (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
4538 4545          } else {
4539 4546                  mutex_exit(&buf->b_evict_lock);
4540 4547                  ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4541 4548                  /* protected by hash lock, or hdr is on arc_anon */
4542 4549                  ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4543 4550                  ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4544 4551                  arc_change_state(arc_anon, hdr, hash_lock);
4545 4552                  hdr->b_l1hdr.b_arc_access = 0;
4546 4553                  mutex_exit(hash_lock);
4547 4554  
4548 4555                  buf_discard_identity(hdr);
4549 4556                  arc_buf_thaw(buf);
4550 4557          }
4551 4558          buf->b_efunc = NULL;
4552 4559          buf->b_private = NULL;
4553 4560  }
4554 4561  
4555 4562  int
4556 4563  arc_released(arc_buf_t *buf)
4557 4564  {
4558 4565          int released;
4559 4566  
4560 4567          mutex_enter(&buf->b_evict_lock);
4561 4568          released = (buf->b_data != NULL &&
4562 4569              buf->b_hdr->b_l1hdr.b_state == arc_anon);
4563 4570          mutex_exit(&buf->b_evict_lock);
4564 4571          return (released);
4565 4572  }
4566 4573  
4567 4574  #ifdef ZFS_DEBUG
4568 4575  int
4569 4576  arc_referenced(arc_buf_t *buf)
4570 4577  {
4571 4578          int referenced;
4572 4579  
4573 4580          mutex_enter(&buf->b_evict_lock);
4574 4581          referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
4575 4582          mutex_exit(&buf->b_evict_lock);
4576 4583          return (referenced);
4577 4584  }
4578 4585  #endif
4579 4586  
4580 4587  static void
4581 4588  arc_write_ready(zio_t *zio)
4582 4589  {
4583 4590          arc_write_callback_t *callback = zio->io_private;
4584 4591          arc_buf_t *buf = callback->awcb_buf;
4585 4592          arc_buf_hdr_t *hdr = buf->b_hdr;
4586 4593  
4587 4594          ASSERT(HDR_HAS_L1HDR(hdr));
4588 4595          ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
4589 4596          ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4590 4597          callback->awcb_ready(zio, buf, callback->awcb_private);
4591 4598  
4592 4599          /*
4593 4600           * If the IO is already in progress, then this is a re-write
4594 4601           * attempt, so we need to thaw and re-compute the cksum.
4595 4602           * It is the responsibility of the callback to handle the
4596 4603           * accounting for any re-write attempt.
4597 4604           */
4598 4605          if (HDR_IO_IN_PROGRESS(hdr)) {
4599 4606                  mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
4600 4607                  if (hdr->b_freeze_cksum != NULL) {
4601 4608                          kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
4602 4609                          hdr->b_freeze_cksum = NULL;
4603 4610                  }
4604 4611                  mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
4605 4612          }
4606 4613          arc_cksum_compute(buf, B_FALSE);
4607 4614          hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4608 4615  }
4609 4616  
4610 4617  /*
4611 4618   * The SPA calls this callback for each physical write that happens on behalf
4612 4619   * of a logical write.  See the comment in dbuf_write_physdone() for details.
4613 4620   */
4614 4621  static void
4615 4622  arc_write_physdone(zio_t *zio)
4616 4623  {
4617 4624          arc_write_callback_t *cb = zio->io_private;
4618 4625          if (cb->awcb_physdone != NULL)
4619 4626                  cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
4620 4627  }
4621 4628  
4622 4629  static void
4623 4630  arc_write_done(zio_t *zio)
4624 4631  {
4625 4632          arc_write_callback_t *callback = zio->io_private;
4626 4633          arc_buf_t *buf = callback->awcb_buf;
4627 4634          arc_buf_hdr_t *hdr = buf->b_hdr;
4628 4635  
4629 4636          ASSERT(hdr->b_l1hdr.b_acb == NULL);
4630 4637  
4631 4638          if (zio->io_error == 0) {
4632 4639                  if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
4633 4640                          buf_discard_identity(hdr);
4634 4641                  } else {
4635 4642                          hdr->b_dva = *BP_IDENTITY(zio->io_bp);
4636 4643                          hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
4637 4644                  }
4638 4645          } else {
4639 4646                  ASSERT(BUF_EMPTY(hdr));
4640 4647          }
4641 4648  
4642 4649          /*
4643 4650           * If the block to be written was all-zero or compressed enough to be
4644 4651           * embedded in the BP, no write was performed so there will be no
4645 4652           * dva/birth/checksum.  The buffer must therefore remain anonymous
4646 4653           * (and uncached).
4647 4654           */
4648 4655          if (!BUF_EMPTY(hdr)) {
4649 4656                  arc_buf_hdr_t *exists;
4650 4657                  kmutex_t *hash_lock;
4651 4658  
4652 4659                  ASSERT(zio->io_error == 0);
4653 4660  
4654 4661                  arc_cksum_verify(buf);
4655 4662  
4656 4663                  exists = buf_hash_insert(hdr, &hash_lock);
4657 4664                  if (exists != NULL) {
4658 4665                          /*
4659 4666                           * This can only happen if we overwrite for
4660 4667                           * sync-to-convergence, because we remove
4661 4668                           * buffers from the hash table when we arc_free().
4662 4669                           */
4663 4670                          if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
4664 4671                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4665 4672                                          panic("bad overwrite, hdr=%p exists=%p",
4666 4673                                              (void *)hdr, (void *)exists);
4667 4674                                  ASSERT(refcount_is_zero(
4668 4675                                      &exists->b_l1hdr.b_refcnt));
4669 4676                                  arc_change_state(arc_anon, exists, hash_lock);
4670 4677                                  mutex_exit(hash_lock);
4671 4678                                  arc_hdr_destroy(exists);
4672 4679                                  exists = buf_hash_insert(hdr, &hash_lock);
4673 4680                                  ASSERT3P(exists, ==, NULL);
4674 4681                          } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
4675 4682                                  /* nopwrite */
4676 4683                                  ASSERT(zio->io_prop.zp_nopwrite);
4677 4684                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4678 4685                                          panic("bad nopwrite, hdr=%p exists=%p",
4679 4686                                              (void *)hdr, (void *)exists);
4680 4687                          } else {
4681 4688                                  /* Dedup */
4682 4689                                  ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4683 4690                                  ASSERT(hdr->b_l1hdr.b_state == arc_anon);
4684 4691                                  ASSERT(BP_GET_DEDUP(zio->io_bp));
4685 4692                                  ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
4686 4693                          }
4687 4694                  }
4688 4695                  hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4689 4696                  /* if it's not anon, we are doing a scrub */
4690 4697                  if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
4691 4698                          arc_access(hdr, hash_lock);
4692 4699                  mutex_exit(hash_lock);
4693 4700          } else {
4694 4701                  hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4695 4702          }
4696 4703  
4697 4704          ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4698 4705          callback->awcb_done(zio, buf, callback->awcb_private);
4699 4706  
4700 4707          kmem_free(callback, sizeof (arc_write_callback_t));
4701 4708  }
4702 4709  
4703 4710  zio_t *
4704 4711  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
4705 4712      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
4706 4713      const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
4707 4714      arc_done_func_t *done, void *private, zio_priority_t priority,
4708 4715      int zio_flags, const zbookmark_phys_t *zb)
4709 4716  {
4710 4717          arc_buf_hdr_t *hdr = buf->b_hdr;
4711 4718          arc_write_callback_t *callback;
4712 4719          zio_t *zio;
4713 4720  
4714 4721          ASSERT(ready != NULL);
4715 4722          ASSERT(done != NULL);
4716 4723          ASSERT(!HDR_IO_ERROR(hdr));
4717 4724          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4718 4725          ASSERT(hdr->b_l1hdr.b_acb == NULL);
4719 4726          ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4720 4727          if (l2arc)
4721 4728                  hdr->b_flags |= ARC_FLAG_L2CACHE;
4722 4729          if (l2arc_compress)
4723 4730                  hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4724 4731          callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
4725 4732          callback->awcb_ready = ready;
4726 4733          callback->awcb_physdone = physdone;
4727 4734          callback->awcb_done = done;
4728 4735          callback->awcb_private = private;
4729 4736          callback->awcb_buf = buf;
4730 4737  
4731 4738          zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4732 4739              arc_write_ready, arc_write_physdone, arc_write_done, callback,
4733 4740              priority, zio_flags, zb);
4734 4741  
4735 4742          return (zio);
4736 4743  }
4737 4744  
4738 4745  static int
4739 4746  arc_memory_throttle(uint64_t reserve, uint64_t txg)
4740 4747  {
4741 4748  #ifdef _KERNEL
4742 4749          uint64_t available_memory = ptob(freemem);
4743 4750          static uint64_t page_load = 0;
4744 4751          static uint64_t last_txg = 0;
4745 4752  
4746 4753  #if defined(__i386)
4747 4754          available_memory =
4748 4755              MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
4749 4756  #endif
4750 4757  
4751 4758          if (freemem > physmem * arc_lotsfree_percent / 100)
4752 4759                  return (0);
4753 4760  
4754 4761          if (txg > last_txg) {
4755 4762                  last_txg = txg;
4756 4763                  page_load = 0;
4757 4764          }
4758 4765          /*
4759 4766           * If we are in pageout, we know that memory is already tight,
4760 4767           * the arc is already going to be evicting, so we just want to
4761 4768           * continue to let page writes occur as quickly as possible.
4762 4769           */
4763 4770          if (curproc == proc_pageout) {
4764 4771                  if (page_load > MAX(ptob(minfree), available_memory) / 4)
4765 4772                          return (SET_ERROR(ERESTART));
4766 4773                  /* Note: reserve is inflated, so we deflate */
4767 4774                  page_load += reserve / 8;
4768 4775                  return (0);
4769 4776          } else if (page_load > 0 && arc_reclaim_needed()) {
4770 4777                  /* memory is low, delay before restarting */
4771 4778                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4772 4779                  return (SET_ERROR(EAGAIN));
4773 4780          }
4774 4781          page_load = 0;
4775 4782  #endif
4776 4783          return (0);
4777 4784  }
4778 4785  
4779 4786  void
4780 4787  arc_tempreserve_clear(uint64_t reserve)
4781 4788  {
4782 4789          atomic_add_64(&arc_tempreserve, -reserve);
4783 4790          ASSERT((int64_t)arc_tempreserve >= 0);
4784 4791  }
4785 4792  
4786 4793  int
4787 4794  arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4788 4795  {
4789 4796          int error;
4790 4797          uint64_t anon_size;
4791 4798  
4792 4799          if (reserve > arc_c/4 && !arc_no_grow)
4793 4800                  arc_c = MIN(arc_c_max, reserve * 4);
4794 4801          if (reserve > arc_c)
4795 4802                  return (SET_ERROR(ENOMEM));
4796 4803  
4797 4804          /*
4798 4805           * Don't count loaned bufs as in flight dirty data to prevent long
4799 4806           * network delays from blocking transactions that are ready to be
4800 4807           * assigned to a txg.
4801 4808           */
4802 4809          anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
4803 4810              arc_loaned_bytes), 0);
4804 4811  
4805 4812          /*
4806 4813           * Writes will, almost always, require additional memory allocations
4807 4814           * in order to compress/encrypt/etc the data.  We therefore need to
4808 4815           * make sure that there is sufficient available memory for this.
4809 4816           */
4810 4817          error = arc_memory_throttle(reserve, txg);
4811 4818          if (error != 0)
4812 4819                  return (error);
4813 4820  
4814 4821          /*
4815 4822           * Throttle writes when the amount of dirty data in the cache
4816 4823           * gets too large.  We try to keep the cache less than half full
4817 4824           * of dirty blocks so that our sync times don't grow too large.
4818 4825           * Note: if two requests come in concurrently, we might let them
4819 4826           * both succeed, when one of them should fail.  Not a huge deal.
4820 4827           */
4821 4828  
4822 4829          if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4823 4830              anon_size > arc_c / 4) {
4824 4831                  dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4825 4832                      "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4826 4833                      arc_tempreserve>>10,
4827 4834                      arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4828 4835                      arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4829 4836                      reserve>>10, arc_c>>10);
4830 4837                  return (SET_ERROR(ERESTART));
4831 4838          }
4832 4839          atomic_add_64(&arc_tempreserve, reserve);
4833 4840          return (0);
4834 4841  }
4835 4842  
4836 4843  static void
4837 4844  arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
4838 4845      kstat_named_t *evict_data, kstat_named_t *evict_metadata)
4839 4846  {
4840 4847          size->value.ui64 = refcount_count(&state->arcs_size);
4841 4848          evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
4842 4849          evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
4843 4850  }
4844 4851  
4845 4852  static int
4846 4853  arc_kstat_update(kstat_t *ksp, int rw)
4847 4854  {
4848 4855          arc_stats_t *as = ksp->ks_data;
4849 4856  
4850 4857          if (rw == KSTAT_WRITE) {
4851 4858                  return (EACCES);
4852 4859          } else {
4853 4860                  arc_kstat_update_state(arc_anon,
4854 4861                      &as->arcstat_anon_size,
4855 4862                      &as->arcstat_anon_evictable_data,
4856 4863                      &as->arcstat_anon_evictable_metadata);
4857 4864                  arc_kstat_update_state(arc_mru,
4858 4865                      &as->arcstat_mru_size,
4859 4866                      &as->arcstat_mru_evictable_data,
4860 4867                      &as->arcstat_mru_evictable_metadata);
4861 4868                  arc_kstat_update_state(arc_mru_ghost,
4862 4869                      &as->arcstat_mru_ghost_size,
4863 4870                      &as->arcstat_mru_ghost_evictable_data,
4864 4871                      &as->arcstat_mru_ghost_evictable_metadata);
4865 4872                  arc_kstat_update_state(arc_mfu,
4866 4873                      &as->arcstat_mfu_size,
4867 4874                      &as->arcstat_mfu_evictable_data,
4868 4875                      &as->arcstat_mfu_evictable_metadata);
4869 4876                  arc_kstat_update_state(arc_mfu_ghost,
4870 4877                      &as->arcstat_mfu_ghost_size,
4871 4878                      &as->arcstat_mfu_ghost_evictable_data,
4872 4879                      &as->arcstat_mfu_ghost_evictable_metadata);
4873 4880          }
4874 4881  
4875 4882          return (0);
4876 4883  }
4877 4884  
4878 4885  /*
4879 4886   * This function *must* return indices evenly distributed between all
4880 4887   * sublists of the multilist. This is needed due to how the ARC eviction
4881 4888   * code is laid out; arc_evict_state() assumes ARC buffers are evenly
4882 4889   * distributed between all sublists and uses this assumption when
4883 4890   * deciding which sublist to evict from and how much to evict from it.
4884 4891   */
4885 4892  unsigned int
4886 4893  arc_state_multilist_index_func(multilist_t *ml, void *obj)
4887 4894  {
4888 4895          arc_buf_hdr_t *hdr = obj;
4889 4896  
4890 4897          /*
4891 4898           * We rely on b_dva to generate evenly distributed index
4892 4899           * numbers using buf_hash below. So, as an added precaution,
4893 4900           * let's make sure we never add empty buffers to the arc lists.
4894 4901           */
4895 4902          ASSERT(!BUF_EMPTY(hdr));
4896 4903  
4897 4904          /*
4898 4905           * The assumption here, is the hash value for a given
4899 4906           * arc_buf_hdr_t will remain constant throughout it's lifetime
4900 4907           * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
4901 4908           * Thus, we don't need to store the header's sublist index
4902 4909           * on insertion, as this index can be recalculated on removal.
4903 4910           *
4904 4911           * Also, the low order bits of the hash value are thought to be
4905 4912           * distributed evenly. Otherwise, in the case that the multilist
4906 4913           * has a power of two number of sublists, each sublists' usage
4907 4914           * would not be evenly distributed.
4908 4915           */
4909 4916          return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
4910 4917              multilist_get_num_sublists(ml));
4911 4918  }
4912 4919  
4913 4920  void
4914 4921  arc_init(void)
4915 4922  {
4916 4923          /*
4917 4924           * allmem is "all memory that we could possibly use".
4918 4925           */
4919 4926  #ifdef _KERNEL
4920 4927          uint64_t allmem = ptob(physmem - swapfs_minfree);
4921 4928  #else
4922 4929          uint64_t allmem = (physmem * PAGESIZE) / 2;
4923 4930  #endif
4924 4931  
4925 4932          mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
4926 4933          cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
4927 4934          cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
4928 4935  
4929 4936          mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
4930 4937          cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
4931 4938  
4932 4939          /* Convert seconds to clock ticks */
4933 4940          arc_min_prefetch_lifespan = 1 * hz;
4934 4941  
4935 4942          /* Start out with 1/8 of all memory */
4936 4943          arc_c = allmem / 8;
4937 4944  
4938 4945  #ifdef _KERNEL
4939 4946          /*
4940 4947           * On architectures where the physical memory can be larger
4941 4948           * than the addressable space (intel in 32-bit mode), we may
4942 4949           * need to limit the cache to 1/8 of VM size.
4943 4950           */
4944 4951          arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4945 4952  #endif
4946 4953  
4947 4954          /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
4948 4955          arc_c_min = MAX(allmem / 32, 64 << 20);
4949 4956          /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
4950 4957          if (allmem >= 1 << 30)
4951 4958                  arc_c_max = allmem - (1 << 30);
4952 4959          else
4953 4960                  arc_c_max = arc_c_min;
4954 4961          arc_c_max = MAX(allmem * 3 / 4, arc_c_max);
4955 4962  
4956 4963          /*
4957 4964           * Allow the tunables to override our calculations if they are
4958 4965           * reasonable (ie. over 64MB)
4959 4966           */
4960 4967          if (zfs_arc_max > 64 << 20 && zfs_arc_max < allmem)
4961 4968                  arc_c_max = zfs_arc_max;
4962 4969          if (zfs_arc_min > 64 << 20 && zfs_arc_min <= arc_c_max)
4963 4970                  arc_c_min = zfs_arc_min;
4964 4971  
4965 4972          arc_c = arc_c_max;
4966 4973          arc_p = (arc_c >> 1);
4967 4974  
4968 4975          /* limit meta-data to 1/4 of the arc capacity */
4969 4976          arc_meta_limit = arc_c_max / 4;
4970 4977  
4971 4978          /* Allow the tunable to override if it is reasonable */
4972 4979          if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4973 4980                  arc_meta_limit = zfs_arc_meta_limit;
4974 4981  
4975 4982          if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4976 4983                  arc_c_min = arc_meta_limit / 2;
4977 4984  
4978 4985          if (zfs_arc_meta_min > 0) {
4979 4986                  arc_meta_min = zfs_arc_meta_min;
4980 4987          } else {
4981 4988                  arc_meta_min = arc_c_min / 2;
4982 4989          }
4983 4990  
4984 4991          if (zfs_arc_grow_retry > 0)
4985 4992                  arc_grow_retry = zfs_arc_grow_retry;
4986 4993  
4987 4994          if (zfs_arc_shrink_shift > 0)
4988 4995                  arc_shrink_shift = zfs_arc_shrink_shift;
4989 4996  
4990 4997          /*
4991 4998           * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
4992 4999           */
4993 5000          if (arc_no_grow_shift >= arc_shrink_shift)
4994 5001                  arc_no_grow_shift = arc_shrink_shift - 1;
4995 5002  
4996 5003          if (zfs_arc_p_min_shift > 0)
4997 5004                  arc_p_min_shift = zfs_arc_p_min_shift;
4998 5005  
4999 5006          if (zfs_arc_num_sublists_per_state < 1)
5000 5007                  zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
5001 5008  
5002 5009          /* if kmem_flags are set, lets try to use less memory */
5003 5010          if (kmem_debugging())
5004 5011                  arc_c = arc_c / 2;
5005 5012          if (arc_c < arc_c_min)
5006 5013                  arc_c = arc_c_min;
5007 5014  
5008 5015          arc_anon = &ARC_anon;
5009 5016          arc_mru = &ARC_mru;
5010 5017          arc_mru_ghost = &ARC_mru_ghost;
5011 5018          arc_mfu = &ARC_mfu;
5012 5019          arc_mfu_ghost = &ARC_mfu_ghost;
5013 5020          arc_l2c_only = &ARC_l2c_only;
5014 5021          arc_size = 0;
5015 5022  
5016 5023          multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
5017 5024              sizeof (arc_buf_hdr_t),
5018 5025              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5019 5026              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5020 5027          multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
5021 5028              sizeof (arc_buf_hdr_t),
5022 5029              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5023 5030              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5024 5031          multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
5025 5032              sizeof (arc_buf_hdr_t),
5026 5033              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5027 5034              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5028 5035          multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
5029 5036              sizeof (arc_buf_hdr_t),
5030 5037              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5031 5038              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5032 5039          multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
5033 5040              sizeof (arc_buf_hdr_t),
5034 5041              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5035 5042              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5036 5043          multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
5037 5044              sizeof (arc_buf_hdr_t),
5038 5045              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5039 5046              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5040 5047          multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
5041 5048              sizeof (arc_buf_hdr_t),
5042 5049              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5043 5050              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5044 5051          multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
5045 5052              sizeof (arc_buf_hdr_t),
5046 5053              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5047 5054              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5048 5055          multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
5049 5056              sizeof (arc_buf_hdr_t),
5050 5057              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5051 5058              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5052 5059          multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
5053 5060              sizeof (arc_buf_hdr_t),
5054 5061              offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5055 5062              zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5056 5063  
5057 5064          refcount_create(&arc_anon->arcs_size);
5058 5065          refcount_create(&arc_mru->arcs_size);
5059 5066          refcount_create(&arc_mru_ghost->arcs_size);
5060 5067          refcount_create(&arc_mfu->arcs_size);
5061 5068          refcount_create(&arc_mfu_ghost->arcs_size);
5062 5069          refcount_create(&arc_l2c_only->arcs_size);
5063 5070  
5064 5071          buf_init();
5065 5072  
5066 5073          arc_reclaim_thread_exit = FALSE;
5067 5074          arc_user_evicts_thread_exit = FALSE;
5068 5075          arc_eviction_list = NULL;
5069 5076          bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
5070 5077  
5071 5078          arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
5072 5079              sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
5073 5080  
5074 5081          if (arc_ksp != NULL) {
5075 5082                  arc_ksp->ks_data = &arc_stats;
5076 5083                  arc_ksp->ks_update = arc_kstat_update;
5077 5084                  kstat_install(arc_ksp);
5078 5085          }
5079 5086  
5080 5087          (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
5081 5088              TS_RUN, minclsyspri);
5082 5089  
5083 5090          (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
5084 5091              TS_RUN, minclsyspri);
5085 5092  
5086 5093          arc_dead = FALSE;
5087 5094          arc_warm = B_FALSE;
5088 5095  
5089 5096          /*
5090 5097           * Calculate maximum amount of dirty data per pool.
5091 5098           *
5092 5099           * If it has been set by /etc/system, take that.
5093 5100           * Otherwise, use a percentage of physical memory defined by
5094 5101           * zfs_dirty_data_max_percent (default 10%) with a cap at
5095 5102           * zfs_dirty_data_max_max (default 4GB).
5096 5103           */
5097 5104          if (zfs_dirty_data_max == 0) {
5098 5105                  zfs_dirty_data_max = physmem * PAGESIZE *
5099 5106                      zfs_dirty_data_max_percent / 100;
5100 5107                  zfs_dirty_data_max = MIN(zfs_dirty_data_max,
5101 5108                      zfs_dirty_data_max_max);
5102 5109          }
5103 5110  }
5104 5111  
5105 5112  void
5106 5113  arc_fini(void)
5107 5114  {
5108 5115          mutex_enter(&arc_reclaim_lock);
5109 5116          arc_reclaim_thread_exit = TRUE;
5110 5117          /*
5111 5118           * The reclaim thread will set arc_reclaim_thread_exit back to
5112 5119           * FALSE when it is finished exiting; we're waiting for that.
5113 5120           */
5114 5121          while (arc_reclaim_thread_exit) {
5115 5122                  cv_signal(&arc_reclaim_thread_cv);
5116 5123                  cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
5117 5124          }
5118 5125          mutex_exit(&arc_reclaim_lock);
5119 5126  
5120 5127          mutex_enter(&arc_user_evicts_lock);
5121 5128          arc_user_evicts_thread_exit = TRUE;
5122 5129          /*
5123 5130           * The user evicts thread will set arc_user_evicts_thread_exit
5124 5131           * to FALSE when it is finished exiting; we're waiting for that.
5125 5132           */
5126 5133          while (arc_user_evicts_thread_exit) {
5127 5134                  cv_signal(&arc_user_evicts_cv);
5128 5135                  cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
5129 5136          }
5130 5137          mutex_exit(&arc_user_evicts_lock);
5131 5138  
5132 5139          /* Use TRUE to ensure *all* buffers are evicted */
5133 5140          arc_flush(NULL, TRUE);
5134 5141  
5135 5142          arc_dead = TRUE;
5136 5143  
5137 5144          if (arc_ksp != NULL) {
5138 5145                  kstat_delete(arc_ksp);
5139 5146                  arc_ksp = NULL;
5140 5147          }
5141 5148  
5142 5149          mutex_destroy(&arc_reclaim_lock);
5143 5150          cv_destroy(&arc_reclaim_thread_cv);
5144 5151          cv_destroy(&arc_reclaim_waiters_cv);
5145 5152  
5146 5153          mutex_destroy(&arc_user_evicts_lock);
5147 5154          cv_destroy(&arc_user_evicts_cv);
5148 5155  
5149 5156          refcount_destroy(&arc_anon->arcs_size);
5150 5157          refcount_destroy(&arc_mru->arcs_size);
5151 5158          refcount_destroy(&arc_mru_ghost->arcs_size);
5152 5159          refcount_destroy(&arc_mfu->arcs_size);
5153 5160          refcount_destroy(&arc_mfu_ghost->arcs_size);
5154 5161          refcount_destroy(&arc_l2c_only->arcs_size);
5155 5162  
5156 5163          multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
5157 5164          multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
5158 5165          multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
5159 5166          multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
5160 5167          multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
5161 5168          multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
5162 5169          multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
5163 5170          multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
5164 5171  
5165 5172          buf_fini();
5166 5173  
5167 5174          ASSERT0(arc_loaned_bytes);
5168 5175  }
5169 5176  
5170 5177  /*
5171 5178   * Level 2 ARC
5172 5179   *
5173 5180   * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5174 5181   * It uses dedicated storage devices to hold cached data, which are populated
5175 5182   * using large infrequent writes.  The main role of this cache is to boost
5176 5183   * the performance of random read workloads.  The intended L2ARC devices
5177 5184   * include short-stroked disks, solid state disks, and other media with
5178 5185   * substantially faster read latency than disk.
5179 5186   *
5180 5187   *                 +-----------------------+
5181 5188   *                 |         ARC           |
5182 5189   *                 +-----------------------+
5183 5190   *                    |         ^     ^
5184 5191   *                    |         |     |
5185 5192   *      l2arc_feed_thread()    arc_read()
5186 5193   *                    |         |     |
5187 5194   *                    |  l2arc read   |
5188 5195   *                    V         |     |
5189 5196   *               +---------------+    |
5190 5197   *               |     L2ARC     |    |
5191 5198   *               +---------------+    |
5192 5199   *                   |    ^           |
5193 5200   *          l2arc_write() |           |
5194 5201   *                   |    |           |
5195 5202   *                   V    |           |
5196 5203   *                 +-------+      +-------+
5197 5204   *                 | vdev  |      | vdev  |
5198 5205   *                 | cache |      | cache |
5199 5206   *                 +-------+      +-------+
5200 5207   *                 +=========+     .-----.
5201 5208   *                 :  L2ARC  :    |-_____-|
5202 5209   *                 : devices :    | Disks |
5203 5210   *                 +=========+    `-_____-'
5204 5211   *
5205 5212   * Read requests are satisfied from the following sources, in order:
5206 5213   *
5207 5214   *      1) ARC
5208 5215   *      2) vdev cache of L2ARC devices
5209 5216   *      3) L2ARC devices
5210 5217   *      4) vdev cache of disks
5211 5218   *      5) disks
5212 5219   *
5213 5220   * Some L2ARC device types exhibit extremely slow write performance.
5214 5221   * To accommodate for this there are some significant differences between
5215 5222   * the L2ARC and traditional cache design:
5216 5223   *
5217 5224   * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
5218 5225   * the ARC behave as usual, freeing buffers and placing headers on ghost
5219 5226   * lists.  The ARC does not send buffers to the L2ARC during eviction as
5220 5227   * this would add inflated write latencies for all ARC memory pressure.
5221 5228   *
5222 5229   * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5223 5230   * It does this by periodically scanning buffers from the eviction-end of
5224 5231   * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
5225 5232   * not already there. It scans until a headroom of buffers is satisfied,
5226 5233   * which itself is a buffer for ARC eviction. If a compressible buffer is
5227 5234   * found during scanning and selected for writing to an L2ARC device, we
5228 5235   * temporarily boost scanning headroom during the next scan cycle to make
5229 5236   * sure we adapt to compression effects (which might significantly reduce
5230 5237   * the data volume we write to L2ARC). The thread that does this is
5231 5238   * l2arc_feed_thread(), illustrated below; example sizes are included to
5232 5239   * provide a better sense of ratio than this diagram:
5233 5240   *
5234 5241   *             head -->                        tail
5235 5242   *              +---------------------+----------+
5236 5243   *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
5237 5244   *              +---------------------+----------+   |   o L2ARC eligible
5238 5245   *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
5239 5246   *              +---------------------+----------+   |
5240 5247   *                   15.9 Gbytes      ^ 32 Mbytes    |
5241 5248   *                                 headroom          |
5242 5249   *                                            l2arc_feed_thread()
5243 5250   *                                                   |
5244 5251   *                       l2arc write hand <--[oooo]--'
5245 5252   *                               |           8 Mbyte
5246 5253   *                               |          write max
5247 5254   *                               V
5248 5255   *                +==============================+
5249 5256   *      L2ARC dev |####|#|###|###|    |####| ... |
5250 5257   *                +==============================+
5251 5258   *                           32 Gbytes
5252 5259   *
5253 5260   * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5254 5261   * evicted, then the L2ARC has cached a buffer much sooner than it probably
5255 5262   * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
5256 5263   * safe to say that this is an uncommon case, since buffers at the end of
5257 5264   * the ARC lists have moved there due to inactivity.
5258 5265   *
5259 5266   * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5260 5267   * then the L2ARC simply misses copying some buffers.  This serves as a
5261 5268   * pressure valve to prevent heavy read workloads from both stalling the ARC
5262 5269   * with waits and clogging the L2ARC with writes.  This also helps prevent
5263 5270   * the potential for the L2ARC to churn if it attempts to cache content too
5264 5271   * quickly, such as during backups of the entire pool.
5265 5272   *
5266 5273   * 5. After system boot and before the ARC has filled main memory, there are
5267 5274   * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5268 5275   * lists can remain mostly static.  Instead of searching from tail of these
5269 5276   * lists as pictured, the l2arc_feed_thread() will search from the list heads
5270 5277   * for eligible buffers, greatly increasing its chance of finding them.
5271 5278   *
5272 5279   * The L2ARC device write speed is also boosted during this time so that
5273 5280   * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
5274 5281   * there are no L2ARC reads, and no fear of degrading read performance
5275 5282   * through increased writes.
5276 5283   *
5277 5284   * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
5278 5285   * the vdev queue can aggregate them into larger and fewer writes.  Each
5279 5286   * device is written to in a rotor fashion, sweeping writes through
5280 5287   * available space then repeating.
5281 5288   *
5282 5289   * 7. The L2ARC does not store dirty content.  It never needs to flush
5283 5290   * write buffers back to disk based storage.
5284 5291   *
5285 5292   * 8. If an ARC buffer is written (and dirtied) which also exists in the
5286 5293   * L2ARC, the now stale L2ARC buffer is immediately dropped.
5287 5294   *
5288 5295   * The performance of the L2ARC can be tweaked by a number of tunables, which
5289 5296   * may be necessary for different workloads:
5290 5297   *
5291 5298   *      l2arc_write_max         max write bytes per interval
5292 5299   *      l2arc_write_boost       extra write bytes during device warmup
5293 5300   *      l2arc_noprefetch        skip caching prefetched buffers
5294 5301   *      l2arc_headroom          number of max device writes to precache
5295 5302   *      l2arc_headroom_boost    when we find compressed buffers during ARC
5296 5303   *                              scanning, we multiply headroom by this
5297 5304   *                              percentage factor for the next scan cycle,
5298 5305   *                              since more compressed buffers are likely to
5299 5306   *                              be present
5300 5307   *      l2arc_feed_secs         seconds between L2ARC writing
5301 5308   *
5302 5309   * Tunables may be removed or added as future performance improvements are
5303 5310   * integrated, and also may become zpool properties.
5304 5311   *
5305 5312   * There are three key functions that control how the L2ARC warms up:
5306 5313   *
5307 5314   *      l2arc_write_eligible()  check if a buffer is eligible to cache
5308 5315   *      l2arc_write_size()      calculate how much to write
5309 5316   *      l2arc_write_interval()  calculate sleep delay between writes
5310 5317   *
5311 5318   * These three functions determine what to write, how much, and how quickly
5312 5319   * to send writes.
5313 5320   */
5314 5321  
5315 5322  static boolean_t
5316 5323  l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
5317 5324  {
5318 5325          /*
5319 5326           * A buffer is *not* eligible for the L2ARC if it:
5320 5327           * 1. belongs to a different spa.
5321 5328           * 2. is already cached on the L2ARC.
5322 5329           * 3. has an I/O in progress (it may be an incomplete read).
5323 5330           * 4. is flagged not eligible (zfs property).
5324 5331           */
5325 5332          if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
5326 5333              HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
5327 5334                  return (B_FALSE);
5328 5335  
5329 5336          return (B_TRUE);
5330 5337  }
5331 5338  
5332 5339  static uint64_t
5333 5340  l2arc_write_size(void)
5334 5341  {
5335 5342          uint64_t size;
5336 5343  
5337 5344          /*
5338 5345           * Make sure our globals have meaningful values in case the user
5339 5346           * altered them.
5340 5347           */
5341 5348          size = l2arc_write_max;
5342 5349          if (size == 0) {
5343 5350                  cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5344 5351                      "be greater than zero, resetting it to the default (%d)",
5345 5352                      L2ARC_WRITE_SIZE);
5346 5353                  size = l2arc_write_max = L2ARC_WRITE_SIZE;
5347 5354          }
5348 5355  
5349 5356          if (arc_warm == B_FALSE)
5350 5357                  size += l2arc_write_boost;
5351 5358  
5352 5359          return (size);
5353 5360  
5354 5361  }
5355 5362  
5356 5363  static clock_t
5357 5364  l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5358 5365  {
5359 5366          clock_t interval, next, now;
5360 5367  
5361 5368          /*
5362 5369           * If the ARC lists are busy, increase our write rate; if the
5363 5370           * lists are stale, idle back.  This is achieved by checking
5364 5371           * how much we previously wrote - if it was more than half of
5365 5372           * what we wanted, schedule the next write much sooner.
5366 5373           */
5367 5374          if (l2arc_feed_again && wrote > (wanted / 2))
5368 5375                  interval = (hz * l2arc_feed_min_ms) / 1000;
5369 5376          else
5370 5377                  interval = hz * l2arc_feed_secs;
5371 5378  
5372 5379          now = ddi_get_lbolt();
5373 5380          next = MAX(now, MIN(now + interval, began + interval));
5374 5381  
5375 5382          return (next);
5376 5383  }
5377 5384  
5378 5385  /*
5379 5386   * Cycle through L2ARC devices.  This is how L2ARC load balances.
5380 5387   * If a device is returned, this also returns holding the spa config lock.
5381 5388   */
5382 5389  static l2arc_dev_t *
5383 5390  l2arc_dev_get_next(void)
5384 5391  {
5385 5392          l2arc_dev_t *first, *next = NULL;
5386 5393  
5387 5394          /*
5388 5395           * Lock out the removal of spas (spa_namespace_lock), then removal
5389 5396           * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
5390 5397           * both locks will be dropped and a spa config lock held instead.
5391 5398           */
5392 5399          mutex_enter(&spa_namespace_lock);
5393 5400          mutex_enter(&l2arc_dev_mtx);
5394 5401  
5395 5402          /* if there are no vdevs, there is nothing to do */
5396 5403          if (l2arc_ndev == 0)
5397 5404                  goto out;
5398 5405  
5399 5406          first = NULL;
5400 5407          next = l2arc_dev_last;
5401 5408          do {
5402 5409                  /* loop around the list looking for a non-faulted vdev */
5403 5410                  if (next == NULL) {
5404 5411                          next = list_head(l2arc_dev_list);
5405 5412                  } else {
5406 5413                          next = list_next(l2arc_dev_list, next);
5407 5414                          if (next == NULL)
5408 5415                                  next = list_head(l2arc_dev_list);
5409 5416                  }
5410 5417  
5411 5418                  /* if we have come back to the start, bail out */
5412 5419                  if (first == NULL)
5413 5420                          first = next;
5414 5421                  else if (next == first)
5415 5422                          break;
5416 5423  
5417 5424          } while (vdev_is_dead(next->l2ad_vdev));
5418 5425  
5419 5426          /* if we were unable to find any usable vdevs, return NULL */
5420 5427          if (vdev_is_dead(next->l2ad_vdev))
5421 5428                  next = NULL;
5422 5429  
5423 5430          l2arc_dev_last = next;
5424 5431  
5425 5432  out:
5426 5433          mutex_exit(&l2arc_dev_mtx);
5427 5434  
5428 5435          /*
5429 5436           * Grab the config lock to prevent the 'next' device from being
5430 5437           * removed while we are writing to it.
5431 5438           */
5432 5439          if (next != NULL)
5433 5440                  spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5434 5441          mutex_exit(&spa_namespace_lock);
5435 5442  
5436 5443          return (next);
5437 5444  }
5438 5445  
5439 5446  /*
5440 5447   * Free buffers that were tagged for destruction.
5441 5448   */
5442 5449  static void
5443 5450  l2arc_do_free_on_write()
5444 5451  {
5445 5452          list_t *buflist;
5446 5453          l2arc_data_free_t *df, *df_prev;
5447 5454  
5448 5455          mutex_enter(&l2arc_free_on_write_mtx);
5449 5456          buflist = l2arc_free_on_write;
5450 5457  
5451 5458          for (df = list_tail(buflist); df; df = df_prev) {
5452 5459                  df_prev = list_prev(buflist, df);
5453 5460                  ASSERT(df->l2df_data != NULL);
5454 5461                  ASSERT(df->l2df_func != NULL);
5455 5462                  df->l2df_func(df->l2df_data, df->l2df_size);
5456 5463                  list_remove(buflist, df);
5457 5464                  kmem_free(df, sizeof (l2arc_data_free_t));
5458 5465          }
5459 5466  
5460 5467          mutex_exit(&l2arc_free_on_write_mtx);
5461 5468  }
5462 5469  
5463 5470  /*
5464 5471   * A write to a cache device has completed.  Update all headers to allow
5465 5472   * reads from these buffers to begin.
5466 5473   */
5467 5474  static void
5468 5475  l2arc_write_done(zio_t *zio)
5469 5476  {
5470 5477          l2arc_write_callback_t *cb;
5471 5478          l2arc_dev_t *dev;
5472 5479          list_t *buflist;
5473 5480          arc_buf_hdr_t *head, *hdr, *hdr_prev;
5474 5481          kmutex_t *hash_lock;
5475 5482          int64_t bytes_dropped = 0;
5476 5483  
5477 5484          cb = zio->io_private;
5478 5485          ASSERT(cb != NULL);
5479 5486          dev = cb->l2wcb_dev;
5480 5487          ASSERT(dev != NULL);
5481 5488          head = cb->l2wcb_head;
5482 5489          ASSERT(head != NULL);
5483 5490          buflist = &dev->l2ad_buflist;
5484 5491          ASSERT(buflist != NULL);
5485 5492          DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
5486 5493              l2arc_write_callback_t *, cb);
5487 5494  
5488 5495          if (zio->io_error != 0)
5489 5496                  ARCSTAT_BUMP(arcstat_l2_writes_error);
5490 5497  
5491 5498          /*
5492 5499           * All writes completed, or an error was hit.
5493 5500           */
5494 5501  top:
5495 5502          mutex_enter(&dev->l2ad_mtx);
5496 5503          for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
5497 5504                  hdr_prev = list_prev(buflist, hdr);
5498 5505  
5499 5506                  hash_lock = HDR_LOCK(hdr);
5500 5507  
5501 5508                  /*
5502 5509                   * We cannot use mutex_enter or else we can deadlock
5503 5510                   * with l2arc_write_buffers (due to swapping the order
5504 5511                   * the hash lock and l2ad_mtx are taken).
5505 5512                   */
5506 5513                  if (!mutex_tryenter(hash_lock)) {
5507 5514                          /*
5508 5515                           * Missed the hash lock. We must retry so we
5509 5516                           * don't leave the ARC_FLAG_L2_WRITING bit set.
5510 5517                           */
5511 5518                          ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
5512 5519  
5513 5520                          /*
5514 5521                           * We don't want to rescan the headers we've
5515 5522                           * already marked as having been written out, so
5516 5523                           * we reinsert the head node so we can pick up
5517 5524                           * where we left off.
5518 5525                           */
5519 5526                          list_remove(buflist, head);
5520 5527                          list_insert_after(buflist, hdr, head);
5521 5528  
5522 5529                          mutex_exit(&dev->l2ad_mtx);
5523 5530  
5524 5531                          /*
5525 5532                           * We wait for the hash lock to become available
5526 5533                           * to try and prevent busy waiting, and increase
5527 5534                           * the chance we'll be able to acquire the lock
5528 5535                           * the next time around.
5529 5536                           */
5530 5537                          mutex_enter(hash_lock);
5531 5538                          mutex_exit(hash_lock);
5532 5539                          goto top;
5533 5540                  }
5534 5541  
5535 5542                  /*
5536 5543                   * We could not have been moved into the arc_l2c_only
5537 5544                   * state while in-flight due to our ARC_FLAG_L2_WRITING
5538 5545                   * bit being set. Let's just ensure that's being enforced.
5539 5546                   */
5540 5547                  ASSERT(HDR_HAS_L1HDR(hdr));
5541 5548  
5542 5549                  /*
5543 5550                   * We may have allocated a buffer for L2ARC compression,
5544 5551                   * we must release it to avoid leaking this data.
5545 5552                   */
5546 5553                  l2arc_release_cdata_buf(hdr);
5547 5554  
5548 5555                  if (zio->io_error != 0) {
5549 5556                          /*
5550 5557                           * Error - drop L2ARC entry.
5551 5558                           */
5552 5559                          list_remove(buflist, hdr);
5553 5560                          hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5554 5561  
5555 5562                          ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
5556 5563                          ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5557 5564  
5558 5565                          bytes_dropped += hdr->b_l2hdr.b_asize;
5559 5566                          (void) refcount_remove_many(&dev->l2ad_alloc,
5560 5567                              hdr->b_l2hdr.b_asize, hdr);
5561 5568                  }
5562 5569  
5563 5570                  /*
5564 5571                   * Allow ARC to begin reads and ghost list evictions to
5565 5572                   * this L2ARC entry.
5566 5573                   */
5567 5574                  hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5568 5575  
5569 5576                  mutex_exit(hash_lock);
5570 5577          }
5571 5578  
5572 5579          atomic_inc_64(&l2arc_writes_done);
5573 5580          list_remove(buflist, head);
5574 5581          ASSERT(!HDR_HAS_L1HDR(head));
5575 5582          kmem_cache_free(hdr_l2only_cache, head);
5576 5583          mutex_exit(&dev->l2ad_mtx);
5577 5584  
5578 5585          vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
5579 5586  
5580 5587          l2arc_do_free_on_write();
5581 5588  
5582 5589          kmem_free(cb, sizeof (l2arc_write_callback_t));
5583 5590  }
5584 5591  
5585 5592  /*
5586 5593   * A read to a cache device completed.  Validate buffer contents before
5587 5594   * handing over to the regular ARC routines.
5588 5595   */
5589 5596  static void
5590 5597  l2arc_read_done(zio_t *zio)
5591 5598  {
5592 5599          l2arc_read_callback_t *cb;
5593 5600          arc_buf_hdr_t *hdr;
5594 5601          arc_buf_t *buf;
5595 5602          kmutex_t *hash_lock;
5596 5603          int equal;
5597 5604  
5598 5605          ASSERT(zio->io_vd != NULL);
5599 5606          ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5600 5607  
5601 5608          spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
5602 5609  
5603 5610          cb = zio->io_private;
5604 5611          ASSERT(cb != NULL);
5605 5612          buf = cb->l2rcb_buf;
5606 5613          ASSERT(buf != NULL);
5607 5614  
5608 5615          hash_lock = HDR_LOCK(buf->b_hdr);
5609 5616          mutex_enter(hash_lock);
5610 5617          hdr = buf->b_hdr;
5611 5618          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5612 5619  
5613 5620          /*
5614 5621           * If the buffer was compressed, decompress it first.
5615 5622           */
5616 5623          if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
5617 5624                  l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
5618 5625          ASSERT(zio->io_data != NULL);
5619 5626          ASSERT3U(zio->io_size, ==, hdr->b_size);
5620 5627          ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size);
5621 5628  
5622 5629          /*
5623 5630           * Check this survived the L2ARC journey.
5624 5631           */
5625 5632          equal = arc_cksum_equal(buf);
5626 5633          if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
5627 5634                  mutex_exit(hash_lock);
5628 5635                  zio->io_private = buf;
5629 5636                  zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
5630 5637                  zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
5631 5638                  arc_read_done(zio);
5632 5639          } else {
5633 5640                  mutex_exit(hash_lock);
5634 5641                  /*
5635 5642                   * Buffer didn't survive caching.  Increment stats and
5636 5643                   * reissue to the original storage device.
5637 5644                   */
5638 5645                  if (zio->io_error != 0) {
5639 5646                          ARCSTAT_BUMP(arcstat_l2_io_error);
5640 5647                  } else {
5641 5648                          zio->io_error = SET_ERROR(EIO);
5642 5649                  }
5643 5650                  if (!equal)
5644 5651                          ARCSTAT_BUMP(arcstat_l2_cksum_bad);
5645 5652  
5646 5653                  /*
5647 5654                   * If there's no waiter, issue an async i/o to the primary
5648 5655                   * storage now.  If there *is* a waiter, the caller must
5649 5656                   * issue the i/o in a context where it's OK to block.
5650 5657                   */
5651 5658                  if (zio->io_waiter == NULL) {
5652 5659                          zio_t *pio = zio_unique_parent(zio);
5653 5660  
5654 5661                          ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
5655 5662  
5656 5663                          zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
5657 5664                              buf->b_data, hdr->b_size, arc_read_done, buf,
5658 5665                              zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
5659 5666                  }
5660 5667          }
5661 5668  
5662 5669          kmem_free(cb, sizeof (l2arc_read_callback_t));
5663 5670  }
5664 5671  
5665 5672  /*
5666 5673   * This is the list priority from which the L2ARC will search for pages to
5667 5674   * cache.  This is used within loops (0..3) to cycle through lists in the
5668 5675   * desired order.  This order can have a significant effect on cache
5669 5676   * performance.
5670 5677   *
5671 5678   * Currently the metadata lists are hit first, MFU then MRU, followed by
5672 5679   * the data lists.  This function returns a locked list, and also returns
5673 5680   * the lock pointer.
5674 5681   */
5675 5682  static multilist_sublist_t *
5676 5683  l2arc_sublist_lock(int list_num)
5677 5684  {
5678 5685          multilist_t *ml = NULL;
5679 5686          unsigned int idx;
5680 5687  
5681 5688          ASSERT(list_num >= 0 && list_num <= 3);
5682 5689  
5683 5690          switch (list_num) {
5684 5691          case 0:
5685 5692                  ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
5686 5693                  break;
5687 5694          case 1:
5688 5695                  ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
5689 5696                  break;
5690 5697          case 2:
5691 5698                  ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
5692 5699                  break;
5693 5700          case 3:
5694 5701                  ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
5695 5702                  break;
5696 5703          }
5697 5704  
5698 5705          /*
5699 5706           * Return a randomly-selected sublist. This is acceptable
5700 5707           * because the caller feeds only a little bit of data for each
5701 5708           * call (8MB). Subsequent calls will result in different
5702 5709           * sublists being selected.
5703 5710           */
5704 5711          idx = multilist_get_random_index(ml);
5705 5712          return (multilist_sublist_lock(ml, idx));
5706 5713  }
5707 5714  
5708 5715  /*
5709 5716   * Evict buffers from the device write hand to the distance specified in
5710 5717   * bytes.  This distance may span populated buffers, it may span nothing.
5711 5718   * This is clearing a region on the L2ARC device ready for writing.
5712 5719   * If the 'all' boolean is set, every buffer is evicted.
5713 5720   */
5714 5721  static void
5715 5722  l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
5716 5723  {
5717 5724          list_t *buflist;
5718 5725          arc_buf_hdr_t *hdr, *hdr_prev;
5719 5726          kmutex_t *hash_lock;
5720 5727          uint64_t taddr;
5721 5728  
5722 5729          buflist = &dev->l2ad_buflist;
5723 5730  
5724 5731          if (!all && dev->l2ad_first) {
5725 5732                  /*
5726 5733                   * This is the first sweep through the device.  There is
5727 5734                   * nothing to evict.
5728 5735                   */
5729 5736                  return;
5730 5737          }
5731 5738  
5732 5739          if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
5733 5740                  /*
5734 5741                   * When nearing the end of the device, evict to the end
5735 5742                   * before the device write hand jumps to the start.
5736 5743                   */
5737 5744                  taddr = dev->l2ad_end;
5738 5745          } else {
5739 5746                  taddr = dev->l2ad_hand + distance;
5740 5747          }
5741 5748          DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
5742 5749              uint64_t, taddr, boolean_t, all);
5743 5750  
5744 5751  top:
5745 5752          mutex_enter(&dev->l2ad_mtx);
5746 5753          for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
5747 5754                  hdr_prev = list_prev(buflist, hdr);
5748 5755  
5749 5756                  hash_lock = HDR_LOCK(hdr);
5750 5757  
5751 5758                  /*
5752 5759                   * We cannot use mutex_enter or else we can deadlock
5753 5760                   * with l2arc_write_buffers (due to swapping the order
5754 5761                   * the hash lock and l2ad_mtx are taken).
5755 5762                   */
5756 5763                  if (!mutex_tryenter(hash_lock)) {
5757 5764                          /*
5758 5765                           * Missed the hash lock.  Retry.
5759 5766                           */
5760 5767                          ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
5761 5768                          mutex_exit(&dev->l2ad_mtx);
5762 5769                          mutex_enter(hash_lock);
5763 5770                          mutex_exit(hash_lock);
5764 5771                          goto top;
5765 5772                  }
5766 5773  
5767 5774                  if (HDR_L2_WRITE_HEAD(hdr)) {
5768 5775                          /*
5769 5776                           * We hit a write head node.  Leave it for
5770 5777                           * l2arc_write_done().
5771 5778                           */
5772 5779                          list_remove(buflist, hdr);
5773 5780                          mutex_exit(hash_lock);
5774 5781                          continue;
5775 5782                  }
5776 5783  
5777 5784                  if (!all && HDR_HAS_L2HDR(hdr) &&
5778 5785                      (hdr->b_l2hdr.b_daddr > taddr ||
5779 5786                      hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
5780 5787                          /*
5781 5788                           * We've evicted to the target address,
5782 5789                           * or the end of the device.
5783 5790                           */
5784 5791                          mutex_exit(hash_lock);
5785 5792                          break;
5786 5793                  }
5787 5794  
5788 5795                  ASSERT(HDR_HAS_L2HDR(hdr));
5789 5796                  if (!HDR_HAS_L1HDR(hdr)) {
5790 5797                          ASSERT(!HDR_L2_READING(hdr));
5791 5798                          /*
5792 5799                           * This doesn't exist in the ARC.  Destroy.
5793 5800                           * arc_hdr_destroy() will call list_remove()
5794 5801                           * and decrement arcstat_l2_size.
5795 5802                           */
5796 5803                          arc_change_state(arc_anon, hdr, hash_lock);
5797 5804                          arc_hdr_destroy(hdr);
5798 5805                  } else {
5799 5806                          ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
5800 5807                          ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
5801 5808                          /*
5802 5809                           * Invalidate issued or about to be issued
5803 5810                           * reads, since we may be about to write
5804 5811                           * over this location.
5805 5812                           */
5806 5813                          if (HDR_L2_READING(hdr)) {
5807 5814                                  ARCSTAT_BUMP(arcstat_l2_evict_reading);
5808 5815                                  hdr->b_flags |= ARC_FLAG_L2_EVICTED;
5809 5816                          }
5810 5817  
5811 5818                          /* Ensure this header has finished being written */
5812 5819                          ASSERT(!HDR_L2_WRITING(hdr));
5813 5820                          ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
5814 5821  
5815 5822                          arc_hdr_l2hdr_destroy(hdr);
5816 5823                  }
5817 5824                  mutex_exit(hash_lock);
5818 5825          }
5819 5826          mutex_exit(&dev->l2ad_mtx);
5820 5827  }
5821 5828  
5822 5829  /*
5823 5830   * Find and write ARC buffers to the L2ARC device.
5824 5831   *
5825 5832   * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5826 5833   * for reading until they have completed writing.
5827 5834   * The headroom_boost is an in-out parameter used to maintain headroom boost
5828 5835   * state between calls to this function.
5829 5836   *
5830 5837   * Returns the number of bytes actually written (which may be smaller than
5831 5838   * the delta by which the device hand has changed due to alignment).
5832 5839   */
5833 5840  static uint64_t
5834 5841  l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5835 5842      boolean_t *headroom_boost)
5836 5843  {
5837 5844          arc_buf_hdr_t *hdr, *hdr_prev, *head;
5838 5845          uint64_t write_asize, write_psize, write_sz, headroom,
5839 5846              buf_compress_minsz;
5840 5847          void *buf_data;
5841 5848          boolean_t full;
5842 5849          l2arc_write_callback_t *cb;
5843 5850          zio_t *pio, *wzio;
5844 5851          uint64_t guid = spa_load_guid(spa);
5845 5852          const boolean_t do_headroom_boost = *headroom_boost;
5846 5853  
5847 5854          ASSERT(dev->l2ad_vdev != NULL);
5848 5855  
5849 5856          /* Lower the flag now, we might want to raise it again later. */
5850 5857          *headroom_boost = B_FALSE;
5851 5858  
5852 5859          pio = NULL;
5853 5860          write_sz = write_asize = write_psize = 0;
5854 5861          full = B_FALSE;
5855 5862          head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
5856 5863          head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5857 5864          head->b_flags |= ARC_FLAG_HAS_L2HDR;
5858 5865  
5859 5866          /*
5860 5867           * We will want to try to compress buffers that are at least 2x the
5861 5868           * device sector size.
5862 5869           */
5863 5870          buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5864 5871  
5865 5872          /*
5866 5873           * Copy buffers for L2ARC writing.
5867 5874           */
5868 5875          for (int try = 0; try <= 3; try++) {
5869 5876                  multilist_sublist_t *mls = l2arc_sublist_lock(try);
5870 5877                  uint64_t passed_sz = 0;
5871 5878  
5872 5879                  /*
5873 5880                   * L2ARC fast warmup.
5874 5881                   *
5875 5882                   * Until the ARC is warm and starts to evict, read from the
5876 5883                   * head of the ARC lists rather than the tail.
5877 5884                   */
5878 5885                  if (arc_warm == B_FALSE)
5879 5886                          hdr = multilist_sublist_head(mls);
5880 5887                  else
5881 5888                          hdr = multilist_sublist_tail(mls);
5882 5889  
5883 5890                  headroom = target_sz * l2arc_headroom;
5884 5891                  if (do_headroom_boost)
5885 5892                          headroom = (headroom * l2arc_headroom_boost) / 100;
5886 5893  
5887 5894                  for (; hdr; hdr = hdr_prev) {
5888 5895                          kmutex_t *hash_lock;
5889 5896                          uint64_t buf_sz;
5890 5897  
5891 5898                          if (arc_warm == B_FALSE)
5892 5899                                  hdr_prev = multilist_sublist_next(mls, hdr);
5893 5900                          else
5894 5901                                  hdr_prev = multilist_sublist_prev(mls, hdr);
5895 5902  
5896 5903                          hash_lock = HDR_LOCK(hdr);
5897 5904                          if (!mutex_tryenter(hash_lock)) {
5898 5905                                  /*
5899 5906                                   * Skip this buffer rather than waiting.
5900 5907                                   */
5901 5908                                  continue;
5902 5909                          }
5903 5910  
5904 5911                          passed_sz += hdr->b_size;
5905 5912                          if (passed_sz > headroom) {
5906 5913                                  /*
5907 5914                                   * Searched too far.
5908 5915                                   */
5909 5916                                  mutex_exit(hash_lock);
5910 5917                                  break;
5911 5918                          }
5912 5919  
5913 5920                          if (!l2arc_write_eligible(guid, hdr)) {
5914 5921                                  mutex_exit(hash_lock);
5915 5922                                  continue;
5916 5923                          }
5917 5924  
5918 5925                          if ((write_sz + hdr->b_size) > target_sz) {
5919 5926                                  full = B_TRUE;
5920 5927                                  mutex_exit(hash_lock);
5921 5928                                  break;
5922 5929                          }
5923 5930  
5924 5931                          if (pio == NULL) {
5925 5932                                  /*
5926 5933                                   * Insert a dummy header on the buflist so
5927 5934                                   * l2arc_write_done() can find where the
5928 5935                                   * write buffers begin without searching.
5929 5936                                   */
5930 5937                                  mutex_enter(&dev->l2ad_mtx);
5931 5938                                  list_insert_head(&dev->l2ad_buflist, head);
5932 5939                                  mutex_exit(&dev->l2ad_mtx);
5933 5940  
5934 5941                                  cb = kmem_alloc(
5935 5942                                      sizeof (l2arc_write_callback_t), KM_SLEEP);
5936 5943                                  cb->l2wcb_dev = dev;
5937 5944                                  cb->l2wcb_head = head;
5938 5945                                  pio = zio_root(spa, l2arc_write_done, cb,
5939 5946                                      ZIO_FLAG_CANFAIL);
5940 5947                          }
5941 5948  
5942 5949                          /*
5943 5950                           * Create and add a new L2ARC header.
5944 5951                           */
5945 5952                          hdr->b_l2hdr.b_dev = dev;
5946 5953                          hdr->b_flags |= ARC_FLAG_L2_WRITING;
5947 5954                          /*
5948 5955                           * Temporarily stash the data buffer in b_tmp_cdata.
5949 5956                           * The subsequent write step will pick it up from
5950 5957                           * there. This is because can't access b_l1hdr.b_buf
5951 5958                           * without holding the hash_lock, which we in turn
5952 5959                           * can't access without holding the ARC list locks
5953 5960                           * (which we want to avoid during compression/writing).
5954 5961                           */
5955 5962                          hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF;
5956 5963                          hdr->b_l2hdr.b_asize = hdr->b_size;
5957 5964                          hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
5958 5965  
5959 5966                          /*
5960 5967                           * Explicitly set the b_daddr field to a known
5961 5968                           * value which means "invalid address". This
5962 5969                           * enables us to differentiate which stage of
5963 5970                           * l2arc_write_buffers() the particular header
5964 5971                           * is in (e.g. this loop, or the one below).
5965 5972                           * ARC_FLAG_L2_WRITING is not enough to make
5966 5973                           * this distinction, and we need to know in
5967 5974                           * order to do proper l2arc vdev accounting in
5968 5975                           * arc_release() and arc_hdr_destroy().
5969 5976                           *
5970 5977                           * Note, we can't use a new flag to distinguish
5971 5978                           * the two stages because we don't hold the
5972 5979                           * header's hash_lock below, in the second stage
5973 5980                           * of this function. Thus, we can't simply
5974 5981                           * change the b_flags field to denote that the
5975 5982                           * IO has been sent. We can change the b_daddr
5976 5983                           * field of the L2 portion, though, since we'll
5977 5984                           * be holding the l2ad_mtx; which is why we're
5978 5985                           * using it to denote the header's state change.
5979 5986                           */
5980 5987                          hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
5981 5988  
5982 5989                          buf_sz = hdr->b_size;
5983 5990                          hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
5984 5991  
5985 5992                          mutex_enter(&dev->l2ad_mtx);
5986 5993                          list_insert_head(&dev->l2ad_buflist, hdr);
5987 5994                          mutex_exit(&dev->l2ad_mtx);
5988 5995  
5989 5996                          /*
5990 5997                           * Compute and store the buffer cksum before
5991 5998                           * writing.  On debug the cksum is verified first.
5992 5999                           */
5993 6000                          arc_cksum_verify(hdr->b_l1hdr.b_buf);
5994 6001                          arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
5995 6002  
5996 6003                          mutex_exit(hash_lock);
5997 6004  
5998 6005                          write_sz += buf_sz;
5999 6006                  }
6000 6007  
6001 6008                  multilist_sublist_unlock(mls);
6002 6009  
6003 6010                  if (full == B_TRUE)
6004 6011                          break;
6005 6012          }
6006 6013  
6007 6014          /* No buffers selected for writing? */
6008 6015          if (pio == NULL) {
6009 6016                  ASSERT0(write_sz);
6010 6017                  ASSERT(!HDR_HAS_L1HDR(head));
6011 6018                  kmem_cache_free(hdr_l2only_cache, head);
6012 6019                  return (0);
6013 6020          }
6014 6021  
6015 6022          mutex_enter(&dev->l2ad_mtx);
6016 6023  
6017 6024          /*
6018 6025           * Now start writing the buffers. We're starting at the write head
6019 6026           * and work backwards, retracing the course of the buffer selector
6020 6027           * loop above.
6021 6028           */
6022 6029          for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6023 6030              hdr = list_prev(&dev->l2ad_buflist, hdr)) {
6024 6031                  uint64_t buf_sz;
6025 6032  
6026 6033                  /*
6027 6034                   * We rely on the L1 portion of the header below, so
6028 6035                   * it's invalid for this header to have been evicted out
6029 6036                   * of the ghost cache, prior to being written out. The
6030 6037                   * ARC_FLAG_L2_WRITING bit ensures this won't happen.
6031 6038                   */
6032 6039                  ASSERT(HDR_HAS_L1HDR(hdr));
6033 6040  
6034 6041                  /*
6035 6042                   * We shouldn't need to lock the buffer here, since we flagged
6036 6043                   * it as ARC_FLAG_L2_WRITING in the previous step, but we must
6037 6044                   * take care to only access its L2 cache parameters. In
6038 6045                   * particular, hdr->l1hdr.b_buf may be invalid by now due to
6039 6046                   * ARC eviction.
6040 6047                   */
6041 6048                  hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
6042 6049  
6043 6050                  if ((HDR_L2COMPRESS(hdr)) &&
6044 6051                      hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
6045 6052                          if (l2arc_compress_buf(hdr)) {
6046 6053                                  /*
6047 6054                                   * If compression succeeded, enable headroom
6048 6055                                   * boost on the next scan cycle.
6049 6056                                   */
6050 6057                                  *headroom_boost = B_TRUE;
6051 6058                          }
6052 6059                  }
6053 6060  
6054 6061                  /*
6055 6062                   * Pick up the buffer data we had previously stashed away
6056 6063                   * (and now potentially also compressed).
6057 6064                   */
6058 6065                  buf_data = hdr->b_l1hdr.b_tmp_cdata;
6059 6066                  buf_sz = hdr->b_l2hdr.b_asize;
6060 6067  
6061 6068                  /*
6062 6069                   * We need to do this regardless if buf_sz is zero or
6063 6070                   * not, otherwise, when this l2hdr is evicted we'll
6064 6071                   * remove a reference that was never added.
6065 6072                   */
6066 6073                  (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6067 6074  
6068 6075                  /* Compression may have squashed the buffer to zero length. */
6069 6076                  if (buf_sz != 0) {
6070 6077                          uint64_t buf_p_sz;
6071 6078  
6072 6079                          wzio = zio_write_phys(pio, dev->l2ad_vdev,
6073 6080                              dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6074 6081                              NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6075 6082                              ZIO_FLAG_CANFAIL, B_FALSE);
6076 6083  
6077 6084                          DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6078 6085                              zio_t *, wzio);
6079 6086                          (void) zio_nowait(wzio);
6080 6087  
6081 6088                          write_asize += buf_sz;
6082 6089  
6083 6090                          /*
6084 6091                           * Keep the clock hand suitably device-aligned.
6085 6092                           */
6086 6093                          buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6087 6094                          write_psize += buf_p_sz;
6088 6095                          dev->l2ad_hand += buf_p_sz;
6089 6096                  }
6090 6097          }
6091 6098  
6092 6099          mutex_exit(&dev->l2ad_mtx);
6093 6100  
6094 6101          ASSERT3U(write_asize, <=, target_sz);
6095 6102          ARCSTAT_BUMP(arcstat_l2_writes_sent);
6096 6103          ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
6097 6104          ARCSTAT_INCR(arcstat_l2_size, write_sz);
6098 6105          ARCSTAT_INCR(arcstat_l2_asize, write_asize);
6099 6106          vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
6100 6107  
6101 6108          /*
6102 6109           * Bump device hand to the device start if it is approaching the end.
6103 6110           * l2arc_evict() will already have evicted ahead for this case.
6104 6111           */
6105 6112          if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
6106 6113                  dev->l2ad_hand = dev->l2ad_start;
6107 6114                  dev->l2ad_first = B_FALSE;
6108 6115          }
6109 6116  
6110 6117          dev->l2ad_writing = B_TRUE;
6111 6118          (void) zio_wait(pio);
6112 6119          dev->l2ad_writing = B_FALSE;
6113 6120  
6114 6121          return (write_asize);
6115 6122  }
6116 6123  
6117 6124  /*
6118 6125   * Compresses an L2ARC buffer.
6119 6126   * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
6120 6127   * size in l2hdr->b_asize. This routine tries to compress the data and
6121 6128   * depending on the compression result there are three possible outcomes:
6122 6129   * *) The buffer was incompressible. The original l2hdr contents were left
6123 6130   *    untouched and are ready for writing to an L2 device.
6124 6131   * *) The buffer was all-zeros, so there is no need to write it to an L2
6125 6132   *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6126 6133   *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6127 6134   * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6128 6135   *    data buffer which holds the compressed data to be written, and b_asize
6129 6136   *    tells us how much data there is. b_compress is set to the appropriate
6130 6137   *    compression algorithm. Once writing is done, invoke
6131 6138   *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6132 6139   *
6133 6140   * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6134 6141   * buffer was incompressible).
6135 6142   */
6136 6143  static boolean_t
6137 6144  l2arc_compress_buf(arc_buf_hdr_t *hdr)
6138 6145  {
6139 6146          void *cdata;
6140 6147          size_t csize, len, rounded;
6141 6148          ASSERT(HDR_HAS_L2HDR(hdr));
6142 6149          l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
6143 6150  
6144 6151          ASSERT(HDR_HAS_L1HDR(hdr));
6145 6152          ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
6146 6153          ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6147 6154  
6148 6155          len = l2hdr->b_asize;
6149 6156          cdata = zio_data_buf_alloc(len);
6150 6157          ASSERT3P(cdata, !=, NULL);
6151 6158          csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6152 6159              cdata, l2hdr->b_asize);
6153 6160  
6154 6161          rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6155 6162          if (rounded > csize) {
6156 6163                  bzero((char *)cdata + csize, rounded - csize);
6157 6164                  csize = rounded;
6158 6165          }
6159 6166  
6160 6167          if (csize == 0) {
6161 6168                  /* zero block, indicate that there's nothing to write */
6162 6169                  zio_data_buf_free(cdata, len);
6163 6170                  l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
6164 6171                  l2hdr->b_asize = 0;
6165 6172                  hdr->b_l1hdr.b_tmp_cdata = NULL;
6166 6173                  ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6167 6174                  return (B_TRUE);
6168 6175          } else if (csize > 0 && csize < len) {
6169 6176                  /*
6170 6177                   * Compression succeeded, we'll keep the cdata around for
6171 6178                   * writing and release it afterwards.
6172 6179                   */
6173 6180                  l2hdr->b_compress = ZIO_COMPRESS_LZ4;
6174 6181                  l2hdr->b_asize = csize;
6175 6182                  hdr->b_l1hdr.b_tmp_cdata = cdata;
6176 6183                  ARCSTAT_BUMP(arcstat_l2_compress_successes);
6177 6184                  return (B_TRUE);
6178 6185          } else {
6179 6186                  /*
6180 6187                   * Compression failed, release the compressed buffer.
6181 6188                   * l2hdr will be left unmodified.
6182 6189                   */
6183 6190                  zio_data_buf_free(cdata, len);
6184 6191                  ARCSTAT_BUMP(arcstat_l2_compress_failures);
6185 6192                  return (B_FALSE);
6186 6193          }
6187 6194  }
6188 6195  
6189 6196  /*
6190 6197   * Decompresses a zio read back from an l2arc device. On success, the
6191 6198   * underlying zio's io_data buffer is overwritten by the uncompressed
6192 6199   * version. On decompression error (corrupt compressed stream), the
6193 6200   * zio->io_error value is set to signal an I/O error.
6194 6201   *
6195 6202   * Please note that the compressed data stream is not checksummed, so
6196 6203   * if the underlying device is experiencing data corruption, we may feed
6197 6204   * corrupt data to the decompressor, so the decompressor needs to be
6198 6205   * able to handle this situation (LZ4 does).
6199 6206   */
6200 6207  static void
6201 6208  l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6202 6209  {
6203 6210          ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6204 6211  
6205 6212          if (zio->io_error != 0) {
6206 6213                  /*
6207 6214                   * An io error has occured, just restore the original io
6208 6215                   * size in preparation for a main pool read.
6209 6216                   */
6210 6217                  zio->io_orig_size = zio->io_size = hdr->b_size;
6211 6218                  return;
6212 6219          }
6213 6220  
6214 6221          if (c == ZIO_COMPRESS_EMPTY) {
6215 6222                  /*
6216 6223                   * An empty buffer results in a null zio, which means we
6217 6224                   * need to fill its io_data after we're done restoring the
6218 6225                   * buffer's contents.
6219 6226                   */
6220 6227                  ASSERT(hdr->b_l1hdr.b_buf != NULL);
6221 6228                  bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6222 6229                  zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
6223 6230          } else {
6224 6231                  ASSERT(zio->io_data != NULL);
6225 6232                  /*
6226 6233                   * We copy the compressed data from the start of the arc buffer
6227 6234                   * (the zio_read will have pulled in only what we need, the
6228 6235                   * rest is garbage which we will overwrite at decompression)
6229 6236                   * and then decompress back to the ARC data buffer. This way we
6230 6237                   * can minimize copying by simply decompressing back over the
6231 6238                   * original compressed data (rather than decompressing to an
6232 6239                   * aux buffer and then copying back the uncompressed buffer,
6233 6240                   * which is likely to be much larger).
6234 6241                   */
6235 6242                  uint64_t csize;
6236 6243                  void *cdata;
6237 6244  
6238 6245                  csize = zio->io_size;
6239 6246                  cdata = zio_data_buf_alloc(csize);
6240 6247                  bcopy(zio->io_data, cdata, csize);
6241 6248                  if (zio_decompress_data(c, cdata, zio->io_data, csize,
6242 6249                      hdr->b_size) != 0)
6243 6250                          zio->io_error = EIO;
6244 6251                  zio_data_buf_free(cdata, csize);
6245 6252          }
6246 6253  
6247 6254          /* Restore the expected uncompressed IO size. */
6248 6255          zio->io_orig_size = zio->io_size = hdr->b_size;
6249 6256  }
6250 6257  
6251 6258  /*
6252 6259   * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6253 6260   * This buffer serves as a temporary holder of compressed data while
6254 6261   * the buffer entry is being written to an l2arc device. Once that is
6255 6262   * done, we can dispose of it.
6256 6263   */
6257 6264  static void
6258 6265  l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6259 6266  {
6260 6267          ASSERT(HDR_HAS_L2HDR(hdr));
6261 6268          enum zio_compress comp = hdr->b_l2hdr.b_compress;
6262 6269  
6263 6270          ASSERT(HDR_HAS_L1HDR(hdr));
6264 6271          ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6265 6272  
6266 6273          if (comp == ZIO_COMPRESS_OFF) {
6267 6274                  /*
6268 6275                   * In this case, b_tmp_cdata points to the same buffer
6269 6276                   * as the arc_buf_t's b_data field. We don't want to
6270 6277                   * free it, since the arc_buf_t will handle that.
6271 6278                   */
6272 6279                  hdr->b_l1hdr.b_tmp_cdata = NULL;
6273 6280          } else if (comp == ZIO_COMPRESS_EMPTY) {
6274 6281                  /*
6275 6282                   * In this case, b_tmp_cdata was compressed to an empty
6276 6283                   * buffer, thus there's nothing to free and b_tmp_cdata
6277 6284                   * should have been set to NULL in l2arc_write_buffers().
6278 6285                   */
6279 6286                  ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6280 6287          } else {
6281 6288                  /*
6282 6289                   * If the data was compressed, then we've allocated a
6283 6290                   * temporary buffer for it, so now we need to release it.
6284 6291                   */
6285 6292                  ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6286 6293                  zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6287 6294                      hdr->b_size);
6288 6295                  hdr->b_l1hdr.b_tmp_cdata = NULL;
6289 6296          }
6290 6297  
6291 6298  }
6292 6299  
6293 6300  /*
6294 6301   * This thread feeds the L2ARC at regular intervals.  This is the beating
6295 6302   * heart of the L2ARC.
6296 6303   */
6297 6304  static void
6298 6305  l2arc_feed_thread(void)
6299 6306  {
6300 6307          callb_cpr_t cpr;
6301 6308          l2arc_dev_t *dev;
6302 6309          spa_t *spa;
6303 6310          uint64_t size, wrote;
6304 6311          clock_t begin, next = ddi_get_lbolt();
6305 6312          boolean_t headroom_boost = B_FALSE;
6306 6313  
6307 6314          CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6308 6315  
6309 6316          mutex_enter(&l2arc_feed_thr_lock);
6310 6317  
6311 6318          while (l2arc_thread_exit == 0) {
6312 6319                  CALLB_CPR_SAFE_BEGIN(&cpr);
6313 6320                  (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
6314 6321                      next);
6315 6322                  CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
6316 6323                  next = ddi_get_lbolt() + hz;
6317 6324  
6318 6325                  /*
6319 6326                   * Quick check for L2ARC devices.
6320 6327                   */
6321 6328                  mutex_enter(&l2arc_dev_mtx);
6322 6329                  if (l2arc_ndev == 0) {
6323 6330                          mutex_exit(&l2arc_dev_mtx);
6324 6331                          continue;
6325 6332                  }
6326 6333                  mutex_exit(&l2arc_dev_mtx);
6327 6334                  begin = ddi_get_lbolt();
6328 6335  
6329 6336                  /*
6330 6337                   * This selects the next l2arc device to write to, and in
6331 6338                   * doing so the next spa to feed from: dev->l2ad_spa.   This
6332 6339                   * will return NULL if there are now no l2arc devices or if
6333 6340                   * they are all faulted.
6334 6341                   *
6335 6342                   * If a device is returned, its spa's config lock is also
6336 6343                   * held to prevent device removal.  l2arc_dev_get_next()
6337 6344                   * will grab and release l2arc_dev_mtx.
6338 6345                   */
6339 6346                  if ((dev = l2arc_dev_get_next()) == NULL)
6340 6347                          continue;
6341 6348  
6342 6349                  spa = dev->l2ad_spa;
6343 6350                  ASSERT(spa != NULL);
6344 6351  
6345 6352                  /*
6346 6353                   * If the pool is read-only then force the feed thread to
6347 6354                   * sleep a little longer.
6348 6355                   */
6349 6356                  if (!spa_writeable(spa)) {
6350 6357                          next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6351 6358                          spa_config_exit(spa, SCL_L2ARC, dev);
6352 6359                          continue;
6353 6360                  }
6354 6361  
6355 6362                  /*
6356 6363                   * Avoid contributing to memory pressure.
6357 6364                   */
6358 6365                  if (arc_reclaim_needed()) {
6359 6366                          ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6360 6367                          spa_config_exit(spa, SCL_L2ARC, dev);
6361 6368                          continue;
6362 6369                  }
6363 6370  
6364 6371                  ARCSTAT_BUMP(arcstat_l2_feeds);
6365 6372  
6366 6373                  size = l2arc_write_size();
6367 6374  
6368 6375                  /*
6369 6376                   * Evict L2ARC buffers that will be overwritten.
6370 6377                   */
6371 6378                  l2arc_evict(dev, size, B_FALSE);
6372 6379  
6373 6380                  /*
6374 6381                   * Write ARC buffers.
6375 6382                   */
6376 6383                  wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
6377 6384  
6378 6385                  /*
6379 6386                   * Calculate interval between writes.
6380 6387                   */
6381 6388                  next = l2arc_write_interval(begin, size, wrote);
6382 6389                  spa_config_exit(spa, SCL_L2ARC, dev);
6383 6390          }
6384 6391  
6385 6392          l2arc_thread_exit = 0;
6386 6393          cv_broadcast(&l2arc_feed_thr_cv);
6387 6394          CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
6388 6395          thread_exit();
6389 6396  }
6390 6397  
6391 6398  boolean_t
6392 6399  l2arc_vdev_present(vdev_t *vd)
6393 6400  {
6394 6401          l2arc_dev_t *dev;
6395 6402  
6396 6403          mutex_enter(&l2arc_dev_mtx);
6397 6404          for (dev = list_head(l2arc_dev_list); dev != NULL;
6398 6405              dev = list_next(l2arc_dev_list, dev)) {
6399 6406                  if (dev->l2ad_vdev == vd)
6400 6407                          break;
6401 6408          }
6402 6409          mutex_exit(&l2arc_dev_mtx);
6403 6410  
6404 6411          return (dev != NULL);
6405 6412  }
6406 6413  
6407 6414  /*
6408 6415   * Add a vdev for use by the L2ARC.  By this point the spa has already
6409 6416   * validated the vdev and opened it.
6410 6417   */
6411 6418  void
6412 6419  l2arc_add_vdev(spa_t *spa, vdev_t *vd)
6413 6420  {
6414 6421          l2arc_dev_t *adddev;
6415 6422  
6416 6423          ASSERT(!l2arc_vdev_present(vd));
6417 6424  
6418 6425          /*
6419 6426           * Create a new l2arc device entry.
6420 6427           */
6421 6428          adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
6422 6429          adddev->l2ad_spa = spa;
6423 6430          adddev->l2ad_vdev = vd;
6424 6431          adddev->l2ad_start = VDEV_LABEL_START_SIZE;
6425 6432          adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
6426 6433          adddev->l2ad_hand = adddev->l2ad_start;
6427 6434          adddev->l2ad_first = B_TRUE;
6428 6435          adddev->l2ad_writing = B_FALSE;
6429 6436  
6430 6437          mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
6431 6438          /*
6432 6439           * This is a list of all ARC buffers that are still valid on the
6433 6440           * device.
6434 6441           */
6435 6442          list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
6436 6443              offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
6437 6444  
6438 6445          vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
6439 6446          refcount_create(&adddev->l2ad_alloc);
6440 6447  
6441 6448          /*
6442 6449           * Add device to global list
6443 6450           */
6444 6451          mutex_enter(&l2arc_dev_mtx);
6445 6452          list_insert_head(l2arc_dev_list, adddev);
6446 6453          atomic_inc_64(&l2arc_ndev);
6447 6454          mutex_exit(&l2arc_dev_mtx);
6448 6455  }
6449 6456  
6450 6457  /*
6451 6458   * Remove a vdev from the L2ARC.
6452 6459   */
6453 6460  void
6454 6461  l2arc_remove_vdev(vdev_t *vd)
6455 6462  {
6456 6463          l2arc_dev_t *dev, *nextdev, *remdev = NULL;
6457 6464  
6458 6465          /*
6459 6466           * Find the device by vdev
6460 6467           */
6461 6468          mutex_enter(&l2arc_dev_mtx);
6462 6469          for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
6463 6470                  nextdev = list_next(l2arc_dev_list, dev);
6464 6471                  if (vd == dev->l2ad_vdev) {
6465 6472                          remdev = dev;
6466 6473                          break;
6467 6474                  }
6468 6475          }
6469 6476          ASSERT(remdev != NULL);
6470 6477  
6471 6478          /*
6472 6479           * Remove device from global list
6473 6480           */
6474 6481          list_remove(l2arc_dev_list, remdev);
6475 6482          l2arc_dev_last = NULL;          /* may have been invalidated */
6476 6483          atomic_dec_64(&l2arc_ndev);
6477 6484          mutex_exit(&l2arc_dev_mtx);
6478 6485  
6479 6486          /*
6480 6487           * Clear all buflists and ARC references.  L2ARC device flush.
6481 6488           */
6482 6489          l2arc_evict(remdev, 0, B_TRUE);
6483 6490          list_destroy(&remdev->l2ad_buflist);
6484 6491          mutex_destroy(&remdev->l2ad_mtx);
6485 6492          refcount_destroy(&remdev->l2ad_alloc);
6486 6493          kmem_free(remdev, sizeof (l2arc_dev_t));
6487 6494  }
6488 6495  
6489 6496  void
6490 6497  l2arc_init(void)
6491 6498  {
6492 6499          l2arc_thread_exit = 0;
6493 6500          l2arc_ndev = 0;
6494 6501          l2arc_writes_sent = 0;
6495 6502          l2arc_writes_done = 0;
6496 6503  
6497 6504          mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
6498 6505          cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
6499 6506          mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
6500 6507          mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
6501 6508  
6502 6509          l2arc_dev_list = &L2ARC_dev_list;
6503 6510          l2arc_free_on_write = &L2ARC_free_on_write;
6504 6511          list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
6505 6512              offsetof(l2arc_dev_t, l2ad_node));
6506 6513          list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
6507 6514              offsetof(l2arc_data_free_t, l2df_list_node));
6508 6515  }
6509 6516  
6510 6517  void
6511 6518  l2arc_fini(void)
6512 6519  {
6513 6520          /*
6514 6521           * This is called from dmu_fini(), which is called from spa_fini();
6515 6522           * Because of this, we can assume that all l2arc devices have
6516 6523           * already been removed when the pools themselves were removed.
6517 6524           */
6518 6525  
6519 6526          l2arc_do_free_on_write();
6520 6527  
6521 6528          mutex_destroy(&l2arc_feed_thr_lock);
6522 6529          cv_destroy(&l2arc_feed_thr_cv);
6523 6530          mutex_destroy(&l2arc_dev_mtx);
6524 6531          mutex_destroy(&l2arc_free_on_write_mtx);
6525 6532  
6526 6533          list_destroy(l2arc_dev_list);
6527 6534          list_destroy(l2arc_free_on_write);
6528 6535  }
6529 6536  
6530 6537  void
6531 6538  l2arc_start(void)
6532 6539  {
6533 6540          if (!(spa_mode_global & FWRITE))
6534 6541                  return;
6535 6542  
6536 6543          (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
6537 6544              TS_RUN, minclsyspri);
6538 6545  }
6539 6546  
6540 6547  void
6541 6548  l2arc_stop(void)
6542 6549  {
6543 6550          if (!(spa_mode_global & FWRITE))
6544 6551                  return;
6545 6552  
6546 6553          mutex_enter(&l2arc_feed_thr_lock);
6547 6554          cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
6548 6555          l2arc_thread_exit = 1;
6549 6556          while (l2arc_thread_exit != 0)
6550 6557                  cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
6551 6558          mutex_exit(&l2arc_feed_thr_lock);
6552 6559  }

↓ open down ↓

5239 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX