illumos-gate.git Wdiff usr/src/uts/common/fs/zfs/arc.c

Print this page

3525 Persistent L2ARC

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   * DVA-based Adjustable Replacement Cache
  30   30   *
  31   31   * While much of the theory of operation used here is
  32   32   * based on the self-tuning, low overhead replacement cache
  33   33   * presented by Megiddo and Modha at FAST 2003, there are some
  34   34   * significant differences:
  35   35   *
  36   36   * 1. The Megiddo and Modha model assumes any page is evictable.
  37   37   * Pages in its cache cannot be "locked" into memory.  This makes
  38   38   * the eviction algorithm simple: evict the last page in the list.
  39   39   * This also make the performance characteristics easy to reason
  40   40   * about.  Our cache is not so simple.  At any given moment, some
  41   41   * subset of the blocks in the cache are un-evictable because we
  42   42   * have handed out a reference to them.  Blocks are only evictable
  43   43   * when there are no external references active.  This makes
  44   44   * eviction far more problematic:  we choose to evict the evictable
  45   45   * blocks that are the "lowest" in the list.
  46   46   *
  47   47   * There are times when it is not possible to evict the requested
  48   48   * space.  In these circumstances we are unable to adjust the cache
  49   49   * size.  To prevent the cache growing unbounded at these times we
  50   50   * implement a "cache throttle" that slows the flow of new data
  51   51   * into the cache until we can make space available.
  52   52   *
  53   53   * 2. The Megiddo and Modha model assumes a fixed cache size.
  54   54   * Pages are evicted when the cache is full and there is a cache
  55   55   * miss.  Our model has a variable sized cache.  It grows with
  56   56   * high use, but also tries to react to memory pressure from the
  57   57   * operating system: decreasing its size when system memory is
  58   58   * tight.
  59   59   *
  60   60   * 3. The Megiddo and Modha model assumes a fixed page size. All
  61   61   * elements of the cache are therefore exactly the same size.  So
  62   62   * when adjusting the cache size following a cache miss, its simply
  63   63   * a matter of choosing a single page to evict.  In our model, we
  64   64   * have variable sized cache blocks (rangeing from 512 bytes to
  65   65   * 128K bytes).  We therefore choose a set of blocks to evict to make
  66   66   * space for a cache miss that approximates as closely as possible
  67   67   * the space used by the new block.
  68   68   *
  69   69   * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  70   70   * by N. Megiddo & D. Modha, FAST 2003
  71   71   */
  72   72  
  73   73  /*
  74   74   * The locking model:
  75   75   *
  76   76   * A new reference to a cache buffer can be obtained in two
  77   77   * ways: 1) via a hash table lookup using the DVA as a key,
  78   78   * or 2) via one of the ARC lists.  The arc_read() interface
  79   79   * uses method 1, while the internal arc algorithms for
  80   80   * adjusting the cache use method 2.  We therefore provide two
  81   81   * types of locks: 1) the hash table lock array, and 2) the
  82   82   * arc list locks.
  83   83   *
  84   84   * Buffers do not have their own mutexes, rather they rely on the
  85   85   * hash table mutexes for the bulk of their protection (i.e. most
  86   86   * fields in the arc_buf_hdr_t are protected by these mutexes).
  87   87   *
  88   88   * buf_hash_find() returns the appropriate mutex (held) when it
  89   89   * locates the requested buffer in the hash table.  It returns
  90   90   * NULL for the mutex if the buffer was not in the table.
  91   91   *
  92   92   * buf_hash_remove() expects the appropriate hash mutex to be
  93   93   * already held before it is invoked.
  94   94   *
  95   95   * Each arc state also has a mutex which is used to protect the
  96   96   * buffer list associated with the state.  When attempting to
  97   97   * obtain a hash table lock while holding an arc list lock you
  98   98   * must use: mutex_tryenter() to avoid deadlock.  Also note that
  99   99   * the active state mutex must be held before the ghost state mutex.
 100  100   *
 101  101   * Arc buffers may have an associated eviction callback function.
 102  102   * This function will be invoked prior to removing the buffer (e.g.
 103  103   * in arc_do_user_evicts()).  Note however that the data associated
 104  104   * with the buffer may be evicted prior to the callback.  The callback
 105  105   * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  106   * the users of callbacks must ensure that their private data is
 107  107   * protected from simultaneous callbacks from arc_buf_evict()
 108  108   * and arc_do_user_evicts().
 109  109   *
 110  110   * Note that the majority of the performance stats are manipulated
 111  111   * with atomic operations.
 112  112   *
 113  113   * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 114  114   *
 115  115   *      - L2ARC buflist creation
 116  116   *      - L2ARC buflist eviction
 117  117   *      - L2ARC write completion, which walks L2ARC buflists
 118  118   *      - ARC header destruction, as it removes from L2ARC buflists
 119  119   *      - ARC header release, as it removes from L2ARC buflists
 120  120   */
 121  121  
 122  122  #include <sys/spa.h>
 123  123  #include <sys/zio.h>
 124  124  #include <sys/zio_compress.h>
 125  125  #include <sys/zfs_context.h>
 126  126  #include <sys/arc.h>
 127  127  #include <sys/refcount.h>
 128  128  #include <sys/vdev.h>

↓ open down ↓

128 lines elided

↑ open up ↑

 129  129  #include <sys/vdev_impl.h>
 130  130  #ifdef _KERNEL
 131  131  #include <sys/vmsystm.h>
 132  132  #include <vm/anon.h>
 133  133  #include <sys/fs/swapnode.h>
 134  134  #include <sys/dnlc.h>
 135  135  #endif
 136  136  #include <sys/callb.h>
 137  137  #include <sys/kstat.h>
 138  138  #include <zfs_fletcher.h>
      139 +#include <sys/byteorder.h>
 139  140  
 140  141  #ifndef _KERNEL
 141  142  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 142  143  boolean_t arc_watch = B_FALSE;
 143  144  int arc_procfd;
 144  145  #endif
 145  146  
 146  147  static kmutex_t         arc_reclaim_thr_lock;
 147  148  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 148  149  static uint8_t          arc_thread_exit;

 149  150  
 150  151  extern int zfs_write_limit_shift;
 151  152  extern uint64_t zfs_write_limit_max;
 152  153  extern kmutex_t zfs_write_limit_lock;
 153  154  
 154  155  #define ARC_REDUCE_DNLC_PERCENT 3
 155  156  uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 156  157  
 157  158  typedef enum arc_reclaim_strategy {
 158  159          ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 159  160          ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 160  161  } arc_reclaim_strategy_t;
 161  162  
 162  163  /* number of seconds before growing cache again */
 163  164  static int              arc_grow_retry = 60;
 164  165  
 165  166  /* shift of arc_c for calculating both min and max arc_p */
 166  167  static int              arc_p_min_shift = 4;
 167  168  
 168  169  /* log2(fraction of arc to reclaim) */
 169  170  static int              arc_shrink_shift = 5;
 170  171  
 171  172  /*
 172  173   * minimum lifespan of a prefetch block in clock ticks
 173  174   * (initialized in arc_init())
 174  175   */
 175  176  static int              arc_min_prefetch_lifespan;
 176  177  
 177  178  static int arc_dead;
 178  179  
 179  180  /*
 180  181   * The arc has filled available memory and has now warmed up.
 181  182   */
 182  183  static boolean_t arc_warm;
 183  184  
 184  185  /*
 185  186   * These tunables are for performance analysis.
 186  187   */
 187  188  uint64_t zfs_arc_max;
 188  189  uint64_t zfs_arc_min;
 189  190  uint64_t zfs_arc_meta_limit = 0;
 190  191  int zfs_arc_grow_retry = 0;
 191  192  int zfs_arc_shrink_shift = 0;
 192  193  int zfs_arc_p_min_shift = 0;
 193  194  int zfs_disable_dup_eviction = 0;
 194  195  
 195  196  /*
 196  197   * Note that buffers can be in one of 6 states:
 197  198   *      ARC_anon        - anonymous (discussed below)
 198  199   *      ARC_mru         - recently used, currently cached
 199  200   *      ARC_mru_ghost   - recentely used, no longer in cache
 200  201   *      ARC_mfu         - frequently used, currently cached
 201  202   *      ARC_mfu_ghost   - frequently used, no longer in cache
 202  203   *      ARC_l2c_only    - exists in L2ARC but not other states
 203  204   * When there are no active references to the buffer, they are
 204  205   * are linked onto a list in one of these arc states.  These are
 205  206   * the only buffers that can be evicted or deleted.  Within each
 206  207   * state there are multiple lists, one for meta-data and one for
 207  208   * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 208  209   * etc.) is tracked separately so that it can be managed more
 209  210   * explicitly: favored over data, limited explicitly.
 210  211   *
 211  212   * Anonymous buffers are buffers that are not associated with
 212  213   * a DVA.  These are buffers that hold dirty block copies
 213  214   * before they are written to stable storage.  By definition,
 214  215   * they are "ref'd" and are considered part of arc_mru
 215  216   * that cannot be freed.  Generally, they will aquire a DVA
 216  217   * as they are written and migrate onto the arc_mru list.
 217  218   *
 218  219   * The ARC_l2c_only state is for buffers that are in the second
 219  220   * level ARC but no longer in any of the ARC_m* lists.  The second
 220  221   * level ARC itself may also contain buffers that are in any of
 221  222   * the ARC_m* states - meaning that a buffer can exist in two
 222  223   * places.  The reason for the ARC_l2c_only state is to keep the
 223  224   * buffer header in the hash table, so that reads that hit the
 224  225   * second level ARC benefit from these fast lookups.
 225  226   */
 226  227  
 227  228  typedef struct arc_state {
 228  229          list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 229  230          uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 230  231          uint64_t arcs_size;     /* total amount of data in this state */
 231  232          kmutex_t arcs_mtx;
 232  233  } arc_state_t;
 233  234  
 234  235  /* The 6 states: */
 235  236  static arc_state_t ARC_anon;
 236  237  static arc_state_t ARC_mru;
 237  238  static arc_state_t ARC_mru_ghost;
 238  239  static arc_state_t ARC_mfu;
 239  240  static arc_state_t ARC_mfu_ghost;
 240  241  static arc_state_t ARC_l2c_only;
 241  242  
 242  243  typedef struct arc_stats {
 243  244          kstat_named_t arcstat_hits;
 244  245          kstat_named_t arcstat_misses;
 245  246          kstat_named_t arcstat_demand_data_hits;
 246  247          kstat_named_t arcstat_demand_data_misses;
 247  248          kstat_named_t arcstat_demand_metadata_hits;
 248  249          kstat_named_t arcstat_demand_metadata_misses;
 249  250          kstat_named_t arcstat_prefetch_data_hits;
 250  251          kstat_named_t arcstat_prefetch_data_misses;
 251  252          kstat_named_t arcstat_prefetch_metadata_hits;
 252  253          kstat_named_t arcstat_prefetch_metadata_misses;
 253  254          kstat_named_t arcstat_mru_hits;
 254  255          kstat_named_t arcstat_mru_ghost_hits;
 255  256          kstat_named_t arcstat_mfu_hits;
 256  257          kstat_named_t arcstat_mfu_ghost_hits;
 257  258          kstat_named_t arcstat_deleted;
 258  259          kstat_named_t arcstat_recycle_miss;
 259  260          /*
 260  261           * Number of buffers that could not be evicted because the hash lock
 261  262           * was held by another thread.  The lock may not necessarily be held
 262  263           * by something using the same buffer, since hash locks are shared
 263  264           * by multiple buffers.
 264  265           */
 265  266          kstat_named_t arcstat_mutex_miss;
 266  267          /*
 267  268           * Number of buffers skipped because they have I/O in progress, are
 268  269           * indrect prefetch buffers that have not lived long enough, or are
 269  270           * not from the spa we're trying to evict from.
 270  271           */
 271  272          kstat_named_t arcstat_evict_skip;
 272  273          kstat_named_t arcstat_evict_l2_cached;
 273  274          kstat_named_t arcstat_evict_l2_eligible;
 274  275          kstat_named_t arcstat_evict_l2_ineligible;
 275  276          kstat_named_t arcstat_hash_elements;
 276  277          kstat_named_t arcstat_hash_elements_max;
 277  278          kstat_named_t arcstat_hash_collisions;
 278  279          kstat_named_t arcstat_hash_chains;
 279  280          kstat_named_t arcstat_hash_chain_max;
 280  281          kstat_named_t arcstat_p;
 281  282          kstat_named_t arcstat_c;
 282  283          kstat_named_t arcstat_c_min;
 283  284          kstat_named_t arcstat_c_max;
 284  285          kstat_named_t arcstat_size;
 285  286          kstat_named_t arcstat_hdr_size;
 286  287          kstat_named_t arcstat_data_size;
 287  288          kstat_named_t arcstat_other_size;
 288  289          kstat_named_t arcstat_l2_hits;
 289  290          kstat_named_t arcstat_l2_misses;
 290  291          kstat_named_t arcstat_l2_feeds;
 291  292          kstat_named_t arcstat_l2_rw_clash;
 292  293          kstat_named_t arcstat_l2_read_bytes;
 293  294          kstat_named_t arcstat_l2_write_bytes;
 294  295          kstat_named_t arcstat_l2_writes_sent;
 295  296          kstat_named_t arcstat_l2_writes_done;
 296  297          kstat_named_t arcstat_l2_writes_error;
 297  298          kstat_named_t arcstat_l2_writes_hdr_miss;
 298  299          kstat_named_t arcstat_l2_evict_lock_retry;
 299  300          kstat_named_t arcstat_l2_evict_reading;

↓ open down ↓

151 lines elided

↑ open up ↑

 300  301          kstat_named_t arcstat_l2_free_on_write;
 301  302          kstat_named_t arcstat_l2_abort_lowmem;
 302  303          kstat_named_t arcstat_l2_cksum_bad;
 303  304          kstat_named_t arcstat_l2_io_error;
 304  305          kstat_named_t arcstat_l2_size;
 305  306          kstat_named_t arcstat_l2_asize;
 306  307          kstat_named_t arcstat_l2_hdr_size;
 307  308          kstat_named_t arcstat_l2_compress_successes;
 308  309          kstat_named_t arcstat_l2_compress_zeros;
 309  310          kstat_named_t arcstat_l2_compress_failures;
      311 +        kstat_named_t arcstat_l2_meta_writes;
      312 +        kstat_named_t arcstat_l2_meta_avg_size;
      313 +        kstat_named_t arcstat_l2_meta_avg_asize;
      314 +        kstat_named_t arcstat_l2_asize_to_meta_ratio;
      315 +        kstat_named_t arcstat_l2_rebuild_attempts;
      316 +        kstat_named_t arcstat_l2_rebuild_successes;
      317 +        kstat_named_t arcstat_l2_rebuild_unsupported;
      318 +        kstat_named_t arcstat_l2_rebuild_timeout;
      319 +        kstat_named_t arcstat_l2_rebuild_arc_bytes;
      320 +        kstat_named_t arcstat_l2_rebuild_l2arc_bytes;
      321 +        kstat_named_t arcstat_l2_rebuild_bufs;
      322 +        kstat_named_t arcstat_l2_rebuild_bufs_precached;
      323 +        kstat_named_t arcstat_l2_rebuild_metabufs;
      324 +        kstat_named_t arcstat_l2_rebuild_uberblk_errors;
      325 +        kstat_named_t arcstat_l2_rebuild_io_errors;
      326 +        kstat_named_t arcstat_l2_rebuild_cksum_errors;
      327 +        kstat_named_t arcstat_l2_rebuild_loop_errors;
      328 +        kstat_named_t arcstat_l2_rebuild_abort_lowmem;
 310  329          kstat_named_t arcstat_memory_throttle_count;
 311  330          kstat_named_t arcstat_duplicate_buffers;
 312  331          kstat_named_t arcstat_duplicate_buffers_size;
 313  332          kstat_named_t arcstat_duplicate_reads;
 314  333          kstat_named_t arcstat_meta_used;
 315  334          kstat_named_t arcstat_meta_limit;
 316  335          kstat_named_t arcstat_meta_max;
 317  336  } arc_stats_t;
 318  337  
 319  338  static arc_stats_t arc_stats = {

 320  339          { "hits",                       KSTAT_DATA_UINT64 },
 321  340          { "misses",                     KSTAT_DATA_UINT64 },
 322  341          { "demand_data_hits",           KSTAT_DATA_UINT64 },
 323  342          { "demand_data_misses",         KSTAT_DATA_UINT64 },
 324  343          { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 325  344          { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 326  345          { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 327  346          { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 328  347          { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 329  348          { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 330  349          { "mru_hits",                   KSTAT_DATA_UINT64 },
 331  350          { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 332  351          { "mfu_hits",                   KSTAT_DATA_UINT64 },
 333  352          { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 334  353          { "deleted",                    KSTAT_DATA_UINT64 },
 335  354          { "recycle_miss",               KSTAT_DATA_UINT64 },
 336  355          { "mutex_miss",                 KSTAT_DATA_UINT64 },
 337  356          { "evict_skip",                 KSTAT_DATA_UINT64 },
 338  357          { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 339  358          { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 340  359          { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 341  360          { "hash_elements",              KSTAT_DATA_UINT64 },
 342  361          { "hash_elements_max",          KSTAT_DATA_UINT64 },
 343  362          { "hash_collisions",            KSTAT_DATA_UINT64 },
 344  363          { "hash_chains",                KSTAT_DATA_UINT64 },
 345  364          { "hash_chain_max",             KSTAT_DATA_UINT64 },
 346  365          { "p",                          KSTAT_DATA_UINT64 },
 347  366          { "c",                          KSTAT_DATA_UINT64 },
 348  367          { "c_min",                      KSTAT_DATA_UINT64 },
 349  368          { "c_max",                      KSTAT_DATA_UINT64 },
 350  369          { "size",                       KSTAT_DATA_UINT64 },
 351  370          { "hdr_size",                   KSTAT_DATA_UINT64 },
 352  371          { "data_size",                  KSTAT_DATA_UINT64 },
 353  372          { "other_size",                 KSTAT_DATA_UINT64 },
 354  373          { "l2_hits",                    KSTAT_DATA_UINT64 },
 355  374          { "l2_misses",                  KSTAT_DATA_UINT64 },
 356  375          { "l2_feeds",                   KSTAT_DATA_UINT64 },
 357  376          { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 358  377          { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 359  378          { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 360  379          { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 361  380          { "l2_writes_done",             KSTAT_DATA_UINT64 },
 362  381          { "l2_writes_error",            KSTAT_DATA_UINT64 },
 363  382          { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 364  383          { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 365  384          { "l2_evict_reading",           KSTAT_DATA_UINT64 },

↓ open down ↓

46 lines elided

↑ open up ↑

 366  385          { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 367  386          { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 368  387          { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 369  388          { "l2_io_error",                KSTAT_DATA_UINT64 },
 370  389          { "l2_size",                    KSTAT_DATA_UINT64 },
 371  390          { "l2_asize",                   KSTAT_DATA_UINT64 },
 372  391          { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 373  392          { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 374  393          { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 375  394          { "l2_compress_failures",       KSTAT_DATA_UINT64 },
      395 +        { "l2_meta_writes",             KSTAT_DATA_UINT64 },
      396 +        { "l2_meta_avg_size",           KSTAT_DATA_UINT64 },
      397 +        { "l2_meta_avg_asize",          KSTAT_DATA_UINT64 },
      398 +        { "l2_asize_to_meta_ratio",     KSTAT_DATA_UINT64 },
      399 +        { "l2_rebuild_attempts",        KSTAT_DATA_UINT64 },
      400 +        { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
      401 +        { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
      402 +        { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
      403 +        { "l2_rebuild_arc_bytes",       KSTAT_DATA_UINT64 },
      404 +        { "l2_rebuild_l2arc_bytes",     KSTAT_DATA_UINT64 },
      405 +        { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
      406 +        { "l2_rebuild_precached",       KSTAT_DATA_UINT64 },
      407 +        { "l2_rebuild_metabufs",        KSTAT_DATA_UINT64 },
      408 +        { "l2_rebuild_uberblk_errors",  KSTAT_DATA_UINT64 },
      409 +        { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
      410 +        { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
      411 +        { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
      412 +        { "l2_rebuild_abort_lowmem",    KSTAT_DATA_UINT64 },
 376  413          { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 377  414          { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 378  415          { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 379  416          { "duplicate_reads",            KSTAT_DATA_UINT64 },
 380  417          { "arc_meta_used",              KSTAT_DATA_UINT64 },
 381  418          { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 382  419          { "arc_meta_max",               KSTAT_DATA_UINT64 }
 383  420  };
 384  421  
 385  422  #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)

 386  423  
 387  424  #define ARCSTAT_INCR(stat, val) \
 388  425          atomic_add_64(&arc_stats.stat.value.ui64, (val))
 389  426  
 390  427  #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 391  428  #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 392  429  
 393  430  #define ARCSTAT_MAX(stat, val) {                                        \
 394  431          uint64_t m;                                                     \
 395  432          while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 396  433              (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 397  434                  continue;                                               \
 398  435  }
 399  436  
 400  437  #define ARCSTAT_MAXSTAT(stat) \
 401  438          ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 402  439  
 403  440  /*
 404  441   * We define a macro to allow ARC hits/misses to be easily broken down by
 405  442   * two separate conditions, giving a total of four different subtypes for
 406  443   * each of hits and misses (so eight statistics total).
 407  444   */
 408  445  #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 409  446          if (cond1) {                                                    \
 410  447                  if (cond2) {                                            \
 411  448                          ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 412  449                  } else {                                                \

↓ open down ↓

27 lines elided

↑ open up ↑

 413  450                          ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 414  451                  }                                                       \
 415  452          } else {                                                        \
 416  453                  if (cond2) {                                            \
 417  454                          ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 418  455                  } else {                                                \
 419  456                          ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 420  457                  }                                                       \
 421  458          }
 422  459  
      460 +/*
      461 + * This macro allows us to use kstats as floating averages. Each time we
      462 + * update this kstat, we first factor it and the update value by
      463 + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
      464 + * average. This macro assumes that integer loads and stores are atomic, but
      465 + * is not safe for multiple writers updating the kstat in parallel (only the
      466 + * last writer's update will remain).
      467 + */
      468 +#define ARCSTAT_F_AVG_FACTOR    3
      469 +#define ARCSTAT_F_AVG(stat, value) \
      470 +        do { \
      471 +                uint64_t x = ARCSTAT(stat); \
      472 +                x = x - x / ARCSTAT_F_AVG_FACTOR + \
      473 +                    (value) / ARCSTAT_F_AVG_FACTOR; \
      474 +                ARCSTAT(stat) = x; \
      475 +                _NOTE(NOTREACHED) \
      476 +                _NOTE(CONSTCOND) \
      477 +        } while (0)
      478 +
 423  479  kstat_t                 *arc_ksp;
 424  480  static arc_state_t      *arc_anon;
 425  481  static arc_state_t      *arc_mru;
 426  482  static arc_state_t      *arc_mru_ghost;
 427  483  static arc_state_t      *arc_mfu;
 428  484  static arc_state_t      *arc_mfu_ghost;
 429  485  static arc_state_t      *arc_l2c_only;
 430  486  
 431  487  /*
 432  488   * There are several ARC variables that are critical to export as kstats --

 433  489   * but we don't want to have to grovel around in the kstat whenever we wish to
 434  490   * manipulate them.  For these variables, we therefore define them to be in
 435  491   * terms of the statistic variable.  This assures that we are not introducing
 436  492   * the possibility of inconsistency by having shadow copies of the variables,
 437  493   * while still allowing the code to be readable.
 438  494   */
 439  495  #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 440  496  #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 441  497  #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 442  498  #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 443  499  #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 444  500  #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 445  501  #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 446  502  #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 447  503  
 448  504  #define L2ARC_IS_VALID_COMPRESS(_c_) \
 449  505          ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 450  506  
 451  507  static int              arc_no_grow;    /* Don't try to grow cache size */
 452  508  static uint64_t         arc_tempreserve;
 453  509  static uint64_t         arc_loaned_bytes;
 454  510  
 455  511  typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 456  512  
 457  513  typedef struct arc_callback arc_callback_t;
 458  514  
 459  515  struct arc_callback {
 460  516          void                    *acb_private;
 461  517          arc_done_func_t         *acb_done;
 462  518          arc_buf_t               *acb_buf;
 463  519          zio_t                   *acb_zio_dummy;
 464  520          arc_callback_t          *acb_next;
 465  521  };
 466  522  
 467  523  typedef struct arc_write_callback arc_write_callback_t;
 468  524  
 469  525  struct arc_write_callback {
 470  526          void            *awcb_private;
 471  527          arc_done_func_t *awcb_ready;
 472  528          arc_done_func_t *awcb_done;
 473  529          arc_buf_t       *awcb_buf;
 474  530  };
 475  531  
 476  532  struct arc_buf_hdr {
 477  533          /* protected by hash lock */
 478  534          dva_t                   b_dva;
 479  535          uint64_t                b_birth;
 480  536          uint64_t                b_cksum0;
 481  537  
 482  538          kmutex_t                b_freeze_lock;
 483  539          zio_cksum_t             *b_freeze_cksum;
 484  540          void                    *b_thawed;
 485  541  
 486  542          arc_buf_hdr_t           *b_hash_next;
 487  543          arc_buf_t               *b_buf;
 488  544          uint32_t                b_flags;
 489  545          uint32_t                b_datacnt;
 490  546  
 491  547          arc_callback_t          *b_acb;
 492  548          kcondvar_t              b_cv;
 493  549  
 494  550          /* immutable */
 495  551          arc_buf_contents_t      b_type;
 496  552          uint64_t                b_size;
 497  553          uint64_t                b_spa;
 498  554  
 499  555          /* protected by arc state mutex */
 500  556          arc_state_t             *b_state;
 501  557          list_node_t             b_arc_node;
 502  558  
 503  559          /* updated atomically */
 504  560          clock_t                 b_arc_access;
 505  561  
 506  562          /* self protecting */
 507  563          refcount_t              b_refcnt;
 508  564  
 509  565          l2arc_buf_hdr_t         *b_l2hdr;
 510  566          list_node_t             b_l2node;
 511  567  };
 512  568  
 513  569  static arc_buf_t *arc_eviction_list;
 514  570  static kmutex_t arc_eviction_mtx;
 515  571  static arc_buf_hdr_t arc_eviction_hdr;
 516  572  static void arc_get_data_buf(arc_buf_t *buf);
 517  573  static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 518  574  static int arc_evict_needed(arc_buf_contents_t type);
 519  575  static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 520  576  static void arc_buf_watch(arc_buf_t *buf);
 521  577  
 522  578  static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 523  579  
 524  580  #define GHOST_STATE(state)      \
 525  581          ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 526  582          (state) == arc_l2c_only)
 527  583  
 528  584  /*
 529  585   * Private ARC flags.  These flags are private ARC only flags that will show up
 530  586   * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 531  587   * be passed in as arc_flags in things like arc_read.  However, these flags
 532  588   * should never be passed and should only be set by ARC code.  When adding new
 533  589   * public flags, make sure not to smash the private ones.
 534  590   */
 535  591  
 536  592  #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 537  593  #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 538  594  #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 539  595  #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 540  596  #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 541  597  #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 542  598  #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 543  599  #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 544  600  #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 545  601  #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 546  602  
 547  603  #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 548  604  #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 549  605  #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 550  606  #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 551  607  #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 552  608  #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 553  609  #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 554  610  #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 555  611  #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 556  612                                      (hdr)->b_l2hdr != NULL)
 557  613  #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 558  614  #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 559  615  #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 560  616  
 561  617  /*
 562  618   * Other sizes
 563  619   */
 564  620  
 565  621  #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 566  622  #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 567  623  
 568  624  /*
 569  625   * Hash table routines
 570  626   */
 571  627  
 572  628  #define HT_LOCK_PAD     64
 573  629  
 574  630  struct ht_lock {
 575  631          kmutex_t        ht_lock;
 576  632  #ifdef _KERNEL
 577  633          unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 578  634  #endif
 579  635  };
 580  636  
 581  637  #define BUF_LOCKS 256
 582  638  typedef struct buf_hash_table {
 583  639          uint64_t ht_mask;
 584  640          arc_buf_hdr_t **ht_table;
 585  641          struct ht_lock ht_locks[BUF_LOCKS];
 586  642  } buf_hash_table_t;
 587  643  
 588  644  static buf_hash_table_t buf_hash_table;
 589  645  
 590  646  #define BUF_HASH_INDEX(spa, dva, birth) \
 591  647          (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 592  648  #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 593  649  #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 594  650  #define HDR_LOCK(hdr) \
 595  651          (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 596  652  
 597  653  uint64_t zfs_crc64_table[256];
 598  654  
 599  655  /*
 600  656   * Level 2 ARC
 601  657   */
 602  658  
 603  659  #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 604  660  #define L2ARC_HEADROOM          2                       /* num of writes */
 605  661  /*
 606  662   * If we discover during ARC scan any buffers to be compressed, we boost
 607  663   * our headroom for the next scanning cycle by this percentage multiple.
 608  664   */
 609  665  #define L2ARC_HEADROOM_BOOST    200
 610  666  #define L2ARC_FEED_SECS         1               /* caching interval secs */
 611  667  #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 612  668  
 613  669  #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 614  670  #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 615  671  
 616  672  /* L2ARC Performance Tunables */
 617  673  uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 618  674  uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 619  675  uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */

↓ open down ↓

187 lines elided

↑ open up ↑

 620  676  uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 621  677  uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 622  678  uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 623  679  boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 624  680  boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 625  681  boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 626  682  
 627  683  /*
 628  684   * L2ARC Internals
 629  685   */
 630      -typedef struct l2arc_dev {
 631      -        vdev_t                  *l2ad_vdev;     /* vdev */
 632      -        spa_t                   *l2ad_spa;      /* spa */
 633      -        uint64_t                l2ad_hand;      /* next write location */
 634      -        uint64_t                l2ad_start;     /* first addr on device */
 635      -        uint64_t                l2ad_end;       /* last addr on device */
 636      -        uint64_t                l2ad_evict;     /* last addr eviction reached */
 637      -        boolean_t               l2ad_first;     /* first sweep through */
 638      -        boolean_t               l2ad_writing;   /* currently writing */
 639      -        list_t                  *l2ad_buflist;  /* buffer list */
 640      -        list_node_t             l2ad_node;      /* device list node */
 641      -} l2arc_dev_t;
 642      -
      686 +typedef struct l2arc_dev l2arc_dev_t;
 643  687  static list_t L2ARC_dev_list;                   /* device list */
 644  688  static list_t *l2arc_dev_list;                  /* device list pointer */
 645  689  static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 646  690  static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 647  691  static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 648  692  static list_t L2ARC_free_on_write;              /* free after write buf list */
 649  693  static list_t *l2arc_free_on_write;             /* free after write list ptr */
 650  694  static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 651  695  static uint64_t l2arc_ndev;                     /* number of devices */
 652  696

 653  697  typedef struct l2arc_read_callback {
 654  698          arc_buf_t               *l2rcb_buf;             /* read buffer */

↓ open down ↓

2 lines elided

↑ open up ↑

 655  699          spa_t                   *l2rcb_spa;             /* spa */
 656  700          blkptr_t                l2rcb_bp;               /* original blkptr */
 657  701          zbookmark_t             l2rcb_zb;               /* original bookmark */
 658  702          int                     l2rcb_flags;            /* original flags */
 659  703          enum zio_compress       l2rcb_compress;         /* applied compress */
 660  704  } l2arc_read_callback_t;
 661  705  
 662  706  typedef struct l2arc_write_callback {
 663  707          l2arc_dev_t     *l2wcb_dev;             /* device info */
 664  708          arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
      709 +        uint8_t         *l2wcb_pbuf;            /* pbuf sent in this write */
      710 +        uint32_t        l2wcb_pbuf_size;        /* size of committed pbuf */
      711 +        uint8_t         *l2wcb_ub_buf;          /* uberblock in this write */
 665  712  } l2arc_write_callback_t;
 666  713  
 667  714  struct l2arc_buf_hdr {
 668  715          /* protected by arc_buf_hdr  mutex */
 669  716          l2arc_dev_t             *b_dev;         /* L2ARC device */
 670  717          uint64_t                b_daddr;        /* disk address, offset byte */
 671  718          /* compression applied to buffer data */
 672  719          enum zio_compress       b_compress;
 673  720          /* real alloc'd buffer size depending on b_compress applied */
 674  721          int                     b_asize;

 675  722          /* temporary buffer holder for in-flight compressed data */
 676  723          void                    *b_tmp_cdata;
 677  724  };
 678  725  
 679  726  typedef struct l2arc_data_free {
 680  727          /* protected by l2arc_free_on_write_mtx */
 681  728          void            *l2df_data;

↓ open down ↓

7 lines elided

↑ open up ↑

 682  729          size_t          l2df_size;
 683  730          void            (*l2df_func)(void *, size_t);
 684  731          list_node_t     l2df_list_node;
 685  732  } l2arc_data_free_t;
 686  733  
 687  734  static kmutex_t l2arc_feed_thr_lock;
 688  735  static kcondvar_t l2arc_feed_thr_cv;
 689  736  static uint8_t l2arc_thread_exit;
 690  737  
 691  738  static void l2arc_read_done(zio_t *zio);
 692      -static void l2arc_hdr_stat_add(void);
      739 +static void l2arc_hdr_stat_add(boolean_t from_arc);
 693  740  static void l2arc_hdr_stat_remove(void);
 694  741  
 695  742  static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 696  743  static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 697  744      enum zio_compress c);
 698  745  static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 699  746  
      747 +typedef enum {
      748 +        L2UBLK_BIG_ENDIAN = (1 << 0),   /* little endian assumed otherwise */
      749 +        L2UBLK_EVICT_FIRST = (1 << 1)   /* mirror of l2ad_first in l2dev */
      750 +} l2uberblock_flags_t;
      751 +
      752 +typedef struct l2uberblock {
      753 +        uint32_t                ub_magic;
      754 +        uint8_t                 ub_version;
      755 +        l2uberblock_flags_t     ub_flags;
      756 +
      757 +        uint64_t                ub_spa_guid;
      758 +        uint64_t                ub_birth;
      759 +        uint64_t                ub_evict_tail;  /* current evict pointer */
      760 +        uint64_t                ub_alloc_space; /* vdev space alloc status */
      761 +        uint64_t                ub_pbuf_daddr;  /* address of newest pbuf */
      762 +        uint32_t                ub_pbuf_asize;  /* size of newest pbuf */
      763 +        zio_cksum_t             ub_pbuf_cksum;  /* fletcher4 of newest pbuf */
      764 +
      765 +        zio_cksum_t             ub_cksum;       /* cksum of uberblock */
      766 +} l2uberblock_t;
      767 +
      768 +typedef enum {
      769 +        L2PBUF_BIG_ENDIAN = (1 << 0),   /* little endian assumed otherwise */
      770 +        L2PBUF_COMPRESSED = (1 << 1)    /* pbuf data items are compressed */
      771 +} l2pbuf_flags_t;
      772 +
      773 +typedef struct l2pbuf {
      774 +        uint32_t                pb_magic;
      775 +        unsigned int            pb_version;
      776 +        l2pbuf_flags_t          pb_flags;
      777 +
      778 +        uint64_t                pb_prev_daddr;  /* address of previous pbuf */
      779 +        uint32_t                pb_prev_asize;  /* size of previous pbuf */
      780 +        zio_cksum_t             pb_prev_cksum;  /* fletcher4 of prev. pbuf */
      781 +
      782 +        /*
      783 +         * This is a set of item lists that are contained in this pbuf. Each
      784 +         * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's.
      785 +         * This serves as a soft timeout feature - once the limit of the
      786 +         * number of item lists that a pbuf can hold is reached, the pbuf is
      787 +         * flushed to stable storage, regardless of its total size.
      788 +         */
      789 +        list_t                  *pb_buflists_list;
      790 +
      791 +        /*
      792 +         * Number of compressed bytes referenced by items in this pbuf and
      793 +         * the number of lists present.
      794 +         * This is not actually written to storage, it is only used by
      795 +         * internal algorithms which check for when a pbuf reaches a
      796 +         * certain size limit, after which it is flushed in a write.
      797 +         */
      798 +        uint64_t                pb_payload_asz;
      799 +        /* Same thing for number of buflists */
      800 +        int                     pb_nbuflists;
      801 +
      802 +        /*
      803 +         * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size.
      804 +         * This is then used by l2arc_pbuf_restore to update used space
      805 +         * on the L2ARC vdev.
      806 +         */
      807 +        size_t                  pb_asize;
      808 +} l2pbuf_t;
      809 +
      810 +typedef struct l2pbuf_buf l2pbuf_buf_t;
      811 +typedef struct l2pbuf_buflist {
      812 +        uint32_t                l2pbl_nbufs;
      813 +        l2pbuf_buf_t            *l2pbl_bufs;
      814 +        list_node_t             l2pbl_node;
      815 +} l2pbuf_buflist_t;
      816 +
      817 +struct l2pbuf_buf {
      818 +        dva_t                   b_dva;          /* dva of buffer */
      819 +        uint64_t                b_birth;        /* birth txg of buffer */
      820 +        uint64_t                b_cksum0;
      821 +        zio_cksum_t             b_freeze_cksum;
      822 +        uint32_t                b_size;         /* uncompressed buf size */
      823 +        uint64_t                b_l2daddr;      /* buf location on l2dev */
      824 +        uint32_t                b_l2asize;      /* actual buf data size */
      825 +        enum zio_compress       b_l2compress;   /* compression applied */
      826 +        uint16_t                b_contents_type;
      827 +        uint32_t                b_flags;
      828 +};
      829 +
      830 +struct l2arc_dev {
      831 +        vdev_t                  *l2ad_vdev;     /* vdev */
      832 +        spa_t                   *l2ad_spa;      /* spa */
      833 +        uint64_t                l2ad_hand;      /* next write location */
      834 +        uint64_t                l2ad_start;     /* first addr on device */
      835 +        uint64_t                l2ad_end;       /* last addr on device */
      836 +        uint64_t                l2ad_evict;     /* last addr eviction reached */
      837 +        boolean_t               l2ad_first;     /* first sweep through */
      838 +        boolean_t               l2ad_writing;   /* currently writing */
      839 +        list_t                  *l2ad_buflist;  /* buffer list */
      840 +        list_node_t             l2ad_node;      /* device list node */
      841 +        l2pbuf_t                l2ad_pbuf;      /* currently open pbuf */
      842 +        uint64_t                l2ad_pbuf_daddr;        /* prev pbuf daddr */
      843 +        uint64_t                l2ad_pbuf_asize;        /* prev pbuf asize */
      844 +        zio_cksum_t             l2ad_pbuf_cksum;        /* prev pbuf cksum */
      845 +        /* uberblock birth counter - incremented for each committed uberblk */
      846 +        uint64_t                l2ad_uberblock_birth;
      847 +        /* flag indicating whether a rebuild is currently going on */
      848 +        boolean_t               l2ad_rebuilding;
      849 +};
      850 +
      851 +/* Stores information about an L2ARC prefetch zio */
      852 +typedef struct l2arc_prefetch_info {
      853 +        uint8_t                 *pi_buf;        /* where the zio writes to */
      854 +        uint64_t                pi_buflen;      /* length of `buf' */
      855 +        zio_t                   *pi_hdr_io;     /* see l2arc_pbuf_read below */
      856 +} l2arc_prefetch_info_t;
      857 +
      858 +/* 256 x 4k of l2uberblocks */
      859 +#define L2UBERBLOCK_SIZE        4096
      860 +#define L2UBERBLOCK_MAGIC       0x12bab10c
      861 +#define L2UBERBLOCK_MAX_VERSION 1       /* our maximum uberblock version */
      862 +#define L2PBUF_MAGIC            0xdb0faba6
      863 +#define L2PBUF_MAX_VERSION      1       /* our maximum pbuf version */
      864 +#define L2PBUF_BUF_SIZE         88      /* size of one pbuf buf entry */
      865 +#define L2PBUF_HDR_SIZE         56      /* pbuf header excluding any payload */
      866 +#define L2PBUF_ENCODED_SIZE(_pb) \
      867 +        (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb))
      868 +/*
      869 + * Allocation limit for the payload of a pbuf. This also fundamentally
      870 + * limits the number of bufs we can reference in a pbuf.
      871 + */
      872 +#define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024)
      873 +#define L2PBUF_MAX_BUFS         (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE)
      874 +#define L2PBUF_COMPRESS_MINSZ   8192    /* minimum size to compress a pbuf */
      875 +#define L2PBUF_MAXSZ            100 * 1024 * 1024       /* maximum pbuf size */
      876 +#define L2PBUF_MAX_BUFLISTS     128     /* max number of buflists per pbuf */
      877 +#define L2ARC_REBUILD_TIMEOUT   60      /* a rebuild may take at most 60s */
      878 +#define L2PBUF_IS_FULL(_pb) \
      879 +        ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \
      880 +        (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists)
      881 +/*
      882 + * These are the flags we allow to persist in L2ARC pbufs. The other flags
      883 + * of an ARC buffer pertain to the buffer's runtime behavior.
      884 + */
      885 +#define L2ARC_PERSIST_FLAGS \
      886 +        (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH)
      887 +
      888 +/*
      889 + * Used during L2ARC rebuild after each read operation to check whether we
      890 + * haven't exceeded the rebuild timeout value.
      891 + */
      892 +#define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \
      893 +        do { \
      894 +                if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \
      895 +                        __VA_ARGS__; \
      896 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \
      897 +                        cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \
      898 +                            "dropping remaining L2ARC metadata."); \
      899 +                        return; \
      900 +                } \
      901 +                _NOTE(NOTREACHED) \
      902 +                _NOTE(CONSTCOND) \
      903 +        } while (0)
      904 +
      905 +/*
      906 + * Performance tuning of L2ARC persistency:
      907 + *
      908 + * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt
      909 + *              compressing it.
      910 + * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers
      911 + *              referenced from a pbuf. Once a pbuf reaches this size, it is
      912 + *              committed to stable storage. Ideally, there should be approx.
      913 + *              l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device.
      914 + * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will
      915 + *              be buffered in a pbuf before it is committed to L2ARC. This
      916 + *              puts a soft temporal upper bound on pbuf commit intervals.
      917 + * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
      918 + *              pool import or when adding one manually later) will attempt
      919 + *              to rebuild L2ARC buffer contents. In special circumstances,
      920 + *              the administrator may want to set this to B_FALSE, if they
      921 + *              are having trouble importing a pool or attaching an L2ARC
      922 + *              device (e.g. the L2ARC device is slow to read in stored pbuf
      923 + *              metadata, or the metadata has become somehow
      924 + *              fragmented/unusable).
      925 + * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
      926 + *              avoid a slow L2ARC device from preventing pool import. If we
      927 + *              are not done rebuilding an L2ARC device by this time, we
      928 + *              stop the rebuild and return immediately.
      929 + */
      930 +uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ;
      931 +uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ;
      932 +uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS;
      933 +boolean_t l2arc_rebuild_enabled = B_TRUE;
      934 +uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
      935 +
      936 +static void l2arc_rebuild_start(l2arc_dev_t *dev);
      937 +static void l2arc_rebuild(l2arc_dev_t *dev);
      938 +static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb);
      939 +static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev,
      940 +    uint64_t guid);
      941 +
      942 +static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub);
      943 +static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
      944 +    zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io);
      945 +static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr,
      946 +    uint32_t asize);
      947 +static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize);
      948 +static void l2arc_pbuf_prefetch_abort(zio_t *zio);
      949 +
      950 +static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf);
      951 +static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub);
      952 +static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
      953 +    uint64_t guid);
      954 +static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio,
      955 +    l2arc_write_callback_t *cb);
      956 +
      957 +static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen);
      958 +static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen,
      959 +    l2pbuf_t *pbuf);
      960 +static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen,
      961 +    uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum);
      962 +static void l2arc_pbuf_init(l2pbuf_t *pb);
      963 +static void l2arc_pbuf_destroy(l2pbuf_t *pb);
      964 +static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio,
      965 +    l2arc_write_callback_t *cb);
      966 +static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs);
      967 +static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
      968 +    const arc_buf_hdr_t *ab, int index);
      969 +static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb);
      970 +
 700  971  static uint64_t
 701  972  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 702  973  {
 703  974          uint8_t *vdva = (uint8_t *)dva;
 704  975          uint64_t crc = -1ULL;
 705  976          int i;
 706  977  
 707  978          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 708  979  
 709  980          for (i = 0; i < sizeof (dva_t); i++)

 710  981                  crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 711  982  
 712  983          crc ^= (spa>>8) ^ birth;
 713  984  
 714  985          return (crc);
 715  986  }
 716  987  
 717  988  #define BUF_EMPTY(buf)                                          \
 718  989          ((buf)->b_dva.dva_word[0] == 0 &&                       \
 719  990          (buf)->b_dva.dva_word[1] == 0 &&                        \
 720  991          (buf)->b_birth == 0)
 721  992  
 722  993  #define BUF_EQUAL(spa, dva, birth, buf)                         \
 723  994          ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 724  995          ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 725  996          ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 726  997  
 727  998  static void
 728  999  buf_discard_identity(arc_buf_hdr_t *hdr)
 729 1000  {
 730 1001          hdr->b_dva.dva_word[0] = 0;
 731 1002          hdr->b_dva.dva_word[1] = 0;
 732 1003          hdr->b_birth = 0;
 733 1004          hdr->b_cksum0 = 0;
 734 1005  }
 735 1006  
 736 1007  static arc_buf_hdr_t *
 737 1008  buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 738 1009  {
 739 1010          uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 740 1011          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 741 1012          arc_buf_hdr_t *buf;
 742 1013  
 743 1014          mutex_enter(hash_lock);
 744 1015          for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 745 1016              buf = buf->b_hash_next) {
 746 1017                  if (BUF_EQUAL(spa, dva, birth, buf)) {
 747 1018                          *lockp = hash_lock;
 748 1019                          return (buf);
 749 1020                  }
 750 1021          }
 751 1022          mutex_exit(hash_lock);
 752 1023          *lockp = NULL;
 753 1024          return (NULL);
 754 1025  }
 755 1026  
 756 1027  /*
 757 1028   * Insert an entry into the hash table.  If there is already an element
 758 1029   * equal to elem in the hash table, then the already existing element
 759 1030   * will be returned and the new element will not be inserted.
 760 1031   * Otherwise returns NULL.
 761 1032   */
 762 1033  static arc_buf_hdr_t *
 763 1034  buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 764 1035  {
 765 1036          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 766 1037          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 767 1038          arc_buf_hdr_t *fbuf;
 768 1039          uint32_t i;
 769 1040  
 770 1041          ASSERT(!HDR_IN_HASH_TABLE(buf));
 771 1042          *lockp = hash_lock;
 772 1043          mutex_enter(hash_lock);
 773 1044          for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 774 1045              fbuf = fbuf->b_hash_next, i++) {
 775 1046                  if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 776 1047                          return (fbuf);
 777 1048          }
 778 1049  
 779 1050          buf->b_hash_next = buf_hash_table.ht_table[idx];
 780 1051          buf_hash_table.ht_table[idx] = buf;
 781 1052          buf->b_flags |= ARC_IN_HASH_TABLE;
 782 1053  
 783 1054          /* collect some hash table performance data */
 784 1055          if (i > 0) {
 785 1056                  ARCSTAT_BUMP(arcstat_hash_collisions);
 786 1057                  if (i == 1)
 787 1058                          ARCSTAT_BUMP(arcstat_hash_chains);
 788 1059  
 789 1060                  ARCSTAT_MAX(arcstat_hash_chain_max, i);
 790 1061          }
 791 1062  
 792 1063          ARCSTAT_BUMP(arcstat_hash_elements);
 793 1064          ARCSTAT_MAXSTAT(arcstat_hash_elements);
 794 1065  
 795 1066          return (NULL);
 796 1067  }
 797 1068  
 798 1069  static void
 799 1070  buf_hash_remove(arc_buf_hdr_t *buf)
 800 1071  {
 801 1072          arc_buf_hdr_t *fbuf, **bufp;
 802 1073          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 803 1074  
 804 1075          ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 805 1076          ASSERT(HDR_IN_HASH_TABLE(buf));
 806 1077  
 807 1078          bufp = &buf_hash_table.ht_table[idx];
 808 1079          while ((fbuf = *bufp) != buf) {
 809 1080                  ASSERT(fbuf != NULL);
 810 1081                  bufp = &fbuf->b_hash_next;
 811 1082          }
 812 1083          *bufp = buf->b_hash_next;
 813 1084          buf->b_hash_next = NULL;
 814 1085          buf->b_flags &= ~ARC_IN_HASH_TABLE;
 815 1086  
 816 1087          /* collect some hash table performance data */
 817 1088          ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 818 1089  
 819 1090          if (buf_hash_table.ht_table[idx] &&
 820 1091              buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 821 1092                  ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 822 1093  }
 823 1094  
 824 1095  /*
 825 1096   * Global data structures and functions for the buf kmem cache.
 826 1097   */
 827 1098  static kmem_cache_t *hdr_cache;
 828 1099  static kmem_cache_t *buf_cache;
 829 1100  
 830 1101  static void
 831 1102  buf_fini(void)
 832 1103  {
 833 1104          int i;
 834 1105  
 835 1106          kmem_free(buf_hash_table.ht_table,
 836 1107              (buf_hash_table.ht_mask + 1) * sizeof (void *));
 837 1108          for (i = 0; i < BUF_LOCKS; i++)
 838 1109                  mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 839 1110          kmem_cache_destroy(hdr_cache);
 840 1111          kmem_cache_destroy(buf_cache);
 841 1112  }
 842 1113  
 843 1114  /*
 844 1115   * Constructor callback - called when the cache is empty
 845 1116   * and a new buf is requested.
 846 1117   */
 847 1118  /* ARGSUSED */
 848 1119  static int
 849 1120  hdr_cons(void *vbuf, void *unused, int kmflag)
 850 1121  {
 851 1122          arc_buf_hdr_t *buf = vbuf;
 852 1123  
 853 1124          bzero(buf, sizeof (arc_buf_hdr_t));
 854 1125          refcount_create(&buf->b_refcnt);
 855 1126          cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 856 1127          mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 857 1128          arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 858 1129  
 859 1130          return (0);
 860 1131  }
 861 1132  
 862 1133  /* ARGSUSED */
 863 1134  static int
 864 1135  buf_cons(void *vbuf, void *unused, int kmflag)
 865 1136  {
 866 1137          arc_buf_t *buf = vbuf;
 867 1138  
 868 1139          bzero(buf, sizeof (arc_buf_t));
 869 1140          mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 870 1141          arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 871 1142  
 872 1143          return (0);
 873 1144  }
 874 1145  
 875 1146  /*
 876 1147   * Destructor callback - called when a cached buf is
 877 1148   * no longer required.
 878 1149   */
 879 1150  /* ARGSUSED */
 880 1151  static void
 881 1152  hdr_dest(void *vbuf, void *unused)
 882 1153  {
 883 1154          arc_buf_hdr_t *buf = vbuf;
 884 1155  
 885 1156          ASSERT(BUF_EMPTY(buf));
 886 1157          refcount_destroy(&buf->b_refcnt);
 887 1158          cv_destroy(&buf->b_cv);
 888 1159          mutex_destroy(&buf->b_freeze_lock);
 889 1160          arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 890 1161  }
 891 1162  
 892 1163  /* ARGSUSED */
 893 1164  static void
 894 1165  buf_dest(void *vbuf, void *unused)
 895 1166  {
 896 1167          arc_buf_t *buf = vbuf;
 897 1168  
 898 1169          mutex_destroy(&buf->b_evict_lock);
 899 1170          arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 900 1171  }
 901 1172  
 902 1173  /*
 903 1174   * Reclaim callback -- invoked when memory is low.
 904 1175   */
 905 1176  /* ARGSUSED */
 906 1177  static void
 907 1178  hdr_recl(void *unused)
 908 1179  {
 909 1180          dprintf("hdr_recl called\n");
 910 1181          /*
 911 1182           * umem calls the reclaim func when we destroy the buf cache,
 912 1183           * which is after we do arc_fini().
 913 1184           */
 914 1185          if (!arc_dead)
 915 1186                  cv_signal(&arc_reclaim_thr_cv);
 916 1187  }
 917 1188  
 918 1189  static void
 919 1190  buf_init(void)
 920 1191  {
 921 1192          uint64_t *ct;
 922 1193          uint64_t hsize = 1ULL << 12;
 923 1194          int i, j;
 924 1195  
 925 1196          /*
 926 1197           * The hash table is big enough to fill all of physical memory
 927 1198           * with an average 64K block size.  The table will take up
 928 1199           * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 929 1200           */
 930 1201          while (hsize * 65536 < physmem * PAGESIZE)
 931 1202                  hsize <<= 1;
 932 1203  retry:
 933 1204          buf_hash_table.ht_mask = hsize - 1;
 934 1205          buf_hash_table.ht_table =
 935 1206              kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 936 1207          if (buf_hash_table.ht_table == NULL) {
 937 1208                  ASSERT(hsize > (1ULL << 8));
 938 1209                  hsize >>= 1;
 939 1210                  goto retry;
 940 1211          }
 941 1212  
 942 1213          hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 943 1214              0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 944 1215          buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 945 1216              0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 946 1217  
 947 1218          for (i = 0; i < 256; i++)
 948 1219                  for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 949 1220                          *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 950 1221  
 951 1222          for (i = 0; i < BUF_LOCKS; i++) {
 952 1223                  mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 953 1224                      NULL, MUTEX_DEFAULT, NULL);
 954 1225          }
 955 1226  }
 956 1227  
 957 1228  #define ARC_MINTIME     (hz>>4) /* 62 ms */
 958 1229  
 959 1230  static void
 960 1231  arc_cksum_verify(arc_buf_t *buf)
 961 1232  {
 962 1233          zio_cksum_t zc;
 963 1234  
 964 1235          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 965 1236                  return;
 966 1237  
 967 1238          mutex_enter(&buf->b_hdr->b_freeze_lock);
 968 1239          if (buf->b_hdr->b_freeze_cksum == NULL ||
 969 1240              (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 970 1241                  mutex_exit(&buf->b_hdr->b_freeze_lock);
 971 1242                  return;
 972 1243          }
 973 1244          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 974 1245          if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 975 1246                  panic("buffer modified while frozen!");
 976 1247          mutex_exit(&buf->b_hdr->b_freeze_lock);
 977 1248  }
 978 1249  
 979 1250  static int
 980 1251  arc_cksum_equal(arc_buf_t *buf)
 981 1252  {
 982 1253          zio_cksum_t zc;
 983 1254          int equal;
 984 1255  
 985 1256          mutex_enter(&buf->b_hdr->b_freeze_lock);
 986 1257          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 987 1258          equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 988 1259          mutex_exit(&buf->b_hdr->b_freeze_lock);
 989 1260  
 990 1261          return (equal);
 991 1262  }
 992 1263  
 993 1264  static void
 994 1265  arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 995 1266  {
 996 1267          if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 997 1268                  return;
 998 1269  
 999 1270          mutex_enter(&buf->b_hdr->b_freeze_lock);
1000 1271          if (buf->b_hdr->b_freeze_cksum != NULL) {
1001 1272                  mutex_exit(&buf->b_hdr->b_freeze_lock);
1002 1273                  return;
1003 1274          }
1004 1275          buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1005 1276          fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1006 1277              buf->b_hdr->b_freeze_cksum);
1007 1278          mutex_exit(&buf->b_hdr->b_freeze_lock);
1008 1279          arc_buf_watch(buf);
1009 1280  }
1010 1281  
1011 1282  #ifndef _KERNEL
1012 1283  typedef struct procctl {
1013 1284          long cmd;
1014 1285          prwatch_t prwatch;
1015 1286  } procctl_t;
1016 1287  #endif
1017 1288  
1018 1289  /* ARGSUSED */
1019 1290  static void
1020 1291  arc_buf_unwatch(arc_buf_t *buf)
1021 1292  {
1022 1293  #ifndef _KERNEL
1023 1294          if (arc_watch) {
1024 1295                  int result;
1025 1296                  procctl_t ctl;
1026 1297                  ctl.cmd = PCWATCH;
1027 1298                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1028 1299                  ctl.prwatch.pr_size = 0;
1029 1300                  ctl.prwatch.pr_wflags = 0;
1030 1301                  result = write(arc_procfd, &ctl, sizeof (ctl));
1031 1302                  ASSERT3U(result, ==, sizeof (ctl));
1032 1303          }
1033 1304  #endif
1034 1305  }
1035 1306  
1036 1307  /* ARGSUSED */
1037 1308  static void
1038 1309  arc_buf_watch(arc_buf_t *buf)
1039 1310  {
1040 1311  #ifndef _KERNEL
1041 1312          if (arc_watch) {
1042 1313                  int result;
1043 1314                  procctl_t ctl;
1044 1315                  ctl.cmd = PCWATCH;
1045 1316                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1046 1317                  ctl.prwatch.pr_size = buf->b_hdr->b_size;
1047 1318                  ctl.prwatch.pr_wflags = WA_WRITE;
1048 1319                  result = write(arc_procfd, &ctl, sizeof (ctl));
1049 1320                  ASSERT3U(result, ==, sizeof (ctl));
1050 1321          }
1051 1322  #endif
1052 1323  }
1053 1324  
1054 1325  void
1055 1326  arc_buf_thaw(arc_buf_t *buf)
1056 1327  {
1057 1328          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1058 1329                  if (buf->b_hdr->b_state != arc_anon)
1059 1330                          panic("modifying non-anon buffer!");
1060 1331                  if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1061 1332                          panic("modifying buffer while i/o in progress!");
1062 1333                  arc_cksum_verify(buf);
1063 1334          }
1064 1335  
1065 1336          mutex_enter(&buf->b_hdr->b_freeze_lock);
1066 1337          if (buf->b_hdr->b_freeze_cksum != NULL) {
1067 1338                  kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1068 1339                  buf->b_hdr->b_freeze_cksum = NULL;
1069 1340          }
1070 1341  
1071 1342          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1072 1343                  if (buf->b_hdr->b_thawed)
1073 1344                          kmem_free(buf->b_hdr->b_thawed, 1);
1074 1345                  buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1075 1346          }
1076 1347  
1077 1348          mutex_exit(&buf->b_hdr->b_freeze_lock);
1078 1349  
1079 1350          arc_buf_unwatch(buf);
1080 1351  }
1081 1352  
1082 1353  void
1083 1354  arc_buf_freeze(arc_buf_t *buf)
1084 1355  {
1085 1356          kmutex_t *hash_lock;
1086 1357  
1087 1358          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1088 1359                  return;
1089 1360  
1090 1361          hash_lock = HDR_LOCK(buf->b_hdr);
1091 1362          mutex_enter(hash_lock);
1092 1363  
1093 1364          ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1094 1365              buf->b_hdr->b_state == arc_anon);
1095 1366          arc_cksum_compute(buf, B_FALSE);
1096 1367          mutex_exit(hash_lock);
1097 1368  
1098 1369  }
1099 1370  
1100 1371  static void
1101 1372  add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1102 1373  {
1103 1374          ASSERT(MUTEX_HELD(hash_lock));
1104 1375  
1105 1376          if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1106 1377              (ab->b_state != arc_anon)) {
1107 1378                  uint64_t delta = ab->b_size * ab->b_datacnt;
1108 1379                  list_t *list = &ab->b_state->arcs_list[ab->b_type];
1109 1380                  uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1110 1381  
1111 1382                  ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1112 1383                  mutex_enter(&ab->b_state->arcs_mtx);
1113 1384                  ASSERT(list_link_active(&ab->b_arc_node));
1114 1385                  list_remove(list, ab);
1115 1386                  if (GHOST_STATE(ab->b_state)) {
1116 1387                          ASSERT0(ab->b_datacnt);
1117 1388                          ASSERT3P(ab->b_buf, ==, NULL);
1118 1389                          delta = ab->b_size;
1119 1390                  }
1120 1391                  ASSERT(delta > 0);
1121 1392                  ASSERT3U(*size, >=, delta);
1122 1393                  atomic_add_64(size, -delta);
1123 1394                  mutex_exit(&ab->b_state->arcs_mtx);
1124 1395                  /* remove the prefetch flag if we get a reference */
1125 1396                  if (ab->b_flags & ARC_PREFETCH)
1126 1397                          ab->b_flags &= ~ARC_PREFETCH;
1127 1398          }
1128 1399  }
1129 1400  
1130 1401  static int
1131 1402  remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1132 1403  {
1133 1404          int cnt;
1134 1405          arc_state_t *state = ab->b_state;
1135 1406  
1136 1407          ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1137 1408          ASSERT(!GHOST_STATE(state));
1138 1409  
1139 1410          if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1140 1411              (state != arc_anon)) {
1141 1412                  uint64_t *size = &state->arcs_lsize[ab->b_type];
1142 1413  
1143 1414                  ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1144 1415                  mutex_enter(&state->arcs_mtx);
1145 1416                  ASSERT(!list_link_active(&ab->b_arc_node));
1146 1417                  list_insert_head(&state->arcs_list[ab->b_type], ab);
1147 1418                  ASSERT(ab->b_datacnt > 0);
1148 1419                  atomic_add_64(size, ab->b_size * ab->b_datacnt);
1149 1420                  mutex_exit(&state->arcs_mtx);
1150 1421          }
1151 1422          return (cnt);
1152 1423  }
1153 1424  
1154 1425  /*
1155 1426   * Move the supplied buffer to the indicated state.  The mutex
1156 1427   * for the buffer must be held by the caller.
1157 1428   */
1158 1429  static void
1159 1430  arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1160 1431  {
1161 1432          arc_state_t *old_state = ab->b_state;
1162 1433          int64_t refcnt = refcount_count(&ab->b_refcnt);
1163 1434          uint64_t from_delta, to_delta;
1164 1435  
1165 1436          ASSERT(MUTEX_HELD(hash_lock));
1166 1437          ASSERT(new_state != old_state);
1167 1438          ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1168 1439          ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1169 1440          ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1170 1441  
1171 1442          from_delta = to_delta = ab->b_datacnt * ab->b_size;
1172 1443  
1173 1444          /*
1174 1445           * If this buffer is evictable, transfer it from the
1175 1446           * old state list to the new state list.
1176 1447           */
1177 1448          if (refcnt == 0) {
1178 1449                  if (old_state != arc_anon) {
1179 1450                          int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1180 1451                          uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1181 1452  
1182 1453                          if (use_mutex)
1183 1454                                  mutex_enter(&old_state->arcs_mtx);
1184 1455  
1185 1456                          ASSERT(list_link_active(&ab->b_arc_node));
1186 1457                          list_remove(&old_state->arcs_list[ab->b_type], ab);
1187 1458  
1188 1459                          /*
1189 1460                           * If prefetching out of the ghost cache,
1190 1461                           * we will have a non-zero datacnt.
1191 1462                           */
1192 1463                          if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1193 1464                                  /* ghost elements have a ghost size */
1194 1465                                  ASSERT(ab->b_buf == NULL);
1195 1466                                  from_delta = ab->b_size;
1196 1467                          }
1197 1468                          ASSERT3U(*size, >=, from_delta);
1198 1469                          atomic_add_64(size, -from_delta);
1199 1470  
1200 1471                          if (use_mutex)
1201 1472                                  mutex_exit(&old_state->arcs_mtx);
1202 1473                  }
1203 1474                  if (new_state != arc_anon) {
1204 1475                          int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1205 1476                          uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1206 1477  
1207 1478                          if (use_mutex)
1208 1479                                  mutex_enter(&new_state->arcs_mtx);
1209 1480  
1210 1481                          list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1211 1482  
1212 1483                          /* ghost elements have a ghost size */
1213 1484                          if (GHOST_STATE(new_state)) {
1214 1485                                  ASSERT(ab->b_datacnt == 0);
1215 1486                                  ASSERT(ab->b_buf == NULL);
1216 1487                                  to_delta = ab->b_size;
1217 1488                          }
1218 1489                          atomic_add_64(size, to_delta);
1219 1490  
1220 1491                          if (use_mutex)
1221 1492                                  mutex_exit(&new_state->arcs_mtx);
1222 1493                  }
1223 1494          }
1224 1495  
1225 1496          ASSERT(!BUF_EMPTY(ab));
1226 1497          if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1227 1498                  buf_hash_remove(ab);
1228 1499  
1229 1500          /* adjust state sizes */

↓ open down ↓

520 lines elided

↑ open up ↑

1230 1501          if (to_delta)
1231 1502                  atomic_add_64(&new_state->arcs_size, to_delta);
1232 1503          if (from_delta) {
1233 1504                  ASSERT3U(old_state->arcs_size, >=, from_delta);
1234 1505                  atomic_add_64(&old_state->arcs_size, -from_delta);
1235 1506          }
1236 1507          ab->b_state = new_state;
1237 1508  
1238 1509          /* adjust l2arc hdr stats */
1239 1510          if (new_state == arc_l2c_only)
1240      -                l2arc_hdr_stat_add();
     1511 +                l2arc_hdr_stat_add(old_state != arc_anon);
1241 1512          else if (old_state == arc_l2c_only)
1242 1513                  l2arc_hdr_stat_remove();
1243 1514  }
1244 1515  
1245 1516  void
1246 1517  arc_space_consume(uint64_t space, arc_space_type_t type)
1247 1518  {
1248 1519          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1249 1520  
1250 1521          switch (type) {

1251 1522          case ARC_SPACE_DATA:
1252 1523                  ARCSTAT_INCR(arcstat_data_size, space);
1253 1524                  break;
1254 1525          case ARC_SPACE_OTHER:
1255 1526                  ARCSTAT_INCR(arcstat_other_size, space);
1256 1527                  break;
1257 1528          case ARC_SPACE_HDRS:
1258 1529                  ARCSTAT_INCR(arcstat_hdr_size, space);
1259 1530                  break;
1260 1531          case ARC_SPACE_L2HDRS:
1261 1532                  ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1262 1533                  break;
1263 1534          }
1264 1535  
1265 1536          ARCSTAT_INCR(arcstat_meta_used, space);
1266 1537          atomic_add_64(&arc_size, space);
1267 1538  }
1268 1539  
1269 1540  void
1270 1541  arc_space_return(uint64_t space, arc_space_type_t type)
1271 1542  {
1272 1543          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1273 1544  
1274 1545          switch (type) {
1275 1546          case ARC_SPACE_DATA:
1276 1547                  ARCSTAT_INCR(arcstat_data_size, -space);
1277 1548                  break;
1278 1549          case ARC_SPACE_OTHER:
1279 1550                  ARCSTAT_INCR(arcstat_other_size, -space);
1280 1551                  break;
1281 1552          case ARC_SPACE_HDRS:
1282 1553                  ARCSTAT_INCR(arcstat_hdr_size, -space);
1283 1554                  break;
1284 1555          case ARC_SPACE_L2HDRS:
1285 1556                  ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1286 1557                  break;
1287 1558          }
1288 1559  
1289 1560          ASSERT(arc_meta_used >= space);
1290 1561          if (arc_meta_max < arc_meta_used)
1291 1562                  arc_meta_max = arc_meta_used;
1292 1563          ARCSTAT_INCR(arcstat_meta_used, -space);
1293 1564          ASSERT(arc_size >= space);
1294 1565          atomic_add_64(&arc_size, -space);
1295 1566  }
1296 1567  
1297 1568  void *
1298 1569  arc_data_buf_alloc(uint64_t size)
1299 1570  {
1300 1571          if (arc_evict_needed(ARC_BUFC_DATA))
1301 1572                  cv_signal(&arc_reclaim_thr_cv);
1302 1573          atomic_add_64(&arc_size, size);
1303 1574          return (zio_data_buf_alloc(size));
1304 1575  }
1305 1576  
1306 1577  void
1307 1578  arc_data_buf_free(void *buf, uint64_t size)
1308 1579  {
1309 1580          zio_data_buf_free(buf, size);
1310 1581          ASSERT(arc_size >= size);
1311 1582          atomic_add_64(&arc_size, -size);
1312 1583  }
1313 1584  
1314 1585  arc_buf_t *
1315 1586  arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1316 1587  {
1317 1588          arc_buf_hdr_t *hdr;
1318 1589          arc_buf_t *buf;
1319 1590  
1320 1591          ASSERT3U(size, >, 0);
1321 1592          hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1322 1593          ASSERT(BUF_EMPTY(hdr));
1323 1594          hdr->b_size = size;
1324 1595          hdr->b_type = type;
1325 1596          hdr->b_spa = spa_load_guid(spa);
1326 1597          hdr->b_state = arc_anon;
1327 1598          hdr->b_arc_access = 0;
1328 1599          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1329 1600          buf->b_hdr = hdr;
1330 1601          buf->b_data = NULL;
1331 1602          buf->b_efunc = NULL;
1332 1603          buf->b_private = NULL;
1333 1604          buf->b_next = NULL;

↓ open down ↓

83 lines elided

↑ open up ↑

1334 1605          hdr->b_buf = buf;
1335 1606          arc_get_data_buf(buf);
1336 1607          hdr->b_datacnt = 1;
1337 1608          hdr->b_flags = 0;
1338 1609          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1339 1610          (void) refcount_add(&hdr->b_refcnt, tag);
1340 1611  
1341 1612          return (buf);
1342 1613  }
1343 1614  
     1615 +/*
     1616 + * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
     1617 + * This is used during l2arc reconstruction to make empty ARC buffers
     1618 + * which circumvent the regular disk->arc->l2arc path and instead come
     1619 + * into being in the reverse order, i.e. l2arc->arc->(disk).
     1620 + */
     1621 +arc_buf_hdr_t *
     1622 +arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
     1623 +{
     1624 +        arc_buf_hdr_t *hdr;
     1625 +
     1626 +        ASSERT3U(size, >, 0);
     1627 +        hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
     1628 +        ASSERT(BUF_EMPTY(hdr));
     1629 +        hdr->b_size = size;
     1630 +        hdr->b_type = type;
     1631 +        hdr->b_spa = guid;
     1632 +        hdr->b_state = arc_anon;
     1633 +        hdr->b_arc_access = 0;
     1634 +        hdr->b_buf = NULL;
     1635 +        hdr->b_datacnt = 0;
     1636 +        hdr->b_flags = 0;
     1637 +        ASSERT(refcount_is_zero(&hdr->b_refcnt));
     1638 +
     1639 +        return (hdr);
     1640 +}
     1641 +
1344 1642  static char *arc_onloan_tag = "onloan";
1345 1643  
1346 1644  /*
1347 1645   * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1348 1646   * flight data by arc_tempreserve_space() until they are "returned". Loaned
1349 1647   * buffers must be returned to the arc before they can be used by the DMU or
1350 1648   * freed.
1351 1649   */
1352 1650  arc_buf_t *
1353 1651  arc_loan_buf(spa_t *spa, int size)

1354 1652  {
1355 1653          arc_buf_t *buf;
1356 1654  
1357 1655          buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1358 1656  
1359 1657          atomic_add_64(&arc_loaned_bytes, size);
1360 1658          return (buf);
1361 1659  }
1362 1660  
1363 1661  /*
1364 1662   * Return a loaned arc buffer to the arc.
1365 1663   */
1366 1664  void
1367 1665  arc_return_buf(arc_buf_t *buf, void *tag)
1368 1666  {
1369 1667          arc_buf_hdr_t *hdr = buf->b_hdr;
1370 1668  
1371 1669          ASSERT(buf->b_data != NULL);
1372 1670          (void) refcount_add(&hdr->b_refcnt, tag);
1373 1671          (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1374 1672  
1375 1673          atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1376 1674  }
1377 1675  
1378 1676  /* Detach an arc_buf from a dbuf (tag) */
1379 1677  void
1380 1678  arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1381 1679  {
1382 1680          arc_buf_hdr_t *hdr;
1383 1681  
1384 1682          ASSERT(buf->b_data != NULL);
1385 1683          hdr = buf->b_hdr;
1386 1684          (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1387 1685          (void) refcount_remove(&hdr->b_refcnt, tag);
1388 1686          buf->b_efunc = NULL;
1389 1687          buf->b_private = NULL;
1390 1688  
1391 1689          atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1392 1690  }
1393 1691  
1394 1692  static arc_buf_t *
1395 1693  arc_buf_clone(arc_buf_t *from)
1396 1694  {
1397 1695          arc_buf_t *buf;
1398 1696          arc_buf_hdr_t *hdr = from->b_hdr;
1399 1697          uint64_t size = hdr->b_size;
1400 1698  
1401 1699          ASSERT(hdr->b_state != arc_anon);
1402 1700  
1403 1701          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1404 1702          buf->b_hdr = hdr;
1405 1703          buf->b_data = NULL;
1406 1704          buf->b_efunc = NULL;
1407 1705          buf->b_private = NULL;
1408 1706          buf->b_next = hdr->b_buf;
1409 1707          hdr->b_buf = buf;
1410 1708          arc_get_data_buf(buf);
1411 1709          bcopy(from->b_data, buf->b_data, size);
1412 1710  
1413 1711          /*
1414 1712           * This buffer already exists in the arc so create a duplicate
1415 1713           * copy for the caller.  If the buffer is associated with user data
1416 1714           * then track the size and number of duplicates.  These stats will be
1417 1715           * updated as duplicate buffers are created and destroyed.
1418 1716           */
1419 1717          if (hdr->b_type == ARC_BUFC_DATA) {
1420 1718                  ARCSTAT_BUMP(arcstat_duplicate_buffers);
1421 1719                  ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1422 1720          }
1423 1721          hdr->b_datacnt += 1;
1424 1722          return (buf);
1425 1723  }
1426 1724  
1427 1725  void
1428 1726  arc_buf_add_ref(arc_buf_t *buf, void* tag)
1429 1727  {
1430 1728          arc_buf_hdr_t *hdr;
1431 1729          kmutex_t *hash_lock;
1432 1730  
1433 1731          /*
1434 1732           * Check to see if this buffer is evicted.  Callers
1435 1733           * must verify b_data != NULL to know if the add_ref
1436 1734           * was successful.
1437 1735           */
1438 1736          mutex_enter(&buf->b_evict_lock);
1439 1737          if (buf->b_data == NULL) {
1440 1738                  mutex_exit(&buf->b_evict_lock);
1441 1739                  return;
1442 1740          }
1443 1741          hash_lock = HDR_LOCK(buf->b_hdr);
1444 1742          mutex_enter(hash_lock);
1445 1743          hdr = buf->b_hdr;
1446 1744          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1447 1745          mutex_exit(&buf->b_evict_lock);
1448 1746  
1449 1747          ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1450 1748          add_reference(hdr, hash_lock, tag);
1451 1749          DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1452 1750          arc_access(hdr, hash_lock);
1453 1751          mutex_exit(hash_lock);
1454 1752          ARCSTAT_BUMP(arcstat_hits);
1455 1753          ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1456 1754              demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1457 1755              data, metadata, hits);
1458 1756  }
1459 1757  
1460 1758  /*
1461 1759   * Free the arc data buffer.  If it is an l2arc write in progress,
1462 1760   * the buffer is placed on l2arc_free_on_write to be freed later.
1463 1761   */
1464 1762  static void
1465 1763  arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1466 1764  {
1467 1765          arc_buf_hdr_t *hdr = buf->b_hdr;
1468 1766  
1469 1767          if (HDR_L2_WRITING(hdr)) {
1470 1768                  l2arc_data_free_t *df;
1471 1769                  df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1472 1770                  df->l2df_data = buf->b_data;
1473 1771                  df->l2df_size = hdr->b_size;
1474 1772                  df->l2df_func = free_func;
1475 1773                  mutex_enter(&l2arc_free_on_write_mtx);
1476 1774                  list_insert_head(l2arc_free_on_write, df);
1477 1775                  mutex_exit(&l2arc_free_on_write_mtx);
1478 1776                  ARCSTAT_BUMP(arcstat_l2_free_on_write);
1479 1777          } else {
1480 1778                  free_func(buf->b_data, hdr->b_size);
1481 1779          }
1482 1780  }
1483 1781  
1484 1782  static void
1485 1783  arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1486 1784  {
1487 1785          arc_buf_t **bufp;
1488 1786  
1489 1787          /* free up data associated with the buf */
1490 1788          if (buf->b_data) {
1491 1789                  arc_state_t *state = buf->b_hdr->b_state;
1492 1790                  uint64_t size = buf->b_hdr->b_size;
1493 1791                  arc_buf_contents_t type = buf->b_hdr->b_type;
1494 1792  
1495 1793                  arc_cksum_verify(buf);
1496 1794                  arc_buf_unwatch(buf);
1497 1795  
1498 1796                  if (!recycle) {
1499 1797                          if (type == ARC_BUFC_METADATA) {
1500 1798                                  arc_buf_data_free(buf, zio_buf_free);
1501 1799                                  arc_space_return(size, ARC_SPACE_DATA);
1502 1800                          } else {
1503 1801                                  ASSERT(type == ARC_BUFC_DATA);
1504 1802                                  arc_buf_data_free(buf, zio_data_buf_free);
1505 1803                                  ARCSTAT_INCR(arcstat_data_size, -size);
1506 1804                                  atomic_add_64(&arc_size, -size);
1507 1805                          }
1508 1806                  }
1509 1807                  if (list_link_active(&buf->b_hdr->b_arc_node)) {
1510 1808                          uint64_t *cnt = &state->arcs_lsize[type];
1511 1809  
1512 1810                          ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1513 1811                          ASSERT(state != arc_anon);
1514 1812  
1515 1813                          ASSERT3U(*cnt, >=, size);
1516 1814                          atomic_add_64(cnt, -size);
1517 1815                  }
1518 1816                  ASSERT3U(state->arcs_size, >=, size);
1519 1817                  atomic_add_64(&state->arcs_size, -size);
1520 1818                  buf->b_data = NULL;
1521 1819  
1522 1820                  /*
1523 1821                   * If we're destroying a duplicate buffer make sure
1524 1822                   * that the appropriate statistics are updated.
1525 1823                   */
1526 1824                  if (buf->b_hdr->b_datacnt > 1 &&
1527 1825                      buf->b_hdr->b_type == ARC_BUFC_DATA) {
1528 1826                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1529 1827                          ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1530 1828                  }
1531 1829                  ASSERT(buf->b_hdr->b_datacnt > 0);
1532 1830                  buf->b_hdr->b_datacnt -= 1;
1533 1831          }
1534 1832  
1535 1833          /* only remove the buf if requested */
1536 1834          if (!all)
1537 1835                  return;
1538 1836  
1539 1837          /* remove the buf from the hdr list */
1540 1838          for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1541 1839                  continue;
1542 1840          *bufp = buf->b_next;
1543 1841          buf->b_next = NULL;
1544 1842  
1545 1843          ASSERT(buf->b_efunc == NULL);
1546 1844  
1547 1845          /* clean up the buf */
1548 1846          buf->b_hdr = NULL;
1549 1847          kmem_cache_free(buf_cache, buf);
1550 1848  }
1551 1849  
1552 1850  static void
1553 1851  arc_hdr_destroy(arc_buf_hdr_t *hdr)
1554 1852  {
1555 1853          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1556 1854          ASSERT3P(hdr->b_state, ==, arc_anon);
1557 1855          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1558 1856          l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1559 1857  
1560 1858          if (l2hdr != NULL) {
1561 1859                  boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1562 1860                  /*
1563 1861                   * To prevent arc_free() and l2arc_evict() from
1564 1862                   * attempting to free the same buffer at the same time,
1565 1863                   * a FREE_IN_PROGRESS flag is given to arc_free() to
1566 1864                   * give it priority.  l2arc_evict() can't destroy this
1567 1865                   * header while we are waiting on l2arc_buflist_mtx.
1568 1866                   *
1569 1867                   * The hdr may be removed from l2ad_buflist before we
1570 1868                   * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1571 1869                   */
1572 1870                  if (!buflist_held) {
1573 1871                          mutex_enter(&l2arc_buflist_mtx);
1574 1872                          l2hdr = hdr->b_l2hdr;
1575 1873                  }
1576 1874  
1577 1875                  if (l2hdr != NULL) {
1578 1876                          list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1579 1877                          ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1580 1878                          ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1581 1879                          kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1582 1880                          if (hdr->b_state == arc_l2c_only)
1583 1881                                  l2arc_hdr_stat_remove();
1584 1882                          hdr->b_l2hdr = NULL;
1585 1883                  }
1586 1884  
1587 1885                  if (!buflist_held)
1588 1886                          mutex_exit(&l2arc_buflist_mtx);
1589 1887          }
1590 1888  
1591 1889          if (!BUF_EMPTY(hdr)) {
1592 1890                  ASSERT(!HDR_IN_HASH_TABLE(hdr));
1593 1891                  buf_discard_identity(hdr);
1594 1892          }
1595 1893          while (hdr->b_buf) {
1596 1894                  arc_buf_t *buf = hdr->b_buf;
1597 1895  
1598 1896                  if (buf->b_efunc) {
1599 1897                          mutex_enter(&arc_eviction_mtx);
1600 1898                          mutex_enter(&buf->b_evict_lock);
1601 1899                          ASSERT(buf->b_hdr != NULL);
1602 1900                          arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1603 1901                          hdr->b_buf = buf->b_next;
1604 1902                          buf->b_hdr = &arc_eviction_hdr;
1605 1903                          buf->b_next = arc_eviction_list;
1606 1904                          arc_eviction_list = buf;
1607 1905                          mutex_exit(&buf->b_evict_lock);
1608 1906                          mutex_exit(&arc_eviction_mtx);
1609 1907                  } else {
1610 1908                          arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1611 1909                  }
1612 1910          }
1613 1911          if (hdr->b_freeze_cksum != NULL) {
1614 1912                  kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1615 1913                  hdr->b_freeze_cksum = NULL;
1616 1914          }
1617 1915          if (hdr->b_thawed) {
1618 1916                  kmem_free(hdr->b_thawed, 1);
1619 1917                  hdr->b_thawed = NULL;
1620 1918          }
1621 1919  
1622 1920          ASSERT(!list_link_active(&hdr->b_arc_node));
1623 1921          ASSERT3P(hdr->b_hash_next, ==, NULL);
1624 1922          ASSERT3P(hdr->b_acb, ==, NULL);
1625 1923          kmem_cache_free(hdr_cache, hdr);
1626 1924  }
1627 1925  
1628 1926  void
1629 1927  arc_buf_free(arc_buf_t *buf, void *tag)
1630 1928  {
1631 1929          arc_buf_hdr_t *hdr = buf->b_hdr;
1632 1930          int hashed = hdr->b_state != arc_anon;
1633 1931  
1634 1932          ASSERT(buf->b_efunc == NULL);
1635 1933          ASSERT(buf->b_data != NULL);
1636 1934  
1637 1935          if (hashed) {
1638 1936                  kmutex_t *hash_lock = HDR_LOCK(hdr);
1639 1937  
1640 1938                  mutex_enter(hash_lock);
1641 1939                  hdr = buf->b_hdr;
1642 1940                  ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1643 1941  
1644 1942                  (void) remove_reference(hdr, hash_lock, tag);
1645 1943                  if (hdr->b_datacnt > 1) {
1646 1944                          arc_buf_destroy(buf, FALSE, TRUE);
1647 1945                  } else {
1648 1946                          ASSERT(buf == hdr->b_buf);
1649 1947                          ASSERT(buf->b_efunc == NULL);
1650 1948                          hdr->b_flags |= ARC_BUF_AVAILABLE;
1651 1949                  }
1652 1950                  mutex_exit(hash_lock);
1653 1951          } else if (HDR_IO_IN_PROGRESS(hdr)) {
1654 1952                  int destroy_hdr;
1655 1953                  /*
1656 1954                   * We are in the middle of an async write.  Don't destroy
1657 1955                   * this buffer unless the write completes before we finish
1658 1956                   * decrementing the reference count.
1659 1957                   */
1660 1958                  mutex_enter(&arc_eviction_mtx);
1661 1959                  (void) remove_reference(hdr, NULL, tag);
1662 1960                  ASSERT(refcount_is_zero(&hdr->b_refcnt));
1663 1961                  destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1664 1962                  mutex_exit(&arc_eviction_mtx);
1665 1963                  if (destroy_hdr)
1666 1964                          arc_hdr_destroy(hdr);
1667 1965          } else {
1668 1966                  if (remove_reference(hdr, NULL, tag) > 0)
1669 1967                          arc_buf_destroy(buf, FALSE, TRUE);
1670 1968                  else
1671 1969                          arc_hdr_destroy(hdr);
1672 1970          }
1673 1971  }
1674 1972  
1675 1973  boolean_t
1676 1974  arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1677 1975  {
1678 1976          arc_buf_hdr_t *hdr = buf->b_hdr;
1679 1977          kmutex_t *hash_lock = HDR_LOCK(hdr);
1680 1978          boolean_t no_callback = (buf->b_efunc == NULL);
1681 1979  
1682 1980          if (hdr->b_state == arc_anon) {
1683 1981                  ASSERT(hdr->b_datacnt == 1);
1684 1982                  arc_buf_free(buf, tag);
1685 1983                  return (no_callback);
1686 1984          }
1687 1985  
1688 1986          mutex_enter(hash_lock);
1689 1987          hdr = buf->b_hdr;
1690 1988          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1691 1989          ASSERT(hdr->b_state != arc_anon);
1692 1990          ASSERT(buf->b_data != NULL);
1693 1991  
1694 1992          (void) remove_reference(hdr, hash_lock, tag);
1695 1993          if (hdr->b_datacnt > 1) {
1696 1994                  if (no_callback)
1697 1995                          arc_buf_destroy(buf, FALSE, TRUE);
1698 1996          } else if (no_callback) {
1699 1997                  ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1700 1998                  ASSERT(buf->b_efunc == NULL);
1701 1999                  hdr->b_flags |= ARC_BUF_AVAILABLE;
1702 2000          }
1703 2001          ASSERT(no_callback || hdr->b_datacnt > 1 ||
1704 2002              refcount_is_zero(&hdr->b_refcnt));
1705 2003          mutex_exit(hash_lock);
1706 2004          return (no_callback);
1707 2005  }
1708 2006  
1709 2007  int
1710 2008  arc_buf_size(arc_buf_t *buf)
1711 2009  {
1712 2010          return (buf->b_hdr->b_size);
1713 2011  }
1714 2012  
1715 2013  /*
1716 2014   * Called from the DMU to determine if the current buffer should be
1717 2015   * evicted. In order to ensure proper locking, the eviction must be initiated
1718 2016   * from the DMU. Return true if the buffer is associated with user data and
1719 2017   * duplicate buffers still exist.
1720 2018   */
1721 2019  boolean_t
1722 2020  arc_buf_eviction_needed(arc_buf_t *buf)
1723 2021  {
1724 2022          arc_buf_hdr_t *hdr;
1725 2023          boolean_t evict_needed = B_FALSE;
1726 2024  
1727 2025          if (zfs_disable_dup_eviction)
1728 2026                  return (B_FALSE);
1729 2027  
1730 2028          mutex_enter(&buf->b_evict_lock);
1731 2029          hdr = buf->b_hdr;
1732 2030          if (hdr == NULL) {
1733 2031                  /*
1734 2032                   * We are in arc_do_user_evicts(); let that function
1735 2033                   * perform the eviction.
1736 2034                   */
1737 2035                  ASSERT(buf->b_data == NULL);
1738 2036                  mutex_exit(&buf->b_evict_lock);
1739 2037                  return (B_FALSE);
1740 2038          } else if (buf->b_data == NULL) {
1741 2039                  /*
1742 2040                   * We have already been added to the arc eviction list;
1743 2041                   * recommend eviction.
1744 2042                   */
1745 2043                  ASSERT3P(hdr, ==, &arc_eviction_hdr);
1746 2044                  mutex_exit(&buf->b_evict_lock);
1747 2045                  return (B_TRUE);
1748 2046          }
1749 2047  
1750 2048          if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1751 2049                  evict_needed = B_TRUE;
1752 2050  
1753 2051          mutex_exit(&buf->b_evict_lock);
1754 2052          return (evict_needed);
1755 2053  }
1756 2054  
1757 2055  /*
1758 2056   * Evict buffers from list until we've removed the specified number of
1759 2057   * bytes.  Move the removed buffers to the appropriate evict state.
1760 2058   * If the recycle flag is set, then attempt to "recycle" a buffer:
1761 2059   * - look for a buffer to evict that is `bytes' long.
1762 2060   * - return the data block from this buffer rather than freeing it.
1763 2061   * This flag is used by callers that are trying to make space for a
1764 2062   * new buffer in a full arc cache.
1765 2063   *
1766 2064   * This function makes a "best effort".  It skips over any buffers
1767 2065   * it can't get a hash_lock on, and so may not catch all candidates.
1768 2066   * It may also return without evicting as much space as requested.
1769 2067   */
1770 2068  static void *
1771 2069  arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1772 2070      arc_buf_contents_t type)
1773 2071  {
1774 2072          arc_state_t *evicted_state;
1775 2073          uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1776 2074          arc_buf_hdr_t *ab, *ab_prev = NULL;
1777 2075          list_t *list = &state->arcs_list[type];
1778 2076          kmutex_t *hash_lock;
1779 2077          boolean_t have_lock;
1780 2078          void *stolen = NULL;
1781 2079  
1782 2080          ASSERT(state == arc_mru || state == arc_mfu);
1783 2081  
1784 2082          evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1785 2083  
1786 2084          mutex_enter(&state->arcs_mtx);
1787 2085          mutex_enter(&evicted_state->arcs_mtx);
1788 2086  
1789 2087          for (ab = list_tail(list); ab; ab = ab_prev) {
1790 2088                  ab_prev = list_prev(list, ab);
1791 2089                  /* prefetch buffers have a minimum lifespan */
1792 2090                  if (HDR_IO_IN_PROGRESS(ab) ||
1793 2091                      (spa && ab->b_spa != spa) ||
1794 2092                      (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1795 2093                      ddi_get_lbolt() - ab->b_arc_access <
1796 2094                      arc_min_prefetch_lifespan)) {
1797 2095                          skipped++;
1798 2096                          continue;
1799 2097                  }
1800 2098                  /* "lookahead" for better eviction candidate */
1801 2099                  if (recycle && ab->b_size != bytes &&
1802 2100                      ab_prev && ab_prev->b_size == bytes)
1803 2101                          continue;
1804 2102                  hash_lock = HDR_LOCK(ab);
1805 2103                  have_lock = MUTEX_HELD(hash_lock);
1806 2104                  if (have_lock || mutex_tryenter(hash_lock)) {
1807 2105                          ASSERT0(refcount_count(&ab->b_refcnt));
1808 2106                          ASSERT(ab->b_datacnt > 0);
1809 2107                          while (ab->b_buf) {
1810 2108                                  arc_buf_t *buf = ab->b_buf;
1811 2109                                  if (!mutex_tryenter(&buf->b_evict_lock)) {
1812 2110                                          missed += 1;
1813 2111                                          break;
1814 2112                                  }
1815 2113                                  if (buf->b_data) {
1816 2114                                          bytes_evicted += ab->b_size;
1817 2115                                          if (recycle && ab->b_type == type &&
1818 2116                                              ab->b_size == bytes &&
1819 2117                                              !HDR_L2_WRITING(ab)) {
1820 2118                                                  stolen = buf->b_data;
1821 2119                                                  recycle = FALSE;
1822 2120                                          }
1823 2121                                  }
1824 2122                                  if (buf->b_efunc) {
1825 2123                                          mutex_enter(&arc_eviction_mtx);
1826 2124                                          arc_buf_destroy(buf,
1827 2125                                              buf->b_data == stolen, FALSE);
1828 2126                                          ab->b_buf = buf->b_next;
1829 2127                                          buf->b_hdr = &arc_eviction_hdr;
1830 2128                                          buf->b_next = arc_eviction_list;
1831 2129                                          arc_eviction_list = buf;
1832 2130                                          mutex_exit(&arc_eviction_mtx);
1833 2131                                          mutex_exit(&buf->b_evict_lock);
1834 2132                                  } else {
1835 2133                                          mutex_exit(&buf->b_evict_lock);
1836 2134                                          arc_buf_destroy(buf,
1837 2135                                              buf->b_data == stolen, TRUE);
1838 2136                                  }
1839 2137                          }
1840 2138  
1841 2139                          if (ab->b_l2hdr) {
1842 2140                                  ARCSTAT_INCR(arcstat_evict_l2_cached,
1843 2141                                      ab->b_size);
1844 2142                          } else {
1845 2143                                  if (l2arc_write_eligible(ab->b_spa, ab)) {
1846 2144                                          ARCSTAT_INCR(arcstat_evict_l2_eligible,
1847 2145                                              ab->b_size);
1848 2146                                  } else {
1849 2147                                          ARCSTAT_INCR(
1850 2148                                              arcstat_evict_l2_ineligible,
1851 2149                                              ab->b_size);
1852 2150                                  }
1853 2151                          }
1854 2152  
1855 2153                          if (ab->b_datacnt == 0) {
1856 2154                                  arc_change_state(evicted_state, ab, hash_lock);
1857 2155                                  ASSERT(HDR_IN_HASH_TABLE(ab));
1858 2156                                  ab->b_flags |= ARC_IN_HASH_TABLE;
1859 2157                                  ab->b_flags &= ~ARC_BUF_AVAILABLE;
1860 2158                                  DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1861 2159                          }
1862 2160                          if (!have_lock)
1863 2161                                  mutex_exit(hash_lock);
1864 2162                          if (bytes >= 0 && bytes_evicted >= bytes)
1865 2163                                  break;
1866 2164                  } else {
1867 2165                          missed += 1;
1868 2166                  }
1869 2167          }
1870 2168  
1871 2169          mutex_exit(&evicted_state->arcs_mtx);
1872 2170          mutex_exit(&state->arcs_mtx);
1873 2171  
1874 2172          if (bytes_evicted < bytes)
1875 2173                  dprintf("only evicted %lld bytes from %x",
1876 2174                      (longlong_t)bytes_evicted, state);
1877 2175  
1878 2176          if (skipped)
1879 2177                  ARCSTAT_INCR(arcstat_evict_skip, skipped);
1880 2178  
1881 2179          if (missed)
1882 2180                  ARCSTAT_INCR(arcstat_mutex_miss, missed);
1883 2181  
1884 2182          /*
1885 2183           * We have just evicted some data into the ghost state, make
1886 2184           * sure we also adjust the ghost state size if necessary.
1887 2185           */
1888 2186          if (arc_no_grow &&
1889 2187              arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1890 2188                  int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1891 2189                      arc_mru_ghost->arcs_size - arc_c;
1892 2190  
1893 2191                  if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1894 2192                          int64_t todelete =
1895 2193                              MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1896 2194                          arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1897 2195                  } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1898 2196                          int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1899 2197                              arc_mru_ghost->arcs_size +
1900 2198                              arc_mfu_ghost->arcs_size - arc_c);
1901 2199                          arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1902 2200                  }
1903 2201          }
1904 2202  
1905 2203          return (stolen);
1906 2204  }
1907 2205  
1908 2206  /*
1909 2207   * Remove buffers from list until we've removed the specified number of
1910 2208   * bytes.  Destroy the buffers that are removed.
1911 2209   */
1912 2210  static void
1913 2211  arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1914 2212  {
1915 2213          arc_buf_hdr_t *ab, *ab_prev;
1916 2214          arc_buf_hdr_t marker = { 0 };
1917 2215          list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1918 2216          kmutex_t *hash_lock;
1919 2217          uint64_t bytes_deleted = 0;
1920 2218          uint64_t bufs_skipped = 0;
1921 2219  
1922 2220          ASSERT(GHOST_STATE(state));
1923 2221  top:
1924 2222          mutex_enter(&state->arcs_mtx);
1925 2223          for (ab = list_tail(list); ab; ab = ab_prev) {
1926 2224                  ab_prev = list_prev(list, ab);
1927 2225                  if (spa && ab->b_spa != spa)
1928 2226                          continue;
1929 2227  
1930 2228                  /* ignore markers */
1931 2229                  if (ab->b_spa == 0)
1932 2230                          continue;
1933 2231  
1934 2232                  hash_lock = HDR_LOCK(ab);
1935 2233                  /* caller may be trying to modify this buffer, skip it */
1936 2234                  if (MUTEX_HELD(hash_lock))
1937 2235                          continue;
1938 2236                  if (mutex_tryenter(hash_lock)) {
1939 2237                          ASSERT(!HDR_IO_IN_PROGRESS(ab));
1940 2238                          ASSERT(ab->b_buf == NULL);
1941 2239                          ARCSTAT_BUMP(arcstat_deleted);
1942 2240                          bytes_deleted += ab->b_size;
1943 2241  
1944 2242                          if (ab->b_l2hdr != NULL) {
1945 2243                                  /*
1946 2244                                   * This buffer is cached on the 2nd Level ARC;
1947 2245                                   * don't destroy the header.
1948 2246                                   */
1949 2247                                  arc_change_state(arc_l2c_only, ab, hash_lock);
1950 2248                                  mutex_exit(hash_lock);
1951 2249                          } else {
1952 2250                                  arc_change_state(arc_anon, ab, hash_lock);
1953 2251                                  mutex_exit(hash_lock);
1954 2252                                  arc_hdr_destroy(ab);
1955 2253                          }
1956 2254  
1957 2255                          DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1958 2256                          if (bytes >= 0 && bytes_deleted >= bytes)
1959 2257                                  break;
1960 2258                  } else if (bytes < 0) {
1961 2259                          /*
1962 2260                           * Insert a list marker and then wait for the
1963 2261                           * hash lock to become available. Once its
1964 2262                           * available, restart from where we left off.
1965 2263                           */
1966 2264                          list_insert_after(list, ab, &marker);
1967 2265                          mutex_exit(&state->arcs_mtx);
1968 2266                          mutex_enter(hash_lock);
1969 2267                          mutex_exit(hash_lock);
1970 2268                          mutex_enter(&state->arcs_mtx);
1971 2269                          ab_prev = list_prev(list, &marker);
1972 2270                          list_remove(list, &marker);
1973 2271                  } else
1974 2272                          bufs_skipped += 1;
1975 2273          }
1976 2274          mutex_exit(&state->arcs_mtx);
1977 2275  
1978 2276          if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1979 2277              (bytes < 0 || bytes_deleted < bytes)) {
1980 2278                  list = &state->arcs_list[ARC_BUFC_METADATA];
1981 2279                  goto top;
1982 2280          }
1983 2281  
1984 2282          if (bufs_skipped) {
1985 2283                  ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1986 2284                  ASSERT(bytes >= 0);
1987 2285          }
1988 2286  
1989 2287          if (bytes_deleted < bytes)
1990 2288                  dprintf("only deleted %lld bytes from %p",
1991 2289                      (longlong_t)bytes_deleted, state);
1992 2290  }
1993 2291  
1994 2292  static void
1995 2293  arc_adjust(void)
1996 2294  {
1997 2295          int64_t adjustment, delta;
1998 2296  
1999 2297          /*
2000 2298           * Adjust MRU size
2001 2299           */
2002 2300  
2003 2301          adjustment = MIN((int64_t)(arc_size - arc_c),
2004 2302              (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2005 2303              arc_p));
2006 2304  
2007 2305          if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2008 2306                  delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2009 2307                  (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2010 2308                  adjustment -= delta;
2011 2309          }
2012 2310  
2013 2311          if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2014 2312                  delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2015 2313                  (void) arc_evict(arc_mru, NULL, delta, FALSE,
2016 2314                      ARC_BUFC_METADATA);
2017 2315          }
2018 2316  
2019 2317          /*
2020 2318           * Adjust MFU size
2021 2319           */
2022 2320  
2023 2321          adjustment = arc_size - arc_c;
2024 2322  
2025 2323          if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2026 2324                  delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2027 2325                  (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2028 2326                  adjustment -= delta;
2029 2327          }
2030 2328  
2031 2329          if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2032 2330                  int64_t delta = MIN(adjustment,
2033 2331                      arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2034 2332                  (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2035 2333                      ARC_BUFC_METADATA);
2036 2334          }
2037 2335  
2038 2336          /*
2039 2337           * Adjust ghost lists
2040 2338           */
2041 2339  
2042 2340          adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2043 2341  
2044 2342          if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2045 2343                  delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2046 2344                  arc_evict_ghost(arc_mru_ghost, NULL, delta);
2047 2345          }
2048 2346  
2049 2347          adjustment =
2050 2348              arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2051 2349  
2052 2350          if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2053 2351                  delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2054 2352                  arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2055 2353          }
2056 2354  }
2057 2355  
2058 2356  static void
2059 2357  arc_do_user_evicts(void)
2060 2358  {
2061 2359          mutex_enter(&arc_eviction_mtx);
2062 2360          while (arc_eviction_list != NULL) {
2063 2361                  arc_buf_t *buf = arc_eviction_list;
2064 2362                  arc_eviction_list = buf->b_next;
2065 2363                  mutex_enter(&buf->b_evict_lock);
2066 2364                  buf->b_hdr = NULL;
2067 2365                  mutex_exit(&buf->b_evict_lock);
2068 2366                  mutex_exit(&arc_eviction_mtx);
2069 2367  
2070 2368                  if (buf->b_efunc != NULL)
2071 2369                          VERIFY(buf->b_efunc(buf) == 0);
2072 2370  
2073 2371                  buf->b_efunc = NULL;
2074 2372                  buf->b_private = NULL;
2075 2373                  kmem_cache_free(buf_cache, buf);
2076 2374                  mutex_enter(&arc_eviction_mtx);
2077 2375          }
2078 2376          mutex_exit(&arc_eviction_mtx);
2079 2377  }
2080 2378  
2081 2379  /*
2082 2380   * Flush all *evictable* data from the cache for the given spa.
2083 2381   * NOTE: this will not touch "active" (i.e. referenced) data.
2084 2382   */
2085 2383  void
2086 2384  arc_flush(spa_t *spa)
2087 2385  {
2088 2386          uint64_t guid = 0;
2089 2387  
2090 2388          if (spa)
2091 2389                  guid = spa_load_guid(spa);
2092 2390  
2093 2391          while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2094 2392                  (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2095 2393                  if (spa)
2096 2394                          break;
2097 2395          }
2098 2396          while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2099 2397                  (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2100 2398                  if (spa)
2101 2399                          break;
2102 2400          }
2103 2401          while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2104 2402                  (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2105 2403                  if (spa)
2106 2404                          break;
2107 2405          }
2108 2406          while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2109 2407                  (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2110 2408                  if (spa)
2111 2409                          break;
2112 2410          }
2113 2411  
2114 2412          arc_evict_ghost(arc_mru_ghost, guid, -1);
2115 2413          arc_evict_ghost(arc_mfu_ghost, guid, -1);
2116 2414  
2117 2415          mutex_enter(&arc_reclaim_thr_lock);
2118 2416          arc_do_user_evicts();
2119 2417          mutex_exit(&arc_reclaim_thr_lock);
2120 2418          ASSERT(spa || arc_eviction_list == NULL);
2121 2419  }
2122 2420  
2123 2421  void
2124 2422  arc_shrink(void)
2125 2423  {
2126 2424          if (arc_c > arc_c_min) {
2127 2425                  uint64_t to_free;
2128 2426  
2129 2427  #ifdef _KERNEL
2130 2428                  to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2131 2429  #else
2132 2430                  to_free = arc_c >> arc_shrink_shift;
2133 2431  #endif
2134 2432                  if (arc_c > arc_c_min + to_free)
2135 2433                          atomic_add_64(&arc_c, -to_free);
2136 2434                  else
2137 2435                          arc_c = arc_c_min;
2138 2436  
2139 2437                  atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2140 2438                  if (arc_c > arc_size)
2141 2439                          arc_c = MAX(arc_size, arc_c_min);
2142 2440                  if (arc_p > arc_c)
2143 2441                          arc_p = (arc_c >> 1);
2144 2442                  ASSERT(arc_c >= arc_c_min);
2145 2443                  ASSERT((int64_t)arc_p >= 0);
2146 2444          }
2147 2445  
2148 2446          if (arc_size > arc_c)
2149 2447                  arc_adjust();
2150 2448  }
2151 2449  
2152 2450  /*
2153 2451   * Determine if the system is under memory pressure and is asking
2154 2452   * to reclaim memory. A return value of 1 indicates that the system
2155 2453   * is under memory pressure and that the arc should adjust accordingly.
2156 2454   */
2157 2455  static int
2158 2456  arc_reclaim_needed(void)
2159 2457  {
2160 2458          uint64_t extra;
2161 2459  
2162 2460  #ifdef _KERNEL
2163 2461  
2164 2462          if (needfree)
2165 2463                  return (1);
2166 2464  
2167 2465          /*
2168 2466           * take 'desfree' extra pages, so we reclaim sooner, rather than later
2169 2467           */
2170 2468          extra = desfree;
2171 2469  
2172 2470          /*
2173 2471           * check that we're out of range of the pageout scanner.  It starts to
2174 2472           * schedule paging if freemem is less than lotsfree and needfree.
2175 2473           * lotsfree is the high-water mark for pageout, and needfree is the
2176 2474           * number of needed free pages.  We add extra pages here to make sure
2177 2475           * the scanner doesn't start up while we're freeing memory.
2178 2476           */
2179 2477          if (freemem < lotsfree + needfree + extra)
2180 2478                  return (1);
2181 2479  
2182 2480          /*
2183 2481           * check to make sure that swapfs has enough space so that anon
2184 2482           * reservations can still succeed. anon_resvmem() checks that the
2185 2483           * availrmem is greater than swapfs_minfree, and the number of reserved
2186 2484           * swap pages.  We also add a bit of extra here just to prevent
2187 2485           * circumstances from getting really dire.
2188 2486           */
2189 2487          if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2190 2488                  return (1);
2191 2489  
2192 2490  #if defined(__i386)
2193 2491          /*
2194 2492           * If we're on an i386 platform, it's possible that we'll exhaust the
2195 2493           * kernel heap space before we ever run out of available physical
2196 2494           * memory.  Most checks of the size of the heap_area compare against
2197 2495           * tune.t_minarmem, which is the minimum available real memory that we
2198 2496           * can have in the system.  However, this is generally fixed at 25 pages
2199 2497           * which is so low that it's useless.  In this comparison, we seek to
2200 2498           * calculate the total heap-size, and reclaim if more than 3/4ths of the
2201 2499           * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2202 2500           * free)
2203 2501           */
2204 2502          if (vmem_size(heap_arena, VMEM_FREE) <
2205 2503              (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2206 2504                  return (1);
2207 2505  #endif
2208 2506  
2209 2507          /*
2210 2508           * If zio data pages are being allocated out of a separate heap segment,
2211 2509           * then enforce that the size of available vmem for this arena remains
2212 2510           * above about 1/16th free.
2213 2511           *
2214 2512           * Note: The 1/16th arena free requirement was put in place
2215 2513           * to aggressively evict memory from the arc in order to avoid
2216 2514           * memory fragmentation issues.
2217 2515           */
2218 2516          if (zio_arena != NULL &&
2219 2517              vmem_size(zio_arena, VMEM_FREE) <
2220 2518              (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2221 2519                  return (1);
2222 2520  #else
2223 2521          if (spa_get_random(100) == 0)
2224 2522                  return (1);
2225 2523  #endif
2226 2524          return (0);
2227 2525  }
2228 2526  
2229 2527  static void
2230 2528  arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2231 2529  {
2232 2530          size_t                  i;
2233 2531          kmem_cache_t            *prev_cache = NULL;
2234 2532          kmem_cache_t            *prev_data_cache = NULL;
2235 2533          extern kmem_cache_t     *zio_buf_cache[];
2236 2534          extern kmem_cache_t     *zio_data_buf_cache[];
2237 2535  
2238 2536  #ifdef _KERNEL
2239 2537          if (arc_meta_used >= arc_meta_limit) {
2240 2538                  /*
2241 2539                   * We are exceeding our meta-data cache limit.
2242 2540                   * Purge some DNLC entries to release holds on meta-data.
2243 2541                   */
2244 2542                  dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2245 2543          }
2246 2544  #if defined(__i386)
2247 2545          /*
2248 2546           * Reclaim unused memory from all kmem caches.
2249 2547           */
2250 2548          kmem_reap();
2251 2549  #endif
2252 2550  #endif
2253 2551  
2254 2552          /*
2255 2553           * An aggressive reclamation will shrink the cache size as well as
2256 2554           * reap free buffers from the arc kmem caches.
2257 2555           */
2258 2556          if (strat == ARC_RECLAIM_AGGR)
2259 2557                  arc_shrink();
2260 2558  
2261 2559          for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2262 2560                  if (zio_buf_cache[i] != prev_cache) {
2263 2561                          prev_cache = zio_buf_cache[i];
2264 2562                          kmem_cache_reap_now(zio_buf_cache[i]);
2265 2563                  }
2266 2564                  if (zio_data_buf_cache[i] != prev_data_cache) {
2267 2565                          prev_data_cache = zio_data_buf_cache[i];
2268 2566                          kmem_cache_reap_now(zio_data_buf_cache[i]);
2269 2567                  }
2270 2568          }
2271 2569          kmem_cache_reap_now(buf_cache);
2272 2570          kmem_cache_reap_now(hdr_cache);
2273 2571  
2274 2572          /*
2275 2573           * Ask the vmem areana to reclaim unused memory from its
2276 2574           * quantum caches.
2277 2575           */
2278 2576          if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2279 2577                  vmem_qcache_reap(zio_arena);
2280 2578  }
2281 2579  
2282 2580  static void
2283 2581  arc_reclaim_thread(void)
2284 2582  {
2285 2583          clock_t                 growtime = 0;
2286 2584          arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2287 2585          callb_cpr_t             cpr;
2288 2586  
2289 2587          CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2290 2588  
2291 2589          mutex_enter(&arc_reclaim_thr_lock);
2292 2590          while (arc_thread_exit == 0) {
2293 2591                  if (arc_reclaim_needed()) {
2294 2592  
2295 2593                          if (arc_no_grow) {
2296 2594                                  if (last_reclaim == ARC_RECLAIM_CONS) {
2297 2595                                          last_reclaim = ARC_RECLAIM_AGGR;
2298 2596                                  } else {
2299 2597                                          last_reclaim = ARC_RECLAIM_CONS;
2300 2598                                  }
2301 2599                          } else {
2302 2600                                  arc_no_grow = TRUE;
2303 2601                                  last_reclaim = ARC_RECLAIM_AGGR;
2304 2602                                  membar_producer();
2305 2603                          }
2306 2604  
2307 2605                          /* reset the growth delay for every reclaim */
2308 2606                          growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2309 2607  
2310 2608                          arc_kmem_reap_now(last_reclaim);
2311 2609                          arc_warm = B_TRUE;
2312 2610  
2313 2611                  } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2314 2612                          arc_no_grow = FALSE;
2315 2613                  }
2316 2614  
2317 2615                  arc_adjust();
2318 2616  
2319 2617                  if (arc_eviction_list != NULL)
2320 2618                          arc_do_user_evicts();
2321 2619  
2322 2620                  /* block until needed, or one second, whichever is shorter */
2323 2621                  CALLB_CPR_SAFE_BEGIN(&cpr);
2324 2622                  (void) cv_timedwait(&arc_reclaim_thr_cv,
2325 2623                      &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2326 2624                  CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2327 2625          }
2328 2626  
2329 2627          arc_thread_exit = 0;
2330 2628          cv_broadcast(&arc_reclaim_thr_cv);
2331 2629          CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2332 2630          thread_exit();
2333 2631  }
2334 2632  
2335 2633  /*
2336 2634   * Adapt arc info given the number of bytes we are trying to add and
2337 2635   * the state that we are comming from.  This function is only called
2338 2636   * when we are adding new content to the cache.
2339 2637   */
2340 2638  static void
2341 2639  arc_adapt(int bytes, arc_state_t *state)
2342 2640  {
2343 2641          int mult;
2344 2642          uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2345 2643  
2346 2644          if (state == arc_l2c_only)
2347 2645                  return;
2348 2646  
2349 2647          ASSERT(bytes > 0);
2350 2648          /*
2351 2649           * Adapt the target size of the MRU list:
2352 2650           *      - if we just hit in the MRU ghost list, then increase
2353 2651           *        the target size of the MRU list.
2354 2652           *      - if we just hit in the MFU ghost list, then increase
2355 2653           *        the target size of the MFU list by decreasing the
2356 2654           *        target size of the MRU list.
2357 2655           */
2358 2656          if (state == arc_mru_ghost) {
2359 2657                  mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2360 2658                      1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2361 2659                  mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2362 2660  
2363 2661                  arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2364 2662          } else if (state == arc_mfu_ghost) {
2365 2663                  uint64_t delta;
2366 2664  
2367 2665                  mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2368 2666                      1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2369 2667                  mult = MIN(mult, 10);
2370 2668  
2371 2669                  delta = MIN(bytes * mult, arc_p);
2372 2670                  arc_p = MAX(arc_p_min, arc_p - delta);
2373 2671          }
2374 2672          ASSERT((int64_t)arc_p >= 0);
2375 2673  
2376 2674          if (arc_reclaim_needed()) {
2377 2675                  cv_signal(&arc_reclaim_thr_cv);
2378 2676                  return;
2379 2677          }
2380 2678  
2381 2679          if (arc_no_grow)
2382 2680                  return;
2383 2681  
2384 2682          if (arc_c >= arc_c_max)
2385 2683                  return;
2386 2684  
2387 2685          /*
2388 2686           * If we're within (2 * maxblocksize) bytes of the target
2389 2687           * cache size, increment the target cache size
2390 2688           */
2391 2689          if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2392 2690                  atomic_add_64(&arc_c, (int64_t)bytes);
2393 2691                  if (arc_c > arc_c_max)
2394 2692                          arc_c = arc_c_max;
2395 2693                  else if (state == arc_anon)
2396 2694                          atomic_add_64(&arc_p, (int64_t)bytes);
2397 2695                  if (arc_p > arc_c)
2398 2696                          arc_p = arc_c;
2399 2697          }
2400 2698          ASSERT((int64_t)arc_p >= 0);
2401 2699  }
2402 2700  
2403 2701  /*
2404 2702   * Check if the cache has reached its limits and eviction is required
2405 2703   * prior to insert.
2406 2704   */
2407 2705  static int
2408 2706  arc_evict_needed(arc_buf_contents_t type)
2409 2707  {
2410 2708          if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2411 2709                  return (1);
2412 2710  
2413 2711          if (arc_reclaim_needed())
2414 2712                  return (1);
2415 2713  
2416 2714          return (arc_size > arc_c);
2417 2715  }
2418 2716  
2419 2717  /*
2420 2718   * The buffer, supplied as the first argument, needs a data block.
2421 2719   * So, if we are at cache max, determine which cache should be victimized.
2422 2720   * We have the following cases:
2423 2721   *
2424 2722   * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2425 2723   * In this situation if we're out of space, but the resident size of the MFU is
2426 2724   * under the limit, victimize the MFU cache to satisfy this insertion request.
2427 2725   *
2428 2726   * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2429 2727   * Here, we've used up all of the available space for the MRU, so we need to
2430 2728   * evict from our own cache instead.  Evict from the set of resident MRU
2431 2729   * entries.
2432 2730   *
2433 2731   * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2434 2732   * c minus p represents the MFU space in the cache, since p is the size of the
2435 2733   * cache that is dedicated to the MRU.  In this situation there's still space on
2436 2734   * the MFU side, so the MRU side needs to be victimized.
2437 2735   *
2438 2736   * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2439 2737   * MFU's resident set is consuming more space than it has been allotted.  In
2440 2738   * this situation, we must victimize our own cache, the MFU, for this insertion.
2441 2739   */
2442 2740  static void
2443 2741  arc_get_data_buf(arc_buf_t *buf)
2444 2742  {
2445 2743          arc_state_t             *state = buf->b_hdr->b_state;
2446 2744          uint64_t                size = buf->b_hdr->b_size;
2447 2745          arc_buf_contents_t      type = buf->b_hdr->b_type;
2448 2746  
2449 2747          arc_adapt(size, state);
2450 2748  
2451 2749          /*
2452 2750           * We have not yet reached cache maximum size,
2453 2751           * just allocate a new buffer.
2454 2752           */
2455 2753          if (!arc_evict_needed(type)) {
2456 2754                  if (type == ARC_BUFC_METADATA) {
2457 2755                          buf->b_data = zio_buf_alloc(size);
2458 2756                          arc_space_consume(size, ARC_SPACE_DATA);
2459 2757                  } else {
2460 2758                          ASSERT(type == ARC_BUFC_DATA);
2461 2759                          buf->b_data = zio_data_buf_alloc(size);
2462 2760                          ARCSTAT_INCR(arcstat_data_size, size);
2463 2761                          atomic_add_64(&arc_size, size);
2464 2762                  }
2465 2763                  goto out;
2466 2764          }
2467 2765  
2468 2766          /*
2469 2767           * If we are prefetching from the mfu ghost list, this buffer
2470 2768           * will end up on the mru list; so steal space from there.
2471 2769           */
2472 2770          if (state == arc_mfu_ghost)
2473 2771                  state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2474 2772          else if (state == arc_mru_ghost)
2475 2773                  state = arc_mru;
2476 2774  
2477 2775          if (state == arc_mru || state == arc_anon) {
2478 2776                  uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2479 2777                  state = (arc_mfu->arcs_lsize[type] >= size &&
2480 2778                      arc_p > mru_used) ? arc_mfu : arc_mru;
2481 2779          } else {
2482 2780                  /* MFU cases */
2483 2781                  uint64_t mfu_space = arc_c - arc_p;
2484 2782                  state =  (arc_mru->arcs_lsize[type] >= size &&
2485 2783                      mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2486 2784          }
2487 2785          if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2488 2786                  if (type == ARC_BUFC_METADATA) {
2489 2787                          buf->b_data = zio_buf_alloc(size);
2490 2788                          arc_space_consume(size, ARC_SPACE_DATA);
2491 2789                  } else {
2492 2790                          ASSERT(type == ARC_BUFC_DATA);
2493 2791                          buf->b_data = zio_data_buf_alloc(size);
2494 2792                          ARCSTAT_INCR(arcstat_data_size, size);
2495 2793                          atomic_add_64(&arc_size, size);
2496 2794                  }
2497 2795                  ARCSTAT_BUMP(arcstat_recycle_miss);
2498 2796          }
2499 2797          ASSERT(buf->b_data != NULL);
2500 2798  out:
2501 2799          /*
2502 2800           * Update the state size.  Note that ghost states have a
2503 2801           * "ghost size" and so don't need to be updated.
2504 2802           */
2505 2803          if (!GHOST_STATE(buf->b_hdr->b_state)) {
2506 2804                  arc_buf_hdr_t *hdr = buf->b_hdr;
2507 2805  
2508 2806                  atomic_add_64(&hdr->b_state->arcs_size, size);
2509 2807                  if (list_link_active(&hdr->b_arc_node)) {
2510 2808                          ASSERT(refcount_is_zero(&hdr->b_refcnt));
2511 2809                          atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2512 2810                  }
2513 2811                  /*
2514 2812                   * If we are growing the cache, and we are adding anonymous
2515 2813                   * data, and we have outgrown arc_p, update arc_p
2516 2814                   */
2517 2815                  if (arc_size < arc_c && hdr->b_state == arc_anon &&
2518 2816                      arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2519 2817                          arc_p = MIN(arc_c, arc_p + size);
2520 2818          }
2521 2819  }
2522 2820  
2523 2821  /*
2524 2822   * This routine is called whenever a buffer is accessed.
2525 2823   * NOTE: the hash lock is dropped in this function.
2526 2824   */
2527 2825  static void
2528 2826  arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2529 2827  {
2530 2828          clock_t now;
2531 2829  
2532 2830          ASSERT(MUTEX_HELD(hash_lock));
2533 2831  
2534 2832          if (buf->b_state == arc_anon) {
2535 2833                  /*
2536 2834                   * This buffer is not in the cache, and does not
2537 2835                   * appear in our "ghost" list.  Add the new buffer
2538 2836                   * to the MRU state.
2539 2837                   */
2540 2838  
2541 2839                  ASSERT(buf->b_arc_access == 0);
2542 2840                  buf->b_arc_access = ddi_get_lbolt();
2543 2841                  DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2544 2842                  arc_change_state(arc_mru, buf, hash_lock);
2545 2843  
2546 2844          } else if (buf->b_state == arc_mru) {
2547 2845                  now = ddi_get_lbolt();
2548 2846  
2549 2847                  /*
2550 2848                   * If this buffer is here because of a prefetch, then either:
2551 2849                   * - clear the flag if this is a "referencing" read
2552 2850                   *   (any subsequent access will bump this into the MFU state).
2553 2851                   * or
2554 2852                   * - move the buffer to the head of the list if this is
2555 2853                   *   another prefetch (to make it less likely to be evicted).
2556 2854                   */
2557 2855                  if ((buf->b_flags & ARC_PREFETCH) != 0) {
2558 2856                          if (refcount_count(&buf->b_refcnt) == 0) {
2559 2857                                  ASSERT(list_link_active(&buf->b_arc_node));
2560 2858                          } else {
2561 2859                                  buf->b_flags &= ~ARC_PREFETCH;
2562 2860                                  ARCSTAT_BUMP(arcstat_mru_hits);
2563 2861                          }
2564 2862                          buf->b_arc_access = now;
2565 2863                          return;
2566 2864                  }
2567 2865  
2568 2866                  /*
2569 2867                   * This buffer has been "accessed" only once so far,
2570 2868                   * but it is still in the cache. Move it to the MFU
2571 2869                   * state.
2572 2870                   */
2573 2871                  if (now > buf->b_arc_access + ARC_MINTIME) {
2574 2872                          /*
2575 2873                           * More than 125ms have passed since we
2576 2874                           * instantiated this buffer.  Move it to the
2577 2875                           * most frequently used state.
2578 2876                           */
2579 2877                          buf->b_arc_access = now;
2580 2878                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2581 2879                          arc_change_state(arc_mfu, buf, hash_lock);
2582 2880                  }
2583 2881                  ARCSTAT_BUMP(arcstat_mru_hits);
2584 2882          } else if (buf->b_state == arc_mru_ghost) {
2585 2883                  arc_state_t     *new_state;
2586 2884                  /*
2587 2885                   * This buffer has been "accessed" recently, but
2588 2886                   * was evicted from the cache.  Move it to the
2589 2887                   * MFU state.
2590 2888                   */
2591 2889  
2592 2890                  if (buf->b_flags & ARC_PREFETCH) {
2593 2891                          new_state = arc_mru;
2594 2892                          if (refcount_count(&buf->b_refcnt) > 0)
2595 2893                                  buf->b_flags &= ~ARC_PREFETCH;
2596 2894                          DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2597 2895                  } else {
2598 2896                          new_state = arc_mfu;
2599 2897                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2600 2898                  }
2601 2899  
2602 2900                  buf->b_arc_access = ddi_get_lbolt();
2603 2901                  arc_change_state(new_state, buf, hash_lock);
2604 2902  
2605 2903                  ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2606 2904          } else if (buf->b_state == arc_mfu) {
2607 2905                  /*
2608 2906                   * This buffer has been accessed more than once and is
2609 2907                   * still in the cache.  Keep it in the MFU state.
2610 2908                   *
2611 2909                   * NOTE: an add_reference() that occurred when we did
2612 2910                   * the arc_read() will have kicked this off the list.
2613 2911                   * If it was a prefetch, we will explicitly move it to
2614 2912                   * the head of the list now.
2615 2913                   */
2616 2914                  if ((buf->b_flags & ARC_PREFETCH) != 0) {
2617 2915                          ASSERT(refcount_count(&buf->b_refcnt) == 0);
2618 2916                          ASSERT(list_link_active(&buf->b_arc_node));
2619 2917                  }
2620 2918                  ARCSTAT_BUMP(arcstat_mfu_hits);
2621 2919                  buf->b_arc_access = ddi_get_lbolt();
2622 2920          } else if (buf->b_state == arc_mfu_ghost) {
2623 2921                  arc_state_t     *new_state = arc_mfu;
2624 2922                  /*
2625 2923                   * This buffer has been accessed more than once but has
2626 2924                   * been evicted from the cache.  Move it back to the
2627 2925                   * MFU state.
2628 2926                   */
2629 2927  
2630 2928                  if (buf->b_flags & ARC_PREFETCH) {
2631 2929                          /*
2632 2930                           * This is a prefetch access...
2633 2931                           * move this block back to the MRU state.
2634 2932                           */
2635 2933                          ASSERT0(refcount_count(&buf->b_refcnt));
2636 2934                          new_state = arc_mru;
2637 2935                  }
2638 2936  
2639 2937                  buf->b_arc_access = ddi_get_lbolt();
2640 2938                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2641 2939                  arc_change_state(new_state, buf, hash_lock);
2642 2940  
2643 2941                  ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2644 2942          } else if (buf->b_state == arc_l2c_only) {
2645 2943                  /*
2646 2944                   * This buffer is on the 2nd Level ARC.
2647 2945                   */
2648 2946  
2649 2947                  buf->b_arc_access = ddi_get_lbolt();
2650 2948                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2651 2949                  arc_change_state(arc_mfu, buf, hash_lock);
2652 2950          } else {
2653 2951                  ASSERT(!"invalid arc state");
2654 2952          }
2655 2953  }
2656 2954  
2657 2955  /* a generic arc_done_func_t which you can use */
2658 2956  /* ARGSUSED */
2659 2957  void
2660 2958  arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2661 2959  {
2662 2960          if (zio == NULL || zio->io_error == 0)
2663 2961                  bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2664 2962          VERIFY(arc_buf_remove_ref(buf, arg));
2665 2963  }
2666 2964  
2667 2965  /* a generic arc_done_func_t */
2668 2966  void
2669 2967  arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2670 2968  {
2671 2969          arc_buf_t **bufp = arg;
2672 2970          if (zio && zio->io_error) {
2673 2971                  VERIFY(arc_buf_remove_ref(buf, arg));
2674 2972                  *bufp = NULL;
2675 2973          } else {
2676 2974                  *bufp = buf;
2677 2975                  ASSERT(buf->b_data);
2678 2976          }
2679 2977  }
2680 2978  
2681 2979  static void
2682 2980  arc_read_done(zio_t *zio)
2683 2981  {
2684 2982          arc_buf_hdr_t   *hdr, *found;
2685 2983          arc_buf_t       *buf;
2686 2984          arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2687 2985          kmutex_t        *hash_lock;
2688 2986          arc_callback_t  *callback_list, *acb;
2689 2987          int             freeable = FALSE;
2690 2988  
2691 2989          buf = zio->io_private;
2692 2990          hdr = buf->b_hdr;
2693 2991  
2694 2992          /*
2695 2993           * The hdr was inserted into hash-table and removed from lists
2696 2994           * prior to starting I/O.  We should find this header, since
2697 2995           * it's in the hash table, and it should be legit since it's
2698 2996           * not possible to evict it during the I/O.  The only possible
2699 2997           * reason for it not to be found is if we were freed during the
2700 2998           * read.
2701 2999           */
2702 3000          found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2703 3001              &hash_lock);
2704 3002  
2705 3003          ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2706 3004              (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2707 3005              (found == hdr && HDR_L2_READING(hdr)));
2708 3006  
2709 3007          hdr->b_flags &= ~ARC_L2_EVICTED;
2710 3008          if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2711 3009                  hdr->b_flags &= ~ARC_L2CACHE;
2712 3010  
2713 3011          /* byteswap if necessary */
2714 3012          callback_list = hdr->b_acb;
2715 3013          ASSERT(callback_list != NULL);
2716 3014          if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2717 3015                  dmu_object_byteswap_t bswap =
2718 3016                      DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2719 3017                  arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2720 3018                      byteswap_uint64_array :
2721 3019                      dmu_ot_byteswap[bswap].ob_func;
2722 3020                  func(buf->b_data, hdr->b_size);
2723 3021          }
2724 3022  
2725 3023          arc_cksum_compute(buf, B_FALSE);
2726 3024          arc_buf_watch(buf);
2727 3025  
2728 3026          if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2729 3027                  /*
2730 3028                   * Only call arc_access on anonymous buffers.  This is because
2731 3029                   * if we've issued an I/O for an evicted buffer, we've already
2732 3030                   * called arc_access (to prevent any simultaneous readers from
2733 3031                   * getting confused).
2734 3032                   */
2735 3033                  arc_access(hdr, hash_lock);
2736 3034          }
2737 3035  
2738 3036          /* create copies of the data buffer for the callers */
2739 3037          abuf = buf;
2740 3038          for (acb = callback_list; acb; acb = acb->acb_next) {
2741 3039                  if (acb->acb_done) {
2742 3040                          if (abuf == NULL) {
2743 3041                                  ARCSTAT_BUMP(arcstat_duplicate_reads);
2744 3042                                  abuf = arc_buf_clone(buf);
2745 3043                          }
2746 3044                          acb->acb_buf = abuf;
2747 3045                          abuf = NULL;
2748 3046                  }
2749 3047          }
2750 3048          hdr->b_acb = NULL;
2751 3049          hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2752 3050          ASSERT(!HDR_BUF_AVAILABLE(hdr));
2753 3051          if (abuf == buf) {
2754 3052                  ASSERT(buf->b_efunc == NULL);
2755 3053                  ASSERT(hdr->b_datacnt == 1);
2756 3054                  hdr->b_flags |= ARC_BUF_AVAILABLE;
2757 3055          }
2758 3056  
2759 3057          ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2760 3058  
2761 3059          if (zio->io_error != 0) {
2762 3060                  hdr->b_flags |= ARC_IO_ERROR;
2763 3061                  if (hdr->b_state != arc_anon)
2764 3062                          arc_change_state(arc_anon, hdr, hash_lock);
2765 3063                  if (HDR_IN_HASH_TABLE(hdr))
2766 3064                          buf_hash_remove(hdr);
2767 3065                  freeable = refcount_is_zero(&hdr->b_refcnt);
2768 3066          }
2769 3067  
2770 3068          /*
2771 3069           * Broadcast before we drop the hash_lock to avoid the possibility
2772 3070           * that the hdr (and hence the cv) might be freed before we get to
2773 3071           * the cv_broadcast().
2774 3072           */
2775 3073          cv_broadcast(&hdr->b_cv);
2776 3074  
2777 3075          if (hash_lock) {
2778 3076                  mutex_exit(hash_lock);
2779 3077          } else {
2780 3078                  /*
2781 3079                   * This block was freed while we waited for the read to
2782 3080                   * complete.  It has been removed from the hash table and
2783 3081                   * moved to the anonymous state (so that it won't show up
2784 3082                   * in the cache).
2785 3083                   */
2786 3084                  ASSERT3P(hdr->b_state, ==, arc_anon);
2787 3085                  freeable = refcount_is_zero(&hdr->b_refcnt);
2788 3086          }
2789 3087  
2790 3088          /* execute each callback and free its structure */
2791 3089          while ((acb = callback_list) != NULL) {
2792 3090                  if (acb->acb_done)
2793 3091                          acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2794 3092  
2795 3093                  if (acb->acb_zio_dummy != NULL) {
2796 3094                          acb->acb_zio_dummy->io_error = zio->io_error;
2797 3095                          zio_nowait(acb->acb_zio_dummy);
2798 3096                  }
2799 3097  
2800 3098                  callback_list = acb->acb_next;
2801 3099                  kmem_free(acb, sizeof (arc_callback_t));
2802 3100          }
2803 3101  
2804 3102          if (freeable)
2805 3103                  arc_hdr_destroy(hdr);
2806 3104  }
2807 3105  
2808 3106  /*
2809 3107   * "Read" the block at the specified DVA (in bp) via the
2810 3108   * cache.  If the block is found in the cache, invoke the provided
2811 3109   * callback immediately and return.  Note that the `zio' parameter
2812 3110   * in the callback will be NULL in this case, since no IO was
2813 3111   * required.  If the block is not in the cache pass the read request
2814 3112   * on to the spa with a substitute callback function, so that the
2815 3113   * requested block will be added to the cache.
2816 3114   *
2817 3115   * If a read request arrives for a block that has a read in-progress,
2818 3116   * either wait for the in-progress read to complete (and return the
2819 3117   * results); or, if this is a read with a "done" func, add a record
2820 3118   * to the read to invoke the "done" func when the read completes,
2821 3119   * and return; or just return.
2822 3120   *
2823 3121   * arc_read_done() will invoke all the requested "done" functions
2824 3122   * for readers of this block.
2825 3123   */
2826 3124  int
2827 3125  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2828 3126      void *private, int priority, int zio_flags, uint32_t *arc_flags,
2829 3127      const zbookmark_t *zb)
2830 3128  {
2831 3129          arc_buf_hdr_t *hdr;
2832 3130          arc_buf_t *buf = NULL;
2833 3131          kmutex_t *hash_lock;
2834 3132          zio_t *rzio;
2835 3133          uint64_t guid = spa_load_guid(spa);
2836 3134  
2837 3135  top:
2838 3136          hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2839 3137              &hash_lock);
2840 3138          if (hdr && hdr->b_datacnt > 0) {
2841 3139  
2842 3140                  *arc_flags |= ARC_CACHED;
2843 3141  
2844 3142                  if (HDR_IO_IN_PROGRESS(hdr)) {
2845 3143  
2846 3144                          if (*arc_flags & ARC_WAIT) {
2847 3145                                  cv_wait(&hdr->b_cv, hash_lock);
2848 3146                                  mutex_exit(hash_lock);
2849 3147                                  goto top;
2850 3148                          }
2851 3149                          ASSERT(*arc_flags & ARC_NOWAIT);
2852 3150  
2853 3151                          if (done) {
2854 3152                                  arc_callback_t  *acb = NULL;
2855 3153  
2856 3154                                  acb = kmem_zalloc(sizeof (arc_callback_t),
2857 3155                                      KM_SLEEP);
2858 3156                                  acb->acb_done = done;
2859 3157                                  acb->acb_private = private;
2860 3158                                  if (pio != NULL)
2861 3159                                          acb->acb_zio_dummy = zio_null(pio,
2862 3160                                              spa, NULL, NULL, NULL, zio_flags);
2863 3161  
2864 3162                                  ASSERT(acb->acb_done != NULL);
2865 3163                                  acb->acb_next = hdr->b_acb;
2866 3164                                  hdr->b_acb = acb;
2867 3165                                  add_reference(hdr, hash_lock, private);
2868 3166                                  mutex_exit(hash_lock);
2869 3167                                  return (0);
2870 3168                          }
2871 3169                          mutex_exit(hash_lock);
2872 3170                          return (0);
2873 3171                  }
2874 3172  
2875 3173                  ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2876 3174  
2877 3175                  if (done) {
2878 3176                          add_reference(hdr, hash_lock, private);
2879 3177                          /*
2880 3178                           * If this block is already in use, create a new
2881 3179                           * copy of the data so that we will be guaranteed
2882 3180                           * that arc_release() will always succeed.
2883 3181                           */
2884 3182                          buf = hdr->b_buf;
2885 3183                          ASSERT(buf);
2886 3184                          ASSERT(buf->b_data);
2887 3185                          if (HDR_BUF_AVAILABLE(hdr)) {
2888 3186                                  ASSERT(buf->b_efunc == NULL);
2889 3187                                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2890 3188                          } else {
2891 3189                                  buf = arc_buf_clone(buf);
2892 3190                          }
2893 3191  
2894 3192                  } else if (*arc_flags & ARC_PREFETCH &&
2895 3193                      refcount_count(&hdr->b_refcnt) == 0) {
2896 3194                          hdr->b_flags |= ARC_PREFETCH;
2897 3195                  }
2898 3196                  DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2899 3197                  arc_access(hdr, hash_lock);
2900 3198                  if (*arc_flags & ARC_L2CACHE)
2901 3199                          hdr->b_flags |= ARC_L2CACHE;
2902 3200                  if (*arc_flags & ARC_L2COMPRESS)
2903 3201                          hdr->b_flags |= ARC_L2COMPRESS;
2904 3202                  mutex_exit(hash_lock);
2905 3203                  ARCSTAT_BUMP(arcstat_hits);
2906 3204                  ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2907 3205                      demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2908 3206                      data, metadata, hits);
2909 3207  
2910 3208                  if (done)
2911 3209                          done(NULL, buf, private);
2912 3210          } else {
2913 3211                  uint64_t size = BP_GET_LSIZE(bp);
2914 3212                  arc_callback_t  *acb;
2915 3213                  vdev_t *vd = NULL;
2916 3214                  uint64_t addr = 0;
2917 3215                  boolean_t devw = B_FALSE;
2918 3216  
2919 3217                  if (hdr == NULL) {
2920 3218                          /* this block is not in the cache */
2921 3219                          arc_buf_hdr_t   *exists;
2922 3220                          arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2923 3221                          buf = arc_buf_alloc(spa, size, private, type);
2924 3222                          hdr = buf->b_hdr;
2925 3223                          hdr->b_dva = *BP_IDENTITY(bp);
2926 3224                          hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2927 3225                          hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2928 3226                          exists = buf_hash_insert(hdr, &hash_lock);
2929 3227                          if (exists) {
2930 3228                                  /* somebody beat us to the hash insert */
2931 3229                                  mutex_exit(hash_lock);
2932 3230                                  buf_discard_identity(hdr);
2933 3231                                  (void) arc_buf_remove_ref(buf, private);
2934 3232                                  goto top; /* restart the IO request */
2935 3233                          }
2936 3234                          /* if this is a prefetch, we don't have a reference */
2937 3235                          if (*arc_flags & ARC_PREFETCH) {
2938 3236                                  (void) remove_reference(hdr, hash_lock,
2939 3237                                      private);
2940 3238                                  hdr->b_flags |= ARC_PREFETCH;
2941 3239                          }
2942 3240                          if (*arc_flags & ARC_L2CACHE)
2943 3241                                  hdr->b_flags |= ARC_L2CACHE;
2944 3242                          if (*arc_flags & ARC_L2COMPRESS)
2945 3243                                  hdr->b_flags |= ARC_L2COMPRESS;
2946 3244                          if (BP_GET_LEVEL(bp) > 0)
2947 3245                                  hdr->b_flags |= ARC_INDIRECT;
2948 3246                  } else {
2949 3247                          /* this block is in the ghost cache */
2950 3248                          ASSERT(GHOST_STATE(hdr->b_state));
2951 3249                          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2952 3250                          ASSERT0(refcount_count(&hdr->b_refcnt));
2953 3251                          ASSERT(hdr->b_buf == NULL);
2954 3252  
2955 3253                          /* if this is a prefetch, we don't have a reference */
2956 3254                          if (*arc_flags & ARC_PREFETCH)
2957 3255                                  hdr->b_flags |= ARC_PREFETCH;
2958 3256                          else
2959 3257                                  add_reference(hdr, hash_lock, private);
2960 3258                          if (*arc_flags & ARC_L2CACHE)
2961 3259                                  hdr->b_flags |= ARC_L2CACHE;
2962 3260                          if (*arc_flags & ARC_L2COMPRESS)
2963 3261                                  hdr->b_flags |= ARC_L2COMPRESS;
2964 3262                          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2965 3263                          buf->b_hdr = hdr;
2966 3264                          buf->b_data = NULL;
2967 3265                          buf->b_efunc = NULL;
2968 3266                          buf->b_private = NULL;
2969 3267                          buf->b_next = NULL;
2970 3268                          hdr->b_buf = buf;
2971 3269                          ASSERT(hdr->b_datacnt == 0);
2972 3270                          hdr->b_datacnt = 1;
2973 3271                          arc_get_data_buf(buf);
2974 3272                          arc_access(hdr, hash_lock);
2975 3273                  }
2976 3274  
2977 3275                  ASSERT(!GHOST_STATE(hdr->b_state));
2978 3276  
2979 3277                  acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2980 3278                  acb->acb_done = done;
2981 3279                  acb->acb_private = private;
2982 3280  
2983 3281                  ASSERT(hdr->b_acb == NULL);
2984 3282                  hdr->b_acb = acb;
2985 3283                  hdr->b_flags |= ARC_IO_IN_PROGRESS;
2986 3284  
2987 3285                  if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2988 3286                      (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2989 3287                          devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2990 3288                          addr = hdr->b_l2hdr->b_daddr;
2991 3289                          /*
2992 3290                           * Lock out device removal.
2993 3291                           */
2994 3292                          if (vdev_is_dead(vd) ||
2995 3293                              !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2996 3294                                  vd = NULL;
2997 3295                  }
2998 3296  
2999 3297                  mutex_exit(hash_lock);
3000 3298  
3001 3299                  /*
3002 3300                   * At this point, we have a level 1 cache miss.  Try again in
3003 3301                   * L2ARC if possible.
3004 3302                   */
3005 3303                  ASSERT3U(hdr->b_size, ==, size);
3006 3304                  DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3007 3305                      uint64_t, size, zbookmark_t *, zb);
3008 3306                  ARCSTAT_BUMP(arcstat_misses);
3009 3307                  ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3010 3308                      demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3011 3309                      data, metadata, misses);
3012 3310  
3013 3311                  if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3014 3312                          /*
3015 3313                           * Read from the L2ARC if the following are true:
3016 3314                           * 1. The L2ARC vdev was previously cached.
3017 3315                           * 2. This buffer still has L2ARC metadata.
3018 3316                           * 3. This buffer isn't currently writing to the L2ARC.
3019 3317                           * 4. The L2ARC entry wasn't evicted, which may
3020 3318                           *    also have invalidated the vdev.
3021 3319                           * 5. This isn't prefetch and l2arc_noprefetch is set.
3022 3320                           */
3023 3321                          if (hdr->b_l2hdr != NULL &&
3024 3322                              !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3025 3323                              !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3026 3324                                  l2arc_read_callback_t *cb;
3027 3325  
3028 3326                                  DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3029 3327                                  ARCSTAT_BUMP(arcstat_l2_hits);
3030 3328  
3031 3329                                  cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3032 3330                                      KM_SLEEP);
3033 3331                                  cb->l2rcb_buf = buf;
3034 3332                                  cb->l2rcb_spa = spa;
3035 3333                                  cb->l2rcb_bp = *bp;
3036 3334                                  cb->l2rcb_zb = *zb;
3037 3335                                  cb->l2rcb_flags = zio_flags;
3038 3336                                  cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3039 3337  
3040 3338                                  ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3041 3339                                      addr + size < vd->vdev_psize -
3042 3340                                      VDEV_LABEL_END_SIZE);
3043 3341  
3044 3342                                  /*
3045 3343                                   * l2arc read.  The SCL_L2ARC lock will be
3046 3344                                   * released by l2arc_read_done().
3047 3345                                   * Issue a null zio if the underlying buffer
3048 3346                                   * was squashed to zero size by compression.
3049 3347                                   */
3050 3348                                  if (hdr->b_l2hdr->b_compress ==
3051 3349                                      ZIO_COMPRESS_EMPTY) {
3052 3350                                          rzio = zio_null(pio, spa, vd,
3053 3351                                              l2arc_read_done, cb,
3054 3352                                              zio_flags | ZIO_FLAG_DONT_CACHE |
3055 3353                                              ZIO_FLAG_CANFAIL |
3056 3354                                              ZIO_FLAG_DONT_PROPAGATE |
3057 3355                                              ZIO_FLAG_DONT_RETRY);
3058 3356                                  } else {
3059 3357                                          rzio = zio_read_phys(pio, vd, addr,
3060 3358                                              hdr->b_l2hdr->b_asize,
3061 3359                                              buf->b_data, ZIO_CHECKSUM_OFF,
3062 3360                                              l2arc_read_done, cb, priority,
3063 3361                                              zio_flags | ZIO_FLAG_DONT_CACHE |
3064 3362                                              ZIO_FLAG_CANFAIL |
3065 3363                                              ZIO_FLAG_DONT_PROPAGATE |
3066 3364                                              ZIO_FLAG_DONT_RETRY, B_FALSE);
3067 3365                                  }
3068 3366                                  DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3069 3367                                      zio_t *, rzio);
3070 3368                                  ARCSTAT_INCR(arcstat_l2_read_bytes,
3071 3369                                      hdr->b_l2hdr->b_asize);
3072 3370  
3073 3371                                  if (*arc_flags & ARC_NOWAIT) {
3074 3372                                          zio_nowait(rzio);
3075 3373                                          return (0);
3076 3374                                  }
3077 3375  
3078 3376                                  ASSERT(*arc_flags & ARC_WAIT);
3079 3377                                  if (zio_wait(rzio) == 0)
3080 3378                                          return (0);
3081 3379  
3082 3380                                  /* l2arc read error; goto zio_read() */
3083 3381                          } else {
3084 3382                                  DTRACE_PROBE1(l2arc__miss,
3085 3383                                      arc_buf_hdr_t *, hdr);
3086 3384                                  ARCSTAT_BUMP(arcstat_l2_misses);
3087 3385                                  if (HDR_L2_WRITING(hdr))
3088 3386                                          ARCSTAT_BUMP(arcstat_l2_rw_clash);
3089 3387                                  spa_config_exit(spa, SCL_L2ARC, vd);
3090 3388                          }
3091 3389                  } else {
3092 3390                          if (vd != NULL)
3093 3391                                  spa_config_exit(spa, SCL_L2ARC, vd);
3094 3392                          if (l2arc_ndev != 0) {
3095 3393                                  DTRACE_PROBE1(l2arc__miss,
3096 3394                                      arc_buf_hdr_t *, hdr);
3097 3395                                  ARCSTAT_BUMP(arcstat_l2_misses);
3098 3396                          }
3099 3397                  }
3100 3398  
3101 3399                  rzio = zio_read(pio, spa, bp, buf->b_data, size,
3102 3400                      arc_read_done, buf, priority, zio_flags, zb);
3103 3401  
3104 3402                  if (*arc_flags & ARC_WAIT)
3105 3403                          return (zio_wait(rzio));
3106 3404  
3107 3405                  ASSERT(*arc_flags & ARC_NOWAIT);
3108 3406                  zio_nowait(rzio);
3109 3407          }
3110 3408          return (0);
3111 3409  }
3112 3410  
3113 3411  void
3114 3412  arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3115 3413  {
3116 3414          ASSERT(buf->b_hdr != NULL);
3117 3415          ASSERT(buf->b_hdr->b_state != arc_anon);
3118 3416          ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3119 3417          ASSERT(buf->b_efunc == NULL);
3120 3418          ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3121 3419  
3122 3420          buf->b_efunc = func;
3123 3421          buf->b_private = private;
3124 3422  }
3125 3423  
3126 3424  /*
3127 3425   * Notify the arc that a block was freed, and thus will never be used again.
3128 3426   */
3129 3427  void
3130 3428  arc_freed(spa_t *spa, const blkptr_t *bp)
3131 3429  {
3132 3430          arc_buf_hdr_t *hdr;
3133 3431          kmutex_t *hash_lock;
3134 3432          uint64_t guid = spa_load_guid(spa);
3135 3433  
3136 3434          hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3137 3435              &hash_lock);
3138 3436          if (hdr == NULL)
3139 3437                  return;
3140 3438          if (HDR_BUF_AVAILABLE(hdr)) {
3141 3439                  arc_buf_t *buf = hdr->b_buf;
3142 3440                  add_reference(hdr, hash_lock, FTAG);
3143 3441                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3144 3442                  mutex_exit(hash_lock);
3145 3443  
3146 3444                  arc_release(buf, FTAG);
3147 3445                  (void) arc_buf_remove_ref(buf, FTAG);
3148 3446          } else {
3149 3447                  mutex_exit(hash_lock);
3150 3448          }
3151 3449  
3152 3450  }
3153 3451  
3154 3452  /*
3155 3453   * This is used by the DMU to let the ARC know that a buffer is
3156 3454   * being evicted, so the ARC should clean up.  If this arc buf
3157 3455   * is not yet in the evicted state, it will be put there.
3158 3456   */
3159 3457  int
3160 3458  arc_buf_evict(arc_buf_t *buf)
3161 3459  {
3162 3460          arc_buf_hdr_t *hdr;
3163 3461          kmutex_t *hash_lock;
3164 3462          arc_buf_t **bufp;
3165 3463  
3166 3464          mutex_enter(&buf->b_evict_lock);
3167 3465          hdr = buf->b_hdr;
3168 3466          if (hdr == NULL) {
3169 3467                  /*
3170 3468                   * We are in arc_do_user_evicts().
3171 3469                   */
3172 3470                  ASSERT(buf->b_data == NULL);
3173 3471                  mutex_exit(&buf->b_evict_lock);
3174 3472                  return (0);
3175 3473          } else if (buf->b_data == NULL) {
3176 3474                  arc_buf_t copy = *buf; /* structure assignment */
3177 3475                  /*
3178 3476                   * We are on the eviction list; process this buffer now
3179 3477                   * but let arc_do_user_evicts() do the reaping.
3180 3478                   */
3181 3479                  buf->b_efunc = NULL;
3182 3480                  mutex_exit(&buf->b_evict_lock);
3183 3481                  VERIFY(copy.b_efunc(&copy) == 0);
3184 3482                  return (1);
3185 3483          }
3186 3484          hash_lock = HDR_LOCK(hdr);
3187 3485          mutex_enter(hash_lock);
3188 3486          hdr = buf->b_hdr;
3189 3487          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3190 3488  
3191 3489          ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3192 3490          ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3193 3491  
3194 3492          /*
3195 3493           * Pull this buffer off of the hdr
3196 3494           */
3197 3495          bufp = &hdr->b_buf;
3198 3496          while (*bufp != buf)
3199 3497                  bufp = &(*bufp)->b_next;
3200 3498          *bufp = buf->b_next;
3201 3499  
3202 3500          ASSERT(buf->b_data != NULL);
3203 3501          arc_buf_destroy(buf, FALSE, FALSE);
3204 3502  
3205 3503          if (hdr->b_datacnt == 0) {
3206 3504                  arc_state_t *old_state = hdr->b_state;
3207 3505                  arc_state_t *evicted_state;
3208 3506  
3209 3507                  ASSERT(hdr->b_buf == NULL);
3210 3508                  ASSERT(refcount_is_zero(&hdr->b_refcnt));
3211 3509  
3212 3510                  evicted_state =
3213 3511                      (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3214 3512  
3215 3513                  mutex_enter(&old_state->arcs_mtx);
3216 3514                  mutex_enter(&evicted_state->arcs_mtx);
3217 3515  
3218 3516                  arc_change_state(evicted_state, hdr, hash_lock);
3219 3517                  ASSERT(HDR_IN_HASH_TABLE(hdr));
3220 3518                  hdr->b_flags |= ARC_IN_HASH_TABLE;
3221 3519                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3222 3520  
3223 3521                  mutex_exit(&evicted_state->arcs_mtx);
3224 3522                  mutex_exit(&old_state->arcs_mtx);
3225 3523          }
3226 3524          mutex_exit(hash_lock);
3227 3525          mutex_exit(&buf->b_evict_lock);
3228 3526  
3229 3527          VERIFY(buf->b_efunc(buf) == 0);
3230 3528          buf->b_efunc = NULL;
3231 3529          buf->b_private = NULL;
3232 3530          buf->b_hdr = NULL;
3233 3531          buf->b_next = NULL;
3234 3532          kmem_cache_free(buf_cache, buf);
3235 3533          return (1);
3236 3534  }
3237 3535  
3238 3536  /*
3239 3537   * Release this buffer from the cache, making it an anonymous buffer.  This
3240 3538   * must be done after a read and prior to modifying the buffer contents.
3241 3539   * If the buffer has more than one reference, we must make
3242 3540   * a new hdr for the buffer.
3243 3541   */
3244 3542  void
3245 3543  arc_release(arc_buf_t *buf, void *tag)
3246 3544  {
3247 3545          arc_buf_hdr_t *hdr;
3248 3546          kmutex_t *hash_lock = NULL;
3249 3547          l2arc_buf_hdr_t *l2hdr;
3250 3548          uint64_t buf_size;
3251 3549  
3252 3550          /*
3253 3551           * It would be nice to assert that if it's DMU metadata (level >
3254 3552           * 0 || it's the dnode file), then it must be syncing context.
3255 3553           * But we don't know that information at this level.
3256 3554           */
3257 3555  
3258 3556          mutex_enter(&buf->b_evict_lock);
3259 3557          hdr = buf->b_hdr;
3260 3558  
3261 3559          /* this buffer is not on any list */
3262 3560          ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3263 3561  
3264 3562          if (hdr->b_state == arc_anon) {
3265 3563                  /* this buffer is already released */
3266 3564                  ASSERT(buf->b_efunc == NULL);
3267 3565          } else {
3268 3566                  hash_lock = HDR_LOCK(hdr);
3269 3567                  mutex_enter(hash_lock);
3270 3568                  hdr = buf->b_hdr;
3271 3569                  ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3272 3570          }
3273 3571  
3274 3572          l2hdr = hdr->b_l2hdr;
3275 3573          if (l2hdr) {
3276 3574                  mutex_enter(&l2arc_buflist_mtx);
3277 3575                  hdr->b_l2hdr = NULL;
3278 3576          }
3279 3577          buf_size = hdr->b_size;
3280 3578  
3281 3579          /*
3282 3580           * Do we have more than one buf?
3283 3581           */
3284 3582          if (hdr->b_datacnt > 1) {
3285 3583                  arc_buf_hdr_t *nhdr;
3286 3584                  arc_buf_t **bufp;
3287 3585                  uint64_t blksz = hdr->b_size;
3288 3586                  uint64_t spa = hdr->b_spa;
3289 3587                  arc_buf_contents_t type = hdr->b_type;
3290 3588                  uint32_t flags = hdr->b_flags;
3291 3589  
3292 3590                  ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3293 3591                  /*
3294 3592                   * Pull the data off of this hdr and attach it to
3295 3593                   * a new anonymous hdr.
3296 3594                   */
3297 3595                  (void) remove_reference(hdr, hash_lock, tag);
3298 3596                  bufp = &hdr->b_buf;
3299 3597                  while (*bufp != buf)
3300 3598                          bufp = &(*bufp)->b_next;
3301 3599                  *bufp = buf->b_next;
3302 3600                  buf->b_next = NULL;
3303 3601  
3304 3602                  ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3305 3603                  atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3306 3604                  if (refcount_is_zero(&hdr->b_refcnt)) {
3307 3605                          uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3308 3606                          ASSERT3U(*size, >=, hdr->b_size);
3309 3607                          atomic_add_64(size, -hdr->b_size);
3310 3608                  }
3311 3609  
3312 3610                  /*
3313 3611                   * We're releasing a duplicate user data buffer, update
3314 3612                   * our statistics accordingly.
3315 3613                   */
3316 3614                  if (hdr->b_type == ARC_BUFC_DATA) {
3317 3615                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3318 3616                          ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3319 3617                              -hdr->b_size);
3320 3618                  }
3321 3619                  hdr->b_datacnt -= 1;
3322 3620                  arc_cksum_verify(buf);
3323 3621                  arc_buf_unwatch(buf);
3324 3622  
3325 3623                  mutex_exit(hash_lock);
3326 3624  
3327 3625                  nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3328 3626                  nhdr->b_size = blksz;
3329 3627                  nhdr->b_spa = spa;
3330 3628                  nhdr->b_type = type;
3331 3629                  nhdr->b_buf = buf;
3332 3630                  nhdr->b_state = arc_anon;
3333 3631                  nhdr->b_arc_access = 0;
3334 3632                  nhdr->b_flags = flags & ARC_L2_WRITING;
3335 3633                  nhdr->b_l2hdr = NULL;
3336 3634                  nhdr->b_datacnt = 1;
3337 3635                  nhdr->b_freeze_cksum = NULL;
3338 3636                  (void) refcount_add(&nhdr->b_refcnt, tag);
3339 3637                  buf->b_hdr = nhdr;
3340 3638                  mutex_exit(&buf->b_evict_lock);
3341 3639                  atomic_add_64(&arc_anon->arcs_size, blksz);
3342 3640          } else {
3343 3641                  mutex_exit(&buf->b_evict_lock);
3344 3642                  ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3345 3643                  ASSERT(!list_link_active(&hdr->b_arc_node));
3346 3644                  ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3347 3645                  if (hdr->b_state != arc_anon)
3348 3646                          arc_change_state(arc_anon, hdr, hash_lock);
3349 3647                  hdr->b_arc_access = 0;
3350 3648                  if (hash_lock)
3351 3649                          mutex_exit(hash_lock);
3352 3650  
3353 3651                  buf_discard_identity(hdr);
3354 3652                  arc_buf_thaw(buf);
3355 3653          }
3356 3654          buf->b_efunc = NULL;
3357 3655          buf->b_private = NULL;
3358 3656  
3359 3657          if (l2hdr) {
3360 3658                  ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3361 3659                  list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3362 3660                  kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3363 3661                  ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3364 3662                  mutex_exit(&l2arc_buflist_mtx);
3365 3663          }
3366 3664  }
3367 3665  
3368 3666  int
3369 3667  arc_released(arc_buf_t *buf)
3370 3668  {
3371 3669          int released;
3372 3670  
3373 3671          mutex_enter(&buf->b_evict_lock);
3374 3672          released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3375 3673          mutex_exit(&buf->b_evict_lock);
3376 3674          return (released);
3377 3675  }
3378 3676  
3379 3677  int
3380 3678  arc_has_callback(arc_buf_t *buf)
3381 3679  {
3382 3680          int callback;
3383 3681  
3384 3682          mutex_enter(&buf->b_evict_lock);
3385 3683          callback = (buf->b_efunc != NULL);
3386 3684          mutex_exit(&buf->b_evict_lock);
3387 3685          return (callback);
3388 3686  }
3389 3687  
3390 3688  #ifdef ZFS_DEBUG
3391 3689  int
3392 3690  arc_referenced(arc_buf_t *buf)
3393 3691  {
3394 3692          int referenced;
3395 3693  
3396 3694          mutex_enter(&buf->b_evict_lock);
3397 3695          referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3398 3696          mutex_exit(&buf->b_evict_lock);
3399 3697          return (referenced);
3400 3698  }
3401 3699  #endif
3402 3700  
3403 3701  static void
3404 3702  arc_write_ready(zio_t *zio)
3405 3703  {
3406 3704          arc_write_callback_t *callback = zio->io_private;
3407 3705          arc_buf_t *buf = callback->awcb_buf;
3408 3706          arc_buf_hdr_t *hdr = buf->b_hdr;
3409 3707  
3410 3708          ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3411 3709          callback->awcb_ready(zio, buf, callback->awcb_private);
3412 3710  
3413 3711          /*
3414 3712           * If the IO is already in progress, then this is a re-write
3415 3713           * attempt, so we need to thaw and re-compute the cksum.
3416 3714           * It is the responsibility of the callback to handle the
3417 3715           * accounting for any re-write attempt.
3418 3716           */
3419 3717          if (HDR_IO_IN_PROGRESS(hdr)) {
3420 3718                  mutex_enter(&hdr->b_freeze_lock);
3421 3719                  if (hdr->b_freeze_cksum != NULL) {
3422 3720                          kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3423 3721                          hdr->b_freeze_cksum = NULL;
3424 3722                  }
3425 3723                  mutex_exit(&hdr->b_freeze_lock);
3426 3724          }
3427 3725          arc_cksum_compute(buf, B_FALSE);
3428 3726          hdr->b_flags |= ARC_IO_IN_PROGRESS;
3429 3727  }
3430 3728  
3431 3729  static void
3432 3730  arc_write_done(zio_t *zio)
3433 3731  {
3434 3732          arc_write_callback_t *callback = zio->io_private;
3435 3733          arc_buf_t *buf = callback->awcb_buf;
3436 3734          arc_buf_hdr_t *hdr = buf->b_hdr;
3437 3735  
3438 3736          ASSERT(hdr->b_acb == NULL);
3439 3737  
3440 3738          if (zio->io_error == 0) {
3441 3739                  hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3442 3740                  hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3443 3741                  hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3444 3742          } else {
3445 3743                  ASSERT(BUF_EMPTY(hdr));
3446 3744          }
3447 3745  
3448 3746          /*
3449 3747           * If the block to be written was all-zero, we may have
3450 3748           * compressed it away.  In this case no write was performed
3451 3749           * so there will be no dva/birth/checksum.  The buffer must
3452 3750           * therefore remain anonymous (and uncached).
3453 3751           */
3454 3752          if (!BUF_EMPTY(hdr)) {
3455 3753                  arc_buf_hdr_t *exists;
3456 3754                  kmutex_t *hash_lock;
3457 3755  
3458 3756                  ASSERT(zio->io_error == 0);
3459 3757  
3460 3758                  arc_cksum_verify(buf);
3461 3759  
3462 3760                  exists = buf_hash_insert(hdr, &hash_lock);
3463 3761                  if (exists) {
3464 3762                          /*
3465 3763                           * This can only happen if we overwrite for
3466 3764                           * sync-to-convergence, because we remove
3467 3765                           * buffers from the hash table when we arc_free().
3468 3766                           */
3469 3767                          if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3470 3768                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3471 3769                                          panic("bad overwrite, hdr=%p exists=%p",
3472 3770                                              (void *)hdr, (void *)exists);
3473 3771                                  ASSERT(refcount_is_zero(&exists->b_refcnt));
3474 3772                                  arc_change_state(arc_anon, exists, hash_lock);
3475 3773                                  mutex_exit(hash_lock);
3476 3774                                  arc_hdr_destroy(exists);
3477 3775                                  exists = buf_hash_insert(hdr, &hash_lock);
3478 3776                                  ASSERT3P(exists, ==, NULL);
3479 3777                          } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3480 3778                                  /* nopwrite */
3481 3779                                  ASSERT(zio->io_prop.zp_nopwrite);
3482 3780                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3483 3781                                          panic("bad nopwrite, hdr=%p exists=%p",
3484 3782                                              (void *)hdr, (void *)exists);
3485 3783                          } else {
3486 3784                                  /* Dedup */
3487 3785                                  ASSERT(hdr->b_datacnt == 1);
3488 3786                                  ASSERT(hdr->b_state == arc_anon);
3489 3787                                  ASSERT(BP_GET_DEDUP(zio->io_bp));
3490 3788                                  ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3491 3789                          }
3492 3790                  }
3493 3791                  hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3494 3792                  /* if it's not anon, we are doing a scrub */
3495 3793                  if (!exists && hdr->b_state == arc_anon)
3496 3794                          arc_access(hdr, hash_lock);
3497 3795                  mutex_exit(hash_lock);
3498 3796          } else {
3499 3797                  hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3500 3798          }
3501 3799  
3502 3800          ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3503 3801          callback->awcb_done(zio, buf, callback->awcb_private);
3504 3802  
3505 3803          kmem_free(callback, sizeof (arc_write_callback_t));
3506 3804  }
3507 3805  
3508 3806  zio_t *
3509 3807  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3510 3808      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3511 3809      const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3512 3810      void *private, int priority, int zio_flags, const zbookmark_t *zb)
3513 3811  {
3514 3812          arc_buf_hdr_t *hdr = buf->b_hdr;
3515 3813          arc_write_callback_t *callback;
3516 3814          zio_t *zio;
3517 3815  
3518 3816          ASSERT(ready != NULL);
3519 3817          ASSERT(done != NULL);
3520 3818          ASSERT(!HDR_IO_ERROR(hdr));
3521 3819          ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3522 3820          ASSERT(hdr->b_acb == NULL);
3523 3821          if (l2arc)
3524 3822                  hdr->b_flags |= ARC_L2CACHE;
3525 3823          if (l2arc_compress)
3526 3824                  hdr->b_flags |= ARC_L2COMPRESS;
3527 3825          callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3528 3826          callback->awcb_ready = ready;
3529 3827          callback->awcb_done = done;
3530 3828          callback->awcb_private = private;
3531 3829          callback->awcb_buf = buf;
3532 3830  
3533 3831          zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3534 3832              arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3535 3833  
3536 3834          return (zio);
3537 3835  }
3538 3836  
3539 3837  static int
3540 3838  arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3541 3839  {
3542 3840  #ifdef _KERNEL
3543 3841          uint64_t available_memory = ptob(freemem);
3544 3842          static uint64_t page_load = 0;
3545 3843          static uint64_t last_txg = 0;
3546 3844  
3547 3845  #if defined(__i386)
3548 3846          available_memory =
3549 3847              MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3550 3848  #endif
3551 3849          if (available_memory >= zfs_write_limit_max)
3552 3850                  return (0);
3553 3851  
3554 3852          if (txg > last_txg) {
3555 3853                  last_txg = txg;
3556 3854                  page_load = 0;
3557 3855          }
3558 3856          /*
3559 3857           * If we are in pageout, we know that memory is already tight,
3560 3858           * the arc is already going to be evicting, so we just want to
3561 3859           * continue to let page writes occur as quickly as possible.
3562 3860           */
3563 3861          if (curproc == proc_pageout) {
3564 3862                  if (page_load > MAX(ptob(minfree), available_memory) / 4)
3565 3863                          return (SET_ERROR(ERESTART));
3566 3864                  /* Note: reserve is inflated, so we deflate */
3567 3865                  page_load += reserve / 8;
3568 3866                  return (0);
3569 3867          } else if (page_load > 0 && arc_reclaim_needed()) {
3570 3868                  /* memory is low, delay before restarting */
3571 3869                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3572 3870                  return (SET_ERROR(EAGAIN));
3573 3871          }
3574 3872          page_load = 0;
3575 3873  
3576 3874          if (arc_size > arc_c_min) {
3577 3875                  uint64_t evictable_memory =
3578 3876                      arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3579 3877                      arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3580 3878                      arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3581 3879                      arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3582 3880                  available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3583 3881          }
3584 3882  
3585 3883          if (inflight_data > available_memory / 4) {
3586 3884                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3587 3885                  return (SET_ERROR(ERESTART));
3588 3886          }
3589 3887  #endif
3590 3888          return (0);
3591 3889  }
3592 3890  
3593 3891  void
3594 3892  arc_tempreserve_clear(uint64_t reserve)
3595 3893  {
3596 3894          atomic_add_64(&arc_tempreserve, -reserve);
3597 3895          ASSERT((int64_t)arc_tempreserve >= 0);
3598 3896  }
3599 3897  
3600 3898  int
3601 3899  arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3602 3900  {
3603 3901          int error;
3604 3902          uint64_t anon_size;
3605 3903  
3606 3904  #ifdef ZFS_DEBUG
3607 3905          /*
3608 3906           * Once in a while, fail for no reason.  Everything should cope.
3609 3907           */
3610 3908          if (spa_get_random(10000) == 0) {
3611 3909                  dprintf("forcing random failure\n");
3612 3910                  return (SET_ERROR(ERESTART));
3613 3911          }
3614 3912  #endif
3615 3913          if (reserve > arc_c/4 && !arc_no_grow)
3616 3914                  arc_c = MIN(arc_c_max, reserve * 4);
3617 3915          if (reserve > arc_c)
3618 3916                  return (SET_ERROR(ENOMEM));
3619 3917  
3620 3918          /*
3621 3919           * Don't count loaned bufs as in flight dirty data to prevent long
3622 3920           * network delays from blocking transactions that are ready to be
3623 3921           * assigned to a txg.
3624 3922           */
3625 3923          anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3626 3924  
3627 3925          /*
3628 3926           * Writes will, almost always, require additional memory allocations
3629 3927           * in order to compress/encrypt/etc the data.  We therefore need to
3630 3928           * make sure that there is sufficient available memory for this.
3631 3929           */
3632 3930          if (error = arc_memory_throttle(reserve, anon_size, txg))
3633 3931                  return (error);
3634 3932  
3635 3933          /*
3636 3934           * Throttle writes when the amount of dirty data in the cache
3637 3935           * gets too large.  We try to keep the cache less than half full
3638 3936           * of dirty blocks so that our sync times don't grow too large.
3639 3937           * Note: if two requests come in concurrently, we might let them
3640 3938           * both succeed, when one of them should fail.  Not a huge deal.
3641 3939           */
3642 3940  
3643 3941          if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3644 3942              anon_size > arc_c / 4) {
3645 3943                  dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3646 3944                      "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3647 3945                      arc_tempreserve>>10,
3648 3946                      arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3649 3947                      arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3650 3948                      reserve>>10, arc_c>>10);
3651 3949                  return (SET_ERROR(ERESTART));
3652 3950          }
3653 3951          atomic_add_64(&arc_tempreserve, reserve);
3654 3952          return (0);
3655 3953  }
3656 3954  
3657 3955  void
3658 3956  arc_init(void)
3659 3957  {
3660 3958          mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3661 3959          cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3662 3960  
3663 3961          /* Convert seconds to clock ticks */
3664 3962          arc_min_prefetch_lifespan = 1 * hz;
3665 3963  
3666 3964          /* Start out with 1/8 of all memory */
3667 3965          arc_c = physmem * PAGESIZE / 8;
3668 3966  
3669 3967  #ifdef _KERNEL
3670 3968          /*
3671 3969           * On architectures where the physical memory can be larger
3672 3970           * than the addressable space (intel in 32-bit mode), we may
3673 3971           * need to limit the cache to 1/8 of VM size.
3674 3972           */
3675 3973          arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3676 3974  #endif
3677 3975  
3678 3976          /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3679 3977          arc_c_min = MAX(arc_c / 4, 64<<20);
3680 3978          /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3681 3979          if (arc_c * 8 >= 1<<30)
3682 3980                  arc_c_max = (arc_c * 8) - (1<<30);
3683 3981          else
3684 3982                  arc_c_max = arc_c_min;
3685 3983          arc_c_max = MAX(arc_c * 6, arc_c_max);
3686 3984  
3687 3985          /*
3688 3986           * Allow the tunables to override our calculations if they are
3689 3987           * reasonable (ie. over 64MB)
3690 3988           */
3691 3989          if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3692 3990                  arc_c_max = zfs_arc_max;
3693 3991          if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3694 3992                  arc_c_min = zfs_arc_min;
3695 3993  
3696 3994          arc_c = arc_c_max;
3697 3995          arc_p = (arc_c >> 1);
3698 3996  
3699 3997          /* limit meta-data to 1/4 of the arc capacity */
3700 3998          arc_meta_limit = arc_c_max / 4;
3701 3999  
3702 4000          /* Allow the tunable to override if it is reasonable */
3703 4001          if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3704 4002                  arc_meta_limit = zfs_arc_meta_limit;
3705 4003  
3706 4004          if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3707 4005                  arc_c_min = arc_meta_limit / 2;
3708 4006  
3709 4007          if (zfs_arc_grow_retry > 0)
3710 4008                  arc_grow_retry = zfs_arc_grow_retry;
3711 4009  
3712 4010          if (zfs_arc_shrink_shift > 0)
3713 4011                  arc_shrink_shift = zfs_arc_shrink_shift;
3714 4012  
3715 4013          if (zfs_arc_p_min_shift > 0)
3716 4014                  arc_p_min_shift = zfs_arc_p_min_shift;
3717 4015  
3718 4016          /* if kmem_flags are set, lets try to use less memory */
3719 4017          if (kmem_debugging())
3720 4018                  arc_c = arc_c / 2;
3721 4019          if (arc_c < arc_c_min)
3722 4020                  arc_c = arc_c_min;
3723 4021  
3724 4022          arc_anon = &ARC_anon;
3725 4023          arc_mru = &ARC_mru;
3726 4024          arc_mru_ghost = &ARC_mru_ghost;
3727 4025          arc_mfu = &ARC_mfu;
3728 4026          arc_mfu_ghost = &ARC_mfu_ghost;
3729 4027          arc_l2c_only = &ARC_l2c_only;
3730 4028          arc_size = 0;
3731 4029  
3732 4030          mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3733 4031          mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3734 4032          mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3735 4033          mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3736 4034          mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3737 4035          mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3738 4036  
3739 4037          list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3740 4038              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3741 4039          list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3742 4040              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3743 4041          list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3744 4042              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3745 4043          list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3746 4044              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3747 4045          list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3748 4046              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3749 4047          list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3750 4048              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3751 4049          list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3752 4050              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3753 4051          list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3754 4052              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3755 4053          list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3756 4054              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3757 4055          list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3758 4056              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3759 4057  
3760 4058          buf_init();
3761 4059  
3762 4060          arc_thread_exit = 0;
3763 4061          arc_eviction_list = NULL;
3764 4062          mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3765 4063          bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3766 4064  
3767 4065          arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3768 4066              sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3769 4067  
3770 4068          if (arc_ksp != NULL) {
3771 4069                  arc_ksp->ks_data = &arc_stats;
3772 4070                  kstat_install(arc_ksp);
3773 4071          }
3774 4072  
3775 4073          (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3776 4074              TS_RUN, minclsyspri);
3777 4075  
3778 4076          arc_dead = FALSE;
3779 4077          arc_warm = B_FALSE;
3780 4078  
3781 4079          if (zfs_write_limit_max == 0)
3782 4080                  zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3783 4081          else
3784 4082                  zfs_write_limit_shift = 0;
3785 4083          mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3786 4084  }
3787 4085  
3788 4086  void
3789 4087  arc_fini(void)
3790 4088  {
3791 4089          mutex_enter(&arc_reclaim_thr_lock);
3792 4090          arc_thread_exit = 1;
3793 4091          while (arc_thread_exit != 0)
3794 4092                  cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3795 4093          mutex_exit(&arc_reclaim_thr_lock);
3796 4094  
3797 4095          arc_flush(NULL);
3798 4096  
3799 4097          arc_dead = TRUE;
3800 4098  
3801 4099          if (arc_ksp != NULL) {
3802 4100                  kstat_delete(arc_ksp);
3803 4101                  arc_ksp = NULL;
3804 4102          }
3805 4103  
3806 4104          mutex_destroy(&arc_eviction_mtx);
3807 4105          mutex_destroy(&arc_reclaim_thr_lock);
3808 4106          cv_destroy(&arc_reclaim_thr_cv);
3809 4107  
3810 4108          list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3811 4109          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3812 4110          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3813 4111          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3814 4112          list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3815 4113          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3816 4114          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3817 4115          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3818 4116  
3819 4117          mutex_destroy(&arc_anon->arcs_mtx);
3820 4118          mutex_destroy(&arc_mru->arcs_mtx);
3821 4119          mutex_destroy(&arc_mru_ghost->arcs_mtx);
3822 4120          mutex_destroy(&arc_mfu->arcs_mtx);
3823 4121          mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3824 4122          mutex_destroy(&arc_l2c_only->arcs_mtx);
3825 4123  
3826 4124          mutex_destroy(&zfs_write_limit_lock);
3827 4125  
3828 4126          buf_fini();
3829 4127  
3830 4128          ASSERT(arc_loaned_bytes == 0);
3831 4129  }
3832 4130  
3833 4131  /*
3834 4132   * Level 2 ARC
3835 4133   *
3836 4134   * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3837 4135   * It uses dedicated storage devices to hold cached data, which are populated
3838 4136   * using large infrequent writes.  The main role of this cache is to boost
3839 4137   * the performance of random read workloads.  The intended L2ARC devices
3840 4138   * include short-stroked disks, solid state disks, and other media with
3841 4139   * substantially faster read latency than disk.
3842 4140   *
3843 4141   *                 +-----------------------+
3844 4142   *                 |         ARC           |
3845 4143   *                 +-----------------------+
3846 4144   *                    |         ^     ^
3847 4145   *                    |         |     |
3848 4146   *      l2arc_feed_thread()    arc_read()
3849 4147   *                    |         |     |
3850 4148   *                    |  l2arc read   |
3851 4149   *                    V         |     |
3852 4150   *               +---------------+    |
3853 4151   *               |     L2ARC     |    |
3854 4152   *               +---------------+    |
3855 4153   *                   |    ^           |
3856 4154   *          l2arc_write() |           |
3857 4155   *                   |    |           |
3858 4156   *                   V    |           |
3859 4157   *                 +-------+      +-------+
3860 4158   *                 | vdev  |      | vdev  |
3861 4159   *                 | cache |      | cache |
3862 4160   *                 +-------+      +-------+
3863 4161   *                 +=========+     .-----.
3864 4162   *                 :  L2ARC  :    |-_____-|
3865 4163   *                 : devices :    | Disks |
3866 4164   *                 +=========+    `-_____-'
3867 4165   *
3868 4166   * Read requests are satisfied from the following sources, in order:
3869 4167   *
3870 4168   *      1) ARC
3871 4169   *      2) vdev cache of L2ARC devices
3872 4170   *      3) L2ARC devices
3873 4171   *      4) vdev cache of disks
3874 4172   *      5) disks
3875 4173   *
3876 4174   * Some L2ARC device types exhibit extremely slow write performance.
3877 4175   * To accommodate for this there are some significant differences between
3878 4176   * the L2ARC and traditional cache design:
3879 4177   *
3880 4178   * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3881 4179   * the ARC behave as usual, freeing buffers and placing headers on ghost
3882 4180   * lists.  The ARC does not send buffers to the L2ARC during eviction as
3883 4181   * this would add inflated write latencies for all ARC memory pressure.
3884 4182   *
3885 4183   * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3886 4184   * It does this by periodically scanning buffers from the eviction-end of
3887 4185   * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3888 4186   * not already there. It scans until a headroom of buffers is satisfied,
3889 4187   * which itself is a buffer for ARC eviction. If a compressible buffer is
3890 4188   * found during scanning and selected for writing to an L2ARC device, we
3891 4189   * temporarily boost scanning headroom during the next scan cycle to make
3892 4190   * sure we adapt to compression effects (which might significantly reduce
3893 4191   * the data volume we write to L2ARC). The thread that does this is
3894 4192   * l2arc_feed_thread(), illustrated below; example sizes are included to
3895 4193   * provide a better sense of ratio than this diagram:
3896 4194   *
3897 4195   *             head -->                        tail
3898 4196   *              +---------------------+----------+
3899 4197   *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3900 4198   *              +---------------------+----------+   |   o L2ARC eligible
3901 4199   *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3902 4200   *              +---------------------+----------+   |
3903 4201   *                   15.9 Gbytes      ^ 32 Mbytes    |
3904 4202   *                                 headroom          |
3905 4203   *                                            l2arc_feed_thread()
3906 4204   *                                                   |
3907 4205   *                       l2arc write hand <--[oooo]--'
3908 4206   *                               |           8 Mbyte
3909 4207   *                               |          write max
3910 4208   *                               V
3911 4209   *                +==============================+
3912 4210   *      L2ARC dev |####|#|###|###|    |####| ... |
3913 4211   *                +==============================+
3914 4212   *                           32 Gbytes
3915 4213   *
3916 4214   * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3917 4215   * evicted, then the L2ARC has cached a buffer much sooner than it probably
3918 4216   * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3919 4217   * safe to say that this is an uncommon case, since buffers at the end of
3920 4218   * the ARC lists have moved there due to inactivity.
3921 4219   *
3922 4220   * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3923 4221   * then the L2ARC simply misses copying some buffers.  This serves as a
3924 4222   * pressure valve to prevent heavy read workloads from both stalling the ARC
3925 4223   * with waits and clogging the L2ARC with writes.  This also helps prevent
3926 4224   * the potential for the L2ARC to churn if it attempts to cache content too
3927 4225   * quickly, such as during backups of the entire pool.
3928 4226   *
3929 4227   * 5. After system boot and before the ARC has filled main memory, there are
3930 4228   * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3931 4229   * lists can remain mostly static.  Instead of searching from tail of these
3932 4230   * lists as pictured, the l2arc_feed_thread() will search from the list heads
3933 4231   * for eligible buffers, greatly increasing its chance of finding them.
3934 4232   *
3935 4233   * The L2ARC device write speed is also boosted during this time so that
3936 4234   * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3937 4235   * there are no L2ARC reads, and no fear of degrading read performance
3938 4236   * through increased writes.
3939 4237   *
3940 4238   * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3941 4239   * the vdev queue can aggregate them into larger and fewer writes.  Each
3942 4240   * device is written to in a rotor fashion, sweeping writes through
3943 4241   * available space then repeating.
3944 4242   *
3945 4243   * 7. The L2ARC does not store dirty content.  It never needs to flush
3946 4244   * write buffers back to disk based storage.
3947 4245   *
3948 4246   * 8. If an ARC buffer is written (and dirtied) which also exists in the
3949 4247   * L2ARC, the now stale L2ARC buffer is immediately dropped.
3950 4248   *
3951 4249   * The performance of the L2ARC can be tweaked by a number of tunables, which
3952 4250   * may be necessary for different workloads:
3953 4251   *
3954 4252   *      l2arc_write_max         max write bytes per interval
3955 4253   *      l2arc_write_boost       extra write bytes during device warmup
3956 4254   *      l2arc_noprefetch        skip caching prefetched buffers
3957 4255   *      l2arc_headroom          number of max device writes to precache
3958 4256   *      l2arc_headroom_boost    when we find compressed buffers during ARC
3959 4257   *                              scanning, we multiply headroom by this
3960 4258   *                              percentage factor for the next scan cycle,
3961 4259   *                              since more compressed buffers are likely to
3962 4260   *                              be present
3963 4261   *      l2arc_feed_secs         seconds between L2ARC writing
3964 4262   *
3965 4263   * Tunables may be removed or added as future performance improvements are

↓ open down ↓

2612 lines elided

↑ open up ↑

3966 4264   * integrated, and also may become zpool properties.
3967 4265   *
3968 4266   * There are three key functions that control how the L2ARC warms up:
3969 4267   *
3970 4268   *      l2arc_write_eligible()  check if a buffer is eligible to cache
3971 4269   *      l2arc_write_size()      calculate how much to write
3972 4270   *      l2arc_write_interval()  calculate sleep delay between writes
3973 4271   *
3974 4272   * These three functions determine what to write, how much, and how quickly
3975 4273   * to send writes.
     4274 + *
     4275 + * L2ARC persistency:
     4276 + *
     4277 + * When writing buffers to L2ARC, we periodically add some metadata to
     4278 + * make sure we can pick them up after reboot, thus dramatically reducing
     4279 + * the impact that any downtime has on the performance of storage systems
     4280 + * with large caches.
     4281 + *
     4282 + * The implementation works fairly simply by integrating the following two
     4283 + * modifications:
     4284 + *
     4285 + * *) Every now and then, at end of an L2ARC feed cycle, we append a piece
     4286 + *    of metadata (called a "pbuf", or "persistency buffer") to the L2ARC
     4287 + *    write. This allows us to understand what what's been written, so that
     4288 + *    we can rebuild the arc_buf_hdr_t structures of the main ARC buffers.
     4289 + *    The pbuf also includes a "back-reference" pointer to the previous
     4290 + *    pbuf, forming a linked list of pbufs on the L2ARC device.
     4291 + *
     4292 + * *) We reserve 4k of space at the start of each L2ARC device for our
     4293 + *    header bookkeeping purposes. This contains a single 4k uberblock, which
     4294 + *    contains our top-level reference structures. We update it on each pbuf
     4295 + *    write. If this write results in an inconsistent uberblock (e.g. due to
     4296 + *    power failure), we detect this by verifying the uberblock's checksum
     4297 + *    and simply drop the entries from L2ARC. Once an L2ARC pbuf update
     4298 + *    completes, we update the uberblock to point to it.
     4299 + *
     4300 + * Implementation diagram:
     4301 + *
     4302 + * +=== L2ARC device (not to scale) ======================================+
     4303 + * |       ____________newest pbuf pointer_____________                   |
     4304 + * |      /                                            \                  |
     4305 + * |     /                                              V                 |
     4306 + * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---|
     4307 + * |                       ^       / ^       / ^       /                  |
     4308 + * |                       `-prev-'  `-prev-'  `-prev-'                   |
     4309 + * |                         pbuf      pbuf      pbuf                     |
     4310 + * +======================================================================+
     4311 + *
     4312 + * On-device data structures:
     4313 + *
     4314 + * (L2ARC persistent uberblock)
     4315 + * struct l2uberblock {
     4316 + *      (these fields are in network byte order)
     4317 + *      uint32_t magic = 0x12bab10c;    l2-ber-block
     4318 + *      uint8_t  version = 0x1;
     4319 + *      uint8_t  reserved = 0x0;
     4320 + *      uint16_t ublk_flags;            see l2uberblock_flags_t
     4321 + *
     4322 + *      (byte order of fields below determined by `ublk_flags')
     4323 + *      uint64_t spa_guid;              what pool this l2arc dev belongs to
     4324 + *      uint64_t birth_txg;             ublk with highest birth_txg is newest
     4325 + *      uint64_t evict_tail;            current evict pointer on l2arc dev
     4326 + *      uint64_t alloc_space;           how much space is alloc'd on the dev
     4327 + *      uint64_t pbuf_daddr;            dev addr of the newest l2pbuf_t
     4328 + *      uint32_t pbuf_asize;            size of newest pbuf
     4329 + *      uint64_t pbuf_cksum[4];         fletcher4 of newest pbuf
     4330 + *
     4331 + *      uint8_t  reserved[3996] = {0x0, 0x0, ... 0x0};
     4332 + *
     4333 + *      uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above);
     4334 + * } l2dev_uberblock;
     4335 + *
     4336 + * (L2ARC persistent buffer list)
     4337 + * typedef struct l2pbuf_t {
     4338 + *      (these fields are in network byte order)
     4339 + *      uint32_t magic = 0xdb0faba6;    the-buffer-bag
     4340 + *      uint8_t  version = 0x1;
     4341 + *      uint8_t  reserved = 0x0;
     4342 + *      uint16_t pbuf_flags;            see l2pbuf_flags_t
     4343 + *
     4344 + *      (byte order of fields below determined by `pbuf_flags')
     4345 + *      uint64_t prev_pbuf_daddr;       previous pbuf dev addr
     4346 + *      uint32_t prev_pbuf_asize;       previous pbuf size
     4347 + *      uint64_t prev_pbuf_cksum[4];    fletcher4(of previous pbuf)
     4348 + *
     4349 + *      uint32_t items_size;            uncompressed size of `items' below
     4350 + *      (if (pbuf_flags & compress) decompress `items' prior to decoding)
     4351 + *      struct l2pbuf_buf_item {
     4352 + *              (these fields mirror [l2]arc_buf_hdr fields)
     4353 + *              uint64_t dva[2];                buffer's DVA
     4354 + *              uint64_t birth;                 buffer's birth TXG in ARC
     4355 + *              uint64_t cksum0;                lower 64-bits of buffer's cksum
     4356 + *              uint64_t freeze_cksum[4];       buffer's freeze cksum
     4357 + *              uint32_t size;                  uncompressed buffer data size
     4358 + *              uint64_t l2daddr;               device address (offset) of buf
     4359 + *              uint32_t l2asize;               actual space occupied by buf
     4360 + *              uint8_t  compress;              compress algo used on data
     4361 + *              uint8_t  contents_type;         buffer's contents type
     4362 + *              uint16_t reserved = 0x0;        for alignment and future use
     4363 + *              uint32_t flags;                 buffer's persistent flags
     4364 + *      } items[];                              continues for remainder of pbuf
     4365 + * } l2pbuf_t;
     4366 + *
     4367 + * L2ARC reconstruction:
     4368 + *
     4369 + * When writing data, we simply write in the standard rotary fashion,
     4370 + * evicting buffers as we go and simply writing new data over them (appending
     4371 + * an updated l2pbuf_t every now and then). This obviously means that once we
     4372 + * loop around the end of the device, we will start cutting into an already
     4373 + * committed l2pbuf (and its referenced data buffers), like so:
     4374 + *
     4375 + *    current write head__       __old tail
     4376 + *                        \     /
     4377 + *                        V    V
     4378 + * <--|bufs|pbuf|bufs|pbuf|    |bufs|pbuf|bufs|pbuf|-->
     4379 + *                         ^    ^^^^^^^^^_____________________________
     4380 + *                         |                                          \
     4381 + *                         <<nextwrite>> - will overwrite this pbuf --/
     4382 + *
     4383 + * When importing the pool, we detect this situation and use it to stop
     4384 + * our scanning process:
     4385 + * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the
     4386 + *      previous one.
     4387 + * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum)
     4388 + *      then the pbuf is invalid and stop scanning (goto step 3 below).
     4389 + * 3) if (this is the last valid pbuf)
     4390 + *      discard this pbuf as well (its ARC bufs may have been damaged by a
     4391 + *      partial overwrite).
     4392 + * (We could potentially salvage the remaining good arc bufs above in step 3,
     4393 + * buf the cost of doing so probably outweighs the value of the entire pbuf).
     4394 + *
     4395 + * There is one significant caveat to consider when rebuilding ARC contents
     4396 + * from an L2ARC device: what about invalidated buffers? Given the above
     4397 + * construction, we cannot update pbufs which we've already written to amend
     4398 + * them to remove buffers which were invalidated. Thus, during reconstruction,
     4399 + * we might be populating the cache with buffers for data that's not on the
     4400 + * main pool anymore, or may have been overwritten!
     4401 + *
     4402 + * As it turns out, this isn't a problem. Every arc_read request includes
     4403 + * both the DVA and, crucially, the birth TXG of the BP the caller is
     4404 + * looking for. So even if the cache were populated by completely rotten
     4405 + * blocks for data that had been long deleted and/or overwritten, we'll
     4406 + * never actually return bad data from the cache, since the DVA with the
     4407 + * birth TXG uniquely identify a block in space and time - once created,
     4408 + * a block is immutable on disk. The worst thing we have done is wasted
     4409 + * some time and memory at l2arc rebuild to reconstruct outdated ARC
     4410 + * entries that will get dropped from the l2arc as it is being updated
     4411 + * with new blocks.
3976 4412   */
3977 4413  
3978 4414  static boolean_t
3979 4415  l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3980 4416  {
3981 4417          /*
3982 4418           * A buffer is *not* eligible for the L2ARC if it:
3983 4419           * 1. belongs to a different spa.
3984 4420           * 2. is already cached on the L2ARC.
3985 4421           * 3. has an I/O in progress (it may be an incomplete read).

3986 4422           * 4. is flagged not eligible (zfs property).
3987 4423           */
3988 4424          if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3989 4425              HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3990 4426                  return (B_FALSE);
3991 4427  
3992 4428          return (B_TRUE);
3993 4429  }
3994 4430  
3995 4431  static uint64_t
3996 4432  l2arc_write_size(void)
3997 4433  {
3998 4434          uint64_t size;
3999 4435  
4000 4436          /*
4001 4437           * Make sure our globals have meaningful values in case the user
4002 4438           * altered them.
4003 4439           */
4004 4440          size = l2arc_write_max;
4005 4441          if (size == 0) {
4006 4442                  cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4007 4443                      "be greater than zero, resetting it to the default (%d)",
4008 4444                      L2ARC_WRITE_SIZE);
4009 4445                  size = l2arc_write_max = L2ARC_WRITE_SIZE;
4010 4446          }
4011 4447  
4012 4448          if (arc_warm == B_FALSE)
4013 4449                  size += l2arc_write_boost;
4014 4450  
4015 4451          return (size);
4016 4452  
4017 4453  }
4018 4454  
4019 4455  static clock_t
4020 4456  l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4021 4457  {
4022 4458          clock_t interval, next, now;
4023 4459  
4024 4460          /*
4025 4461           * If the ARC lists are busy, increase our write rate; if the
4026 4462           * lists are stale, idle back.  This is achieved by checking
4027 4463           * how much we previously wrote - if it was more than half of
4028 4464           * what we wanted, schedule the next write much sooner.
4029 4465           */
4030 4466          if (l2arc_feed_again && wrote > (wanted / 2))
4031 4467                  interval = (hz * l2arc_feed_min_ms) / 1000;

↓ open down ↓

46 lines elided

↑ open up ↑

4032 4468          else
4033 4469                  interval = hz * l2arc_feed_secs;
4034 4470  
4035 4471          now = ddi_get_lbolt();
4036 4472          next = MAX(now, MIN(now + interval, began + interval));
4037 4473  
4038 4474          return (next);
4039 4475  }
4040 4476  
4041 4477  static void
4042      -l2arc_hdr_stat_add(void)
     4478 +l2arc_hdr_stat_add(boolean_t from_arc)
4043 4479  {
4044 4480          ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4045      -        ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
     4481 +        if (from_arc)
     4482 +                ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4046 4483  }
4047 4484  
4048 4485  static void
4049 4486  l2arc_hdr_stat_remove(void)
4050 4487  {
4051 4488          ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4052 4489          ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4053 4490  }
4054 4491  
4055 4492  /*

4056 4493   * Cycle through L2ARC devices.  This is how L2ARC load balances.
4057 4494   * If a device is returned, this also returns holding the spa config lock.
4058 4495   */
4059 4496  static l2arc_dev_t *
4060 4497  l2arc_dev_get_next(void)
4061 4498  {
4062 4499          l2arc_dev_t *first, *next = NULL;
4063 4500  
4064 4501          /*
4065 4502           * Lock out the removal of spas (spa_namespace_lock), then removal
4066 4503           * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4067 4504           * both locks will be dropped and a spa config lock held instead.
4068 4505           */

↓ open down ↓

13 lines elided

↑ open up ↑

4069 4506          mutex_enter(&spa_namespace_lock);
4070 4507          mutex_enter(&l2arc_dev_mtx);
4071 4508  
4072 4509          /* if there are no vdevs, there is nothing to do */
4073 4510          if (l2arc_ndev == 0)
4074 4511                  goto out;
4075 4512  
4076 4513          first = NULL;
4077 4514          next = l2arc_dev_last;
4078 4515          do {
4079      -                /* loop around the list looking for a non-faulted vdev */
     4516 +                /*
     4517 +                 * Loop around the list looking for a non-faulted vdev
     4518 +                 * and one that isn't currently doing an L2ARC rebuild.
     4519 +                 */
4080 4520                  if (next == NULL) {
4081 4521                          next = list_head(l2arc_dev_list);
4082 4522                  } else {
4083 4523                          next = list_next(l2arc_dev_list, next);
4084 4524                          if (next == NULL)
4085 4525                                  next = list_head(l2arc_dev_list);
4086 4526                  }
4087 4527  
4088 4528                  /* if we have come back to the start, bail out */
4089 4529                  if (first == NULL)
4090 4530                          first = next;
4091 4531                  else if (next == first)
4092 4532                          break;
4093 4533  
4094      -        } while (vdev_is_dead(next->l2ad_vdev));
     4534 +        } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding);
4095 4535  
4096 4536          /* if we were unable to find any usable vdevs, return NULL */
4097      -        if (vdev_is_dead(next->l2ad_vdev))
     4537 +        if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding)
4098 4538                  next = NULL;
4099 4539  
4100 4540          l2arc_dev_last = next;
4101 4541  
4102 4542  out:
4103 4543          mutex_exit(&l2arc_dev_mtx);
4104 4544  
4105 4545          /*
4106 4546           * Grab the config lock to prevent the 'next' device from being
4107 4547           * removed while we are writing to it.

4108 4548           */
4109 4549          if (next != NULL)
4110 4550                  spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4111 4551          mutex_exit(&spa_namespace_lock);
4112 4552  
4113 4553          return (next);
4114 4554  }
4115 4555  
4116 4556  /*
4117 4557   * Free buffers that were tagged for destruction.
4118 4558   */
4119 4559  static void
4120 4560  l2arc_do_free_on_write()
4121 4561  {
4122 4562          list_t *buflist;
4123 4563          l2arc_data_free_t *df, *df_prev;
4124 4564  
4125 4565          mutex_enter(&l2arc_free_on_write_mtx);
4126 4566          buflist = l2arc_free_on_write;
4127 4567  
4128 4568          for (df = list_tail(buflist); df; df = df_prev) {
4129 4569                  df_prev = list_prev(buflist, df);
4130 4570                  ASSERT(df->l2df_data != NULL);
4131 4571                  ASSERT(df->l2df_func != NULL);
4132 4572                  df->l2df_func(df->l2df_data, df->l2df_size);
4133 4573                  list_remove(buflist, df);
4134 4574                  kmem_free(df, sizeof (l2arc_data_free_t));
4135 4575          }
4136 4576  
4137 4577          mutex_exit(&l2arc_free_on_write_mtx);
4138 4578  }
4139 4579  
4140 4580  /*
4141 4581   * A write to a cache device has completed.  Update all headers to allow
4142 4582   * reads from these buffers to begin.
4143 4583   */
4144 4584  static void
4145 4585  l2arc_write_done(zio_t *zio)
4146 4586  {
4147 4587          l2arc_write_callback_t *cb;
4148 4588          l2arc_dev_t *dev;
4149 4589          list_t *buflist;
4150 4590          arc_buf_hdr_t *head, *ab, *ab_prev;
4151 4591          l2arc_buf_hdr_t *abl2;
4152 4592          kmutex_t *hash_lock;
4153 4593  
4154 4594          cb = zio->io_private;
4155 4595          ASSERT(cb != NULL);
4156 4596          dev = cb->l2wcb_dev;
4157 4597          ASSERT(dev != NULL);
4158 4598          head = cb->l2wcb_head;
4159 4599          ASSERT(head != NULL);
4160 4600          buflist = dev->l2ad_buflist;
4161 4601          ASSERT(buflist != NULL);
4162 4602          DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4163 4603              l2arc_write_callback_t *, cb);
4164 4604

↓ open down ↓

57 lines elided

↑ open up ↑

4165 4605          if (zio->io_error != 0)
4166 4606                  ARCSTAT_BUMP(arcstat_l2_writes_error);
4167 4607  
4168 4608          mutex_enter(&l2arc_buflist_mtx);
4169 4609  
4170 4610          /*
4171 4611           * All writes completed, or an error was hit.
4172 4612           */
4173 4613          for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4174 4614                  ab_prev = list_prev(buflist, ab);
     4615 +                abl2 = ab->b_l2hdr;
4175 4616  
     4617 +                /*
     4618 +                 * Release the temporary compressed buffer as soon as possible.
     4619 +                 */
     4620 +                if (abl2->b_compress != ZIO_COMPRESS_OFF)
     4621 +                        l2arc_release_cdata_buf(ab);
     4622 +
4176 4623                  hash_lock = HDR_LOCK(ab);
4177 4624                  if (!mutex_tryenter(hash_lock)) {
4178 4625                          /*
4179 4626                           * This buffer misses out.  It may be in a stage
4180 4627                           * of eviction.  Its ARC_L2_WRITING flag will be
4181 4628                           * left set, denying reads to this buffer.
4182 4629                           */
4183 4630                          ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4184 4631                          continue;
4185 4632                  }
4186 4633  
4187      -                abl2 = ab->b_l2hdr;
4188      -
4189      -                /*
4190      -                 * Release the temporary compressed buffer as soon as possible.
4191      -                 */
4192      -                if (abl2->b_compress != ZIO_COMPRESS_OFF)
4193      -                        l2arc_release_cdata_buf(ab);
4194      -
4195 4634                  if (zio->io_error != 0) {
4196 4635                          /*
4197 4636                           * Error - drop L2ARC entry.
4198 4637                           */
4199 4638                          list_remove(buflist, ab);
4200 4639                          ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4201 4640                          ab->b_l2hdr = NULL;
4202 4641                          kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4203 4642                          ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4204 4643                  }

4205 4644  
4206 4645                  /*
4207 4646                   * Allow ARC to begin reads to this L2ARC entry.
4208 4647                   */
4209 4648                  ab->b_flags &= ~ARC_L2_WRITING;
4210 4649

↓ open down ↓

6 lines elided

↑ open up ↑

4211 4650                  mutex_exit(hash_lock);
4212 4651          }
4213 4652  
4214 4653          atomic_inc_64(&l2arc_writes_done);
4215 4654          list_remove(buflist, head);
4216 4655          kmem_cache_free(hdr_cache, head);
4217 4656          mutex_exit(&l2arc_buflist_mtx);
4218 4657  
4219 4658          l2arc_do_free_on_write();
4220 4659  
     4660 +        if (cb->l2wcb_pbuf)
     4661 +                kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size);
     4662 +        if (cb->l2wcb_ub_buf)
     4663 +                kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE);
4221 4664          kmem_free(cb, sizeof (l2arc_write_callback_t));
4222 4665  }
4223 4666  
4224 4667  /*
4225 4668   * A read to a cache device completed.  Validate buffer contents before
4226 4669   * handing over to the regular ARC routines.
4227 4670   */
4228 4671  static void
4229 4672  l2arc_read_done(zio_t *zio)
4230 4673  {

4231 4674          l2arc_read_callback_t *cb;
4232 4675          arc_buf_hdr_t *hdr;
4233 4676          arc_buf_t *buf;
4234 4677          kmutex_t *hash_lock;
4235 4678          int equal;
4236 4679  
4237 4680          ASSERT(zio->io_vd != NULL);
4238 4681          ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4239 4682  
4240 4683          spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4241 4684  
4242 4685          cb = zio->io_private;
4243 4686          ASSERT(cb != NULL);
4244 4687          buf = cb->l2rcb_buf;
4245 4688          ASSERT(buf != NULL);
4246 4689  
4247 4690          hash_lock = HDR_LOCK(buf->b_hdr);
4248 4691          mutex_enter(hash_lock);
4249 4692          hdr = buf->b_hdr;
4250 4693          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4251 4694  
4252 4695          /*
4253 4696           * If the buffer was compressed, decompress it first.
4254 4697           */
4255 4698          if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4256 4699                  l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4257 4700          ASSERT(zio->io_data != NULL);
4258 4701  
4259 4702          /*
4260 4703           * Check this survived the L2ARC journey.
4261 4704           */
4262 4705          equal = arc_cksum_equal(buf);
4263 4706          if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4264 4707                  mutex_exit(hash_lock);
4265 4708                  zio->io_private = buf;
4266 4709                  zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4267 4710                  zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
4268 4711                  arc_read_done(zio);
4269 4712          } else {
4270 4713                  mutex_exit(hash_lock);
4271 4714                  /*
4272 4715                   * Buffer didn't survive caching.  Increment stats and
4273 4716                   * reissue to the original storage device.
4274 4717                   */
4275 4718                  if (zio->io_error != 0) {
4276 4719                          ARCSTAT_BUMP(arcstat_l2_io_error);
4277 4720                  } else {
4278 4721                          zio->io_error = SET_ERROR(EIO);
4279 4722                  }
4280 4723                  if (!equal)
4281 4724                          ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4282 4725  
4283 4726                  /*
4284 4727                   * If there's no waiter, issue an async i/o to the primary
4285 4728                   * storage now.  If there *is* a waiter, the caller must
4286 4729                   * issue the i/o in a context where it's OK to block.
4287 4730                   */
4288 4731                  if (zio->io_waiter == NULL) {
4289 4732                          zio_t *pio = zio_unique_parent(zio);
4290 4733  
4291 4734                          ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4292 4735  
4293 4736                          zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4294 4737                              buf->b_data, zio->io_size, arc_read_done, buf,
4295 4738                              zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4296 4739                  }
4297 4740          }
4298 4741  
4299 4742          kmem_free(cb, sizeof (l2arc_read_callback_t));
4300 4743  }
4301 4744  
4302 4745  /*
4303 4746   * This is the list priority from which the L2ARC will search for pages to
4304 4747   * cache.  This is used within loops (0..3) to cycle through lists in the
4305 4748   * desired order.  This order can have a significant effect on cache
4306 4749   * performance.
4307 4750   *
4308 4751   * Currently the metadata lists are hit first, MFU then MRU, followed by
4309 4752   * the data lists.  This function returns a locked list, and also returns
4310 4753   * the lock pointer.
4311 4754   */
4312 4755  static list_t *
4313 4756  l2arc_list_locked(int list_num, kmutex_t **lock)
4314 4757  {
4315 4758          list_t *list = NULL;
4316 4759  
4317 4760          ASSERT(list_num >= 0 && list_num <= 3);
4318 4761  
4319 4762          switch (list_num) {
4320 4763          case 0:
4321 4764                  list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4322 4765                  *lock = &arc_mfu->arcs_mtx;
4323 4766                  break;
4324 4767          case 1:
4325 4768                  list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4326 4769                  *lock = &arc_mru->arcs_mtx;
4327 4770                  break;
4328 4771          case 2:
4329 4772                  list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4330 4773                  *lock = &arc_mfu->arcs_mtx;
4331 4774                  break;
4332 4775          case 3:
4333 4776                  list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4334 4777                  *lock = &arc_mru->arcs_mtx;
4335 4778                  break;
4336 4779          }
4337 4780  
4338 4781          ASSERT(!(MUTEX_HELD(*lock)));
4339 4782          mutex_enter(*lock);
4340 4783          return (list);
4341 4784  }
4342 4785  
4343 4786  /*
4344 4787   * Evict buffers from the device write hand to the distance specified in
4345 4788   * bytes.  This distance may span populated buffers, it may span nothing.
4346 4789   * This is clearing a region on the L2ARC device ready for writing.
4347 4790   * If the 'all' boolean is set, every buffer is evicted.
4348 4791   */
4349 4792  static void
4350 4793  l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4351 4794  {
4352 4795          list_t *buflist;
4353 4796          l2arc_buf_hdr_t *abl2;
4354 4797          arc_buf_hdr_t *ab, *ab_prev;
4355 4798          kmutex_t *hash_lock;
4356 4799          uint64_t taddr;
4357 4800  
4358 4801          buflist = dev->l2ad_buflist;
4359 4802  
4360 4803          if (buflist == NULL)
4361 4804                  return;
4362 4805  
4363 4806          if (!all && dev->l2ad_first) {
4364 4807                  /*
4365 4808                   * This is the first sweep through the device.  There is
4366 4809                   * nothing to evict.
4367 4810                   */
4368 4811                  return;
4369 4812          }
4370 4813  
4371 4814          if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4372 4815                  /*
4373 4816                   * When nearing the end of the device, evict to the end
4374 4817                   * before the device write hand jumps to the start.
4375 4818                   */
4376 4819                  taddr = dev->l2ad_end;
4377 4820          } else {
4378 4821                  taddr = dev->l2ad_hand + distance;
4379 4822          }
4380 4823          DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4381 4824              uint64_t, taddr, boolean_t, all);
4382 4825  
4383 4826  top:
4384 4827          mutex_enter(&l2arc_buflist_mtx);
4385 4828          for (ab = list_tail(buflist); ab; ab = ab_prev) {
4386 4829                  ab_prev = list_prev(buflist, ab);
4387 4830  
4388 4831                  hash_lock = HDR_LOCK(ab);
4389 4832                  if (!mutex_tryenter(hash_lock)) {
4390 4833                          /*
4391 4834                           * Missed the hash lock.  Retry.
4392 4835                           */
4393 4836                          ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4394 4837                          mutex_exit(&l2arc_buflist_mtx);
4395 4838                          mutex_enter(hash_lock);
4396 4839                          mutex_exit(hash_lock);
4397 4840                          goto top;
4398 4841                  }
4399 4842  
4400 4843                  if (HDR_L2_WRITE_HEAD(ab)) {
4401 4844                          /*
4402 4845                           * We hit a write head node.  Leave it for
4403 4846                           * l2arc_write_done().
4404 4847                           */
4405 4848                          list_remove(buflist, ab);
4406 4849                          mutex_exit(hash_lock);
4407 4850                          continue;
4408 4851                  }
4409 4852  
4410 4853                  if (!all && ab->b_l2hdr != NULL &&
4411 4854                      (ab->b_l2hdr->b_daddr > taddr ||
4412 4855                      ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4413 4856                          /*
4414 4857                           * We've evicted to the target address,
4415 4858                           * or the end of the device.
4416 4859                           */
4417 4860                          mutex_exit(hash_lock);
4418 4861                          break;
4419 4862                  }
4420 4863  
4421 4864                  if (HDR_FREE_IN_PROGRESS(ab)) {
4422 4865                          /*
4423 4866                           * Already on the path to destruction.
4424 4867                           */
4425 4868                          mutex_exit(hash_lock);
4426 4869                          continue;
4427 4870                  }
4428 4871  
4429 4872                  if (ab->b_state == arc_l2c_only) {
4430 4873                          ASSERT(!HDR_L2_READING(ab));
4431 4874                          /*
4432 4875                           * This doesn't exist in the ARC.  Destroy.
4433 4876                           * arc_hdr_destroy() will call list_remove()
4434 4877                           * and decrement arcstat_l2_size.
4435 4878                           */
4436 4879                          arc_change_state(arc_anon, ab, hash_lock);
4437 4880                          arc_hdr_destroy(ab);
4438 4881                  } else {
4439 4882                          /*
4440 4883                           * Invalidate issued or about to be issued
4441 4884                           * reads, since we may be about to write
4442 4885                           * over this location.
4443 4886                           */
4444 4887                          if (HDR_L2_READING(ab)) {
4445 4888                                  ARCSTAT_BUMP(arcstat_l2_evict_reading);
4446 4889                                  ab->b_flags |= ARC_L2_EVICTED;
4447 4890                          }
4448 4891  
4449 4892                          /*
4450 4893                           * Tell ARC this no longer exists in L2ARC.
4451 4894                           */
4452 4895                          if (ab->b_l2hdr != NULL) {
4453 4896                                  abl2 = ab->b_l2hdr;
4454 4897                                  ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4455 4898                                  ab->b_l2hdr = NULL;
4456 4899                                  kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4457 4900                                  ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4458 4901                          }
4459 4902                          list_remove(buflist, ab);
4460 4903  
4461 4904                          /*
4462 4905                           * This may have been leftover after a
4463 4906                           * failed write.
4464 4907                           */
4465 4908                          ab->b_flags &= ~ARC_L2_WRITING;
4466 4909                  }
4467 4910                  mutex_exit(hash_lock);
4468 4911          }
4469 4912          mutex_exit(&l2arc_buflist_mtx);
4470 4913  
4471 4914          vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4472 4915          dev->l2ad_evict = taddr;
4473 4916  }
4474 4917  
4475 4918  /*
4476 4919   * Find and write ARC buffers to the L2ARC device.
4477 4920   *
4478 4921   * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4479 4922   * for reading until they have completed writing.
4480 4923   * The headroom_boost is an in-out parameter used to maintain headroom boost
4481 4924   * state between calls to this function.
4482 4925   *
4483 4926   * Returns the number of bytes actually written (which may be smaller than
4484 4927   * the delta by which the device hand has changed due to alignment).
4485 4928   */
4486 4929  static uint64_t
4487 4930  l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4488 4931      boolean_t *headroom_boost)
4489 4932  {
4490 4933          arc_buf_hdr_t *ab, *ab_prev, *head;
4491 4934          list_t *list;

↓ open down ↓

261 lines elided

↑ open up ↑

4492 4935          uint64_t write_asize, write_psize, write_sz, headroom,
4493 4936              buf_compress_minsz;
4494 4937          void *buf_data;
4495 4938          kmutex_t *list_lock;
4496 4939          boolean_t full;
4497 4940          l2arc_write_callback_t *cb;
4498 4941          zio_t *pio, *wzio;
4499 4942          uint64_t guid = spa_load_guid(spa);
4500 4943          const boolean_t do_headroom_boost = *headroom_boost;
4501 4944  
     4945 +        /* persistency-related */
     4946 +        l2pbuf_t *pb;
     4947 +        l2pbuf_buflist_t *pb_buflist;
     4948 +        int num_bufs, buf_index;
     4949 +
4502 4950          ASSERT(dev->l2ad_vdev != NULL);
4503 4951  
4504 4952          /* Lower the flag now, we might want to raise it again later. */
4505 4953          *headroom_boost = B_FALSE;
4506 4954  
4507 4955          pio = NULL;
     4956 +        cb = NULL;
4508 4957          write_sz = write_asize = write_psize = 0;
4509 4958          full = B_FALSE;
4510 4959          head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4511 4960          head->b_flags |= ARC_L2_WRITE_HEAD;
4512 4961  
4513 4962          /*
4514 4963           * We will want to try to compress buffers that are at least 2x the
4515 4964           * device sector size.
4516 4965           */
4517 4966          buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4518 4967  
     4968 +        pb = &dev->l2ad_pbuf;
     4969 +        num_bufs = 0;
     4970 +
4519 4971          /*
     4972 +         * We will want to try to compress buffers that are at least 2x the
     4973 +         * device sector size.
     4974 +         */
     4975 +        buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
     4976 +
     4977 +        /*
4520 4978           * Copy buffers for L2ARC writing.
4521 4979           */
4522 4980          mutex_enter(&l2arc_buflist_mtx);
4523 4981          for (int try = 0; try <= 3; try++) {
4524 4982                  uint64_t passed_sz = 0;
4525 4983  
4526 4984                  list = l2arc_list_locked(try, &list_lock);
4527 4985  
4528 4986                  /*
4529 4987                   * L2ARC fast warmup.

4530 4988                   *
4531 4989                   * Until the ARC is warm and starts to evict, read from the
4532 4990                   * head of the ARC lists rather than the tail.
4533 4991                   */
4534 4992                  if (arc_warm == B_FALSE)
4535 4993                          ab = list_head(list);
4536 4994                  else
4537 4995                          ab = list_tail(list);
4538 4996  
4539 4997                  headroom = target_sz * l2arc_headroom;
4540 4998                  if (do_headroom_boost)
4541 4999                          headroom = (headroom * l2arc_headroom_boost) / 100;
4542 5000  
4543 5001                  for (; ab; ab = ab_prev) {
4544 5002                          l2arc_buf_hdr_t *l2hdr;
4545 5003                          kmutex_t *hash_lock;
4546 5004                          uint64_t buf_sz;
4547 5005  
4548 5006                          if (arc_warm == B_FALSE)
4549 5007                                  ab_prev = list_next(list, ab);
4550 5008                          else
4551 5009                                  ab_prev = list_prev(list, ab);
4552 5010  
4553 5011                          hash_lock = HDR_LOCK(ab);
4554 5012                          if (!mutex_tryenter(hash_lock)) {
4555 5013                                  /*
4556 5014                                   * Skip this buffer rather than waiting.
4557 5015                                   */
4558 5016                                  continue;
4559 5017                          }
4560 5018  
4561 5019                          passed_sz += ab->b_size;
4562 5020                          if (passed_sz > headroom) {
4563 5021                                  /*
4564 5022                                   * Searched too far.
4565 5023                                   */
4566 5024                                  mutex_exit(hash_lock);
4567 5025                                  break;
4568 5026                          }
4569 5027  
4570 5028                          if (!l2arc_write_eligible(guid, ab)) {
4571 5029                                  mutex_exit(hash_lock);
4572 5030                                  continue;
4573 5031                          }
4574 5032  
4575 5033                          if ((write_sz + ab->b_size) > target_sz) {
4576 5034                                  full = B_TRUE;
4577 5035                                  mutex_exit(hash_lock);
4578 5036                                  break;

↓ open down ↓

49 lines elided

↑ open up ↑

4579 5037                          }
4580 5038  
4581 5039                          if (pio == NULL) {
4582 5040                                  /*
4583 5041                                   * Insert a dummy header on the buflist so
4584 5042                                   * l2arc_write_done() can find where the
4585 5043                                   * write buffers begin without searching.
4586 5044                                   */
4587 5045                                  list_insert_head(dev->l2ad_buflist, head);
4588 5046  
4589      -                                cb = kmem_alloc(
     5047 +                                cb = kmem_zalloc(
4590 5048                                      sizeof (l2arc_write_callback_t), KM_SLEEP);
4591 5049                                  cb->l2wcb_dev = dev;
4592 5050                                  cb->l2wcb_head = head;
4593 5051                                  pio = zio_root(spa, l2arc_write_done, cb,
4594 5052                                      ZIO_FLAG_CANFAIL);
4595 5053                          }
4596 5054  
4597 5055                          /*
4598 5056                           * Create and add a new L2ARC header.
4599 5057                           */

4600 5058                          l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4601 5059                          l2hdr->b_dev = dev;
4602 5060                          ab->b_flags |= ARC_L2_WRITING;
4603 5061  
4604 5062                          /*
4605 5063                           * Temporarily stash the data buffer in b_tmp_cdata.
4606 5064                           * The subsequent write step will pick it up from
4607 5065                           * there. This is because can't access ab->b_buf
4608 5066                           * without holding the hash_lock, which we in turn
4609 5067                           * can't access without holding the ARC list locks
4610 5068                           * (which we want to avoid during compression/writing).
4611 5069                           */
4612 5070                          l2hdr->b_compress = ZIO_COMPRESS_OFF;
4613 5071                          l2hdr->b_asize = ab->b_size;
4614 5072                          l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4615 5073  
4616 5074                          buf_sz = ab->b_size;
4617 5075                          ab->b_l2hdr = l2hdr;
4618 5076  
4619 5077                          list_insert_head(dev->l2ad_buflist, ab);
4620 5078

↓ open down ↓

21 lines elided

↑ open up ↑

4621 5079                          /*
4622 5080                           * Compute and store the buffer cksum before
4623 5081                           * writing.  On debug the cksum is verified first.
4624 5082                           */
4625 5083                          arc_cksum_verify(ab->b_buf);
4626 5084                          arc_cksum_compute(ab->b_buf, B_TRUE);
4627 5085  
4628 5086                          mutex_exit(hash_lock);
4629 5087  
4630 5088                          write_sz += buf_sz;
     5089 +                        num_bufs++;
4631 5090                  }
4632 5091  
4633 5092                  mutex_exit(list_lock);
4634 5093  
4635 5094                  if (full == B_TRUE)
4636 5095                          break;
4637 5096          }
4638 5097  
4639 5098          /* No buffers selected for writing? */
4640 5099          if (pio == NULL) {
4641 5100                  ASSERT0(write_sz);
4642 5101                  mutex_exit(&l2arc_buflist_mtx);
4643 5102                  kmem_cache_free(hdr_cache, head);
4644 5103                  return (0);
4645 5104          }
4646 5105  
     5106 +        /* expand the pbuf to include a new list */
     5107 +        pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs);
     5108 +
4647 5109          /*
4648 5110           * Now start writing the buffers. We're starting at the write head
4649 5111           * and work backwards, retracing the course of the buffer selector
4650 5112           * loop above.
4651 5113           */
4652      -        for (ab = list_prev(dev->l2ad_buflist, head); ab;
4653      -            ab = list_prev(dev->l2ad_buflist, ab)) {
     5114 +        for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab;
     5115 +            ab = list_prev(dev->l2ad_buflist, ab), buf_index++) {
4654 5116                  l2arc_buf_hdr_t *l2hdr;
4655 5117                  uint64_t buf_sz;
4656 5118  
4657 5119                  /*
4658 5120                   * We shouldn't need to lock the buffer here, since we flagged
4659 5121                   * it as ARC_L2_WRITING in the previous step, but we must take
4660 5122                   * care to only access its L2 cache parameters. In particular,
4661 5123                   * ab->b_buf may be invalid by now due to ARC eviction.
4662 5124                   */
4663 5125                  l2hdr = ab->b_l2hdr;

4664 5126                  l2hdr->b_daddr = dev->l2ad_hand;
4665 5127  
4666 5128                  if ((ab->b_flags & ARC_L2COMPRESS) &&
4667 5129                      l2hdr->b_asize >= buf_compress_minsz) {
4668 5130                          if (l2arc_compress_buf(l2hdr)) {
4669 5131                                  /*
4670 5132                                   * If compression succeeded, enable headroom
4671 5133                                   * boost on the next scan cycle.
4672 5134                                   */
4673 5135                                  *headroom_boost = B_TRUE;
4674 5136                          }
4675 5137                  }
4676 5138  
4677 5139                  /*
4678 5140                   * Pick up the buffer data we had previously stashed away
4679 5141                   * (and now potentially also compressed).
4680 5142                   */
4681 5143                  buf_data = l2hdr->b_tmp_cdata;
4682 5144                  buf_sz = l2hdr->b_asize;
4683 5145  
4684 5146                  /* Compression may have squashed the buffer to zero length. */
4685 5147                  if (buf_sz != 0) {
4686 5148                          uint64_t buf_p_sz;
4687 5149  
4688 5150                          wzio = zio_write_phys(pio, dev->l2ad_vdev,
4689 5151                              dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4690 5152                              NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4691 5153                              ZIO_FLAG_CANFAIL, B_FALSE);
4692 5154  
4693 5155                          DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4694 5156                              zio_t *, wzio);

↓ open down ↓

31 lines elided

↑ open up ↑

4695 5157                          (void) zio_nowait(wzio);
4696 5158  
4697 5159                          write_asize += buf_sz;
4698 5160                          /*
4699 5161                           * Keep the clock hand suitably device-aligned.
4700 5162                           */
4701 5163                          buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4702 5164                          write_psize += buf_p_sz;
4703 5165                          dev->l2ad_hand += buf_p_sz;
4704 5166                  }
4705      -        }
4706 5167  
     5168 +                l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index);
     5169 +        }
     5170 +        ASSERT(buf_index == num_bufs);
4707 5171          mutex_exit(&l2arc_buflist_mtx);
4708 5172  
4709 5173          ASSERT3U(write_asize, <=, target_sz);
4710 5174          ARCSTAT_BUMP(arcstat_l2_writes_sent);
4711 5175          ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4712 5176          ARCSTAT_INCR(arcstat_l2_size, write_sz);
4713 5177          ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4714 5178          vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4715 5179  
     5180 +        /* Is it time to commit this pbuf? */
     5181 +        if (L2PBUF_IS_FULL(pb) &&
     5182 +            dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) {
     5183 +                l2arc_pbuf_commit(dev, pio, cb);
     5184 +                l2arc_pbuf_destroy(pb);
     5185 +                l2arc_pbuf_init(pb);
     5186 +        }
     5187 +
4716 5188          /*
4717 5189           * Bump device hand to the device start if it is approaching the end.
4718 5190           * l2arc_evict() will already have evicted ahead for this case.
4719 5191           */
4720 5192          if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4721 5193                  vdev_space_update(dev->l2ad_vdev,
4722 5194                      dev->l2ad_end - dev->l2ad_hand, 0, 0);
4723 5195                  dev->l2ad_hand = dev->l2ad_start;
4724 5196                  dev->l2ad_evict = dev->l2ad_start;
4725 5197                  dev->l2ad_first = B_FALSE;

4726 5198          }
4727 5199  
4728 5200          dev->l2ad_writing = B_TRUE;
4729 5201          (void) zio_wait(pio);
4730 5202          dev->l2ad_writing = B_FALSE;
4731 5203  
4732 5204          return (write_asize);
4733 5205  }
4734 5206  
4735 5207  /*
4736 5208   * Compresses an L2ARC buffer.
4737 5209   * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4738 5210   * size in l2hdr->b_asize. This routine tries to compress the data and
4739 5211   * depending on the compression result there are three possible outcomes:
4740 5212   * *) The buffer was incompressible. The original l2hdr contents were left
4741 5213   *    untouched and are ready for writing to an L2 device.
4742 5214   * *) The buffer was all-zeros, so there is no need to write it to an L2
4743 5215   *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4744 5216   *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4745 5217   * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4746 5218   *    data buffer which holds the compressed data to be written, and b_asize
4747 5219   *    tells us how much data there is. b_compress is set to the appropriate
4748 5220   *    compression algorithm. Once writing is done, invoke
4749 5221   *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4750 5222   *
4751 5223   * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4752 5224   * buffer was incompressible).
4753 5225   */
4754 5226  static boolean_t
4755 5227  l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4756 5228  {
4757 5229          void *cdata;
4758 5230          size_t csize, len;
4759 5231  
4760 5232          ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4761 5233          ASSERT(l2hdr->b_tmp_cdata != NULL);
4762 5234  
4763 5235          len = l2hdr->b_asize;
4764 5236          cdata = zio_data_buf_alloc(len);
4765 5237          csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4766 5238              cdata, l2hdr->b_asize);
4767 5239  
4768 5240          if (csize == 0) {
4769 5241                  /* zero block, indicate that there's nothing to write */
4770 5242                  zio_data_buf_free(cdata, len);
4771 5243                  l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4772 5244                  l2hdr->b_asize = 0;
4773 5245                  l2hdr->b_tmp_cdata = NULL;
4774 5246                  ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4775 5247                  return (B_TRUE);
4776 5248          } else if (csize > 0 && csize < len) {
4777 5249                  /*
4778 5250                   * Compression succeeded, we'll keep the cdata around for
4779 5251                   * writing and release it afterwards.
4780 5252                   */
4781 5253                  l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4782 5254                  l2hdr->b_asize = csize;
4783 5255                  l2hdr->b_tmp_cdata = cdata;
4784 5256                  ARCSTAT_BUMP(arcstat_l2_compress_successes);
4785 5257                  return (B_TRUE);
4786 5258          } else {
4787 5259                  /*
4788 5260                   * Compression failed, release the compressed buffer.
4789 5261                   * l2hdr will be left unmodified.
4790 5262                   */
4791 5263                  zio_data_buf_free(cdata, len);
4792 5264                  ARCSTAT_BUMP(arcstat_l2_compress_failures);
4793 5265                  return (B_FALSE);
4794 5266          }
4795 5267  }
4796 5268  
4797 5269  /*
4798 5270   * Decompresses a zio read back from an l2arc device. On success, the
4799 5271   * underlying zio's io_data buffer is overwritten by the uncompressed
4800 5272   * version. On decompression error (corrupt compressed stream), the
4801 5273   * zio->io_error value is set to signal an I/O error.
4802 5274   *
4803 5275   * Please note that the compressed data stream is not checksummed, so
4804 5276   * if the underlying device is experiencing data corruption, we may feed
4805 5277   * corrupt data to the decompressor, so the decompressor needs to be
4806 5278   * able to handle this situation (LZ4 does).
4807 5279   */
4808 5280  static void
4809 5281  l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4810 5282  {
4811 5283          ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4812 5284  
4813 5285          if (zio->io_error != 0) {
4814 5286                  /*
4815 5287                   * An io error has occured, just restore the original io
4816 5288                   * size in preparation for a main pool read.
4817 5289                   */
4818 5290                  zio->io_orig_size = zio->io_size = hdr->b_size;
4819 5291                  return;
4820 5292          }
4821 5293  
4822 5294          if (c == ZIO_COMPRESS_EMPTY) {
4823 5295                  /*
4824 5296                   * An empty buffer results in a null zio, which means we
4825 5297                   * need to fill its io_data after we're done restoring the
4826 5298                   * buffer's contents.
4827 5299                   */
4828 5300                  ASSERT(hdr->b_buf != NULL);
4829 5301                  bzero(hdr->b_buf->b_data, hdr->b_size);
4830 5302                  zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
4831 5303          } else {
4832 5304                  ASSERT(zio->io_data != NULL);
4833 5305                  /*
4834 5306                   * We copy the compressed data from the start of the arc buffer
4835 5307                   * (the zio_read will have pulled in only what we need, the
4836 5308                   * rest is garbage which we will overwrite at decompression)
4837 5309                   * and then decompress back to the ARC data buffer. This way we
4838 5310                   * can minimize copying by simply decompressing back over the
4839 5311                   * original compressed data (rather than decompressing to an
4840 5312                   * aux buffer and then copying back the uncompressed buffer,
4841 5313                   * which is likely to be much larger).
4842 5314                   */
4843 5315                  uint64_t csize;
4844 5316                  void *cdata;
4845 5317  
4846 5318                  csize = zio->io_size;
4847 5319                  cdata = zio_data_buf_alloc(csize);
4848 5320                  bcopy(zio->io_data, cdata, csize);
4849 5321                  if (zio_decompress_data(c, cdata, zio->io_data, csize,
4850 5322                      hdr->b_size) != 0)
4851 5323                          zio->io_error = EIO;
4852 5324                  zio_data_buf_free(cdata, csize);
4853 5325          }
4854 5326  
4855 5327          /* Restore the expected uncompressed IO size. */
4856 5328          zio->io_orig_size = zio->io_size = hdr->b_size;
4857 5329  }
4858 5330  
4859 5331  /*
4860 5332   * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
4861 5333   * This buffer serves as a temporary holder of compressed data while
4862 5334   * the buffer entry is being written to an l2arc device. Once that is
4863 5335   * done, we can dispose of it.
4864 5336   */
4865 5337  static void
4866 5338  l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
4867 5339  {
4868 5340          l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4869 5341  
4870 5342          if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
4871 5343                  /*
4872 5344                   * If the data was compressed, then we've allocated a
4873 5345                   * temporary buffer for it, so now we need to release it.
4874 5346                   */
4875 5347                  ASSERT(l2hdr->b_tmp_cdata != NULL);
4876 5348                  zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
4877 5349          }
4878 5350          l2hdr->b_tmp_cdata = NULL;
4879 5351  }
4880 5352  
4881 5353  /*
4882 5354   * This thread feeds the L2ARC at regular intervals.  This is the beating
4883 5355   * heart of the L2ARC.
4884 5356   */
4885 5357  static void
4886 5358  l2arc_feed_thread(void)
4887 5359  {
4888 5360          callb_cpr_t cpr;
4889 5361          l2arc_dev_t *dev;
4890 5362          spa_t *spa;
4891 5363          uint64_t size, wrote;
4892 5364          clock_t begin, next = ddi_get_lbolt();
4893 5365          boolean_t headroom_boost = B_FALSE;
4894 5366  
4895 5367          CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4896 5368  
4897 5369          mutex_enter(&l2arc_feed_thr_lock);
4898 5370  
4899 5371          while (l2arc_thread_exit == 0) {
4900 5372                  CALLB_CPR_SAFE_BEGIN(&cpr);
4901 5373                  (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4902 5374                      next);
4903 5375                  CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4904 5376                  next = ddi_get_lbolt() + hz;
4905 5377  
4906 5378                  /*
4907 5379                   * Quick check for L2ARC devices.
4908 5380                   */
4909 5381                  mutex_enter(&l2arc_dev_mtx);
4910 5382                  if (l2arc_ndev == 0) {
4911 5383                          mutex_exit(&l2arc_dev_mtx);
4912 5384                          continue;
4913 5385                  }
4914 5386                  mutex_exit(&l2arc_dev_mtx);
4915 5387                  begin = ddi_get_lbolt();
4916 5388  
4917 5389                  /*
4918 5390                   * This selects the next l2arc device to write to, and in
4919 5391                   * doing so the next spa to feed from: dev->l2ad_spa.   This
4920 5392                   * will return NULL if there are now no l2arc devices or if
4921 5393                   * they are all faulted.
4922 5394                   *
4923 5395                   * If a device is returned, its spa's config lock is also
4924 5396                   * held to prevent device removal.  l2arc_dev_get_next()
4925 5397                   * will grab and release l2arc_dev_mtx.
4926 5398                   */
4927 5399                  if ((dev = l2arc_dev_get_next()) == NULL)
4928 5400                          continue;
4929 5401  
4930 5402                  spa = dev->l2ad_spa;
4931 5403                  ASSERT(spa != NULL);
4932 5404  
4933 5405                  /*
4934 5406                   * If the pool is read-only then force the feed thread to
4935 5407                   * sleep a little longer.
4936 5408                   */
4937 5409                  if (!spa_writeable(spa)) {
4938 5410                          next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4939 5411                          spa_config_exit(spa, SCL_L2ARC, dev);
4940 5412                          continue;
4941 5413                  }
4942 5414  
4943 5415                  /*
4944 5416                   * Avoid contributing to memory pressure.
4945 5417                   */
4946 5418                  if (arc_reclaim_needed()) {
4947 5419                          ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4948 5420                          spa_config_exit(spa, SCL_L2ARC, dev);
4949 5421                          continue;
4950 5422                  }
4951 5423  
4952 5424                  ARCSTAT_BUMP(arcstat_l2_feeds);
4953 5425  
4954 5426                  size = l2arc_write_size();
4955 5427  
4956 5428                  /*
4957 5429                   * Evict L2ARC buffers that will be overwritten.
4958 5430                   */
4959 5431                  l2arc_evict(dev, size, B_FALSE);
4960 5432  
4961 5433                  /*
4962 5434                   * Write ARC buffers.
4963 5435                   */
4964 5436                  wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
4965 5437  
4966 5438                  /*
4967 5439                   * Calculate interval between writes.
4968 5440                   */
4969 5441                  next = l2arc_write_interval(begin, size, wrote);
4970 5442                  spa_config_exit(spa, SCL_L2ARC, dev);
4971 5443          }
4972 5444  
4973 5445          l2arc_thread_exit = 0;
4974 5446          cv_broadcast(&l2arc_feed_thr_cv);
4975 5447          CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
4976 5448          thread_exit();
4977 5449  }
4978 5450  
4979 5451  boolean_t
4980 5452  l2arc_vdev_present(vdev_t *vd)
4981 5453  {
4982 5454          l2arc_dev_t *dev;
4983 5455  
4984 5456          mutex_enter(&l2arc_dev_mtx);
4985 5457          for (dev = list_head(l2arc_dev_list); dev != NULL;
4986 5458              dev = list_next(l2arc_dev_list, dev)) {

↓ open down ↓

261 lines elided

↑ open up ↑

4987 5459                  if (dev->l2ad_vdev == vd)
4988 5460                          break;
4989 5461          }
4990 5462          mutex_exit(&l2arc_dev_mtx);
4991 5463  
4992 5464          return (dev != NULL);
4993 5465  }
4994 5466  
4995 5467  /*
4996 5468   * Add a vdev for use by the L2ARC.  By this point the spa has already
4997      - * validated the vdev and opened it.
     5469 + * validated the vdev and opened it. The `rebuild' flag indicates whether
     5470 + * we should attempt an L2ARC persistency rebuild.
4998 5471   */
4999 5472  void
5000      -l2arc_add_vdev(spa_t *spa, vdev_t *vd)
     5473 +l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5001 5474  {
5002 5475          l2arc_dev_t *adddev;
5003 5476  
5004 5477          ASSERT(!l2arc_vdev_present(vd));
5005 5478  
5006 5479          /*
5007 5480           * Create a new l2arc device entry.
5008 5481           */
5009 5482          adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5010 5483          adddev->l2ad_spa = spa;
5011 5484          adddev->l2ad_vdev = vd;
5012      -        adddev->l2ad_start = VDEV_LABEL_START_SIZE;
     5485 +        adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE;
5013 5486          adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5014 5487          adddev->l2ad_hand = adddev->l2ad_start;
5015 5488          adddev->l2ad_evict = adddev->l2ad_start;
5016 5489          adddev->l2ad_first = B_TRUE;
5017 5490          adddev->l2ad_writing = B_FALSE;
     5491 +        l2arc_pbuf_init(&adddev->l2ad_pbuf);
5018 5492  
5019 5493          /*
5020 5494           * This is a list of all ARC buffers that are still valid on the
5021 5495           * device.
5022 5496           */
5023 5497          adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5024 5498          list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5025 5499              offsetof(arc_buf_hdr_t, b_l2node));
5026 5500  
5027 5501          vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5028 5502  
5029 5503          /*
5030 5504           * Add device to global list
5031 5505           */
5032 5506          mutex_enter(&l2arc_dev_mtx);
5033 5507          list_insert_head(l2arc_dev_list, adddev);
5034 5508          atomic_inc_64(&l2arc_ndev);
     5509 +        if (rebuild && l2arc_rebuild_enabled) {
     5510 +                adddev->l2ad_rebuilding = B_TRUE;
     5511 +                (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev,
     5512 +                    0, &p0, TS_RUN, minclsyspri);
     5513 +        }
5035 5514          mutex_exit(&l2arc_dev_mtx);
5036 5515  }
5037 5516  
5038 5517  /*
5039 5518   * Remove a vdev from the L2ARC.
5040 5519   */
5041 5520  void
5042 5521  l2arc_remove_vdev(vdev_t *vd)
5043 5522  {
5044 5523          l2arc_dev_t *dev, *nextdev, *remdev = NULL;

5045 5524  
5046 5525          /*
5047 5526           * Find the device by vdev
5048 5527           */
5049 5528          mutex_enter(&l2arc_dev_mtx);
5050 5529          for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5051 5530                  nextdev = list_next(l2arc_dev_list, dev);
5052 5531                  if (vd == dev->l2ad_vdev) {
5053 5532                          remdev = dev;
5054 5533                          break;
5055 5534                  }
5056 5535          }
5057 5536          ASSERT(remdev != NULL);
5058 5537  
5059 5538          /*

↓ open down ↓

15 lines elided

↑ open up ↑

5060 5539           * Remove device from global list
5061 5540           */
5062 5541          list_remove(l2arc_dev_list, remdev);
5063 5542          l2arc_dev_last = NULL;          /* may have been invalidated */
5064 5543          atomic_dec_64(&l2arc_ndev);
5065 5544          mutex_exit(&l2arc_dev_mtx);
5066 5545  
5067 5546          /*
5068 5547           * Clear all buflists and ARC references.  L2ARC device flush.
5069 5548           */
     5549 +        l2arc_pbuf_destroy(&remdev->l2ad_pbuf);
5070 5550          l2arc_evict(remdev, 0, B_TRUE);
5071 5551          list_destroy(remdev->l2ad_buflist);
5072 5552          kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5073 5553          kmem_free(remdev, sizeof (l2arc_dev_t));
5074 5554  }
5075 5555  
5076 5556  void
5077 5557  l2arc_init(void)
5078 5558  {
5079 5559          l2arc_thread_exit = 0;

5080 5560          l2arc_ndev = 0;
5081 5561          l2arc_writes_sent = 0;
5082 5562          l2arc_writes_done = 0;
5083 5563  
5084 5564          mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5085 5565          cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5086 5566          mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5087 5567          mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5088 5568          mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5089 5569  
5090 5570          l2arc_dev_list = &L2ARC_dev_list;
5091 5571          l2arc_free_on_write = &L2ARC_free_on_write;
5092 5572          list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5093 5573              offsetof(l2arc_dev_t, l2ad_node));
5094 5574          list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5095 5575              offsetof(l2arc_data_free_t, l2df_list_node));
5096 5576  }
5097 5577  
5098 5578  void
5099 5579  l2arc_fini(void)
5100 5580  {
5101 5581          /*
5102 5582           * This is called from dmu_fini(), which is called from spa_fini();
5103 5583           * Because of this, we can assume that all l2arc devices have
5104 5584           * already been removed when the pools themselves were removed.
5105 5585           */
5106 5586  
5107 5587          l2arc_do_free_on_write();
5108 5588  
5109 5589          mutex_destroy(&l2arc_feed_thr_lock);
5110 5590          cv_destroy(&l2arc_feed_thr_cv);
5111 5591          mutex_destroy(&l2arc_dev_mtx);
5112 5592          mutex_destroy(&l2arc_buflist_mtx);
5113 5593          mutex_destroy(&l2arc_free_on_write_mtx);
5114 5594  
5115 5595          list_destroy(l2arc_dev_list);
5116 5596          list_destroy(l2arc_free_on_write);
5117 5597  }
5118 5598  
5119 5599  void
5120 5600  l2arc_start(void)
5121 5601  {
5122 5602          if (!(spa_mode_global & FWRITE))
5123 5603                  return;
5124 5604  
5125 5605          (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5126 5606              TS_RUN, minclsyspri);
5127 5607  }
5128 5608  
5129 5609  void
5130 5610  l2arc_stop(void)

↓ open down ↓

51 lines elided

↑ open up ↑

5131 5611  {
5132 5612          if (!(spa_mode_global & FWRITE))
5133 5613                  return;
5134 5614  
5135 5615          mutex_enter(&l2arc_feed_thr_lock);
5136 5616          cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
5137 5617          l2arc_thread_exit = 1;
5138 5618          while (l2arc_thread_exit != 0)
5139 5619                  cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5140 5620          mutex_exit(&l2arc_feed_thr_lock);
     5621 +}
     5622 +
     5623 +/*
     5624 + * Main entry point for L2ARC metadata rebuilding. This function must be
     5625 + * called via thread_create so that the L2ARC metadata rebuild doesn't block
     5626 + * pool import and may proceed in parallel on all available L2ARC devices.
     5627 + */
     5628 +static void
     5629 +l2arc_rebuild_start(l2arc_dev_t *dev)
     5630 +{
     5631 +        vdev_t *vd = dev->l2ad_vdev;
     5632 +        spa_t *spa = dev->l2ad_spa;
     5633 +
     5634 +        /* Lock out device removal. */
     5635 +        spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
     5636 +        ASSERT(dev->l2ad_rebuilding == B_TRUE);
     5637 +        l2arc_rebuild(dev);
     5638 +        dev->l2ad_rebuilding = B_FALSE;
     5639 +        spa_config_exit(spa, SCL_L2ARC, vd);
     5640 +        thread_exit();
     5641 +}
     5642 +
     5643 +/*
     5644 + * This function implements the actual L2ARC metadata rebuild. It:
     5645 + *
     5646 + * 1) scans the device for valid l2uberblocks
     5647 + * 2) if it finds a good uberblock, starts reading the pbuf chain
     5648 + * 3) restores each pbuf's contents to memory
     5649 + *
     5650 + * Operation stops under any of the following conditions:
     5651 + *
     5652 + * 1) We reach the end of the pbuf chain (the previous-buffer reference
     5653 + *    in the pbuf is zero).
     5654 + * 2) We encounter *any* error condition (cksum errors, io errors, looped
     5655 + *    pbufs, etc.).
     5656 + * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
     5657 + *    from making severely fragmented L2ARC pbufs or slow L2ARC devices
     5658 + *    prevent a machine from importing the pool (and letting the
     5659 + *    administrator take corrective action, e.g. by kicking the misbehaving
     5660 + *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
     5661 + *    rebuilding disabled).
     5662 + */
     5663 +static void
     5664 +l2arc_rebuild(l2arc_dev_t *dev)
     5665 +{
     5666 +        int err;
     5667 +        l2uberblock_t ub;
     5668 +        l2pbuf_t pb;
     5669 +        zio_t *this_io = NULL, *next_io = NULL;
     5670 +        int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
     5671 +
     5672 +        if ((err = l2arc_uberblock_find(dev, &ub)) != 0)
     5673 +                return;
     5674 +        L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */);
     5675 +
     5676 +        /* set up uberblock update info */
     5677 +        dev->l2ad_uberblock_birth = ub.ub_birth + 1;
     5678 +
     5679 +        /* initial sanity checks */
     5680 +        l2arc_pbuf_init(&pb);
     5681 +        if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize,
     5682 +            ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) {
     5683 +                /* root pbuf is bad, we can't do anything about that */
     5684 +                if (err == EINVAL) {
     5685 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
     5686 +                } else {
     5687 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
     5688 +                }
     5689 +                l2arc_pbuf_destroy(&pb);
     5690 +                return;
     5691 +        }
     5692 +        L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
     5693 +
     5694 +        dev->l2ad_evict = ub.ub_evict_tail;
     5695 +
     5696 +        /* keep on chaining in new blocks */
     5697 +        dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr;
     5698 +        dev->l2ad_pbuf_asize = ub.ub_pbuf_asize;
     5699 +        dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum;
     5700 +        dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
     5701 +            ub.ub_pbuf_daddr + ub.ub_pbuf_asize);
     5702 +        dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0);
     5703 +
     5704 +        /* start the rebuild process */
     5705 +        for (;;) {
     5706 +                l2pbuf_t pb_prev;
     5707 +
     5708 +                l2arc_pbuf_init(&pb_prev);
     5709 +                if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr,
     5710 +                    pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io,
     5711 +                    &next_io)) != 0) {
     5712 +                        /*
     5713 +                         * We are done reading, discard the last good buffer.
     5714 +                         */
     5715 +                        if (pb.pb_prev_daddr > dev->l2ad_hand &&
     5716 +                            pb.pb_prev_asize > L2PBUF_HDR_SIZE) {
     5717 +                                /* this is an error, we stopped too early */
     5718 +                                if (err == EINVAL) {
     5719 +                                        ARCSTAT_BUMP(
     5720 +                                            arcstat_l2_rebuild_cksum_errors);
     5721 +                                } else {
     5722 +                                        ARCSTAT_BUMP(
     5723 +                                            arcstat_l2_rebuild_io_errors);
     5724 +                                }
     5725 +                        }
     5726 +                        l2arc_pbuf_destroy(&pb_prev);
     5727 +                        l2arc_pbuf_destroy(&pb);
     5728 +                        break;
     5729 +                }
     5730 +
     5731 +                /*
     5732 +                 * Protection against infinite loops of pbufs. This is also
     5733 +                 * our primary termination mechanism - once the buffer list
     5734 +                 * loops around our starting pbuf, we can stop.
     5735 +                 */
     5736 +                if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr &&
     5737 +                    pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) {
     5738 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
     5739 +                        l2arc_pbuf_destroy(&pb);
     5740 +                        l2arc_pbuf_destroy(&pb_prev);
     5741 +                        if (next_io)
     5742 +                                l2arc_pbuf_prefetch_abort(next_io);
     5743 +                        return;
     5744 +                }
     5745 +
     5746 +                /*
     5747 +                 * Our memory pressure valve. If the system is running low
     5748 +                 * on memory, rather than swamping memory with new ARC buf
     5749 +                 * hdrs, we opt not to reconstruct the L2ARC. At this point,
     5750 +                 * however, we have already set up our L2ARC dev to chain in
     5751 +                 * new metadata pbufs, so the user may choose to re-add the
     5752 +                 * L2ARC dev at a later time to reconstruct it (when there's
     5753 +                 * less memory pressure).
     5754 +                 */
     5755 +                if (arc_reclaim_needed()) {
     5756 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
     5757 +                        cmn_err(CE_NOTE, "System running low on memory, "
     5758 +                            "aborting L2ARC rebuild.");
     5759 +                        l2arc_pbuf_destroy(&pb);
     5760 +                        l2arc_pbuf_destroy(&pb_prev);
     5761 +                        if (next_io)
     5762 +                                l2arc_pbuf_prefetch_abort(next_io);
     5763 +                        break;
     5764 +                }
     5765 +
     5766 +                /*
     5767 +                 * Now that we know that the prev_pbuf checks out alright, we
     5768 +                 * can start reconstruction from this pbuf - we can be sure
     5769 +                 * that the L2ARC write hand has not yet reached any of our
     5770 +                 * buffers.
     5771 +                 */
     5772 +                l2arc_pbuf_restore(dev, &pb);
     5773 +
     5774 +                /* pbuf restored, continue with next one in the list */
     5775 +                l2arc_pbuf_destroy(&pb);
     5776 +                pb = pb_prev;
     5777 +                this_io = next_io;
     5778 +                next_io = NULL;
     5779 +
     5780 +                L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
     5781 +        }
     5782 +
     5783 +        ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
     5784 +}
     5785 +
     5786 +/*
     5787 + * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries
     5788 + * which only contain an l2arc hdr, essentially restoring the buffers to
     5789 + * their L2ARC evicted state. This function also updates space usage on the
     5790 + * L2ARC vdev to make sure it tracks restored buffers.
     5791 + */
     5792 +static void
     5793 +l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb)
     5794 +{
     5795 +        spa_t *spa;
     5796 +        uint64_t guid;
     5797 +        list_t *buflists_list;
     5798 +        l2pbuf_buflist_t *buflist;
     5799 +
     5800 +        mutex_enter(&l2arc_buflist_mtx);
     5801 +        spa = dev->l2ad_vdev->vdev_spa;
     5802 +        guid = spa_load_guid(spa);
     5803 +        buflists_list = pb->pb_buflists_list;
     5804 +        for (buflist = list_head(buflists_list); buflist;
     5805 +            buflist = list_next(buflists_list, buflist)) {
     5806 +                int i;
     5807 +                uint64_t size, asize, psize;
     5808 +
     5809 +                size = asize = psize = 0;
     5810 +                for (i = 0; i < buflist->l2pbl_nbufs; i++) {
     5811 +                        l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev,
     5812 +                            guid);
     5813 +                        size += buflist->l2pbl_bufs[i].b_size;
     5814 +                        asize += buflist->l2pbl_bufs[i].b_l2asize;
     5815 +                        psize += vdev_psize_to_asize(dev->l2ad_vdev,
     5816 +                            buflist->l2pbl_bufs[i].b_l2asize);
     5817 +                }
     5818 +                ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size);
     5819 +                ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize);
     5820 +                ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs);
     5821 +                vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
     5822 +        }
     5823 +        mutex_exit(&l2arc_buflist_mtx);
     5824 +        ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs);
     5825 +        vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev,
     5826 +            pb->pb_asize), 0, 0);
     5827 +}
     5828 +
     5829 +/*
     5830 + * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into
     5831 + * a state indicating that it has been evicted to L2ARC.
     5832 + * The `guid' here is the ARC-load-guid from spa_load_guid.
     5833 + */
     5834 +static void
     5835 +l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid)
     5836 +{
     5837 +        arc_buf_hdr_t *hdr;
     5838 +        kmutex_t *hash_lock;
     5839 +        dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]};
     5840 +
     5841 +        hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock);
     5842 +        if (hdr == NULL) {
     5843 +                /* not in cache, try to insert */
     5844 +                arc_buf_hdr_t *exists;
     5845 +                arc_buf_contents_t type = buf->b_contents_type;
     5846 +                l2arc_buf_hdr_t *l2hdr;
     5847 +
     5848 +                hdr = arc_buf_hdr_alloc(guid, buf->b_size, type);
     5849 +                hdr->b_dva = buf->b_dva;
     5850 +                hdr->b_birth = buf->b_birth;
     5851 +                hdr->b_cksum0 = buf->b_cksum0;
     5852 +                hdr->b_size = buf->b_size;
     5853 +                exists = buf_hash_insert(hdr, &hash_lock);
     5854 +                if (exists) {
     5855 +                        /* somebody beat us to the hash insert */
     5856 +                        mutex_exit(hash_lock);
     5857 +                        arc_hdr_destroy(hdr);
     5858 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
     5859 +                        return;
     5860 +                }
     5861 +                hdr->b_flags = buf->b_flags;
     5862 +                mutex_enter(&hdr->b_freeze_lock);
     5863 +                ASSERT(hdr->b_freeze_cksum == NULL);
     5864 +                hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
     5865 +                    KM_SLEEP);
     5866 +                *hdr->b_freeze_cksum = buf->b_freeze_cksum;
     5867 +                mutex_exit(&hdr->b_freeze_lock);
     5868 +
     5869 +                /* now rebuild the l2arc entry */
     5870 +                ASSERT(hdr->b_l2hdr == NULL);
     5871 +                l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
     5872 +                l2hdr->b_dev = dev;
     5873 +                l2hdr->b_daddr = buf->b_l2daddr;
     5874 +                l2hdr->b_asize = buf->b_l2asize;
     5875 +                l2hdr->b_compress = buf->b_l2compress;
     5876 +                hdr->b_l2hdr = l2hdr;
     5877 +                list_insert_head(dev->l2ad_buflist, hdr);
     5878 +                ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
     5879 +                ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
     5880 +
     5881 +                arc_change_state(arc_l2c_only, hdr, hash_lock);
     5882 +        }
     5883 +        mutex_exit(hash_lock);
     5884 +}
     5885 +
     5886 +/*
     5887 + * Attempts to locate and read the newest valid uberblock on the provided
     5888 + * L2ARC device and writes it to `ub'. On success, this function returns 0,
     5889 + * otherwise the appropriate error code is returned.
     5890 + */
     5891 +static int
     5892 +l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub)
     5893 +{
     5894 +        int err = 0;
     5895 +        uint8_t *ub_buf;
     5896 +        uint64_t guid;
     5897 +
     5898 +        ARCSTAT_BUMP(arcstat_l2_rebuild_attempts);
     5899 +        ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
     5900 +        guid = spa_guid(dev->l2ad_vdev->vdev_spa);
     5901 +
     5902 +        if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
     5903 +            VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf,
     5904 +            ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
     5905 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
     5906 +            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
     5907 +                ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
     5908 +                goto cleanup;
     5909 +        }
     5910 +
     5911 +        /*
     5912 +         * Initial peek - does the device even have any usable uberblocks?
     5913 +         * If not, don't bother continuing.
     5914 +         */
     5915 +        l2arc_uberblock_decode(ub_buf, ub);
     5916 +        if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 ||
     5917 +            ub->ub_version > L2UBERBLOCK_MAX_VERSION ||
     5918 +            ub->ub_spa_guid != guid) {
     5919 +                err = ENOTSUP;
     5920 +                ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported);
     5921 +                goto cleanup;
     5922 +        }
     5923 +
     5924 +        /* now check to make sure that what we selected is okay */
     5925 +        if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) {
     5926 +                if (err == EINVAL) {
     5927 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
     5928 +                } else {
     5929 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors);
     5930 +                }
     5931 +                goto cleanup;
     5932 +        }
     5933 +
     5934 +        /* this uberblock is valid */
     5935 +
     5936 +cleanup:
     5937 +        kmem_free(ub_buf, L2UBERBLOCK_SIZE);
     5938 +        return (err);
     5939 +}
     5940 +
     5941 +/*
     5942 + * Reads a pbuf from storage, decodes it and validates its contents against
     5943 + * the provided checksum. The result is placed in `pb'.
     5944 + *
     5945 + * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching.
     5946 + * When issuing the first pbuf IO during rebuild, you should pass NULL for
     5947 + * `this_io'. This function will then issue a sync IO to read the pbuf and
     5948 + * also issue an async IO to fetch the next pbuf in the pbuf chain. The
     5949 + * prefetch IO is returned in `prefetch_io. On subsequent calls to this
     5950 + * function, pass the value returned in `prefetch_io' from the previous
     5951 + * call as `this_io' and a fresh `prefetch_io' pointer to hold the next
     5952 + * prefetch IO. Prior to the call, you should initialize your `prefetch_io'
     5953 + * pointer to be NULL. If no prefetch IO was issued, the pointer is left
     5954 + * set at NULL.
     5955 + *
     5956 + * Actual prefetching takes place in two steps: a header IO (pi_hdr_io)
     5957 + * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io
     5958 + * IO is used internally in this function to be able to `peek' at the next
     5959 + * buffer's header before the main IO to read it in completely has finished.
     5960 + * We can then begin to issue the IO for the next buffer in the chain before
     5961 + * we are done reading, keeping the L2ARC device's pipeline saturated with
     5962 + * reads (rather than issuing an IO, waiting for it to complete, validating
     5963 + * the returned buffer and issuing the next one). This will make sure that
     5964 + * the rebuild proceeds at maximum read throughput.
     5965 + *
     5966 + * On success, this function returns 0, otherwise it returns an appropriate
     5967 + * error code. On error the prefetching IO is aborted and cleared before
     5968 + * returning from this function. Therefore, if we return `success', the
     5969 + * caller can assume that we have taken care of cleanup of prefetch IOs.
     5970 + */
     5971 +static int
     5972 +l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
     5973 +    zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io)
     5974 +{
     5975 +        int err = 0;
     5976 +        uint64_t prev_pb_start;
     5977 +        uint32_t prev_pb_asize;
     5978 +        zio_cksum_t calc_cksum, prev_pb_cksum;
     5979 +        l2arc_prefetch_info_t *pi = NULL;
     5980 +
     5981 +        ASSERT(dev != NULL);
     5982 +        ASSERT(pb != NULL);
     5983 +        ASSERT(*prefetch_io == NULL);
     5984 +
     5985 +        if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) {
     5986 +                /* We could not have issued a prefetch IO for this */
     5987 +                ASSERT(this_io == NULL);
     5988 +                return (EINVAL);
     5989 +        }
     5990 +
     5991 +        /*
     5992 +         * Check to see if we have issued the IO for this pbuf in a previous
     5993 +         * run. If not, issue it now.
     5994 +         */
     5995 +        if (this_io == NULL)
     5996 +                this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize);
     5997 +
     5998 +        /* Pick up the prefetch info buffer and read its contents */
     5999 +        pi = this_io->io_private;
     6000 +        ASSERT(pi != NULL);
     6001 +        ASSERT(asize <= pi->pi_buflen);
     6002 +
     6003 +        /* Wait for the IO to read this pbuf's header to complete */
     6004 +        if ((err = zio_wait(pi->pi_hdr_io)) != 0) {
     6005 +                (void) zio_wait(this_io);
     6006 +                goto cleanup;
     6007 +        }
     6008 +
     6009 +        /*
     6010 +         * Peek to see if we can start issuing the next pbuf IO immediately.
     6011 +         * At this point, only the current pbuf's header has been read.
     6012 +         */
     6013 +        if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start,
     6014 +            &prev_pb_asize, &prev_pb_cksum) == 0) {
     6015 +                uint64_t this_pb_start, this_pb_end, prev_pb_end;
     6016 +                /* Detect malformed pbuf references and loops */
     6017 +                this_pb_start = daddr;
     6018 +                this_pb_end = daddr + asize;
     6019 +                prev_pb_end = prev_pb_start + prev_pb_asize;
     6020 +                if ((prev_pb_start >= this_pb_start && prev_pb_start <
     6021 +                    this_pb_end) ||
     6022 +                    (prev_pb_end >= this_pb_start && prev_pb_end <
     6023 +                    this_pb_end)) {
     6024 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
     6025 +                        cmn_err(CE_WARN, "Looping L2ARC metadata reference "
     6026 +                            "detected, aborting rebuild.");
     6027 +                        err = EINVAL;
     6028 +                        goto cleanup;
     6029 +                }
     6030 +                /*
     6031 +                 * Start issuing IO for the next pbuf early - this should
     6032 +                 * help keep the L2ARC device busy while we read, decode
     6033 +                 * and restore this pbuf.
     6034 +                 */
     6035 +                if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize))
     6036 +                        *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev,
     6037 +                            prev_pb_start, prev_pb_asize);
     6038 +        }
     6039 +
     6040 +        /* Wait for the main pbuf IO to complete */
     6041 +        if ((err = zio_wait(this_io)) != 0)
     6042 +                goto cleanup;
     6043 +
     6044 +        /* Make sure the buffer checks out ok */
     6045 +        fletcher_4_native(pi->pi_buf, asize, &calc_cksum);
     6046 +        if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) {
     6047 +                err = EINVAL;
     6048 +                goto cleanup;
     6049 +        }
     6050 +
     6051 +        /* Now we can take our time decoding this buffer */
     6052 +        if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0)
     6053 +                goto cleanup;
     6054 +
     6055 +        /* This will be used in l2arc_pbuf_restore for space accounting */
     6056 +        pb->pb_asize = asize;
     6057 +
     6058 +        ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb));
     6059 +        ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize);
     6060 +        ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
     6061 +            pb->pb_payload_asz / asize);
     6062 +
     6063 +cleanup:
     6064 +        kmem_free(pi->pi_buf, pi->pi_buflen);
     6065 +        pi->pi_buf = NULL;
     6066 +        kmem_free(pi, sizeof (l2arc_prefetch_info_t));
     6067 +        /* Abort an in-flight prefetch in case of error */
     6068 +        if (err != 0 && *prefetch_io != NULL) {
     6069 +                l2arc_pbuf_prefetch_abort(*prefetch_io);
     6070 +                *prefetch_io = NULL;
     6071 +        }
     6072 +        return (err);
     6073 +}
     6074 +
     6075 +/*
     6076 + * Validates a pbuf device address to make sure that it can be read
     6077 + * from the provided L2ARC device. Returns 1 if the address is within
     6078 + * the device's bounds, or 0 if not.
     6079 + */
     6080 +static int
     6081 +l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize)
     6082 +{
     6083 +        uint32_t psize;
     6084 +        uint64_t end;
     6085 +
     6086 +        psize = vdev_psize_to_asize(dev->l2ad_vdev, asize);
     6087 +        end = daddr + psize;
     6088 +
     6089 +        if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE ||
     6090 +            asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start ||
     6091 +            /* check that the buffer address is correctly aligned */
     6092 +            (daddr & (vdev_psize_to_asize(dev->l2ad_vdev,
     6093 +            SPA_MINBLOCKSIZE) - 1)) != 0)
     6094 +                return (0);
     6095 +        else
     6096 +                return (1);
     6097 +}
     6098 +
     6099 +/*
     6100 + * Starts an asynchronous read IO to read a pbuf. This is used in pbuf
     6101 + * reconstruction to start reading the next pbuf before we are done
     6102 + * decoding and reconstructing the current pbuf, to keep the l2arc device
     6103 + * nice and hot with read IO to process.
     6104 + * The returned zio will contain a newly allocated memory buffers for the IO
     6105 + * data which should then be freed by the caller once the zio is no longer
     6106 + * needed (i.e. due to it having completed). If you wish to abort this
     6107 + * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care
     6108 + * of disposing of the allocated buffers correctly.
     6109 + */
     6110 +static zio_t *
     6111 +l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize)
     6112 +{
     6113 +        uint32_t i, psize;
     6114 +        zio_t *pio, *hdr_io;
     6115 +        uint64_t hdr_rsize;
     6116 +        uint8_t *buf;
     6117 +        l2arc_prefetch_info_t *pinfo;
     6118 +
     6119 +        psize = vdev_psize_to_asize(vd, asize);
     6120 +        buf = kmem_alloc(psize, KM_SLEEP);
     6121 +        pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP);
     6122 +        pinfo->pi_buf = buf;
     6123 +        pinfo->pi_buflen = psize;
     6124 +
     6125 +        /*
     6126 +         * We start issuing the IO for the pbuf header early. This
     6127 +         * allows l2arc_pbuf_read to start issuing IO for the next
     6128 +         * buffer before the current pbuf is read in completely.
     6129 +         */
     6130 +
     6131 +        hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE);
     6132 +        ASSERT(hdr_rsize <= psize);
     6133 +        pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL,
     6134 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
     6135 +            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
     6136 +        hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf,
     6137 +            ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
     6138 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
     6139 +            ZIO_FLAG_DONT_RETRY, B_FALSE);
     6140 +        (void) zio_nowait(hdr_io);
     6141 +
     6142 +        /*
     6143 +         * Read in the rest of the pbuf - this can take longer than just
     6144 +         * having a peek at the header.
     6145 +         */
     6146 +        pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE |
     6147 +            ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
     6148 +            ZIO_FLAG_DONT_RETRY);
     6149 +        for (i = hdr_rsize; i < psize; ) {
     6150 +                uint64_t rsize = psize - i;
     6151 +                zio_t *rzio;
     6152 +
     6153 +                if (psize - i > SPA_MAXBLOCKSIZE)
     6154 +                        rsize = SPA_MAXBLOCKSIZE;
     6155 +                ASSERT(rsize >= SPA_MINBLOCKSIZE);
     6156 +                rzio = zio_read_phys(pio, vd, daddr + i,
     6157 +                    rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
     6158 +                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE |
     6159 +                    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
     6160 +                    ZIO_FLAG_DONT_RETRY, B_FALSE);
     6161 +                (void) zio_nowait(rzio);
     6162 +                i += rsize;
     6163 +        }
     6164 +
     6165 +        return (pio);
     6166 +}
     6167 +
     6168 +/*
     6169 + * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data
     6170 + * buffers allocated for it.
     6171 + */
     6172 +static void
     6173 +l2arc_pbuf_prefetch_abort(zio_t *zio)
     6174 +{
     6175 +        l2arc_prefetch_info_t *pi;
     6176 +
     6177 +        pi = zio->io_private;
     6178 +        ASSERT(pi != NULL);
     6179 +        if (pi->pi_hdr_io != NULL)
     6180 +                (void) zio_wait(pi->pi_hdr_io);
     6181 +        (void) zio_wait(zio);
     6182 +        kmem_free(pi->pi_buf, pi->pi_buflen);
     6183 +        pi->pi_buf = NULL;
     6184 +        kmem_free(pi, sizeof (l2arc_prefetch_info_t));
     6185 +}
     6186 +
     6187 +/*
     6188 + * Encodes an l2uberblock_t structure into a destination buffer. This
     6189 + * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting
     6190 + * uberblock is always of this constant size.
     6191 + */
     6192 +static void
     6193 +l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf)
     6194 +{
     6195 +        zio_cksum_t cksum;
     6196 +
     6197 +        bzero(buf, L2UBERBLOCK_SIZE);
     6198 +
     6199 +#if defined(_BIG_ENDIAN)
     6200 +        *(uint32_t *)buf = L2UBERBLOCK_MAGIC;
     6201 +        *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN;
     6202 +#else   /* !defined(_BIG_ENDIAN) */
     6203 +        *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC);
     6204 +        /* zero flags is ok */
     6205 +#endif  /* !defined(_BIG_ENDIAN) */
     6206 +        buf[4] = L2UBERBLOCK_MAX_VERSION;
     6207 +
     6208 +        /* rest in native byte order */
     6209 +        *(uint64_t *)(buf + 8) = ub->ub_spa_guid;
     6210 +        *(uint64_t *)(buf + 16) = ub->ub_birth;
     6211 +        *(uint64_t *)(buf + 24) = ub->ub_evict_tail;
     6212 +        *(uint64_t *)(buf + 32) = ub->ub_alloc_space;
     6213 +        *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr;
     6214 +        *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize;
     6215 +        bcopy(&ub->ub_pbuf_cksum, buf + 52, 32);
     6216 +
     6217 +        fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
     6218 +        bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32);
     6219 +}
     6220 +
     6221 +/*
     6222 + * Decodes an l2uberblock_t from an on-disk representation. Please note
     6223 + * that this function does not perform any uberblock validation and
     6224 + * checksumming - call l2arc_uberblock_verify() for that.
     6225 + */
     6226 +static void
     6227 +l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub)
     6228 +{
     6229 +        boolean_t bswap_needed;
     6230 +
     6231 +        /* these always come in big endian */
     6232 +#if defined(_BIG_ENDIAN)
     6233 +        ub->ub_magic = *(uint32_t *)buf;
     6234 +        ub->ub_flags = *(uint16_t *)(buf + 6);
     6235 +        bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1);
     6236 +#else   /* !defined(_BIG_ENDIAN) */
     6237 +        ub->ub_magic = BSWAP_32(*(uint32_t *)buf);
     6238 +        ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6));
     6239 +        bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0);
     6240 +#endif  /* !defined(_BIG_ENDIAN) */
     6241 +        ub->ub_version = buf[4];
     6242 +
     6243 +        ub->ub_spa_guid = *(uint64_t *)(buf + 8);
     6244 +        ub->ub_birth = *(uint64_t *)(buf + 16);
     6245 +        ub->ub_evict_tail = *(uint64_t *)(buf + 24);
     6246 +        ub->ub_alloc_space = *(uint64_t *)(buf + 32);
     6247 +        ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40);
     6248 +        ub->ub_pbuf_asize = *(uint32_t *)(buf + 48);
     6249 +        bcopy(buf + 52, &ub->ub_pbuf_cksum, 36);
     6250 +        bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32);
     6251 +
     6252 +        /* swap the rest if endianness doesn't match us */
     6253 +        if (bswap_needed) {
     6254 +                ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid);
     6255 +                ub->ub_birth = BSWAP_64(ub->ub_birth);
     6256 +                ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail);
     6257 +                ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space);
     6258 +                ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr);
     6259 +                ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize);
     6260 +                ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum);
     6261 +                ZIO_CHECKSUM_BSWAP(&ub->ub_cksum);
     6262 +        }
     6263 +}
     6264 +
     6265 +/*
     6266 + * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is
     6267 + * valid and matches its checksum.
     6268 + */
     6269 +static int
     6270 +l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
     6271 +    uint64_t guid)
     6272 +{
     6273 +        zio_cksum_t cksum;
     6274 +
     6275 +        if (ub->ub_magic != L2UBERBLOCK_MAGIC ||
     6276 +            ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION)
     6277 +                /*
     6278 +                 * bad magic or invalid version => persistent l2arc not
     6279 +                 * supported
     6280 +                 */
     6281 +                return (ENOTSUP);
     6282 +
     6283 +        if (ub->ub_spa_guid != guid)
     6284 +                /* this l2arc dev isn't ours */
     6285 +                return (EINVAL);
     6286 +
     6287 +        fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
     6288 +        if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum))
     6289 +                /* bad checksum, corrupt uberblock */
     6290 +                return (EINVAL);
     6291 +
     6292 +        return (0);
     6293 +}
     6294 +
     6295 +/*
     6296 + * Schedules a zio to update the uberblock on an l2arc device. The zio is
     6297 + * initiated as a child of `pio' and `cb' is filled with the information
     6298 + * needed to free the uberblock data buffer after writing.
     6299 + */
     6300 +static void
     6301 +l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
     6302 +{
     6303 +        uint8_t *ub_buf;
     6304 +        l2uberblock_t ub;
     6305 +        zio_t *wzio;
     6306 +        vdev_stat_t st;
     6307 +
     6308 +        ASSERT(cb->l2wcb_ub_buf == NULL);
     6309 +        vdev_get_stats(dev->l2ad_vdev, &st);
     6310 +
     6311 +        bzero(&ub, sizeof (ub));
     6312 +        ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
     6313 +        ub.ub_birth = dev->l2ad_uberblock_birth++;
     6314 +        ub.ub_evict_tail = dev->l2ad_evict;
     6315 +        ub.ub_alloc_space = st.vs_alloc;
     6316 +        ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr;
     6317 +        ub.ub_pbuf_asize = dev->l2ad_pbuf_asize;
     6318 +        ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum;
     6319 +        if (dev->l2ad_first)
     6320 +                ub.ub_flags |= L2UBLK_EVICT_FIRST;
     6321 +
     6322 +        ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
     6323 +        cb->l2wcb_ub_buf = ub_buf;
     6324 +        l2arc_uberblock_encode(&ub, ub_buf);
     6325 +        wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
     6326 +            L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL,
     6327 +            ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
     6328 +        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
     6329 +            zio_t *, wzio);
     6330 +        (void) zio_nowait(wzio);
     6331 +}
     6332 +
     6333 +/*
     6334 + * Encodes a l2pbuf_t structure into the portable on-disk format. The
     6335 + * `buf' buffer must be suitably sized to hold the entire uncompressed
     6336 + * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function
     6337 + * also compresses the buffer.
     6338 + *
     6339 + * The return value is the length of the resulting encoded pbuf structure.
     6340 + * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression
     6341 + * was applied, or smaller if compression was applied. In either case,
     6342 + * prior to writing to disk, the caller must suitably pad the output
     6343 + * buffer so that it is aligned on a multiple of the underlying storage
     6344 + * system's block size.
     6345 + */
     6346 +static uint32_t
     6347 +l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen)
     6348 +{
     6349 +        uint16_t flags = 0;
     6350 +        uint8_t *dst_buf;
     6351 +        uint32_t enclen;
     6352 +        l2pbuf_buflist_t *buflist;
     6353 +
     6354 +        enclen = L2PBUF_ENCODED_SIZE(pb);
     6355 +        ASSERT(buflen >= enclen);
     6356 +        bzero(buf, enclen);
     6357 +
     6358 +        /* non-header portions of pbufs are in native byte order */
     6359 +        *(uint64_t *)(buf + 8) = pb->pb_prev_daddr;
     6360 +        *(uint32_t *)(buf + 16) = pb->pb_prev_asize;
     6361 +        bcopy(&pb->pb_prev_cksum, buf + 20, 32);
     6362 +        *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE;
     6363 +
     6364 +        /* first we encode the buflists uncompressed */
     6365 +        dst_buf = buf + L2PBUF_HDR_SIZE;
     6366 +        for (buflist = list_head(pb->pb_buflists_list); buflist;
     6367 +            buflist = list_next(pb->pb_buflists_list, buflist)) {
     6368 +                int i;
     6369 +
     6370 +                ASSERT(buflist->l2pbl_nbufs != 0);
     6371 +                for (i = 0; i < buflist->l2pbl_nbufs; i++) {
     6372 +                        l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
     6373 +
     6374 +                        ASSERT(pbl_buf->b_size != 0);
     6375 +                        *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0];
     6376 +                        *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1];
     6377 +                        *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth;
     6378 +                        *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0;
     6379 +                        bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32);
     6380 +                        *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size;
     6381 +                        *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr;
     6382 +                        *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize;
     6383 +                        dst_buf[80] = pbl_buf->b_l2compress;
     6384 +                        dst_buf[81] = pbl_buf->b_contents_type;
     6385 +                        *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags;
     6386 +                        dst_buf += L2PBUF_BUF_SIZE;
     6387 +                }
     6388 +        }
     6389 +        ASSERT((uint32_t)(dst_buf - buf) == enclen);
     6390 +
     6391 +        /* and then compress them if necessary */
     6392 +        if (enclen >= l2arc_pbuf_compress_minsz) {
     6393 +                uint8_t *cbuf;
     6394 +                size_t slen, clen;
     6395 +
     6396 +                slen = l2arc_pbuf_items_encoded_size(pb);
     6397 +                cbuf = kmem_alloc(slen, KM_SLEEP);
     6398 +                clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0);
     6399 +                ASSERT(clen != 0);
     6400 +                if (clen < slen) {
     6401 +                        bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen);
     6402 +                        flags |= L2PBUF_COMPRESSED;
     6403 +                        /* zero out the rest of the input buffer */
     6404 +                        bzero(buf + L2PBUF_HDR_SIZE + clen,
     6405 +                            buflen - (L2PBUF_HDR_SIZE + clen));
     6406 +                        /* adjust our buffer length now that it's shortened */
     6407 +                        enclen = L2PBUF_HDR_SIZE + clen;
     6408 +                }
     6409 +                kmem_free(cbuf, slen);
     6410 +        }
     6411 +
     6412 +        /* the header goes last since `flags' may change due to compression */
     6413 +#if defined(_BIG_ENDIAN)
     6414 +        *(uint32_t *)buf = L2PBUF_MAGIC;
     6415 +        flags |= L2PBUF_BIG_ENDIAN;
     6416 +        *(uint16_t *)(buf + 6) = flags;
     6417 +#else   /* !defined(_BIG_ENDIAN) */
     6418 +        *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC);
     6419 +        *(uint16_t *)(buf + 6) = BSWAP_16(flags);
     6420 +#endif  /* !defined(_BIG_ENDIAN) */
     6421 +        buf[4] = L2PBUF_MAX_VERSION;
     6422 +
     6423 +        return (enclen);
     6424 +}
     6425 +
     6426 +/*
     6427 + * Decodes a stored l2pbuf_t structure previously encoded using
     6428 + * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf
     6429 + * must be initialized by l2arc_pbuf_init by the caller beforehand, but
     6430 + * must not have been used to store any buffers yet.
     6431 + *
     6432 + * Please note that we don't do checksum verification here, as we don't
     6433 + * know our own checksum (that's know by the previous block in the linked
     6434 + * list, or by the uberblock). This should be performed by the caller
     6435 + * prior to calling l2arc_pbuf_decode.
     6436 + */
     6437 +static int
     6438 +l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb)
     6439 +{
     6440 +        boolean_t bswap_needed;
     6441 +        uint32_t payload_sz, payload_asz;
     6442 +        uint8_t *src_bufs;
     6443 +        l2pbuf_buflist_t *buflist;
     6444 +        int i, nbufs;
     6445 +
     6446 +        ASSERT(input_buf != NULL);
     6447 +        ASSERT(pb != NULL);
     6448 +        ASSERT(pb->pb_version != 0);
     6449 +        ASSERT(pb->pb_nbuflists == 0);
     6450 +
     6451 +        /* no valid buffer can be this small */
     6452 +        if (buflen < L2PBUF_HDR_SIZE)
     6453 +                return (EINVAL);
     6454 +
     6455 +        /* these always come in big endian */
     6456 +#if defined(_BIG_ENDIAN)
     6457 +        pb->pb_magic = *(uint32_t *)input_buf;
     6458 +        pb->pb_flags = *(uint16_t *)(input_buf + 6);
     6459 +        bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1);
     6460 +#else   /* !defined(_BIG_ENDIAN) */
     6461 +        pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf);
     6462 +        pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6));
     6463 +        bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0);
     6464 +#endif  /* !defined(_BIG_ENDIAN) */
     6465 +        pb->pb_version = input_buf[4];
     6466 +
     6467 +        if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0)
     6468 +                return (EINVAL);
     6469 +        if (pb->pb_version > L2PBUF_MAX_VERSION)
     6470 +                return (ENOTSUP);
     6471 +
     6472 +        /* remainder of pbuf may need bswap'ping */
     6473 +        pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8);
     6474 +        pb->pb_prev_asize = *(uint64_t *)(input_buf + 16);
     6475 +        bcopy(input_buf + 20, &pb->pb_prev_cksum, 32);
     6476 +        payload_sz = *(uint32_t *)(input_buf + 52);
     6477 +        payload_asz = buflen - L2PBUF_HDR_SIZE;
     6478 +
     6479 +        if (bswap_needed) {
     6480 +                pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr);
     6481 +                pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize);
     6482 +                ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum);
     6483 +                payload_sz = BSWAP_32(payload_sz);
     6484 +        }
     6485 +
     6486 +        /* check for sensible buffer allocation limits */
     6487 +        if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) ||
     6488 +            (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) ||
     6489 +            (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0)
     6490 +                return (EINVAL);
     6491 +        nbufs = payload_sz / L2PBUF_BUF_SIZE;
     6492 +
     6493 +        /* decompression might be needed */
     6494 +        if (pb->pb_flags & L2PBUF_COMPRESSED) {
     6495 +                src_bufs = kmem_alloc(payload_sz, KM_SLEEP);
     6496 +                if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs,
     6497 +                    payload_asz, payload_sz, 0) != 0) {
     6498 +                        kmem_free(src_bufs, payload_sz);
     6499 +                        return (EINVAL);
     6500 +                }
     6501 +        } else {
     6502 +                src_bufs = input_buf + L2PBUF_HDR_SIZE;
     6503 +        }
     6504 +
     6505 +        /* Decode individual pbuf items from our source buffer. */
     6506 +        buflist = l2arc_pbuf_buflist_alloc(pb, nbufs);
     6507 +        for (i = 0; i < nbufs; i++) {
     6508 +                l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
     6509 +                const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE;
     6510 +
     6511 +                pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src;
     6512 +                pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8);
     6513 +                pbl_buf->b_birth = *(uint64_t *)(src + 16);
     6514 +                pbl_buf->b_cksum0 = *(uint64_t *)(src + 24);
     6515 +                bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32);
     6516 +                pbl_buf->b_size = *(uint32_t *)(src + 64);
     6517 +                pbl_buf->b_l2daddr = *(uint64_t *)(src + 68);
     6518 +                pbl_buf->b_l2asize = *(uint32_t *)(src + 76);
     6519 +                pbl_buf->b_l2compress = src[80];
     6520 +                pbl_buf->b_contents_type = src[81];
     6521 +                pbl_buf->b_flags = *(uint32_t *)(src + 84);
     6522 +
     6523 +                if (bswap_needed) {
     6524 +                        pbl_buf->b_dva.dva_word[0] =
     6525 +                            BSWAP_64(pbl_buf->b_dva.dva_word[0]);
     6526 +                        pbl_buf->b_dva.dva_word[1] =
     6527 +                            BSWAP_64(pbl_buf->b_dva.dva_word[1]);
     6528 +                        pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth);
     6529 +                        pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0);
     6530 +                        ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum);
     6531 +                        pbl_buf->b_size = BSWAP_32(pbl_buf->b_size);
     6532 +                        pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr);
     6533 +                        pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize);
     6534 +                        pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags);
     6535 +                }
     6536 +
     6537 +                pb->pb_payload_asz += pbl_buf->b_l2asize;
     6538 +        }
     6539 +
     6540 +        if (pb->pb_flags & L2PBUF_COMPRESSED)
     6541 +                kmem_free(src_bufs, payload_sz);
     6542 +
     6543 +        return (0);
     6544 +}
     6545 +
     6546 +/*
     6547 + * Decodes the previous buffer pointer encoded in a pbuf. This is used
     6548 + * during L2ARC reconstruction to "peek" at the next buffer and start
     6549 + * issuing IO to fetch it early, before decoding of the current buffer
     6550 + * is done (which can take time due to decompression).
     6551 + * Returns 0 on success (and fills in the return parameters `daddr',
     6552 + * `asize' and `cksum' with the info of the previous pbuf), and an errno
     6553 + * on error.
     6554 + */
     6555 +static int
     6556 +l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr,
     6557 +    uint32_t *asize, zio_cksum_t *cksum)
     6558 +{
     6559 +        boolean_t bswap_needed;
     6560 +        uint16_t version, flags;
     6561 +        uint32_t magic;
     6562 +
     6563 +        ASSERT(buf != NULL);
     6564 +
     6565 +        /* no valid buffer can be this small */
     6566 +        if (buflen <= L2PBUF_HDR_SIZE)
     6567 +                return (EINVAL);
     6568 +
     6569 +        /* these always come in big endian */
     6570 +#if defined(_BIG_ENDIAN)
     6571 +        magic = *(uint32_t *)buf;
     6572 +        flags = *(uint16_t *)(buf + 6);
     6573 +        bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1);
     6574 +#else   /* !defined(_BIG_ENDIAN) */
     6575 +        magic = BSWAP_32(*(uint32_t *)buf);
     6576 +        flags = BSWAP_16(*(uint16_t *)(buf + 6));
     6577 +        bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0);
     6578 +#endif  /* !defined(_BIG_ENDIAN) */
     6579 +        version = buf[4];
     6580 +
     6581 +        if (magic != L2PBUF_MAGIC || version == 0)
     6582 +                return (EINVAL);
     6583 +        if (version > L2PBUF_MAX_VERSION)
     6584 +                return (ENOTSUP);
     6585 +
     6586 +        *daddr = *(uint64_t *)(buf + 4);
     6587 +        *asize = *(uint64_t *)(buf + 12);
     6588 +        bcopy(buf + 16, cksum, 32);
     6589 +
     6590 +        if (bswap_needed) {
     6591 +                *daddr = BSWAP_64(*daddr);
     6592 +                *asize = BSWAP_64(*asize);
     6593 +                ZIO_CHECKSUM_BSWAP(cksum);
     6594 +        }
     6595 +
     6596 +        return (0);
     6597 +}
     6598 +
     6599 +/*
     6600 + * Initializes a pbuf structure into a clean state. All version and flags
     6601 + * fields are filled in as appropriate for this architecture.
     6602 + * If the structure was used before, first call l2arc_pbuf_destroy on it,
     6603 + * as this function assumes the structure is uninitialized.
     6604 + */
     6605 +static void
     6606 +l2arc_pbuf_init(l2pbuf_t *pb)
     6607 +{
     6608 +        bzero(pb, sizeof (l2pbuf_t));
     6609 +        pb->pb_version = L2PBUF_MAX_VERSION;
     6610 +#if defined(_BIG_ENDIAN)
     6611 +        pb->pb_flags |= L2PB_BIG_ENDIAN;
     6612 +#endif
     6613 +        pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP);
     6614 +        list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t),
     6615 +            offsetof(l2pbuf_buflist_t, l2pbl_node));
     6616 +}
     6617 +
     6618 +/*
     6619 + * Destroys a pbuf structure and puts it into a clean state ready to be
     6620 + * initialized by l2arc_pbuf_init. All buflists created by
     6621 + * l2arc_pbuf_buflist_alloc are released as well.
     6622 + */
     6623 +static void
     6624 +l2arc_pbuf_destroy(l2pbuf_t *pb)
     6625 +{
     6626 +        list_t *buflist_list = pb->pb_buflists_list;
     6627 +        l2pbuf_buflist_t *buflist;
     6628 +
     6629 +        while ((buflist = list_head(buflist_list)) != NULL) {
     6630 +                ASSERT(buflist->l2pbl_nbufs > 0);
     6631 +                kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) *
     6632 +                    buflist->l2pbl_nbufs);
     6633 +                list_remove(buflist_list, buflist);
     6634 +                kmem_free(buflist, sizeof (l2pbuf_buflist_t));
     6635 +        }
     6636 +        pb->pb_nbuflists = 0;
     6637 +        list_destroy(pb->pb_buflists_list);
     6638 +        kmem_free(pb->pb_buflists_list, sizeof (list_t));
     6639 +        bzero(pb, sizeof (l2pbuf_t));
     6640 +}
     6641 +
     6642 +/*
     6643 + * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs'
     6644 + * buffers. This is used during the buffer write cycle - each cycle allocates
     6645 + * a new buflist and fills it with buffers it writes. Then, when the pbuf
     6646 + * reaches its buflist limit, it is commited to stable storage.
     6647 + */
     6648 +static l2pbuf_buflist_t *
     6649 +l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs)
     6650 +{
     6651 +        l2pbuf_buflist_t *buflist;
     6652 +
     6653 +        ASSERT(pb->pb_buflists_list != NULL);
     6654 +        buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP);
     6655 +        buflist->l2pbl_nbufs = nbufs;
     6656 +        buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs,
     6657 +            KM_SLEEP);
     6658 +        list_insert_tail(pb->pb_buflists_list, buflist);
     6659 +        pb->pb_nbuflists++;
     6660 +
     6661 +        return (buflist);
     6662 +}
     6663 +
     6664 +/*
     6665 + * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'.
     6666 + * The buffer being inserted must be present in L2ARC.
     6667 + */
     6668 +static void
     6669 +l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
     6670 +    const arc_buf_hdr_t *ab, int index)
     6671 +{
     6672 +        l2pbuf_buf_t *pb_buf;
     6673 +        const l2arc_buf_hdr_t *l2hdr;
     6674 +
     6675 +        l2hdr = ab->b_l2hdr;
     6676 +        ASSERT(l2hdr != NULL);
     6677 +        ASSERT(pbl->l2pbl_nbufs > index);
     6678 +
     6679 +        pb_buf = &pbl->l2pbl_bufs[index];
     6680 +        pb_buf->b_dva = ab->b_dva;
     6681 +        pb_buf->b_birth = ab->b_birth;
     6682 +        pb_buf->b_cksum0 = ab->b_cksum0;
     6683 +        pb_buf->b_freeze_cksum = *ab->b_freeze_cksum;
     6684 +        pb_buf->b_size = ab->b_size;
     6685 +        pb_buf->b_l2daddr = l2hdr->b_daddr;
     6686 +        pb_buf->b_l2asize = l2hdr->b_asize;
     6687 +        pb_buf->b_l2compress = l2hdr->b_compress;
     6688 +        pb_buf->b_contents_type = ab->b_type;
     6689 +        pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS;
     6690 +        pb->pb_payload_asz += l2hdr->b_asize;
     6691 +}
     6692 +
     6693 +/*
     6694 + * Commits a pbuf to stable storage. This routine is invoked when writing
     6695 + * ARC buffers to an L2ARC device. When the pbuf associated with the device
     6696 + * has reached its limits (either in size or in number of writes), it is
     6697 + * scheduled here for writing.
     6698 + * This function allocates some memory to temporarily hold the serialized
     6699 + * buffer to be written. This is then released in l2arc_write_done.
     6700 + */
     6701 +static void
     6702 +l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
     6703 +{
     6704 +        l2pbuf_t *pb = &dev->l2ad_pbuf;
     6705 +        uint64_t i, est_encsize, bufsize, encsize, io_size;
     6706 +        uint8_t *pb_buf;
     6707 +
     6708 +        pb->pb_prev_daddr = dev->l2ad_pbuf_daddr;
     6709 +        pb->pb_prev_asize = dev->l2ad_pbuf_asize;
     6710 +        pb->pb_prev_cksum = dev->l2ad_pbuf_cksum;
     6711 +
     6712 +        est_encsize = L2PBUF_ENCODED_SIZE(pb);
     6713 +        bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize);
     6714 +        pb_buf = kmem_zalloc(bufsize, KM_SLEEP);
     6715 +        encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize);
     6716 +        cb->l2wcb_pbuf = pb_buf;
     6717 +        cb->l2wcb_pbuf_size = bufsize;
     6718 +
     6719 +        dev->l2ad_pbuf_daddr = dev->l2ad_hand;
     6720 +        dev->l2ad_pbuf_asize = encsize;
     6721 +        fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum);
     6722 +
     6723 +        io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize);
     6724 +        for (i = 0; i < io_size; ) {
     6725 +                zio_t *wzio;
     6726 +                uint64_t wsize = io_size - i;
     6727 +
     6728 +                if (wsize > SPA_MAXBLOCKSIZE)
     6729 +                        wsize = SPA_MAXBLOCKSIZE;
     6730 +                ASSERT(wsize >= SPA_MINBLOCKSIZE);
     6731 +                wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i,
     6732 +                    wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
     6733 +                    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
     6734 +                DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
     6735 +                    zio_t *, wzio);
     6736 +                (void) zio_nowait(wzio);
     6737 +                i += wsize;
     6738 +        }
     6739 +
     6740 +        dev->l2ad_hand += io_size;
     6741 +        vdev_space_update(dev->l2ad_vdev, io_size, 0, 0);
     6742 +        l2arc_uberblock_update(dev, pio, cb);
     6743 +
     6744 +        ARCSTAT_INCR(arcstat_l2_write_bytes, io_size);
     6745 +        ARCSTAT_BUMP(arcstat_l2_meta_writes);
     6746 +        ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize);
     6747 +        ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize);
     6748 +        ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
     6749 +            pb->pb_payload_asz / encsize);
     6750 +}
     6751 +
     6752 +/*
     6753 + * Returns the number of bytes occupied by the payload buffer items of
     6754 + * a pbuf in portable (on-disk) encoded form, i.e. the bytes following
     6755 + * L2PBUF_HDR_SIZE.
     6756 + */
     6757 +static uint32_t
     6758 +l2arc_pbuf_items_encoded_size(l2pbuf_t *pb)
     6759 +{
     6760 +        uint32_t size = 0;
     6761 +        l2pbuf_buflist_t *buflist;
     6762 +
     6763 +        for (buflist = list_head(pb->pb_buflists_list); buflist != NULL;
     6764 +            buflist = list_next(pb->pb_buflists_list, buflist))
     6765 +                size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs;
     6766 +
     6767 +        return (size);
5141 6768  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX