illumos-gate.git Wdiff usr/src/uts/common/fs/zfs/arc.c

Print this page

3525 Persistent L2ARC

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26   26   * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  27   27   */
  28   28  
  29   29  /*
  30   30   * DVA-based Adjustable Replacement Cache
  31   31   *
  32   32   * While much of the theory of operation used here is
  33   33   * based on the self-tuning, low overhead replacement cache
  34   34   * presented by Megiddo and Modha at FAST 2003, there are some
  35   35   * significant differences:
  36   36   *
  37   37   * 1. The Megiddo and Modha model assumes any page is evictable.
  38   38   * Pages in its cache cannot be "locked" into memory.  This makes
  39   39   * the eviction algorithm simple: evict the last page in the list.
  40   40   * This also make the performance characteristics easy to reason
  41   41   * about.  Our cache is not so simple.  At any given moment, some
  42   42   * subset of the blocks in the cache are un-evictable because we
  43   43   * have handed out a reference to them.  Blocks are only evictable
  44   44   * when there are no external references active.  This makes
  45   45   * eviction far more problematic:  we choose to evict the evictable
  46   46   * blocks that are the "lowest" in the list.
  47   47   *
  48   48   * There are times when it is not possible to evict the requested
  49   49   * space.  In these circumstances we are unable to adjust the cache
  50   50   * size.  To prevent the cache growing unbounded at these times we
  51   51   * implement a "cache throttle" that slows the flow of new data
  52   52   * into the cache until we can make space available.
  53   53   *
  54   54   * 2. The Megiddo and Modha model assumes a fixed cache size.
  55   55   * Pages are evicted when the cache is full and there is a cache
  56   56   * miss.  Our model has a variable sized cache.  It grows with
  57   57   * high use, but also tries to react to memory pressure from the
  58   58   * operating system: decreasing its size when system memory is
  59   59   * tight.
  60   60   *
  61   61   * 3. The Megiddo and Modha model assumes a fixed page size. All
  62   62   * elements of the cache are therefore exactly the same size.  So
  63   63   * when adjusting the cache size following a cache miss, its simply
  64   64   * a matter of choosing a single page to evict.  In our model, we
  65   65   * have variable sized cache blocks (rangeing from 512 bytes to
  66   66   * 128K bytes).  We therefore choose a set of blocks to evict to make
  67   67   * space for a cache miss that approximates as closely as possible
  68   68   * the space used by the new block.
  69   69   *
  70   70   * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71   71   * by N. Megiddo & D. Modha, FAST 2003
  72   72   */
  73   73  
  74   74  /*
  75   75   * The locking model:
  76   76   *
  77   77   * A new reference to a cache buffer can be obtained in two
  78   78   * ways: 1) via a hash table lookup using the DVA as a key,
  79   79   * or 2) via one of the ARC lists.  The arc_read() interface
  80   80   * uses method 1, while the internal arc algorithms for
  81   81   * adjusting the cache use method 2.  We therefore provide two
  82   82   * types of locks: 1) the hash table lock array, and 2) the
  83   83   * arc list locks.
  84   84   *
  85   85   * Buffers do not have their own mutexes, rather they rely on the
  86   86   * hash table mutexes for the bulk of their protection (i.e. most
  87   87   * fields in the arc_buf_hdr_t are protected by these mutexes).
  88   88   *
  89   89   * buf_hash_find() returns the appropriate mutex (held) when it
  90   90   * locates the requested buffer in the hash table.  It returns
  91   91   * NULL for the mutex if the buffer was not in the table.
  92   92   *
  93   93   * buf_hash_remove() expects the appropriate hash mutex to be
  94   94   * already held before it is invoked.
  95   95   *
  96   96   * Each arc state also has a mutex which is used to protect the
  97   97   * buffer list associated with the state.  When attempting to
  98   98   * obtain a hash table lock while holding an arc list lock you
  99   99   * must use: mutex_tryenter() to avoid deadlock.  Also note that
 100  100   * the active state mutex must be held before the ghost state mutex.
 101  101   *
 102  102   * Arc buffers may have an associated eviction callback function.
 103  103   * This function will be invoked prior to removing the buffer (e.g.
 104  104   * in arc_do_user_evicts()).  Note however that the data associated
 105  105   * with the buffer may be evicted prior to the callback.  The callback
 106  106   * must be made with *no locks held* (to prevent deadlock).  Additionally,
 107  107   * the users of callbacks must ensure that their private data is
 108  108   * protected from simultaneous callbacks from arc_buf_evict()
 109  109   * and arc_do_user_evicts().
 110  110   *
 111  111   * Note that the majority of the performance stats are manipulated
 112  112   * with atomic operations.
 113  113   *
 114  114   * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 115  115   *
 116  116   *      - L2ARC buflist creation
 117  117   *      - L2ARC buflist eviction
 118  118   *      - L2ARC write completion, which walks L2ARC buflists
 119  119   *      - ARC header destruction, as it removes from L2ARC buflists
 120  120   *      - ARC header release, as it removes from L2ARC buflists
 121  121   */
 122  122  
 123  123  #include <sys/spa.h>
 124  124  #include <sys/zio.h>
 125  125  #include <sys/zio_compress.h>
 126  126  #include <sys/zfs_context.h>
 127  127  #include <sys/arc.h>
 128  128  #include <sys/refcount.h>
 129  129  #include <sys/vdev.h>
 130  130  #include <sys/vdev_impl.h>

↓ open down ↓

130 lines elided

↑ open up ↑

 131  131  #include <sys/dsl_pool.h>
 132  132  #ifdef _KERNEL
 133  133  #include <sys/vmsystm.h>
 134  134  #include <vm/anon.h>
 135  135  #include <sys/fs/swapnode.h>
 136  136  #include <sys/dnlc.h>
 137  137  #endif
 138  138  #include <sys/callb.h>
 139  139  #include <sys/kstat.h>
 140  140  #include <zfs_fletcher.h>
      141 +#include <sys/byteorder.h>
      142 +#include <sys/spa_impl.h>
 141  143  
 142  144  #ifndef _KERNEL
 143  145  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 144  146  boolean_t arc_watch = B_FALSE;
 145  147  int arc_procfd;
 146  148  #endif
 147  149  
 148  150  static kmutex_t         arc_reclaim_thr_lock;
 149  151  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 150  152  static uint8_t          arc_thread_exit;

 151  153  
 152  154  #define ARC_REDUCE_DNLC_PERCENT 3
 153  155  uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154  156  
 155  157  typedef enum arc_reclaim_strategy {
 156  158          ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157  159          ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158  160  } arc_reclaim_strategy_t;
 159  161  
 160  162  /*
 161  163   * The number of iterations through arc_evict_*() before we
 162  164   * drop & reacquire the lock.
 163  165   */
 164  166  int arc_evict_iterations = 100;
 165  167  
 166  168  /* number of seconds before growing cache again */
 167  169  static int              arc_grow_retry = 60;
 168  170  
 169  171  /* shift of arc_c for calculating both min and max arc_p */
 170  172  static int              arc_p_min_shift = 4;
 171  173  
 172  174  /* log2(fraction of arc to reclaim) */
 173  175  static int              arc_shrink_shift = 5;
 174  176  
 175  177  /*
 176  178   * minimum lifespan of a prefetch block in clock ticks
 177  179   * (initialized in arc_init())
 178  180   */
 179  181  static int              arc_min_prefetch_lifespan;
 180  182  
 181  183  /*
 182  184   * If this percent of memory is free, don't throttle.
 183  185   */
 184  186  int arc_lotsfree_percent = 10;
 185  187  
 186  188  static int arc_dead;
 187  189  
 188  190  /*
 189  191   * The arc has filled available memory and has now warmed up.
 190  192   */
 191  193  static boolean_t arc_warm;
 192  194  
 193  195  /*
 194  196   * These tunables are for performance analysis.
 195  197   */
 196  198  uint64_t zfs_arc_max;
 197  199  uint64_t zfs_arc_min;
 198  200  uint64_t zfs_arc_meta_limit = 0;
 199  201  int zfs_arc_grow_retry = 0;
 200  202  int zfs_arc_shrink_shift = 0;
 201  203  int zfs_arc_p_min_shift = 0;
 202  204  int zfs_disable_dup_eviction = 0;
 203  205  
 204  206  /*
 205  207   * Note that buffers can be in one of 6 states:
 206  208   *      ARC_anon        - anonymous (discussed below)
 207  209   *      ARC_mru         - recently used, currently cached
 208  210   *      ARC_mru_ghost   - recentely used, no longer in cache
 209  211   *      ARC_mfu         - frequently used, currently cached
 210  212   *      ARC_mfu_ghost   - frequently used, no longer in cache
 211  213   *      ARC_l2c_only    - exists in L2ARC but not other states
 212  214   * When there are no active references to the buffer, they are
 213  215   * are linked onto a list in one of these arc states.  These are
 214  216   * the only buffers that can be evicted or deleted.  Within each
 215  217   * state there are multiple lists, one for meta-data and one for
 216  218   * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 217  219   * etc.) is tracked separately so that it can be managed more
 218  220   * explicitly: favored over data, limited explicitly.
 219  221   *
 220  222   * Anonymous buffers are buffers that are not associated with
 221  223   * a DVA.  These are buffers that hold dirty block copies
 222  224   * before they are written to stable storage.  By definition,
 223  225   * they are "ref'd" and are considered part of arc_mru
 224  226   * that cannot be freed.  Generally, they will aquire a DVA
 225  227   * as they are written and migrate onto the arc_mru list.
 226  228   *
 227  229   * The ARC_l2c_only state is for buffers that are in the second
 228  230   * level ARC but no longer in any of the ARC_m* lists.  The second
 229  231   * level ARC itself may also contain buffers that are in any of
 230  232   * the ARC_m* states - meaning that a buffer can exist in two
 231  233   * places.  The reason for the ARC_l2c_only state is to keep the
 232  234   * buffer header in the hash table, so that reads that hit the
 233  235   * second level ARC benefit from these fast lookups.
 234  236   */
 235  237  
 236  238  typedef struct arc_state {
 237  239          list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 238  240          uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 239  241          uint64_t arcs_size;     /* total amount of data in this state */
 240  242          kmutex_t arcs_mtx;
 241  243  } arc_state_t;
 242  244  
 243  245  /* The 6 states: */
 244  246  static arc_state_t ARC_anon;
 245  247  static arc_state_t ARC_mru;
 246  248  static arc_state_t ARC_mru_ghost;
 247  249  static arc_state_t ARC_mfu;
 248  250  static arc_state_t ARC_mfu_ghost;
 249  251  static arc_state_t ARC_l2c_only;
 250  252  
 251  253  typedef struct arc_stats {
 252  254          kstat_named_t arcstat_hits;
 253  255          kstat_named_t arcstat_misses;
 254  256          kstat_named_t arcstat_demand_data_hits;
 255  257          kstat_named_t arcstat_demand_data_misses;
 256  258          kstat_named_t arcstat_demand_metadata_hits;
 257  259          kstat_named_t arcstat_demand_metadata_misses;
 258  260          kstat_named_t arcstat_prefetch_data_hits;
 259  261          kstat_named_t arcstat_prefetch_data_misses;
 260  262          kstat_named_t arcstat_prefetch_metadata_hits;
 261  263          kstat_named_t arcstat_prefetch_metadata_misses;
 262  264          kstat_named_t arcstat_mru_hits;
 263  265          kstat_named_t arcstat_mru_ghost_hits;
 264  266          kstat_named_t arcstat_mfu_hits;
 265  267          kstat_named_t arcstat_mfu_ghost_hits;
 266  268          kstat_named_t arcstat_deleted;
 267  269          kstat_named_t arcstat_recycle_miss;
 268  270          /*
 269  271           * Number of buffers that could not be evicted because the hash lock
 270  272           * was held by another thread.  The lock may not necessarily be held
 271  273           * by something using the same buffer, since hash locks are shared
 272  274           * by multiple buffers.
 273  275           */
 274  276          kstat_named_t arcstat_mutex_miss;
 275  277          /*
 276  278           * Number of buffers skipped because they have I/O in progress, are
 277  279           * indrect prefetch buffers that have not lived long enough, or are
 278  280           * not from the spa we're trying to evict from.
 279  281           */
 280  282          kstat_named_t arcstat_evict_skip;
 281  283          kstat_named_t arcstat_evict_l2_cached;
 282  284          kstat_named_t arcstat_evict_l2_eligible;
 283  285          kstat_named_t arcstat_evict_l2_ineligible;
 284  286          kstat_named_t arcstat_hash_elements;
 285  287          kstat_named_t arcstat_hash_elements_max;
 286  288          kstat_named_t arcstat_hash_collisions;
 287  289          kstat_named_t arcstat_hash_chains;
 288  290          kstat_named_t arcstat_hash_chain_max;
 289  291          kstat_named_t arcstat_p;
 290  292          kstat_named_t arcstat_c;
 291  293          kstat_named_t arcstat_c_min;
 292  294          kstat_named_t arcstat_c_max;
 293  295          kstat_named_t arcstat_size;
 294  296          kstat_named_t arcstat_hdr_size;
 295  297          kstat_named_t arcstat_data_size;
 296  298          kstat_named_t arcstat_other_size;
 297  299          kstat_named_t arcstat_l2_hits;
 298  300          kstat_named_t arcstat_l2_misses;
 299  301          kstat_named_t arcstat_l2_feeds;
 300  302          kstat_named_t arcstat_l2_rw_clash;
 301  303          kstat_named_t arcstat_l2_read_bytes;
 302  304          kstat_named_t arcstat_l2_write_bytes;
 303  305          kstat_named_t arcstat_l2_writes_sent;
 304  306          kstat_named_t arcstat_l2_writes_done;
 305  307          kstat_named_t arcstat_l2_writes_error;
 306  308          kstat_named_t arcstat_l2_writes_hdr_miss;
 307  309          kstat_named_t arcstat_l2_evict_lock_retry;
 308  310          kstat_named_t arcstat_l2_evict_reading;

↓ open down ↓

158 lines elided

↑ open up ↑

 309  311          kstat_named_t arcstat_l2_free_on_write;
 310  312          kstat_named_t arcstat_l2_abort_lowmem;
 311  313          kstat_named_t arcstat_l2_cksum_bad;
 312  314          kstat_named_t arcstat_l2_io_error;
 313  315          kstat_named_t arcstat_l2_size;
 314  316          kstat_named_t arcstat_l2_asize;
 315  317          kstat_named_t arcstat_l2_hdr_size;
 316  318          kstat_named_t arcstat_l2_compress_successes;
 317  319          kstat_named_t arcstat_l2_compress_zeros;
 318  320          kstat_named_t arcstat_l2_compress_failures;
      321 +        kstat_named_t arcstat_l2_log_blk_writes;
      322 +        kstat_named_t arcstat_l2_log_blk_avg_size;
      323 +        kstat_named_t arcstat_l2_data_to_meta_ratio;
      324 +        kstat_named_t arcstat_l2_rebuild_successes;
      325 +        kstat_named_t arcstat_l2_rebuild_abort_unsupported;
      326 +        kstat_named_t arcstat_l2_rebuild_abort_timeout;
      327 +        kstat_named_t arcstat_l2_rebuild_abort_io_errors;
      328 +        kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
      329 +        kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
      330 +        kstat_named_t arcstat_l2_rebuild_abort_lowmem;
      331 +        kstat_named_t arcstat_l2_rebuild_size;
      332 +        kstat_named_t arcstat_l2_rebuild_bufs;
      333 +        kstat_named_t arcstat_l2_rebuild_bufs_precached;
      334 +        kstat_named_t arcstat_l2_rebuild_psize;
      335 +        kstat_named_t arcstat_l2_rebuild_log_blks;
 319  336          kstat_named_t arcstat_memory_throttle_count;
 320  337          kstat_named_t arcstat_duplicate_buffers;
 321  338          kstat_named_t arcstat_duplicate_buffers_size;
 322  339          kstat_named_t arcstat_duplicate_reads;
 323  340          kstat_named_t arcstat_meta_used;
 324  341          kstat_named_t arcstat_meta_limit;
 325  342          kstat_named_t arcstat_meta_max;
 326  343  } arc_stats_t;
 327  344  
 328  345  static arc_stats_t arc_stats = {

 329  346          { "hits",                       KSTAT_DATA_UINT64 },
 330  347          { "misses",                     KSTAT_DATA_UINT64 },
 331  348          { "demand_data_hits",           KSTAT_DATA_UINT64 },
 332  349          { "demand_data_misses",         KSTAT_DATA_UINT64 },
 333  350          { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 334  351          { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 335  352          { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 336  353          { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 337  354          { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 338  355          { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 339  356          { "mru_hits",                   KSTAT_DATA_UINT64 },
 340  357          { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 341  358          { "mfu_hits",                   KSTAT_DATA_UINT64 },
 342  359          { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 343  360          { "deleted",                    KSTAT_DATA_UINT64 },
 344  361          { "recycle_miss",               KSTAT_DATA_UINT64 },
 345  362          { "mutex_miss",                 KSTAT_DATA_UINT64 },
 346  363          { "evict_skip",                 KSTAT_DATA_UINT64 },
 347  364          { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 348  365          { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 349  366          { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 350  367          { "hash_elements",              KSTAT_DATA_UINT64 },
 351  368          { "hash_elements_max",          KSTAT_DATA_UINT64 },
 352  369          { "hash_collisions",            KSTAT_DATA_UINT64 },
 353  370          { "hash_chains",                KSTAT_DATA_UINT64 },
 354  371          { "hash_chain_max",             KSTAT_DATA_UINT64 },
 355  372          { "p",                          KSTAT_DATA_UINT64 },
 356  373          { "c",                          KSTAT_DATA_UINT64 },
 357  374          { "c_min",                      KSTAT_DATA_UINT64 },
 358  375          { "c_max",                      KSTAT_DATA_UINT64 },
 359  376          { "size",                       KSTAT_DATA_UINT64 },
 360  377          { "hdr_size",                   KSTAT_DATA_UINT64 },
 361  378          { "data_size",                  KSTAT_DATA_UINT64 },
 362  379          { "other_size",                 KSTAT_DATA_UINT64 },
 363  380          { "l2_hits",                    KSTAT_DATA_UINT64 },
 364  381          { "l2_misses",                  KSTAT_DATA_UINT64 },
 365  382          { "l2_feeds",                   KSTAT_DATA_UINT64 },
 366  383          { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 367  384          { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 368  385          { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 369  386          { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 370  387          { "l2_writes_done",             KSTAT_DATA_UINT64 },
 371  388          { "l2_writes_error",            KSTAT_DATA_UINT64 },
 372  389          { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 373  390          { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 374  391          { "l2_evict_reading",           KSTAT_DATA_UINT64 },

↓ open down ↓

46 lines elided

↑ open up ↑

 375  392          { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 376  393          { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 377  394          { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 378  395          { "l2_io_error",                KSTAT_DATA_UINT64 },
 379  396          { "l2_size",                    KSTAT_DATA_UINT64 },
 380  397          { "l2_asize",                   KSTAT_DATA_UINT64 },
 381  398          { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 382  399          { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 383  400          { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 384  401          { "l2_compress_failures",       KSTAT_DATA_UINT64 },
      402 +        { "l2_log_blk_writes",          KSTAT_DATA_UINT64 },
      403 +        { "l2_log_blk_avg_size",        KSTAT_DATA_UINT64 },
      404 +        { "l2_data_to_meta_ratio",      KSTAT_DATA_UINT64 },
      405 +        { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
      406 +        { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
      407 +        { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
      408 +        { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
      409 +        { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
      410 +        { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
      411 +        { "l2_rebuild_lowmem",          KSTAT_DATA_UINT64 },
      412 +        { "l2_rebuild_psize",           KSTAT_DATA_UINT64 },
      413 +        { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
      414 +        { "l2_rebuild_bufs_precached",  KSTAT_DATA_UINT64 },
      415 +        { "l2_rebuild_size",            KSTAT_DATA_UINT64 },
      416 +        { "l2_rebuild_log_blks",        KSTAT_DATA_UINT64 },
 385  417          { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 386  418          { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 387  419          { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 388  420          { "duplicate_reads",            KSTAT_DATA_UINT64 },
 389  421          { "arc_meta_used",              KSTAT_DATA_UINT64 },
 390  422          { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 391  423          { "arc_meta_max",               KSTAT_DATA_UINT64 }
 392  424  };
 393  425  
 394  426  #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)

 395  427  
 396  428  #define ARCSTAT_INCR(stat, val) \
 397  429          atomic_add_64(&arc_stats.stat.value.ui64, (val))
 398  430  
 399  431  #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 400  432  #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 401  433  
 402  434  #define ARCSTAT_MAX(stat, val) {                                        \
 403  435          uint64_t m;                                                     \
 404  436          while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 405  437              (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 406  438                  continue;                                               \
 407  439  }
 408  440  
 409  441  #define ARCSTAT_MAXSTAT(stat) \
 410  442          ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 411  443  
 412  444  /*
 413  445   * We define a macro to allow ARC hits/misses to be easily broken down by
 414  446   * two separate conditions, giving a total of four different subtypes for
 415  447   * each of hits and misses (so eight statistics total).
 416  448   */
 417  449  #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 418  450          if (cond1) {                                                    \
 419  451                  if (cond2) {                                            \
 420  452                          ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 421  453                  } else {                                                \

↓ open down ↓

27 lines elided

↑ open up ↑

 422  454                          ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 423  455                  }                                                       \
 424  456          } else {                                                        \
 425  457                  if (cond2) {                                            \
 426  458                          ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 427  459                  } else {                                                \
 428  460                          ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 429  461                  }                                                       \
 430  462          }
 431  463  
      464 +/*
      465 + * This macro allows us to use kstats as floating averages. Each time we
      466 + * update this kstat, we first factor it and the update value by
      467 + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
      468 + * average. This macro assumes that integer loads and stores are atomic, but
      469 + * is not safe for multiple writers updating the kstat in parallel (only the
      470 + * last writer's update will remain).
      471 + */
      472 +#define ARCSTAT_F_AVG_FACTOR    3
      473 +#define ARCSTAT_F_AVG(stat, value) \
      474 +        do { \
      475 +                uint64_t x = ARCSTAT(stat); \
      476 +                x = x - x / ARCSTAT_F_AVG_FACTOR + \
      477 +                    (value) / ARCSTAT_F_AVG_FACTOR; \
      478 +                ARCSTAT(stat) = x; \
      479 +                _NOTE(NOTREACHED) \
      480 +                _NOTE(CONSTCOND) \
      481 +        } while (0)
      482 +
 432  483  kstat_t                 *arc_ksp;
 433  484  static arc_state_t      *arc_anon;
 434  485  static arc_state_t      *arc_mru;
 435  486  static arc_state_t      *arc_mru_ghost;
 436  487  static arc_state_t      *arc_mfu;
 437  488  static arc_state_t      *arc_mfu_ghost;
 438  489  static arc_state_t      *arc_l2c_only;
 439  490  
 440  491  /*
 441  492   * There are several ARC variables that are critical to export as kstats --

 442  493   * but we don't want to have to grovel around in the kstat whenever we wish to
 443  494   * manipulate them.  For these variables, we therefore define them to be in
 444  495   * terms of the statistic variable.  This assures that we are not introducing
 445  496   * the possibility of inconsistency by having shadow copies of the variables,
 446  497   * while still allowing the code to be readable.
 447  498   */
 448  499  #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 449  500  #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 450  501  #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 451  502  #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 452  503  #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 453  504  #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 454  505  #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 455  506  #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 456  507  
 457  508  #define L2ARC_IS_VALID_COMPRESS(_c_) \
 458  509          ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 459  510  
 460  511  static int              arc_no_grow;    /* Don't try to grow cache size */
 461  512  static uint64_t         arc_tempreserve;
 462  513  static uint64_t         arc_loaned_bytes;
 463  514  
 464  515  typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 465  516  
 466  517  typedef struct arc_callback arc_callback_t;
 467  518  
 468  519  struct arc_callback {
 469  520          void                    *acb_private;
 470  521          arc_done_func_t         *acb_done;
 471  522          arc_buf_t               *acb_buf;
 472  523          zio_t                   *acb_zio_dummy;
 473  524          arc_callback_t          *acb_next;
 474  525  };
 475  526  
 476  527  typedef struct arc_write_callback arc_write_callback_t;
 477  528  
 478  529  struct arc_write_callback {
 479  530          void            *awcb_private;
 480  531          arc_done_func_t *awcb_ready;
 481  532          arc_done_func_t *awcb_physdone;
 482  533          arc_done_func_t *awcb_done;
 483  534          arc_buf_t       *awcb_buf;
 484  535  };
 485  536  
 486  537  struct arc_buf_hdr {
 487  538          /* protected by hash lock */
 488  539          dva_t                   b_dva;
 489  540          uint64_t                b_birth;
 490  541          uint64_t                b_cksum0;
 491  542  
 492  543          kmutex_t                b_freeze_lock;
 493  544          zio_cksum_t             *b_freeze_cksum;
 494  545          void                    *b_thawed;
 495  546  
 496  547          arc_buf_hdr_t           *b_hash_next;
 497  548          arc_buf_t               *b_buf;
 498  549          uint32_t                b_flags;
 499  550          uint32_t                b_datacnt;
 500  551  
 501  552          arc_callback_t          *b_acb;
 502  553          kcondvar_t              b_cv;
 503  554  
 504  555          /* immutable */
 505  556          arc_buf_contents_t      b_type;
 506  557          uint64_t                b_size;
 507  558          uint64_t                b_spa;
 508  559  
 509  560          /* protected by arc state mutex */
 510  561          arc_state_t             *b_state;
 511  562          list_node_t             b_arc_node;
 512  563  
 513  564          /* updated atomically */
 514  565          clock_t                 b_arc_access;
 515  566  
 516  567          /* self protecting */
 517  568          refcount_t              b_refcnt;
 518  569  
 519  570          l2arc_buf_hdr_t         *b_l2hdr;
 520  571          list_node_t             b_l2node;
 521  572  };
 522  573  
 523  574  static arc_buf_t *arc_eviction_list;
 524  575  static kmutex_t arc_eviction_mtx;
 525  576  static arc_buf_hdr_t arc_eviction_hdr;
 526  577  static void arc_get_data_buf(arc_buf_t *buf);
 527  578  static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 528  579  static int arc_evict_needed(arc_buf_contents_t type);
 529  580  static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 530  581  static void arc_buf_watch(arc_buf_t *buf);
 531  582  
 532  583  static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 533  584  
 534  585  #define GHOST_STATE(state)      \
 535  586          ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 536  587          (state) == arc_l2c_only)
 537  588  
 538  589  /*
 539  590   * Private ARC flags.  These flags are private ARC only flags that will show up
 540  591   * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 541  592   * be passed in as arc_flags in things like arc_read.  However, these flags
 542  593   * should never be passed and should only be set by ARC code.  When adding new
 543  594   * public flags, make sure not to smash the private ones.
 544  595   */
 545  596  
 546  597  #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 547  598  #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 548  599  #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 549  600  #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 550  601  #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 551  602  #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 552  603  #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 553  604  #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 554  605  #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 555  606  #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 556  607  
 557  608  #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 558  609  #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 559  610  #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 560  611  #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 561  612  #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 562  613  #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 563  614  #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 564  615  #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 565  616  #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 566  617                                      (hdr)->b_l2hdr != NULL)
 567  618  #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 568  619  #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 569  620  #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 570  621  
 571  622  /*
 572  623   * Other sizes
 573  624   */
 574  625  
 575  626  #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 576  627  #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 577  628  
 578  629  /*
 579  630   * Hash table routines
 580  631   */
 581  632  
 582  633  #define HT_LOCK_PAD     64
 583  634  
 584  635  struct ht_lock {
 585  636          kmutex_t        ht_lock;
 586  637  #ifdef _KERNEL
 587  638          unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 588  639  #endif
 589  640  };
 590  641  
 591  642  #define BUF_LOCKS 256
 592  643  typedef struct buf_hash_table {
 593  644          uint64_t ht_mask;
 594  645          arc_buf_hdr_t **ht_table;
 595  646          struct ht_lock ht_locks[BUF_LOCKS];
 596  647  } buf_hash_table_t;
 597  648  
 598  649  static buf_hash_table_t buf_hash_table;
 599  650  
 600  651  #define BUF_HASH_INDEX(spa, dva, birth) \
 601  652          (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 602  653  #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 603  654  #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 604  655  #define HDR_LOCK(hdr) \
 605  656          (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 606  657  
 607  658  uint64_t zfs_crc64_table[256];
 608  659  
 609  660  /*
 610  661   * Level 2 ARC
 611  662   */
 612  663  
 613  664  #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 614  665  #define L2ARC_HEADROOM          2                       /* num of writes */
 615  666  /*
 616  667   * If we discover during ARC scan any buffers to be compressed, we boost
 617  668   * our headroom for the next scanning cycle by this percentage multiple.
 618  669   */
 619  670  #define L2ARC_HEADROOM_BOOST    200
 620  671  #define L2ARC_FEED_SECS         1               /* caching interval secs */
 621  672  #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 622  673  
 623  674  #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 624  675  #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 625  676  
 626  677  /* L2ARC Performance Tunables */
 627  678  uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 628  679  uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 629  680  uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */

↓ open down ↓

188 lines elided

↑ open up ↑

 630  681  uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 631  682  uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 632  683  uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 633  684  boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 634  685  boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 635  686  boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 636  687  
 637  688  /*
 638  689   * L2ARC Internals
 639  690   */
 640      -typedef struct l2arc_dev {
 641      -        vdev_t                  *l2ad_vdev;     /* vdev */
 642      -        spa_t                   *l2ad_spa;      /* spa */
 643      -        uint64_t                l2ad_hand;      /* next write location */
 644      -        uint64_t                l2ad_start;     /* first addr on device */
 645      -        uint64_t                l2ad_end;       /* last addr on device */
 646      -        uint64_t                l2ad_evict;     /* last addr eviction reached */
 647      -        boolean_t               l2ad_first;     /* first sweep through */
 648      -        boolean_t               l2ad_writing;   /* currently writing */
 649      -        list_t                  *l2ad_buflist;  /* buffer list */
 650      -        list_node_t             l2ad_node;      /* device list node */
 651      -} l2arc_dev_t;
 652      -
      691 +typedef struct l2arc_dev l2arc_dev_t;
 653  692  static list_t L2ARC_dev_list;                   /* device list */
 654  693  static list_t *l2arc_dev_list;                  /* device list pointer */
 655  694  static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 656  695  static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 657  696  static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 658  697  static list_t L2ARC_free_on_write;              /* free after write buf list */
 659  698  static list_t *l2arc_free_on_write;             /* free after write list ptr */
 660  699  static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 661  700  static uint64_t l2arc_ndev;                     /* number of devices */
 662  701

 663  702  typedef struct l2arc_read_callback {
 664  703          arc_buf_t               *l2rcb_buf;             /* read buffer */

↓ open down ↓

2 lines elided

↑ open up ↑

 665  704          spa_t                   *l2rcb_spa;             /* spa */
 666  705          blkptr_t                l2rcb_bp;               /* original blkptr */
 667  706          zbookmark_t             l2rcb_zb;               /* original bookmark */
 668  707          int                     l2rcb_flags;            /* original flags */
 669  708          enum zio_compress       l2rcb_compress;         /* applied compress */
 670  709  } l2arc_read_callback_t;
 671  710  
 672  711  typedef struct l2arc_write_callback {
 673  712          l2arc_dev_t     *l2wcb_dev;             /* device info */
 674  713          arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
      714 +        /* list of in-flight l2arc_log_blk_buf_t's */
      715 +        list_t          l2wcb_log_blk_buf_list;
 675  716  } l2arc_write_callback_t;
 676  717  
 677  718  struct l2arc_buf_hdr {
 678  719          /* protected by arc_buf_hdr  mutex */
 679  720          l2arc_dev_t             *b_dev;         /* L2ARC device */
 680  721          uint64_t                b_daddr;        /* disk address, offset byte */
 681  722          /* compression applied to buffer data */
 682  723          enum zio_compress       b_compress;
 683  724          /* real alloc'd buffer size depending on b_compress applied */
 684  725          int                     b_asize;

 685  726          /* temporary buffer holder for in-flight compressed data */
 686  727          void                    *b_tmp_cdata;
 687  728  };
 688  729  
 689  730  typedef struct l2arc_data_free {
 690  731          /* protected by l2arc_free_on_write_mtx */
 691  732          void            *l2df_data;

↓ open down ↓

7 lines elided

↑ open up ↑

 692  733          size_t          l2df_size;
 693  734          void            (*l2df_func)(void *, size_t);
 694  735          list_node_t     l2df_list_node;
 695  736  } l2arc_data_free_t;
 696  737  
 697  738  static kmutex_t l2arc_feed_thr_lock;
 698  739  static kcondvar_t l2arc_feed_thr_cv;
 699  740  static uint8_t l2arc_thread_exit;
 700  741  
 701  742  static void l2arc_read_done(zio_t *zio);
 702      -static void l2arc_hdr_stat_add(void);
      743 +static void l2arc_hdr_stat_add(boolean_t from_arc);
 703  744  static void l2arc_hdr_stat_remove(void);
      745 +static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
 704  746  
 705  747  static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 706  748  static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 707  749      enum zio_compress c);
 708  750  static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 709  751  
 710      -static uint64_t
      752 +enum {
      753 +        L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0)    /* mirror of l2ad_first */
      754 +};
      755 +
      756 +/*
      757 + * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
      758 + */
      759 +typedef struct l2arc_log_blk_ptr {
      760 +        uint64_t        l2lbp_daddr;    /* device address of log */
      761 +        /*
      762 +         * l2lbp_prop is the same format as the blk_prop in blkptr_t:
      763 +         *      * logical size (in sectors)
      764 +         *      * physical (compressed) size (in sectors)
      765 +         *      * compression algorithm (we always LZ4-compress l2arc logs)
      766 +         *      * checksum algorithm (used for l2lbp_cksum)
      767 +         *      * object type & level (unused for now)
      768 +         */
      769 +        uint64_t        l2lbp_prop;
      770 +        zio_cksum_t     l2lbp_cksum;    /* fletcher4 of log */
      771 +} l2arc_log_blk_ptr_t;
      772 +
      773 +/*
      774 + * The persistent L2ARC device header.
      775 + */
      776 +typedef struct l2arc_dev_hdr_phys {
      777 +        uint64_t        l2dh_magic;
      778 +        zio_cksum_t     l2dh_self_cksum;        /* fletcher4 of fields below */
      779 +
      780 +        /*
      781 +         * Global L2ARC device state and metadata.
      782 +         */
      783 +        uint64_t        l2dh_spa_guid;
      784 +        uint64_t        l2dh_evict_tail;        /* current evict pointer */
      785 +        uint64_t        l2dh_alloc_space;       /* vdev space alloc status */
      786 +        uint64_t        l2dh_flags;             /* l2arc_dev_hdr_flags_t */
      787 +
      788 +        /*
      789 +         * Start of log block chain. [0] -> newest log, [1] -> one older (used
      790 +         * for initiating prefetch).
      791 +         */
      792 +        l2arc_log_blk_ptr_t     l2dh_start_lbps[2];
      793 +
      794 +        const uint64_t  l2dh_pad[43];           /* pad to 512 bytes */
      795 +} l2arc_dev_hdr_phys_t;
      796 +CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
      797 +
      798 +/*
      799 + * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
      800 + */
      801 +typedef struct l2arc_log_ent_phys {
      802 +        dva_t                   l2le_dva;       /* dva of buffer */
      803 +        uint64_t                l2le_birth;     /* birth txg of buffer */
      804 +        uint64_t                l2le_cksum0;
      805 +        zio_cksum_t             l2le_freeze_cksum;
      806 +        /*
      807 +         * l2le_prop is the same format as the blk_prop in blkptr_t:
      808 +         *      * logical size (in sectors)
      809 +         *      * physical (compressed) size (in sectors)
      810 +         *      * compression algorithm
      811 +         *      * checksum algorithm (used for cksum0)
      812 +         *      * object type & level (used to restore arc_buf_contents_t)
      813 +         */
      814 +        uint64_t                l2le_prop;
      815 +        uint64_t                l2le_daddr;     /* buf location on l2dev */
      816 +        const uint64_t          l2le_pad[6];    /* resv'd for future use */
      817 +} l2arc_log_ent_phys_t;
      818 +
      819 +/*
      820 + * These design limits give us the following overhead (before compression):
      821 + *      avg_blk_sz      overhead
      822 + *      1k              12.51 %
      823 + *      2k               6.26 %
      824 + *      4k               3.13 %
      825 + *      8k               1.56 %
      826 + *      16k              0.78 %
      827 + *      32k              0.39 %
      828 + *      64k              0.20 %
      829 + *      128k             0.10 %
      830 + * Compression should be able to sequeeze these down by about a factor of 2x.
      831 + */
      832 +#define L2ARC_LOG_BLK_SIZE                      (128 * 1024)    /* 128k */
      833 +#define L2ARC_LOG_BLK_HEADER_LEN                (128)
      834 +#define L2ARC_LOG_BLK_ENTRIES                   /* 1023 entries */      \
      835 +        ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) /              \
      836 +        sizeof (l2arc_log_ent_phys_t))
      837 +/*
      838 + * Maximum amount of data in an l2arc log block (used to terminate rebuilding
      839 + * before we hit the write head and restore potentially corrupted blocks).
      840 + */
      841 +#define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE  \
      842 +        (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
      843 +/*
      844 + * For the persistency and rebuild algorithms to operate reliably we need
      845 + * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
      846 + * excessive log block looping might confuse the log chain end detection).
      847 + * Under normal circumstances this is not a problem, since this is somewhere
      848 + * around only 400 MB.
      849 + */
      850 +#define L2ARC_PERSIST_MIN_SIZE  (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
      851 +
      852 +/*
      853 + * A log block of up to 1023 ARC buffer log entries, chained into the
      854 + * persistent L2ARC metadata linked list.
      855 + */
      856 +typedef struct l2arc_log_blk_phys {
      857 +        /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
      858 +        uint64_t                l2lb_magic;
      859 +        l2arc_log_blk_ptr_t     l2lb_back2_lbp; /* back 2 steps in chain */
      860 +        uint64_t                l2lb_pad[9];    /* resv'd for future use */
      861 +        /* Payload */
      862 +        l2arc_log_ent_phys_t    l2lb_entries[L2ARC_LOG_BLK_ENTRIES];
      863 +} l2arc_log_blk_phys_t;
      864 +
      865 +CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
      866 +CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) -
      867 +    offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
      868 +
      869 +/*
      870 + * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
      871 + * written to the L2ARC device. They may be compressed, hence the uint8_t[].
      872 + */
      873 +typedef struct l2arc_log_blk_buf {
      874 +        uint8_t         l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
      875 +        list_node_t     l2lbb_node;
      876 +} l2arc_log_blk_buf_t;
      877 +
      878 +/* Macros for the manipulation fields in the blk_prop format of blkptr_t */
      879 +#define BLKPROP_GET_LSIZE(_obj, _field)         \
      880 +        BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
      881 +#define BLKPROP_SET_LSIZE(_obj, _field, x)      \
      882 +        BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
      883 +#define BLKPROP_GET_PSIZE(_obj, _field)         \
      884 +        BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
      885 +#define BLKPROP_SET_PSIZE(_obj, _field, x)      \
      886 +        BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
      887 +#define BLKPROP_GET_COMPRESS(_obj, _field)      \
      888 +        BF64_GET((_obj)->_field, 32, 8)
      889 +#define BLKPROP_SET_COMPRESS(_obj, _field, x)   \
      890 +        BF64_SET((_obj)->_field, 32, 8, x)
      891 +#define BLKPROP_GET_CHECKSUM(_obj, _field)      \
      892 +        BF64_GET((_obj)->_field, 40, 8)
      893 +#define BLKPROP_SET_CHECKSUM(_obj, _field, x)   \
      894 +        BF64_SET((_obj)->_field, 40, 8, x)
      895 +#define BLKPROP_GET_TYPE(_obj, _field)          \
      896 +        BF64_GET((_obj)->_field, 48, 8)
      897 +#define BLKPROP_SET_TYPE(_obj, _field, x)       \
      898 +        BF64_SET((_obj)->_field, 48, 8, x)
      899 +
      900 +/* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */
      901 +#define LBP_GET_LSIZE(_add)             BLKPROP_GET_LSIZE(_add, l2lbp_prop)
      902 +#define LBP_SET_LSIZE(_add, x)          BLKPROP_SET_LSIZE(_add, l2lbp_prop, x)
      903 +#define LBP_GET_PSIZE(_add)             BLKPROP_GET_PSIZE(_add, l2lbp_prop)
      904 +#define LBP_SET_PSIZE(_add, x)          BLKPROP_SET_PSIZE(_add, l2lbp_prop, x)
      905 +#define LBP_GET_COMPRESS(_add)          BLKPROP_GET_COMPRESS(_add, l2lbp_prop)
      906 +#define LBP_SET_COMPRESS(_add, x)       BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \
      907 +    x)
      908 +#define LBP_GET_CHECKSUM(_add)          BLKPROP_GET_CHECKSUM(_add, l2lbp_prop)
      909 +#define LBP_SET_CHECKSUM(_add, x)       BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \
      910 +    x)
      911 +#define LBP_GET_TYPE(_add)              BLKPROP_GET_TYPE(_add, l2lbp_prop)
      912 +#define LBP_SET_TYPE(_add, x)           BLKPROP_SET_TYPE(_add, l2lbp_prop, x)
      913 +
      914 +/* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */
      915 +#define LE_GET_LSIZE(_le)       BLKPROP_GET_LSIZE(_le, l2le_prop)
      916 +#define LE_SET_LSIZE(_le, x)    BLKPROP_SET_LSIZE(_le, l2le_prop, x)
      917 +#define LE_GET_PSIZE(_le)       BLKPROP_GET_PSIZE(_le, l2le_prop)
      918 +#define LE_SET_PSIZE(_le, x)    BLKPROP_SET_PSIZE(_le, l2le_prop, x)
      919 +#define LE_GET_COMPRESS(_le)    BLKPROP_GET_COMPRESS(_le, l2le_prop)
      920 +#define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x)
      921 +#define LE_GET_CHECKSUM(_le)    BLKPROP_GET_CHECKSUM(_le, l2le_prop)
      922 +#define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x)
      923 +#define LE_GET_TYPE(_le)        BLKPROP_GET_TYPE(_le, l2le_prop)
      924 +#define LE_SET_TYPE(_le, x)     BLKPROP_SET_TYPE(_le, l2le_prop, x)
      925 +
      926 +#define PTR_SWAP(x, y)          \
      927 +        do {                    \
      928 +                void *tmp = (x);\
      929 +                x = y;          \
      930 +                y = tmp;        \
      931 +                _NOTE(CONSTCOND)\
      932 +        } while (0)
      933 +
      934 +#define L2ARC_DEV_HDR_MAGIC     0x12bab10c00000001LLU
      935 +#define L2ARC_LOG_BLK_MAGIC     0x120103b10c000001LLU
      936 +#define L2ARC_REBUILD_TIMEOUT   300     /* a rebuild may take at most 300s */
      937 +
      938 +struct l2arc_dev {
      939 +        vdev_t                  *l2ad_vdev;     /* vdev */
      940 +        spa_t                   *l2ad_spa;      /* spa */
      941 +        uint64_t                l2ad_hand;      /* next write location */
      942 +        uint64_t                l2ad_start;     /* first addr on device */
      943 +        uint64_t                l2ad_end;       /* last addr on device */
      944 +        uint64_t                l2ad_evict;     /* last addr eviction reached */
      945 +        boolean_t               l2ad_first;     /* first sweep through */
      946 +        boolean_t               l2ad_writing;   /* currently writing */
      947 +        list_t                  *l2ad_buflist;  /* buffer list */
      948 +        list_node_t             l2ad_node;      /* device list node */
      949 +        l2arc_dev_hdr_phys_t    l2ad_dev_hdr;   /* persistent device header */
      950 +        l2arc_log_blk_phys_t    l2ad_log_blk;   /* currently open log block */
      951 +        int                     l2ad_log_ent_idx; /* index into cur log blk */
      952 +        /* number of bytes in current log block's payload */
      953 +        uint64_t                l2ad_log_blk_payload_asize;
      954 +        /* flag indicating whether a rebuild is scheduled or is going on */
      955 +        boolean_t               l2ad_rebuild;
      956 +};
      957 +
      958 +/*
      959 + * Performance tuning of L2ARC persistency:
      960 + *
      961 + * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
      962 + *              pool import or when adding one manually later) will attempt
      963 + *              to rebuild L2ARC buffer contents. In special circumstances,
      964 + *              the administrator may want to set this to B_FALSE, if they
      965 + *              are having trouble importing a pool or attaching an L2ARC
      966 + *              device (e.g. the L2ARC device is slow to read in stored log
      967 + *              metadata, or the metadata has become somehow
      968 + *              fragmented/unusable).
      969 + * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
      970 + *              avoid a slow L2ARC device from preventing pool import. If we
      971 + *              are not done rebuilding an L2ARC device by this time, we
      972 + *              stop the rebuild and return immediately.
      973 + */
      974 +boolean_t l2arc_rebuild_enabled = B_TRUE;
      975 +uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
      976 +
      977 +/*
      978 + * L2ARC persistency rebuild routines.
      979 + */
      980 +static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
      981 +static int l2arc_rebuild(l2arc_dev_t *dev);
      982 +static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
      983 +    l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
      984 +static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
      985 +    l2arc_dev_t *dev, uint64_t guid);
      986 +
      987 +/*
      988 + * L2ARC persistency read I/O routines.
      989 + */
      990 +static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr);
      991 +static int l2arc_log_blk_read(l2arc_dev_t *dev,
      992 +    const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp,
      993 +    l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
      994 +    uint8_t *this_lb_buf, uint8_t *next_lb_buf,
      995 +    zio_t *this_io, zio_t **next_io);
      996 +static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev,
      997 +    const l2arc_log_blk_ptr_t *lp);
      998 +static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
      999 +    const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf);
     1000 +static void l2arc_log_blk_prefetch_abort(zio_t *zio);
     1001 +
     1002 +/*
     1003 + * L2ARC persistency write I/O routines.
     1004 + */
     1005 +static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
     1006 +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     1007 +    l2arc_write_callback_t *cb);
     1008 +
     1009 +/*
     1010 + * L2ARC persistency auxilliary routines.
     1011 + */
     1012 +static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
     1013 +    zio_cksum_t *cksum);
     1014 +static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
     1015 +    const arc_buf_hdr_t *ab);
     1016 +static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
     1017 +    uint64_t top, uint64_t check);
     1018 +static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline);
     1019 +
     1020 +static inline uint64_t
 711 1021  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 712 1022  {
 713 1023          uint8_t *vdva = (uint8_t *)dva;
 714 1024          uint64_t crc = -1ULL;
 715 1025          int i;
 716 1026  
 717 1027          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 718 1028  
 719 1029          for (i = 0; i < sizeof (dva_t); i++)
 720 1030                  crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];

 721 1031  
 722 1032          crc ^= (spa>>8) ^ birth;
 723 1033  
 724 1034          return (crc);
 725 1035  }
 726 1036  
 727 1037  #define BUF_EMPTY(buf)                                          \
 728 1038          ((buf)->b_dva.dva_word[0] == 0 &&                       \
 729 1039          (buf)->b_dva.dva_word[1] == 0 &&                        \
 730 1040          (buf)->b_birth == 0)
 731 1041  
 732 1042  #define BUF_EQUAL(spa, dva, birth, buf)                         \
 733 1043          ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 734 1044          ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 735 1045          ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 736 1046  
 737 1047  static void
 738 1048  buf_discard_identity(arc_buf_hdr_t *hdr)
 739 1049  {
 740 1050          hdr->b_dva.dva_word[0] = 0;
 741 1051          hdr->b_dva.dva_word[1] = 0;
 742 1052          hdr->b_birth = 0;
 743 1053          hdr->b_cksum0 = 0;
 744 1054  }
 745 1055  
 746 1056  static arc_buf_hdr_t *
 747 1057  buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 748 1058  {
 749 1059          uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 750 1060          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 751 1061          arc_buf_hdr_t *buf;
 752 1062  
 753 1063          mutex_enter(hash_lock);
 754 1064          for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 755 1065              buf = buf->b_hash_next) {
 756 1066                  if (BUF_EQUAL(spa, dva, birth, buf)) {
 757 1067                          *lockp = hash_lock;
 758 1068                          return (buf);
 759 1069                  }
 760 1070          }
 761 1071          mutex_exit(hash_lock);
 762 1072          *lockp = NULL;
 763 1073          return (NULL);
 764 1074  }
 765 1075  
 766 1076  /*
 767 1077   * Insert an entry into the hash table.  If there is already an element
 768 1078   * equal to elem in the hash table, then the already existing element
 769 1079   * will be returned and the new element will not be inserted.
 770 1080   * Otherwise returns NULL.
 771 1081   */
 772 1082  static arc_buf_hdr_t *
 773 1083  buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 774 1084  {
 775 1085          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 776 1086          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 777 1087          arc_buf_hdr_t *fbuf;
 778 1088          uint32_t i;
 779 1089  
 780 1090          ASSERT(!HDR_IN_HASH_TABLE(buf));
 781 1091          *lockp = hash_lock;
 782 1092          mutex_enter(hash_lock);
 783 1093          for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 784 1094              fbuf = fbuf->b_hash_next, i++) {
 785 1095                  if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 786 1096                          return (fbuf);
 787 1097          }
 788 1098  
 789 1099          buf->b_hash_next = buf_hash_table.ht_table[idx];
 790 1100          buf_hash_table.ht_table[idx] = buf;
 791 1101          buf->b_flags |= ARC_IN_HASH_TABLE;
 792 1102  
 793 1103          /* collect some hash table performance data */
 794 1104          if (i > 0) {
 795 1105                  ARCSTAT_BUMP(arcstat_hash_collisions);
 796 1106                  if (i == 1)
 797 1107                          ARCSTAT_BUMP(arcstat_hash_chains);
 798 1108  
 799 1109                  ARCSTAT_MAX(arcstat_hash_chain_max, i);
 800 1110          }
 801 1111  
 802 1112          ARCSTAT_BUMP(arcstat_hash_elements);
 803 1113          ARCSTAT_MAXSTAT(arcstat_hash_elements);
 804 1114  
 805 1115          return (NULL);
 806 1116  }
 807 1117  
 808 1118  static void
 809 1119  buf_hash_remove(arc_buf_hdr_t *buf)
 810 1120  {
 811 1121          arc_buf_hdr_t *fbuf, **bufp;
 812 1122          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 813 1123  
 814 1124          ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 815 1125          ASSERT(HDR_IN_HASH_TABLE(buf));
 816 1126  
 817 1127          bufp = &buf_hash_table.ht_table[idx];
 818 1128          while ((fbuf = *bufp) != buf) {
 819 1129                  ASSERT(fbuf != NULL);
 820 1130                  bufp = &fbuf->b_hash_next;
 821 1131          }
 822 1132          *bufp = buf->b_hash_next;
 823 1133          buf->b_hash_next = NULL;
 824 1134          buf->b_flags &= ~ARC_IN_HASH_TABLE;
 825 1135  
 826 1136          /* collect some hash table performance data */
 827 1137          ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 828 1138  
 829 1139          if (buf_hash_table.ht_table[idx] &&
 830 1140              buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 831 1141                  ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 832 1142  }
 833 1143  
 834 1144  /*
 835 1145   * Global data structures and functions for the buf kmem cache.
 836 1146   */
 837 1147  static kmem_cache_t *hdr_cache;
 838 1148  static kmem_cache_t *buf_cache;
 839 1149  
 840 1150  static void
 841 1151  buf_fini(void)
 842 1152  {
 843 1153          int i;
 844 1154  
 845 1155          kmem_free(buf_hash_table.ht_table,
 846 1156              (buf_hash_table.ht_mask + 1) * sizeof (void *));
 847 1157          for (i = 0; i < BUF_LOCKS; i++)
 848 1158                  mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 849 1159          kmem_cache_destroy(hdr_cache);
 850 1160          kmem_cache_destroy(buf_cache);
 851 1161  }
 852 1162  
 853 1163  /*
 854 1164   * Constructor callback - called when the cache is empty
 855 1165   * and a new buf is requested.
 856 1166   */
 857 1167  /* ARGSUSED */
 858 1168  static int
 859 1169  hdr_cons(void *vbuf, void *unused, int kmflag)
 860 1170  {
 861 1171          arc_buf_hdr_t *buf = vbuf;
 862 1172  
 863 1173          bzero(buf, sizeof (arc_buf_hdr_t));
 864 1174          refcount_create(&buf->b_refcnt);
 865 1175          cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 866 1176          mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 867 1177          arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 868 1178  
 869 1179          return (0);
 870 1180  }
 871 1181  
 872 1182  /* ARGSUSED */
 873 1183  static int
 874 1184  buf_cons(void *vbuf, void *unused, int kmflag)
 875 1185  {
 876 1186          arc_buf_t *buf = vbuf;
 877 1187  
 878 1188          bzero(buf, sizeof (arc_buf_t));
 879 1189          mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 880 1190          arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 881 1191  
 882 1192          return (0);
 883 1193  }
 884 1194  
 885 1195  /*
 886 1196   * Destructor callback - called when a cached buf is
 887 1197   * no longer required.
 888 1198   */
 889 1199  /* ARGSUSED */
 890 1200  static void
 891 1201  hdr_dest(void *vbuf, void *unused)
 892 1202  {
 893 1203          arc_buf_hdr_t *buf = vbuf;
 894 1204  
 895 1205          ASSERT(BUF_EMPTY(buf));
 896 1206          refcount_destroy(&buf->b_refcnt);
 897 1207          cv_destroy(&buf->b_cv);
 898 1208          mutex_destroy(&buf->b_freeze_lock);
 899 1209          arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 900 1210  }
 901 1211  
 902 1212  /* ARGSUSED */
 903 1213  static void
 904 1214  buf_dest(void *vbuf, void *unused)
 905 1215  {
 906 1216          arc_buf_t *buf = vbuf;
 907 1217  
 908 1218          mutex_destroy(&buf->b_evict_lock);
 909 1219          arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 910 1220  }
 911 1221  
 912 1222  /*
 913 1223   * Reclaim callback -- invoked when memory is low.
 914 1224   */
 915 1225  /* ARGSUSED */
 916 1226  static void
 917 1227  hdr_recl(void *unused)
 918 1228  {
 919 1229          dprintf("hdr_recl called\n");
 920 1230          /*
 921 1231           * umem calls the reclaim func when we destroy the buf cache,
 922 1232           * which is after we do arc_fini().
 923 1233           */
 924 1234          if (!arc_dead)
 925 1235                  cv_signal(&arc_reclaim_thr_cv);
 926 1236  }
 927 1237  
 928 1238  static void
 929 1239  buf_init(void)
 930 1240  {
 931 1241          uint64_t *ct;
 932 1242          uint64_t hsize = 1ULL << 12;
 933 1243          int i, j;
 934 1244  
 935 1245          /*
 936 1246           * The hash table is big enough to fill all of physical memory
 937 1247           * with an average 64K block size.  The table will take up
 938 1248           * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 939 1249           */
 940 1250          while (hsize * 65536 < physmem * PAGESIZE)
 941 1251                  hsize <<= 1;
 942 1252  retry:
 943 1253          buf_hash_table.ht_mask = hsize - 1;
 944 1254          buf_hash_table.ht_table =
 945 1255              kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 946 1256          if (buf_hash_table.ht_table == NULL) {
 947 1257                  ASSERT(hsize > (1ULL << 8));
 948 1258                  hsize >>= 1;
 949 1259                  goto retry;
 950 1260          }
 951 1261  
 952 1262          hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 953 1263              0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 954 1264          buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 955 1265              0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 956 1266  
 957 1267          for (i = 0; i < 256; i++)
 958 1268                  for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 959 1269                          *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 960 1270  
 961 1271          for (i = 0; i < BUF_LOCKS; i++) {
 962 1272                  mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 963 1273                      NULL, MUTEX_DEFAULT, NULL);
 964 1274          }
 965 1275  }
 966 1276  
 967 1277  #define ARC_MINTIME     (hz>>4) /* 62 ms */
 968 1278  
 969 1279  static void
 970 1280  arc_cksum_verify(arc_buf_t *buf)
 971 1281  {
 972 1282          zio_cksum_t zc;
 973 1283  
 974 1284          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 975 1285                  return;
 976 1286  
 977 1287          mutex_enter(&buf->b_hdr->b_freeze_lock);
 978 1288          if (buf->b_hdr->b_freeze_cksum == NULL ||
 979 1289              (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 980 1290                  mutex_exit(&buf->b_hdr->b_freeze_lock);
 981 1291                  return;
 982 1292          }
 983 1293          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 984 1294          if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 985 1295                  panic("buffer modified while frozen!");
 986 1296          mutex_exit(&buf->b_hdr->b_freeze_lock);
 987 1297  }
 988 1298  
 989 1299  static int
 990 1300  arc_cksum_equal(arc_buf_t *buf)
 991 1301  {
 992 1302          zio_cksum_t zc;
 993 1303          int equal;
 994 1304  
 995 1305          mutex_enter(&buf->b_hdr->b_freeze_lock);
 996 1306          fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 997 1307          equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 998 1308          mutex_exit(&buf->b_hdr->b_freeze_lock);
 999 1309  
1000 1310          return (equal);
1001 1311  }
1002 1312  
1003 1313  static void
1004 1314  arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1005 1315  {
1006 1316          if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1007 1317                  return;
1008 1318  
1009 1319          mutex_enter(&buf->b_hdr->b_freeze_lock);
1010 1320          if (buf->b_hdr->b_freeze_cksum != NULL) {
1011 1321                  mutex_exit(&buf->b_hdr->b_freeze_lock);
1012 1322                  return;
1013 1323          }
1014 1324          buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1015 1325          fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1016 1326              buf->b_hdr->b_freeze_cksum);
1017 1327          mutex_exit(&buf->b_hdr->b_freeze_lock);
1018 1328          arc_buf_watch(buf);
1019 1329  }
1020 1330  
1021 1331  #ifndef _KERNEL
1022 1332  typedef struct procctl {
1023 1333          long cmd;
1024 1334          prwatch_t prwatch;
1025 1335  } procctl_t;
1026 1336  #endif
1027 1337  
1028 1338  /* ARGSUSED */
1029 1339  static void
1030 1340  arc_buf_unwatch(arc_buf_t *buf)
1031 1341  {
1032 1342  #ifndef _KERNEL
1033 1343          if (arc_watch) {
1034 1344                  int result;
1035 1345                  procctl_t ctl;
1036 1346                  ctl.cmd = PCWATCH;
1037 1347                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1038 1348                  ctl.prwatch.pr_size = 0;
1039 1349                  ctl.prwatch.pr_wflags = 0;
1040 1350                  result = write(arc_procfd, &ctl, sizeof (ctl));
1041 1351                  ASSERT3U(result, ==, sizeof (ctl));
1042 1352          }
1043 1353  #endif
1044 1354  }
1045 1355  
1046 1356  /* ARGSUSED */
1047 1357  static void
1048 1358  arc_buf_watch(arc_buf_t *buf)
1049 1359  {
1050 1360  #ifndef _KERNEL
1051 1361          if (arc_watch) {
1052 1362                  int result;
1053 1363                  procctl_t ctl;
1054 1364                  ctl.cmd = PCWATCH;
1055 1365                  ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1056 1366                  ctl.prwatch.pr_size = buf->b_hdr->b_size;
1057 1367                  ctl.prwatch.pr_wflags = WA_WRITE;
1058 1368                  result = write(arc_procfd, &ctl, sizeof (ctl));
1059 1369                  ASSERT3U(result, ==, sizeof (ctl));
1060 1370          }
1061 1371  #endif
1062 1372  }
1063 1373  
1064 1374  void
1065 1375  arc_buf_thaw(arc_buf_t *buf)
1066 1376  {
1067 1377          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1068 1378                  if (buf->b_hdr->b_state != arc_anon)
1069 1379                          panic("modifying non-anon buffer!");
1070 1380                  if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1071 1381                          panic("modifying buffer while i/o in progress!");
1072 1382                  arc_cksum_verify(buf);
1073 1383          }
1074 1384  
1075 1385          mutex_enter(&buf->b_hdr->b_freeze_lock);
1076 1386          if (buf->b_hdr->b_freeze_cksum != NULL) {
1077 1387                  kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1078 1388                  buf->b_hdr->b_freeze_cksum = NULL;
1079 1389          }
1080 1390  
1081 1391          if (zfs_flags & ZFS_DEBUG_MODIFY) {
1082 1392                  if (buf->b_hdr->b_thawed)
1083 1393                          kmem_free(buf->b_hdr->b_thawed, 1);
1084 1394                  buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1085 1395          }
1086 1396  
1087 1397          mutex_exit(&buf->b_hdr->b_freeze_lock);
1088 1398  
1089 1399          arc_buf_unwatch(buf);
1090 1400  }
1091 1401  
1092 1402  void
1093 1403  arc_buf_freeze(arc_buf_t *buf)
1094 1404  {
1095 1405          kmutex_t *hash_lock;
1096 1406  
1097 1407          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1098 1408                  return;
1099 1409  
1100 1410          hash_lock = HDR_LOCK(buf->b_hdr);
1101 1411          mutex_enter(hash_lock);
1102 1412  
1103 1413          ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1104 1414              buf->b_hdr->b_state == arc_anon);
1105 1415          arc_cksum_compute(buf, B_FALSE);
1106 1416          mutex_exit(hash_lock);
1107 1417  
1108 1418  }
1109 1419  
1110 1420  static void
1111 1421  add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1112 1422  {
1113 1423          ASSERT(MUTEX_HELD(hash_lock));
1114 1424  
1115 1425          if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1116 1426              (ab->b_state != arc_anon)) {
1117 1427                  uint64_t delta = ab->b_size * ab->b_datacnt;
1118 1428                  list_t *list = &ab->b_state->arcs_list[ab->b_type];
1119 1429                  uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1120 1430  
1121 1431                  ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1122 1432                  mutex_enter(&ab->b_state->arcs_mtx);
1123 1433                  ASSERT(list_link_active(&ab->b_arc_node));
1124 1434                  list_remove(list, ab);
1125 1435                  if (GHOST_STATE(ab->b_state)) {
1126 1436                          ASSERT0(ab->b_datacnt);
1127 1437                          ASSERT3P(ab->b_buf, ==, NULL);
1128 1438                          delta = ab->b_size;
1129 1439                  }
1130 1440                  ASSERT(delta > 0);
1131 1441                  ASSERT3U(*size, >=, delta);
1132 1442                  atomic_add_64(size, -delta);
1133 1443                  mutex_exit(&ab->b_state->arcs_mtx);
1134 1444                  /* remove the prefetch flag if we get a reference */
1135 1445                  if (ab->b_flags & ARC_PREFETCH)
1136 1446                          ab->b_flags &= ~ARC_PREFETCH;
1137 1447          }
1138 1448  }
1139 1449  
1140 1450  static int
1141 1451  remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1142 1452  {
1143 1453          int cnt;
1144 1454          arc_state_t *state = ab->b_state;
1145 1455  
1146 1456          ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1147 1457          ASSERT(!GHOST_STATE(state));
1148 1458  
1149 1459          if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1150 1460              (state != arc_anon)) {
1151 1461                  uint64_t *size = &state->arcs_lsize[ab->b_type];
1152 1462  
1153 1463                  ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1154 1464                  mutex_enter(&state->arcs_mtx);
1155 1465                  ASSERT(!list_link_active(&ab->b_arc_node));
1156 1466                  list_insert_head(&state->arcs_list[ab->b_type], ab);
1157 1467                  ASSERT(ab->b_datacnt > 0);
1158 1468                  atomic_add_64(size, ab->b_size * ab->b_datacnt);
1159 1469                  mutex_exit(&state->arcs_mtx);
1160 1470          }
1161 1471          return (cnt);
1162 1472  }
1163 1473  
1164 1474  /*
1165 1475   * Move the supplied buffer to the indicated state.  The mutex
1166 1476   * for the buffer must be held by the caller.
1167 1477   */
1168 1478  static void
1169 1479  arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1170 1480  {
1171 1481          arc_state_t *old_state = ab->b_state;
1172 1482          int64_t refcnt = refcount_count(&ab->b_refcnt);
1173 1483          uint64_t from_delta, to_delta;
1174 1484  
1175 1485          ASSERT(MUTEX_HELD(hash_lock));
1176 1486          ASSERT3P(new_state, !=, old_state);
1177 1487          ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1178 1488          ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1179 1489          ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1180 1490  
1181 1491          from_delta = to_delta = ab->b_datacnt * ab->b_size;
1182 1492  
1183 1493          /*
1184 1494           * If this buffer is evictable, transfer it from the
1185 1495           * old state list to the new state list.
1186 1496           */
1187 1497          if (refcnt == 0) {
1188 1498                  if (old_state != arc_anon) {
1189 1499                          int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1190 1500                          uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1191 1501  
1192 1502                          if (use_mutex)
1193 1503                                  mutex_enter(&old_state->arcs_mtx);
1194 1504  
1195 1505                          ASSERT(list_link_active(&ab->b_arc_node));
1196 1506                          list_remove(&old_state->arcs_list[ab->b_type], ab);
1197 1507  
1198 1508                          /*
1199 1509                           * If prefetching out of the ghost cache,
1200 1510                           * we will have a non-zero datacnt.
1201 1511                           */
1202 1512                          if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1203 1513                                  /* ghost elements have a ghost size */
1204 1514                                  ASSERT(ab->b_buf == NULL);
1205 1515                                  from_delta = ab->b_size;
1206 1516                          }
1207 1517                          ASSERT3U(*size, >=, from_delta);
1208 1518                          atomic_add_64(size, -from_delta);
1209 1519  
1210 1520                          if (use_mutex)
1211 1521                                  mutex_exit(&old_state->arcs_mtx);
1212 1522                  }
1213 1523                  if (new_state != arc_anon) {
1214 1524                          int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1215 1525                          uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1216 1526  
1217 1527                          if (use_mutex)
1218 1528                                  mutex_enter(&new_state->arcs_mtx);
1219 1529  
1220 1530                          list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1221 1531  
1222 1532                          /* ghost elements have a ghost size */
1223 1533                          if (GHOST_STATE(new_state)) {
1224 1534                                  ASSERT(ab->b_datacnt == 0);
1225 1535                                  ASSERT(ab->b_buf == NULL);
1226 1536                                  to_delta = ab->b_size;
1227 1537                          }
1228 1538                          atomic_add_64(size, to_delta);
1229 1539  
1230 1540                          if (use_mutex)
1231 1541                                  mutex_exit(&new_state->arcs_mtx);
1232 1542                  }
1233 1543          }
1234 1544  
1235 1545          ASSERT(!BUF_EMPTY(ab));
1236 1546          if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1237 1547                  buf_hash_remove(ab);
1238 1548  
1239 1549          /* adjust state sizes */

↓ open down ↓

519 lines elided

↑ open up ↑

1240 1550          if (to_delta)
1241 1551                  atomic_add_64(&new_state->arcs_size, to_delta);
1242 1552          if (from_delta) {
1243 1553                  ASSERT3U(old_state->arcs_size, >=, from_delta);
1244 1554                  atomic_add_64(&old_state->arcs_size, -from_delta);
1245 1555          }
1246 1556          ab->b_state = new_state;
1247 1557  
1248 1558          /* adjust l2arc hdr stats */
1249 1559          if (new_state == arc_l2c_only)
1250      -                l2arc_hdr_stat_add();
     1560 +                l2arc_hdr_stat_add(old_state != arc_anon);
1251 1561          else if (old_state == arc_l2c_only)
1252 1562                  l2arc_hdr_stat_remove();
1253 1563  }
1254 1564  
1255 1565  void
1256 1566  arc_space_consume(uint64_t space, arc_space_type_t type)
1257 1567  {
1258 1568          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1259 1569  
1260 1570          switch (type) {

1261 1571          case ARC_SPACE_DATA:
1262 1572                  ARCSTAT_INCR(arcstat_data_size, space);
1263 1573                  break;
1264 1574          case ARC_SPACE_OTHER:
1265 1575                  ARCSTAT_INCR(arcstat_other_size, space);
1266 1576                  break;
1267 1577          case ARC_SPACE_HDRS:
1268 1578                  ARCSTAT_INCR(arcstat_hdr_size, space);
1269 1579                  break;
1270 1580          case ARC_SPACE_L2HDRS:
1271 1581                  ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1272 1582                  break;
1273 1583          }
1274 1584  
1275 1585          ARCSTAT_INCR(arcstat_meta_used, space);
1276 1586          atomic_add_64(&arc_size, space);
1277 1587  }
1278 1588  
1279 1589  void
1280 1590  arc_space_return(uint64_t space, arc_space_type_t type)
1281 1591  {
1282 1592          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1283 1593  
1284 1594          switch (type) {
1285 1595          case ARC_SPACE_DATA:
1286 1596                  ARCSTAT_INCR(arcstat_data_size, -space);
1287 1597                  break;
1288 1598          case ARC_SPACE_OTHER:
1289 1599                  ARCSTAT_INCR(arcstat_other_size, -space);
1290 1600                  break;
1291 1601          case ARC_SPACE_HDRS:
1292 1602                  ARCSTAT_INCR(arcstat_hdr_size, -space);
1293 1603                  break;
1294 1604          case ARC_SPACE_L2HDRS:
1295 1605                  ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1296 1606                  break;
1297 1607          }
1298 1608  
1299 1609          ASSERT(arc_meta_used >= space);
1300 1610          if (arc_meta_max < arc_meta_used)
1301 1611                  arc_meta_max = arc_meta_used;
1302 1612          ARCSTAT_INCR(arcstat_meta_used, -space);
1303 1613          ASSERT(arc_size >= space);
1304 1614          atomic_add_64(&arc_size, -space);
1305 1615  }
1306 1616  
1307 1617  void *
1308 1618  arc_data_buf_alloc(uint64_t size)
1309 1619  {
1310 1620          if (arc_evict_needed(ARC_BUFC_DATA))
1311 1621                  cv_signal(&arc_reclaim_thr_cv);
1312 1622          atomic_add_64(&arc_size, size);
1313 1623          return (zio_data_buf_alloc(size));
1314 1624  }
1315 1625  
1316 1626  void
1317 1627  arc_data_buf_free(void *buf, uint64_t size)
1318 1628  {
1319 1629          zio_data_buf_free(buf, size);
1320 1630          ASSERT(arc_size >= size);
1321 1631          atomic_add_64(&arc_size, -size);
1322 1632  }
1323 1633  
1324 1634  arc_buf_t *
1325 1635  arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1326 1636  {
1327 1637          arc_buf_hdr_t *hdr;
1328 1638          arc_buf_t *buf;
1329 1639  
1330 1640          ASSERT3U(size, >, 0);
1331 1641          hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1332 1642          ASSERT(BUF_EMPTY(hdr));
1333 1643          hdr->b_size = size;
1334 1644          hdr->b_type = type;
1335 1645          hdr->b_spa = spa_load_guid(spa);
1336 1646          hdr->b_state = arc_anon;
1337 1647          hdr->b_arc_access = 0;
1338 1648          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1339 1649          buf->b_hdr = hdr;
1340 1650          buf->b_data = NULL;
1341 1651          buf->b_efunc = NULL;
1342 1652          buf->b_private = NULL;
1343 1653          buf->b_next = NULL;

↓ open down ↓

83 lines elided

↑ open up ↑

1344 1654          hdr->b_buf = buf;
1345 1655          arc_get_data_buf(buf);
1346 1656          hdr->b_datacnt = 1;
1347 1657          hdr->b_flags = 0;
1348 1658          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1349 1659          (void) refcount_add(&hdr->b_refcnt, tag);
1350 1660  
1351 1661          return (buf);
1352 1662  }
1353 1663  
     1664 +/*
     1665 + * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
     1666 + * This is used during l2arc reconstruction to make empty ARC buffers
     1667 + * which circumvent the regular disk->arc->l2arc path and instead come
     1668 + * into being in the reverse order, i.e. l2arc->arc->(disk).
     1669 + */
     1670 +arc_buf_hdr_t *
     1671 +arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
     1672 +{
     1673 +        arc_buf_hdr_t *hdr;
     1674 +
     1675 +        ASSERT3U(size, >, 0);
     1676 +        hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
     1677 +        ASSERT(BUF_EMPTY(hdr));
     1678 +        hdr->b_size = size;
     1679 +        hdr->b_type = type;
     1680 +        hdr->b_spa = guid;
     1681 +        hdr->b_state = arc_anon;
     1682 +        hdr->b_arc_access = 0;
     1683 +        hdr->b_buf = NULL;
     1684 +        hdr->b_datacnt = 0;
     1685 +        hdr->b_flags = 0;
     1686 +        ASSERT(refcount_is_zero(&hdr->b_refcnt));
     1687 +
     1688 +        return (hdr);
     1689 +}
     1690 +
1354 1691  static char *arc_onloan_tag = "onloan";
1355 1692  
1356 1693  /*
1357 1694   * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1358 1695   * flight data by arc_tempreserve_space() until they are "returned". Loaned
1359 1696   * buffers must be returned to the arc before they can be used by the DMU or
1360 1697   * freed.
1361 1698   */
1362 1699  arc_buf_t *
1363 1700  arc_loan_buf(spa_t *spa, int size)

1364 1701  {
1365 1702          arc_buf_t *buf;
1366 1703  
1367 1704          buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1368 1705  
1369 1706          atomic_add_64(&arc_loaned_bytes, size);
1370 1707          return (buf);
1371 1708  }
1372 1709  
1373 1710  /*
1374 1711   * Return a loaned arc buffer to the arc.
1375 1712   */
1376 1713  void
1377 1714  arc_return_buf(arc_buf_t *buf, void *tag)
1378 1715  {
1379 1716          arc_buf_hdr_t *hdr = buf->b_hdr;
1380 1717  
1381 1718          ASSERT(buf->b_data != NULL);
1382 1719          (void) refcount_add(&hdr->b_refcnt, tag);
1383 1720          (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1384 1721  
1385 1722          atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1386 1723  }
1387 1724  
1388 1725  /* Detach an arc_buf from a dbuf (tag) */
1389 1726  void
1390 1727  arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1391 1728  {
1392 1729          arc_buf_hdr_t *hdr;
1393 1730  
1394 1731          ASSERT(buf->b_data != NULL);
1395 1732          hdr = buf->b_hdr;
1396 1733          (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1397 1734          (void) refcount_remove(&hdr->b_refcnt, tag);
1398 1735          buf->b_efunc = NULL;
1399 1736          buf->b_private = NULL;
1400 1737  
1401 1738          atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1402 1739  }
1403 1740  
1404 1741  static arc_buf_t *
1405 1742  arc_buf_clone(arc_buf_t *from)
1406 1743  {
1407 1744          arc_buf_t *buf;
1408 1745          arc_buf_hdr_t *hdr = from->b_hdr;
1409 1746          uint64_t size = hdr->b_size;
1410 1747  
1411 1748          ASSERT(hdr->b_state != arc_anon);
1412 1749  
1413 1750          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1414 1751          buf->b_hdr = hdr;
1415 1752          buf->b_data = NULL;
1416 1753          buf->b_efunc = NULL;
1417 1754          buf->b_private = NULL;
1418 1755          buf->b_next = hdr->b_buf;
1419 1756          hdr->b_buf = buf;
1420 1757          arc_get_data_buf(buf);
1421 1758          bcopy(from->b_data, buf->b_data, size);
1422 1759  
1423 1760          /*
1424 1761           * This buffer already exists in the arc so create a duplicate
1425 1762           * copy for the caller.  If the buffer is associated with user data
1426 1763           * then track the size and number of duplicates.  These stats will be
1427 1764           * updated as duplicate buffers are created and destroyed.
1428 1765           */
1429 1766          if (hdr->b_type == ARC_BUFC_DATA) {
1430 1767                  ARCSTAT_BUMP(arcstat_duplicate_buffers);
1431 1768                  ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1432 1769          }
1433 1770          hdr->b_datacnt += 1;
1434 1771          return (buf);
1435 1772  }
1436 1773  
1437 1774  void
1438 1775  arc_buf_add_ref(arc_buf_t *buf, void* tag)
1439 1776  {
1440 1777          arc_buf_hdr_t *hdr;
1441 1778          kmutex_t *hash_lock;
1442 1779  
1443 1780          /*
1444 1781           * Check to see if this buffer is evicted.  Callers
1445 1782           * must verify b_data != NULL to know if the add_ref
1446 1783           * was successful.
1447 1784           */
1448 1785          mutex_enter(&buf->b_evict_lock);
1449 1786          if (buf->b_data == NULL) {
1450 1787                  mutex_exit(&buf->b_evict_lock);
1451 1788                  return;
1452 1789          }
1453 1790          hash_lock = HDR_LOCK(buf->b_hdr);
1454 1791          mutex_enter(hash_lock);
1455 1792          hdr = buf->b_hdr;
1456 1793          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1457 1794          mutex_exit(&buf->b_evict_lock);
1458 1795  
1459 1796          ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1460 1797          add_reference(hdr, hash_lock, tag);
1461 1798          DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1462 1799          arc_access(hdr, hash_lock);
1463 1800          mutex_exit(hash_lock);
1464 1801          ARCSTAT_BUMP(arcstat_hits);
1465 1802          ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1466 1803              demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1467 1804              data, metadata, hits);
1468 1805  }
1469 1806  
1470 1807  /*
1471 1808   * Free the arc data buffer.  If it is an l2arc write in progress,
1472 1809   * the buffer is placed on l2arc_free_on_write to be freed later.
1473 1810   */
1474 1811  static void
1475 1812  arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1476 1813  {
1477 1814          arc_buf_hdr_t *hdr = buf->b_hdr;
1478 1815  
1479 1816          if (HDR_L2_WRITING(hdr)) {
1480 1817                  l2arc_data_free_t *df;
1481 1818                  df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1482 1819                  df->l2df_data = buf->b_data;
1483 1820                  df->l2df_size = hdr->b_size;
1484 1821                  df->l2df_func = free_func;
1485 1822                  mutex_enter(&l2arc_free_on_write_mtx);
1486 1823                  list_insert_head(l2arc_free_on_write, df);
1487 1824                  mutex_exit(&l2arc_free_on_write_mtx);
1488 1825                  ARCSTAT_BUMP(arcstat_l2_free_on_write);
1489 1826          } else {
1490 1827                  free_func(buf->b_data, hdr->b_size);
1491 1828          }
1492 1829  }
1493 1830  
1494 1831  static void
1495 1832  arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1496 1833  {
1497 1834          arc_buf_t **bufp;
1498 1835  
1499 1836          /* free up data associated with the buf */
1500 1837          if (buf->b_data) {
1501 1838                  arc_state_t *state = buf->b_hdr->b_state;
1502 1839                  uint64_t size = buf->b_hdr->b_size;
1503 1840                  arc_buf_contents_t type = buf->b_hdr->b_type;
1504 1841  
1505 1842                  arc_cksum_verify(buf);
1506 1843                  arc_buf_unwatch(buf);
1507 1844  
1508 1845                  if (!recycle) {
1509 1846                          if (type == ARC_BUFC_METADATA) {
1510 1847                                  arc_buf_data_free(buf, zio_buf_free);
1511 1848                                  arc_space_return(size, ARC_SPACE_DATA);
1512 1849                          } else {
1513 1850                                  ASSERT(type == ARC_BUFC_DATA);
1514 1851                                  arc_buf_data_free(buf, zio_data_buf_free);
1515 1852                                  ARCSTAT_INCR(arcstat_data_size, -size);
1516 1853                                  atomic_add_64(&arc_size, -size);
1517 1854                          }
1518 1855                  }
1519 1856                  if (list_link_active(&buf->b_hdr->b_arc_node)) {
1520 1857                          uint64_t *cnt = &state->arcs_lsize[type];
1521 1858  
1522 1859                          ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1523 1860                          ASSERT(state != arc_anon);
1524 1861  
1525 1862                          ASSERT3U(*cnt, >=, size);
1526 1863                          atomic_add_64(cnt, -size);
1527 1864                  }
1528 1865                  ASSERT3U(state->arcs_size, >=, size);
1529 1866                  atomic_add_64(&state->arcs_size, -size);
1530 1867                  buf->b_data = NULL;
1531 1868  
1532 1869                  /*
1533 1870                   * If we're destroying a duplicate buffer make sure
1534 1871                   * that the appropriate statistics are updated.
1535 1872                   */
1536 1873                  if (buf->b_hdr->b_datacnt > 1 &&
1537 1874                      buf->b_hdr->b_type == ARC_BUFC_DATA) {
1538 1875                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1539 1876                          ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1540 1877                  }
1541 1878                  ASSERT(buf->b_hdr->b_datacnt > 0);
1542 1879                  buf->b_hdr->b_datacnt -= 1;
1543 1880          }
1544 1881  
1545 1882          /* only remove the buf if requested */
1546 1883          if (!all)
1547 1884                  return;
1548 1885  
1549 1886          /* remove the buf from the hdr list */
1550 1887          for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1551 1888                  continue;
1552 1889          *bufp = buf->b_next;
1553 1890          buf->b_next = NULL;
1554 1891  
1555 1892          ASSERT(buf->b_efunc == NULL);
1556 1893  
1557 1894          /* clean up the buf */
1558 1895          buf->b_hdr = NULL;
1559 1896          kmem_cache_free(buf_cache, buf);
1560 1897  }
1561 1898  
1562 1899  static void
1563 1900  arc_hdr_destroy(arc_buf_hdr_t *hdr)
1564 1901  {
1565 1902          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1566 1903          ASSERT3P(hdr->b_state, ==, arc_anon);
1567 1904          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1568 1905          l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1569 1906  
1570 1907          if (l2hdr != NULL) {
1571 1908                  boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1572 1909                  /*
1573 1910                   * To prevent arc_free() and l2arc_evict() from
1574 1911                   * attempting to free the same buffer at the same time,
1575 1912                   * a FREE_IN_PROGRESS flag is given to arc_free() to
1576 1913                   * give it priority.  l2arc_evict() can't destroy this
1577 1914                   * header while we are waiting on l2arc_buflist_mtx.
1578 1915                   *
1579 1916                   * The hdr may be removed from l2ad_buflist before we
1580 1917                   * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.

↓ open down ↓

217 lines elided

↑ open up ↑

1581 1918                   */
1582 1919                  if (!buflist_held) {
1583 1920                          mutex_enter(&l2arc_buflist_mtx);
1584 1921                          l2hdr = hdr->b_l2hdr;
1585 1922                  }
1586 1923  
1587 1924                  if (l2hdr != NULL) {
1588 1925                          list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1589 1926                          ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1590 1927                          ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1591      -                        kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
     1928 +                        kmem_free(l2hdr, sizeof (*l2hdr));
1592 1929                          if (hdr->b_state == arc_l2c_only)
1593 1930                                  l2arc_hdr_stat_remove();
1594 1931                          hdr->b_l2hdr = NULL;
1595 1932                  }
1596 1933  
1597 1934                  if (!buflist_held)
1598 1935                          mutex_exit(&l2arc_buflist_mtx);
1599 1936          }
1600 1937  
1601 1938          if (!BUF_EMPTY(hdr)) {

1602 1939                  ASSERT(!HDR_IN_HASH_TABLE(hdr));
1603 1940                  buf_discard_identity(hdr);
1604 1941          }
1605 1942          while (hdr->b_buf) {
1606 1943                  arc_buf_t *buf = hdr->b_buf;
1607 1944  
1608 1945                  if (buf->b_efunc) {
1609 1946                          mutex_enter(&arc_eviction_mtx);
1610 1947                          mutex_enter(&buf->b_evict_lock);
1611 1948                          ASSERT(buf->b_hdr != NULL);
1612 1949                          arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1613 1950                          hdr->b_buf = buf->b_next;
1614 1951                          buf->b_hdr = &arc_eviction_hdr;
1615 1952                          buf->b_next = arc_eviction_list;
1616 1953                          arc_eviction_list = buf;
1617 1954                          mutex_exit(&buf->b_evict_lock);
1618 1955                          mutex_exit(&arc_eviction_mtx);
1619 1956                  } else {
1620 1957                          arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1621 1958                  }
1622 1959          }
1623 1960          if (hdr->b_freeze_cksum != NULL) {
1624 1961                  kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1625 1962                  hdr->b_freeze_cksum = NULL;
1626 1963          }
1627 1964          if (hdr->b_thawed) {
1628 1965                  kmem_free(hdr->b_thawed, 1);
1629 1966                  hdr->b_thawed = NULL;
1630 1967          }
1631 1968  
1632 1969          ASSERT(!list_link_active(&hdr->b_arc_node));
1633 1970          ASSERT3P(hdr->b_hash_next, ==, NULL);
1634 1971          ASSERT3P(hdr->b_acb, ==, NULL);
1635 1972          kmem_cache_free(hdr_cache, hdr);
1636 1973  }
1637 1974  
1638 1975  void
1639 1976  arc_buf_free(arc_buf_t *buf, void *tag)
1640 1977  {
1641 1978          arc_buf_hdr_t *hdr = buf->b_hdr;
1642 1979          int hashed = hdr->b_state != arc_anon;
1643 1980  
1644 1981          ASSERT(buf->b_efunc == NULL);
1645 1982          ASSERT(buf->b_data != NULL);
1646 1983  
1647 1984          if (hashed) {
1648 1985                  kmutex_t *hash_lock = HDR_LOCK(hdr);
1649 1986  
1650 1987                  mutex_enter(hash_lock);
1651 1988                  hdr = buf->b_hdr;
1652 1989                  ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1653 1990  
1654 1991                  (void) remove_reference(hdr, hash_lock, tag);
1655 1992                  if (hdr->b_datacnt > 1) {
1656 1993                          arc_buf_destroy(buf, FALSE, TRUE);
1657 1994                  } else {
1658 1995                          ASSERT(buf == hdr->b_buf);
1659 1996                          ASSERT(buf->b_efunc == NULL);
1660 1997                          hdr->b_flags |= ARC_BUF_AVAILABLE;
1661 1998                  }
1662 1999                  mutex_exit(hash_lock);
1663 2000          } else if (HDR_IO_IN_PROGRESS(hdr)) {
1664 2001                  int destroy_hdr;
1665 2002                  /*
1666 2003                   * We are in the middle of an async write.  Don't destroy
1667 2004                   * this buffer unless the write completes before we finish
1668 2005                   * decrementing the reference count.
1669 2006                   */
1670 2007                  mutex_enter(&arc_eviction_mtx);
1671 2008                  (void) remove_reference(hdr, NULL, tag);
1672 2009                  ASSERT(refcount_is_zero(&hdr->b_refcnt));
1673 2010                  destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1674 2011                  mutex_exit(&arc_eviction_mtx);
1675 2012                  if (destroy_hdr)
1676 2013                          arc_hdr_destroy(hdr);
1677 2014          } else {
1678 2015                  if (remove_reference(hdr, NULL, tag) > 0)
1679 2016                          arc_buf_destroy(buf, FALSE, TRUE);
1680 2017                  else
1681 2018                          arc_hdr_destroy(hdr);
1682 2019          }
1683 2020  }
1684 2021  
1685 2022  boolean_t
1686 2023  arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1687 2024  {
1688 2025          arc_buf_hdr_t *hdr = buf->b_hdr;
1689 2026          kmutex_t *hash_lock = HDR_LOCK(hdr);
1690 2027          boolean_t no_callback = (buf->b_efunc == NULL);
1691 2028  
1692 2029          if (hdr->b_state == arc_anon) {
1693 2030                  ASSERT(hdr->b_datacnt == 1);
1694 2031                  arc_buf_free(buf, tag);
1695 2032                  return (no_callback);
1696 2033          }
1697 2034  
1698 2035          mutex_enter(hash_lock);
1699 2036          hdr = buf->b_hdr;
1700 2037          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1701 2038          ASSERT(hdr->b_state != arc_anon);
1702 2039          ASSERT(buf->b_data != NULL);
1703 2040  
1704 2041          (void) remove_reference(hdr, hash_lock, tag);
1705 2042          if (hdr->b_datacnt > 1) {
1706 2043                  if (no_callback)
1707 2044                          arc_buf_destroy(buf, FALSE, TRUE);
1708 2045          } else if (no_callback) {
1709 2046                  ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1710 2047                  ASSERT(buf->b_efunc == NULL);
1711 2048                  hdr->b_flags |= ARC_BUF_AVAILABLE;
1712 2049          }
1713 2050          ASSERT(no_callback || hdr->b_datacnt > 1 ||
1714 2051              refcount_is_zero(&hdr->b_refcnt));
1715 2052          mutex_exit(hash_lock);
1716 2053          return (no_callback);
1717 2054  }
1718 2055  
1719 2056  int
1720 2057  arc_buf_size(arc_buf_t *buf)
1721 2058  {
1722 2059          return (buf->b_hdr->b_size);
1723 2060  }
1724 2061  
1725 2062  /*
1726 2063   * Called from the DMU to determine if the current buffer should be
1727 2064   * evicted. In order to ensure proper locking, the eviction must be initiated
1728 2065   * from the DMU. Return true if the buffer is associated with user data and
1729 2066   * duplicate buffers still exist.
1730 2067   */
1731 2068  boolean_t
1732 2069  arc_buf_eviction_needed(arc_buf_t *buf)
1733 2070  {
1734 2071          arc_buf_hdr_t *hdr;
1735 2072          boolean_t evict_needed = B_FALSE;
1736 2073  
1737 2074          if (zfs_disable_dup_eviction)
1738 2075                  return (B_FALSE);
1739 2076  
1740 2077          mutex_enter(&buf->b_evict_lock);
1741 2078          hdr = buf->b_hdr;
1742 2079          if (hdr == NULL) {
1743 2080                  /*
1744 2081                   * We are in arc_do_user_evicts(); let that function
1745 2082                   * perform the eviction.
1746 2083                   */
1747 2084                  ASSERT(buf->b_data == NULL);
1748 2085                  mutex_exit(&buf->b_evict_lock);
1749 2086                  return (B_FALSE);
1750 2087          } else if (buf->b_data == NULL) {
1751 2088                  /*
1752 2089                   * We have already been added to the arc eviction list;
1753 2090                   * recommend eviction.
1754 2091                   */
1755 2092                  ASSERT3P(hdr, ==, &arc_eviction_hdr);
1756 2093                  mutex_exit(&buf->b_evict_lock);
1757 2094                  return (B_TRUE);
1758 2095          }
1759 2096  
1760 2097          if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1761 2098                  evict_needed = B_TRUE;
1762 2099  
1763 2100          mutex_exit(&buf->b_evict_lock);
1764 2101          return (evict_needed);
1765 2102  }
1766 2103  
1767 2104  /*
1768 2105   * Evict buffers from list until we've removed the specified number of
1769 2106   * bytes.  Move the removed buffers to the appropriate evict state.
1770 2107   * If the recycle flag is set, then attempt to "recycle" a buffer:
1771 2108   * - look for a buffer to evict that is `bytes' long.
1772 2109   * - return the data block from this buffer rather than freeing it.
1773 2110   * This flag is used by callers that are trying to make space for a
1774 2111   * new buffer in a full arc cache.
1775 2112   *
1776 2113   * This function makes a "best effort".  It skips over any buffers
1777 2114   * it can't get a hash_lock on, and so may not catch all candidates.
1778 2115   * It may also return without evicting as much space as requested.
1779 2116   */
1780 2117  static void *
1781 2118  arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1782 2119      arc_buf_contents_t type)
1783 2120  {
1784 2121          arc_state_t *evicted_state;
1785 2122          uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1786 2123          arc_buf_hdr_t *ab, *ab_prev = NULL;
1787 2124          list_t *list = &state->arcs_list[type];
1788 2125          kmutex_t *hash_lock;
1789 2126          boolean_t have_lock;
1790 2127          void *stolen = NULL;
1791 2128          arc_buf_hdr_t marker = { 0 };
1792 2129          int count = 0;
1793 2130  
1794 2131          ASSERT(state == arc_mru || state == arc_mfu);
1795 2132  
1796 2133          evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1797 2134  
1798 2135          mutex_enter(&state->arcs_mtx);
1799 2136          mutex_enter(&evicted_state->arcs_mtx);
1800 2137  
1801 2138          for (ab = list_tail(list); ab; ab = ab_prev) {
1802 2139                  ab_prev = list_prev(list, ab);
1803 2140                  /* prefetch buffers have a minimum lifespan */
1804 2141                  if (HDR_IO_IN_PROGRESS(ab) ||
1805 2142                      (spa && ab->b_spa != spa) ||
1806 2143                      (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1807 2144                      ddi_get_lbolt() - ab->b_arc_access <
1808 2145                      arc_min_prefetch_lifespan)) {
1809 2146                          skipped++;
1810 2147                          continue;
1811 2148                  }
1812 2149                  /* "lookahead" for better eviction candidate */
1813 2150                  if (recycle && ab->b_size != bytes &&
1814 2151                      ab_prev && ab_prev->b_size == bytes)
1815 2152                          continue;
1816 2153  
1817 2154                  /* ignore markers */
1818 2155                  if (ab->b_spa == 0)
1819 2156                          continue;
1820 2157  
1821 2158                  /*
1822 2159                   * It may take a long time to evict all the bufs requested.
1823 2160                   * To avoid blocking all arc activity, periodically drop
1824 2161                   * the arcs_mtx and give other threads a chance to run
1825 2162                   * before reacquiring the lock.
1826 2163                   *
1827 2164                   * If we are looking for a buffer to recycle, we are in
1828 2165                   * the hot code path, so don't sleep.
1829 2166                   */
1830 2167                  if (!recycle && count++ > arc_evict_iterations) {
1831 2168                          list_insert_after(list, ab, &marker);
1832 2169                          mutex_exit(&evicted_state->arcs_mtx);
1833 2170                          mutex_exit(&state->arcs_mtx);
1834 2171                          kpreempt(KPREEMPT_SYNC);
1835 2172                          mutex_enter(&state->arcs_mtx);
1836 2173                          mutex_enter(&evicted_state->arcs_mtx);
1837 2174                          ab_prev = list_prev(list, &marker);
1838 2175                          list_remove(list, &marker);
1839 2176                          count = 0;
1840 2177                          continue;
1841 2178                  }
1842 2179  
1843 2180                  hash_lock = HDR_LOCK(ab);
1844 2181                  have_lock = MUTEX_HELD(hash_lock);
1845 2182                  if (have_lock || mutex_tryenter(hash_lock)) {
1846 2183                          ASSERT0(refcount_count(&ab->b_refcnt));
1847 2184                          ASSERT(ab->b_datacnt > 0);
1848 2185                          while (ab->b_buf) {
1849 2186                                  arc_buf_t *buf = ab->b_buf;
1850 2187                                  if (!mutex_tryenter(&buf->b_evict_lock)) {
1851 2188                                          missed += 1;
1852 2189                                          break;
1853 2190                                  }
1854 2191                                  if (buf->b_data) {
1855 2192                                          bytes_evicted += ab->b_size;
1856 2193                                          if (recycle && ab->b_type == type &&
1857 2194                                              ab->b_size == bytes &&
1858 2195                                              !HDR_L2_WRITING(ab)) {
1859 2196                                                  stolen = buf->b_data;
1860 2197                                                  recycle = FALSE;
1861 2198                                          }
1862 2199                                  }
1863 2200                                  if (buf->b_efunc) {
1864 2201                                          mutex_enter(&arc_eviction_mtx);
1865 2202                                          arc_buf_destroy(buf,
1866 2203                                              buf->b_data == stolen, FALSE);
1867 2204                                          ab->b_buf = buf->b_next;
1868 2205                                          buf->b_hdr = &arc_eviction_hdr;
1869 2206                                          buf->b_next = arc_eviction_list;
1870 2207                                          arc_eviction_list = buf;
1871 2208                                          mutex_exit(&arc_eviction_mtx);
1872 2209                                          mutex_exit(&buf->b_evict_lock);
1873 2210                                  } else {
1874 2211                                          mutex_exit(&buf->b_evict_lock);
1875 2212                                          arc_buf_destroy(buf,
1876 2213                                              buf->b_data == stolen, TRUE);
1877 2214                                  }
1878 2215                          }
1879 2216  
1880 2217                          if (ab->b_l2hdr) {
1881 2218                                  ARCSTAT_INCR(arcstat_evict_l2_cached,
1882 2219                                      ab->b_size);
1883 2220                          } else {
1884 2221                                  if (l2arc_write_eligible(ab->b_spa, ab)) {
1885 2222                                          ARCSTAT_INCR(arcstat_evict_l2_eligible,
1886 2223                                              ab->b_size);
1887 2224                                  } else {
1888 2225                                          ARCSTAT_INCR(
1889 2226                                              arcstat_evict_l2_ineligible,
1890 2227                                              ab->b_size);
1891 2228                                  }
1892 2229                          }
1893 2230  
1894 2231                          if (ab->b_datacnt == 0) {
1895 2232                                  arc_change_state(evicted_state, ab, hash_lock);
1896 2233                                  ASSERT(HDR_IN_HASH_TABLE(ab));
1897 2234                                  ab->b_flags |= ARC_IN_HASH_TABLE;
1898 2235                                  ab->b_flags &= ~ARC_BUF_AVAILABLE;
1899 2236                                  DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1900 2237                          }
1901 2238                          if (!have_lock)
1902 2239                                  mutex_exit(hash_lock);
1903 2240                          if (bytes >= 0 && bytes_evicted >= bytes)
1904 2241                                  break;
1905 2242                  } else {
1906 2243                          missed += 1;
1907 2244                  }
1908 2245          }
1909 2246  
1910 2247          mutex_exit(&evicted_state->arcs_mtx);
1911 2248          mutex_exit(&state->arcs_mtx);
1912 2249  
1913 2250          if (bytes_evicted < bytes)
1914 2251                  dprintf("only evicted %lld bytes from %x",
1915 2252                      (longlong_t)bytes_evicted, state);
1916 2253  
1917 2254          if (skipped)
1918 2255                  ARCSTAT_INCR(arcstat_evict_skip, skipped);
1919 2256  
1920 2257          if (missed)
1921 2258                  ARCSTAT_INCR(arcstat_mutex_miss, missed);
1922 2259  
1923 2260          /*
1924 2261           * Note: we have just evicted some data into the ghost state,
1925 2262           * potentially putting the ghost size over the desired size.  Rather
1926 2263           * that evicting from the ghost list in this hot code path, leave
1927 2264           * this chore to the arc_reclaim_thread().
1928 2265           */
1929 2266  
1930 2267          return (stolen);
1931 2268  }
1932 2269  
1933 2270  /*
1934 2271   * Remove buffers from list until we've removed the specified number of
1935 2272   * bytes.  Destroy the buffers that are removed.
1936 2273   */
1937 2274  static void
1938 2275  arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1939 2276  {
1940 2277          arc_buf_hdr_t *ab, *ab_prev;
1941 2278          arc_buf_hdr_t marker = { 0 };
1942 2279          list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1943 2280          kmutex_t *hash_lock;
1944 2281          uint64_t bytes_deleted = 0;
1945 2282          uint64_t bufs_skipped = 0;
1946 2283          int count = 0;
1947 2284  
1948 2285          ASSERT(GHOST_STATE(state));
1949 2286  top:
1950 2287          mutex_enter(&state->arcs_mtx);
1951 2288          for (ab = list_tail(list); ab; ab = ab_prev) {
1952 2289                  ab_prev = list_prev(list, ab);
1953 2290                  if (ab->b_type > ARC_BUFC_NUMTYPES)
1954 2291                          panic("invalid ab=%p", (void *)ab);
1955 2292                  if (spa && ab->b_spa != spa)
1956 2293                          continue;
1957 2294  
1958 2295                  /* ignore markers */
1959 2296                  if (ab->b_spa == 0)
1960 2297                          continue;
1961 2298  
1962 2299                  hash_lock = HDR_LOCK(ab);
1963 2300                  /* caller may be trying to modify this buffer, skip it */
1964 2301                  if (MUTEX_HELD(hash_lock))
1965 2302                          continue;
1966 2303  
1967 2304                  /*
1968 2305                   * It may take a long time to evict all the bufs requested.
1969 2306                   * To avoid blocking all arc activity, periodically drop
1970 2307                   * the arcs_mtx and give other threads a chance to run
1971 2308                   * before reacquiring the lock.
1972 2309                   */
1973 2310                  if (count++ > arc_evict_iterations) {
1974 2311                          list_insert_after(list, ab, &marker);
1975 2312                          mutex_exit(&state->arcs_mtx);
1976 2313                          kpreempt(KPREEMPT_SYNC);
1977 2314                          mutex_enter(&state->arcs_mtx);
1978 2315                          ab_prev = list_prev(list, &marker);
1979 2316                          list_remove(list, &marker);
1980 2317                          count = 0;
1981 2318                          continue;
1982 2319                  }
1983 2320                  if (mutex_tryenter(hash_lock)) {
1984 2321                          ASSERT(!HDR_IO_IN_PROGRESS(ab));
1985 2322                          ASSERT(ab->b_buf == NULL);
1986 2323                          ARCSTAT_BUMP(arcstat_deleted);
1987 2324                          bytes_deleted += ab->b_size;
1988 2325  
1989 2326                          if (ab->b_l2hdr != NULL) {
1990 2327                                  /*
1991 2328                                   * This buffer is cached on the 2nd Level ARC;
1992 2329                                   * don't destroy the header.
1993 2330                                   */
1994 2331                                  arc_change_state(arc_l2c_only, ab, hash_lock);
1995 2332                                  mutex_exit(hash_lock);
1996 2333                          } else {
1997 2334                                  arc_change_state(arc_anon, ab, hash_lock);
1998 2335                                  mutex_exit(hash_lock);
1999 2336                                  arc_hdr_destroy(ab);
2000 2337                          }
2001 2338  
2002 2339                          DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2003 2340                          if (bytes >= 0 && bytes_deleted >= bytes)
2004 2341                                  break;
2005 2342                  } else if (bytes < 0) {
2006 2343                          /*
2007 2344                           * Insert a list marker and then wait for the
2008 2345                           * hash lock to become available. Once its
2009 2346                           * available, restart from where we left off.
2010 2347                           */
2011 2348                          list_insert_after(list, ab, &marker);
2012 2349                          mutex_exit(&state->arcs_mtx);
2013 2350                          mutex_enter(hash_lock);
2014 2351                          mutex_exit(hash_lock);
2015 2352                          mutex_enter(&state->arcs_mtx);
2016 2353                          ab_prev = list_prev(list, &marker);
2017 2354                          list_remove(list, &marker);
2018 2355                  } else {
2019 2356                          bufs_skipped += 1;
2020 2357                  }
2021 2358  
2022 2359          }
2023 2360          mutex_exit(&state->arcs_mtx);
2024 2361  
2025 2362          if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2026 2363              (bytes < 0 || bytes_deleted < bytes)) {
2027 2364                  list = &state->arcs_list[ARC_BUFC_METADATA];
2028 2365                  goto top;
2029 2366          }
2030 2367  
2031 2368          if (bufs_skipped) {
2032 2369                  ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2033 2370                  ASSERT(bytes >= 0);
2034 2371          }
2035 2372  
2036 2373          if (bytes_deleted < bytes)
2037 2374                  dprintf("only deleted %lld bytes from %p",
2038 2375                      (longlong_t)bytes_deleted, state);
2039 2376  }
2040 2377  
2041 2378  static void
2042 2379  arc_adjust(void)
2043 2380  {
2044 2381          int64_t adjustment, delta;
2045 2382  
2046 2383          /*
2047 2384           * Adjust MRU size
2048 2385           */
2049 2386  
2050 2387          adjustment = MIN((int64_t)(arc_size - arc_c),
2051 2388              (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2052 2389              arc_p));
2053 2390  
2054 2391          if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2055 2392                  delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2056 2393                  (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2057 2394                  adjustment -= delta;
2058 2395          }
2059 2396  
2060 2397          if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2061 2398                  delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2062 2399                  (void) arc_evict(arc_mru, NULL, delta, FALSE,
2063 2400                      ARC_BUFC_METADATA);
2064 2401          }
2065 2402  
2066 2403          /*
2067 2404           * Adjust MFU size
2068 2405           */
2069 2406  
2070 2407          adjustment = arc_size - arc_c;
2071 2408  
2072 2409          if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2073 2410                  delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2074 2411                  (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2075 2412                  adjustment -= delta;
2076 2413          }
2077 2414  
2078 2415          if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2079 2416                  int64_t delta = MIN(adjustment,
2080 2417                      arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2081 2418                  (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2082 2419                      ARC_BUFC_METADATA);
2083 2420          }
2084 2421  
2085 2422          /*
2086 2423           * Adjust ghost lists
2087 2424           */
2088 2425  
2089 2426          adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2090 2427  
2091 2428          if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2092 2429                  delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2093 2430                  arc_evict_ghost(arc_mru_ghost, NULL, delta);
2094 2431          }
2095 2432  
2096 2433          adjustment =
2097 2434              arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2098 2435  
2099 2436          if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2100 2437                  delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2101 2438                  arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2102 2439          }
2103 2440  }
2104 2441  
2105 2442  static void
2106 2443  arc_do_user_evicts(void)
2107 2444  {
2108 2445          mutex_enter(&arc_eviction_mtx);
2109 2446          while (arc_eviction_list != NULL) {
2110 2447                  arc_buf_t *buf = arc_eviction_list;
2111 2448                  arc_eviction_list = buf->b_next;
2112 2449                  mutex_enter(&buf->b_evict_lock);
2113 2450                  buf->b_hdr = NULL;
2114 2451                  mutex_exit(&buf->b_evict_lock);
2115 2452                  mutex_exit(&arc_eviction_mtx);
2116 2453  
2117 2454                  if (buf->b_efunc != NULL)
2118 2455                          VERIFY(buf->b_efunc(buf) == 0);
2119 2456  
2120 2457                  buf->b_efunc = NULL;
2121 2458                  buf->b_private = NULL;
2122 2459                  kmem_cache_free(buf_cache, buf);
2123 2460                  mutex_enter(&arc_eviction_mtx);
2124 2461          }
2125 2462          mutex_exit(&arc_eviction_mtx);
2126 2463  }
2127 2464  
2128 2465  /*
2129 2466   * Flush all *evictable* data from the cache for the given spa.
2130 2467   * NOTE: this will not touch "active" (i.e. referenced) data.
2131 2468   */
2132 2469  void
2133 2470  arc_flush(spa_t *spa)
2134 2471  {
2135 2472          uint64_t guid = 0;
2136 2473  
2137 2474          if (spa)
2138 2475                  guid = spa_load_guid(spa);
2139 2476  
2140 2477          while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2141 2478                  (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2142 2479                  if (spa)
2143 2480                          break;
2144 2481          }
2145 2482          while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2146 2483                  (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2147 2484                  if (spa)
2148 2485                          break;
2149 2486          }
2150 2487          while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2151 2488                  (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2152 2489                  if (spa)
2153 2490                          break;
2154 2491          }
2155 2492          while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2156 2493                  (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2157 2494                  if (spa)
2158 2495                          break;
2159 2496          }
2160 2497  
2161 2498          arc_evict_ghost(arc_mru_ghost, guid, -1);
2162 2499          arc_evict_ghost(arc_mfu_ghost, guid, -1);
2163 2500  
2164 2501          mutex_enter(&arc_reclaim_thr_lock);
2165 2502          arc_do_user_evicts();
2166 2503          mutex_exit(&arc_reclaim_thr_lock);
2167 2504          ASSERT(spa || arc_eviction_list == NULL);
2168 2505  }
2169 2506  
2170 2507  void
2171 2508  arc_shrink(void)
2172 2509  {
2173 2510          if (arc_c > arc_c_min) {
2174 2511                  uint64_t to_free;
2175 2512  
2176 2513  #ifdef _KERNEL
2177 2514                  to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2178 2515  #else
2179 2516                  to_free = arc_c >> arc_shrink_shift;
2180 2517  #endif
2181 2518                  if (arc_c > arc_c_min + to_free)
2182 2519                          atomic_add_64(&arc_c, -to_free);
2183 2520                  else
2184 2521                          arc_c = arc_c_min;
2185 2522  
2186 2523                  atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2187 2524                  if (arc_c > arc_size)
2188 2525                          arc_c = MAX(arc_size, arc_c_min);
2189 2526                  if (arc_p > arc_c)
2190 2527                          arc_p = (arc_c >> 1);
2191 2528                  ASSERT(arc_c >= arc_c_min);
2192 2529                  ASSERT((int64_t)arc_p >= 0);
2193 2530          }
2194 2531  
2195 2532          if (arc_size > arc_c)
2196 2533                  arc_adjust();
2197 2534  }
2198 2535  
2199 2536  /*
2200 2537   * Determine if the system is under memory pressure and is asking
2201 2538   * to reclaim memory. A return value of 1 indicates that the system
2202 2539   * is under memory pressure and that the arc should adjust accordingly.
2203 2540   */
2204 2541  static int
2205 2542  arc_reclaim_needed(void)
2206 2543  {
2207 2544          uint64_t extra;
2208 2545  
2209 2546  #ifdef _KERNEL
2210 2547  
2211 2548          if (needfree)
2212 2549                  return (1);
2213 2550  
2214 2551          /*
2215 2552           * take 'desfree' extra pages, so we reclaim sooner, rather than later
2216 2553           */
2217 2554          extra = desfree;
2218 2555  
2219 2556          /*
2220 2557           * check that we're out of range of the pageout scanner.  It starts to
2221 2558           * schedule paging if freemem is less than lotsfree and needfree.
2222 2559           * lotsfree is the high-water mark for pageout, and needfree is the
2223 2560           * number of needed free pages.  We add extra pages here to make sure
2224 2561           * the scanner doesn't start up while we're freeing memory.
2225 2562           */
2226 2563          if (freemem < lotsfree + needfree + extra)
2227 2564                  return (1);
2228 2565  
2229 2566          /*
2230 2567           * check to make sure that swapfs has enough space so that anon
2231 2568           * reservations can still succeed. anon_resvmem() checks that the
2232 2569           * availrmem is greater than swapfs_minfree, and the number of reserved
2233 2570           * swap pages.  We also add a bit of extra here just to prevent
2234 2571           * circumstances from getting really dire.
2235 2572           */
2236 2573          if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2237 2574                  return (1);
2238 2575  
2239 2576          /*
2240 2577           * Check that we have enough availrmem that memory locking (e.g., via
2241 2578           * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2242 2579           * stores the number of pages that cannot be locked; when availrmem
2243 2580           * drops below pages_pp_maximum, page locking mechanisms such as
2244 2581           * page_pp_lock() will fail.)
2245 2582           */
2246 2583          if (availrmem <= pages_pp_maximum)
2247 2584                  return (1);
2248 2585  
2249 2586  #if defined(__i386)
2250 2587          /*
2251 2588           * If we're on an i386 platform, it's possible that we'll exhaust the
2252 2589           * kernel heap space before we ever run out of available physical
2253 2590           * memory.  Most checks of the size of the heap_area compare against
2254 2591           * tune.t_minarmem, which is the minimum available real memory that we
2255 2592           * can have in the system.  However, this is generally fixed at 25 pages
2256 2593           * which is so low that it's useless.  In this comparison, we seek to
2257 2594           * calculate the total heap-size, and reclaim if more than 3/4ths of the
2258 2595           * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2259 2596           * free)
2260 2597           */
2261 2598          if (vmem_size(heap_arena, VMEM_FREE) <
2262 2599              (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2263 2600                  return (1);
2264 2601  #endif
2265 2602  
2266 2603          /*
2267 2604           * If zio data pages are being allocated out of a separate heap segment,
2268 2605           * then enforce that the size of available vmem for this arena remains
2269 2606           * above about 1/16th free.
2270 2607           *
2271 2608           * Note: The 1/16th arena free requirement was put in place
2272 2609           * to aggressively evict memory from the arc in order to avoid
2273 2610           * memory fragmentation issues.
2274 2611           */
2275 2612          if (zio_arena != NULL &&
2276 2613              vmem_size(zio_arena, VMEM_FREE) <
2277 2614              (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2278 2615                  return (1);
2279 2616  #else
2280 2617          if (spa_get_random(100) == 0)
2281 2618                  return (1);
2282 2619  #endif
2283 2620          return (0);
2284 2621  }
2285 2622  
2286 2623  static void
2287 2624  arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2288 2625  {
2289 2626          size_t                  i;
2290 2627          kmem_cache_t            *prev_cache = NULL;
2291 2628          kmem_cache_t            *prev_data_cache = NULL;
2292 2629          extern kmem_cache_t     *zio_buf_cache[];
2293 2630          extern kmem_cache_t     *zio_data_buf_cache[];
2294 2631  
2295 2632  #ifdef _KERNEL
2296 2633          if (arc_meta_used >= arc_meta_limit) {
2297 2634                  /*
2298 2635                   * We are exceeding our meta-data cache limit.
2299 2636                   * Purge some DNLC entries to release holds on meta-data.
2300 2637                   */
2301 2638                  dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2302 2639          }
2303 2640  #if defined(__i386)
2304 2641          /*
2305 2642           * Reclaim unused memory from all kmem caches.
2306 2643           */
2307 2644          kmem_reap();
2308 2645  #endif
2309 2646  #endif
2310 2647  
2311 2648          /*
2312 2649           * An aggressive reclamation will shrink the cache size as well as
2313 2650           * reap free buffers from the arc kmem caches.
2314 2651           */
2315 2652          if (strat == ARC_RECLAIM_AGGR)
2316 2653                  arc_shrink();
2317 2654  
2318 2655          for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2319 2656                  if (zio_buf_cache[i] != prev_cache) {
2320 2657                          prev_cache = zio_buf_cache[i];
2321 2658                          kmem_cache_reap_now(zio_buf_cache[i]);
2322 2659                  }
2323 2660                  if (zio_data_buf_cache[i] != prev_data_cache) {
2324 2661                          prev_data_cache = zio_data_buf_cache[i];
2325 2662                          kmem_cache_reap_now(zio_data_buf_cache[i]);
2326 2663                  }
2327 2664          }
2328 2665          kmem_cache_reap_now(buf_cache);
2329 2666          kmem_cache_reap_now(hdr_cache);
2330 2667  
2331 2668          /*
2332 2669           * Ask the vmem areana to reclaim unused memory from its
2333 2670           * quantum caches.
2334 2671           */
2335 2672          if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2336 2673                  vmem_qcache_reap(zio_arena);
2337 2674  }
2338 2675  
2339 2676  static void
2340 2677  arc_reclaim_thread(void)
2341 2678  {
2342 2679          clock_t                 growtime = 0;
2343 2680          arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2344 2681          callb_cpr_t             cpr;
2345 2682  
2346 2683          CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2347 2684  
2348 2685          mutex_enter(&arc_reclaim_thr_lock);
2349 2686          while (arc_thread_exit == 0) {
2350 2687                  if (arc_reclaim_needed()) {
2351 2688  
2352 2689                          if (arc_no_grow) {
2353 2690                                  if (last_reclaim == ARC_RECLAIM_CONS) {
2354 2691                                          last_reclaim = ARC_RECLAIM_AGGR;
2355 2692                                  } else {
2356 2693                                          last_reclaim = ARC_RECLAIM_CONS;
2357 2694                                  }
2358 2695                          } else {
2359 2696                                  arc_no_grow = TRUE;
2360 2697                                  last_reclaim = ARC_RECLAIM_AGGR;
2361 2698                                  membar_producer();
2362 2699                          }
2363 2700  
2364 2701                          /* reset the growth delay for every reclaim */
2365 2702                          growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2366 2703  
2367 2704                          arc_kmem_reap_now(last_reclaim);
2368 2705                          arc_warm = B_TRUE;
2369 2706  
2370 2707                  } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2371 2708                          arc_no_grow = FALSE;
2372 2709                  }
2373 2710  
2374 2711                  arc_adjust();
2375 2712  
2376 2713                  if (arc_eviction_list != NULL)
2377 2714                          arc_do_user_evicts();
2378 2715  
2379 2716                  /* block until needed, or one second, whichever is shorter */
2380 2717                  CALLB_CPR_SAFE_BEGIN(&cpr);
2381 2718                  (void) cv_timedwait(&arc_reclaim_thr_cv,
2382 2719                      &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2383 2720                  CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2384 2721          }
2385 2722  
2386 2723          arc_thread_exit = 0;
2387 2724          cv_broadcast(&arc_reclaim_thr_cv);
2388 2725          CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2389 2726          thread_exit();
2390 2727  }
2391 2728  
2392 2729  /*
2393 2730   * Adapt arc info given the number of bytes we are trying to add and
2394 2731   * the state that we are comming from.  This function is only called
2395 2732   * when we are adding new content to the cache.
2396 2733   */
2397 2734  static void
2398 2735  arc_adapt(int bytes, arc_state_t *state)
2399 2736  {
2400 2737          int mult;
2401 2738          uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2402 2739  
2403 2740          if (state == arc_l2c_only)
2404 2741                  return;
2405 2742  
2406 2743          ASSERT(bytes > 0);
2407 2744          /*
2408 2745           * Adapt the target size of the MRU list:
2409 2746           *      - if we just hit in the MRU ghost list, then increase
2410 2747           *        the target size of the MRU list.
2411 2748           *      - if we just hit in the MFU ghost list, then increase
2412 2749           *        the target size of the MFU list by decreasing the
2413 2750           *        target size of the MRU list.
2414 2751           */
2415 2752          if (state == arc_mru_ghost) {
2416 2753                  mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2417 2754                      1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2418 2755                  mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2419 2756  
2420 2757                  arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2421 2758          } else if (state == arc_mfu_ghost) {
2422 2759                  uint64_t delta;
2423 2760  
2424 2761                  mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2425 2762                      1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2426 2763                  mult = MIN(mult, 10);
2427 2764  
2428 2765                  delta = MIN(bytes * mult, arc_p);
2429 2766                  arc_p = MAX(arc_p_min, arc_p - delta);
2430 2767          }
2431 2768          ASSERT((int64_t)arc_p >= 0);
2432 2769  
2433 2770          if (arc_reclaim_needed()) {
2434 2771                  cv_signal(&arc_reclaim_thr_cv);
2435 2772                  return;
2436 2773          }
2437 2774  
2438 2775          if (arc_no_grow)
2439 2776                  return;
2440 2777  
2441 2778          if (arc_c >= arc_c_max)
2442 2779                  return;
2443 2780  
2444 2781          /*
2445 2782           * If we're within (2 * maxblocksize) bytes of the target
2446 2783           * cache size, increment the target cache size
2447 2784           */
2448 2785          if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2449 2786                  atomic_add_64(&arc_c, (int64_t)bytes);
2450 2787                  if (arc_c > arc_c_max)
2451 2788                          arc_c = arc_c_max;
2452 2789                  else if (state == arc_anon)
2453 2790                          atomic_add_64(&arc_p, (int64_t)bytes);
2454 2791                  if (arc_p > arc_c)
2455 2792                          arc_p = arc_c;
2456 2793          }
2457 2794          ASSERT((int64_t)arc_p >= 0);
2458 2795  }
2459 2796  
2460 2797  /*
2461 2798   * Check if the cache has reached its limits and eviction is required
2462 2799   * prior to insert.
2463 2800   */
2464 2801  static int
2465 2802  arc_evict_needed(arc_buf_contents_t type)
2466 2803  {
2467 2804          if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2468 2805                  return (1);
2469 2806  
2470 2807          if (arc_reclaim_needed())
2471 2808                  return (1);
2472 2809  
2473 2810          return (arc_size > arc_c);
2474 2811  }
2475 2812  
2476 2813  /*
2477 2814   * The buffer, supplied as the first argument, needs a data block.
2478 2815   * So, if we are at cache max, determine which cache should be victimized.
2479 2816   * We have the following cases:
2480 2817   *
2481 2818   * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2482 2819   * In this situation if we're out of space, but the resident size of the MFU is
2483 2820   * under the limit, victimize the MFU cache to satisfy this insertion request.
2484 2821   *
2485 2822   * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2486 2823   * Here, we've used up all of the available space for the MRU, so we need to
2487 2824   * evict from our own cache instead.  Evict from the set of resident MRU
2488 2825   * entries.
2489 2826   *
2490 2827   * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2491 2828   * c minus p represents the MFU space in the cache, since p is the size of the
2492 2829   * cache that is dedicated to the MRU.  In this situation there's still space on
2493 2830   * the MFU side, so the MRU side needs to be victimized.
2494 2831   *
2495 2832   * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2496 2833   * MFU's resident set is consuming more space than it has been allotted.  In
2497 2834   * this situation, we must victimize our own cache, the MFU, for this insertion.
2498 2835   */
2499 2836  static void
2500 2837  arc_get_data_buf(arc_buf_t *buf)
2501 2838  {
2502 2839          arc_state_t             *state = buf->b_hdr->b_state;
2503 2840          uint64_t                size = buf->b_hdr->b_size;
2504 2841          arc_buf_contents_t      type = buf->b_hdr->b_type;
2505 2842  
2506 2843          arc_adapt(size, state);
2507 2844  
2508 2845          /*
2509 2846           * We have not yet reached cache maximum size,
2510 2847           * just allocate a new buffer.
2511 2848           */
2512 2849          if (!arc_evict_needed(type)) {
2513 2850                  if (type == ARC_BUFC_METADATA) {
2514 2851                          buf->b_data = zio_buf_alloc(size);
2515 2852                          arc_space_consume(size, ARC_SPACE_DATA);
2516 2853                  } else {
2517 2854                          ASSERT(type == ARC_BUFC_DATA);
2518 2855                          buf->b_data = zio_data_buf_alloc(size);
2519 2856                          ARCSTAT_INCR(arcstat_data_size, size);
2520 2857                          atomic_add_64(&arc_size, size);
2521 2858                  }
2522 2859                  goto out;
2523 2860          }
2524 2861  
2525 2862          /*
2526 2863           * If we are prefetching from the mfu ghost list, this buffer
2527 2864           * will end up on the mru list; so steal space from there.
2528 2865           */
2529 2866          if (state == arc_mfu_ghost)
2530 2867                  state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2531 2868          else if (state == arc_mru_ghost)
2532 2869                  state = arc_mru;
2533 2870  
2534 2871          if (state == arc_mru || state == arc_anon) {
2535 2872                  uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2536 2873                  state = (arc_mfu->arcs_lsize[type] >= size &&
2537 2874                      arc_p > mru_used) ? arc_mfu : arc_mru;
2538 2875          } else {
2539 2876                  /* MFU cases */
2540 2877                  uint64_t mfu_space = arc_c - arc_p;
2541 2878                  state =  (arc_mru->arcs_lsize[type] >= size &&
2542 2879                      mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2543 2880          }
2544 2881          if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2545 2882                  if (type == ARC_BUFC_METADATA) {
2546 2883                          buf->b_data = zio_buf_alloc(size);
2547 2884                          arc_space_consume(size, ARC_SPACE_DATA);
2548 2885                  } else {
2549 2886                          ASSERT(type == ARC_BUFC_DATA);
2550 2887                          buf->b_data = zio_data_buf_alloc(size);
2551 2888                          ARCSTAT_INCR(arcstat_data_size, size);
2552 2889                          atomic_add_64(&arc_size, size);
2553 2890                  }
2554 2891                  ARCSTAT_BUMP(arcstat_recycle_miss);
2555 2892          }
2556 2893          ASSERT(buf->b_data != NULL);
2557 2894  out:
2558 2895          /*
2559 2896           * Update the state size.  Note that ghost states have a
2560 2897           * "ghost size" and so don't need to be updated.
2561 2898           */
2562 2899          if (!GHOST_STATE(buf->b_hdr->b_state)) {
2563 2900                  arc_buf_hdr_t *hdr = buf->b_hdr;
2564 2901  
2565 2902                  atomic_add_64(&hdr->b_state->arcs_size, size);
2566 2903                  if (list_link_active(&hdr->b_arc_node)) {
2567 2904                          ASSERT(refcount_is_zero(&hdr->b_refcnt));
2568 2905                          atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2569 2906                  }
2570 2907                  /*
2571 2908                   * If we are growing the cache, and we are adding anonymous
2572 2909                   * data, and we have outgrown arc_p, update arc_p
2573 2910                   */
2574 2911                  if (arc_size < arc_c && hdr->b_state == arc_anon &&
2575 2912                      arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2576 2913                          arc_p = MIN(arc_c, arc_p + size);
2577 2914          }
2578 2915  }
2579 2916  
2580 2917  /*
2581 2918   * This routine is called whenever a buffer is accessed.
2582 2919   * NOTE: the hash lock is dropped in this function.
2583 2920   */
2584 2921  static void
2585 2922  arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2586 2923  {
2587 2924          clock_t now;
2588 2925  
2589 2926          ASSERT(MUTEX_HELD(hash_lock));
2590 2927  
2591 2928          if (buf->b_state == arc_anon) {
2592 2929                  /*
2593 2930                   * This buffer is not in the cache, and does not
2594 2931                   * appear in our "ghost" list.  Add the new buffer
2595 2932                   * to the MRU state.
2596 2933                   */
2597 2934  
2598 2935                  ASSERT(buf->b_arc_access == 0);
2599 2936                  buf->b_arc_access = ddi_get_lbolt();
2600 2937                  DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2601 2938                  arc_change_state(arc_mru, buf, hash_lock);
2602 2939  
2603 2940          } else if (buf->b_state == arc_mru) {
2604 2941                  now = ddi_get_lbolt();
2605 2942  
2606 2943                  /*
2607 2944                   * If this buffer is here because of a prefetch, then either:
2608 2945                   * - clear the flag if this is a "referencing" read
2609 2946                   *   (any subsequent access will bump this into the MFU state).
2610 2947                   * or
2611 2948                   * - move the buffer to the head of the list if this is
2612 2949                   *   another prefetch (to make it less likely to be evicted).
2613 2950                   */
2614 2951                  if ((buf->b_flags & ARC_PREFETCH) != 0) {
2615 2952                          if (refcount_count(&buf->b_refcnt) == 0) {
2616 2953                                  ASSERT(list_link_active(&buf->b_arc_node));
2617 2954                          } else {
2618 2955                                  buf->b_flags &= ~ARC_PREFETCH;
2619 2956                                  ARCSTAT_BUMP(arcstat_mru_hits);
2620 2957                          }
2621 2958                          buf->b_arc_access = now;
2622 2959                          return;
2623 2960                  }
2624 2961  
2625 2962                  /*
2626 2963                   * This buffer has been "accessed" only once so far,
2627 2964                   * but it is still in the cache. Move it to the MFU
2628 2965                   * state.
2629 2966                   */
2630 2967                  if (now > buf->b_arc_access + ARC_MINTIME) {
2631 2968                          /*
2632 2969                           * More than 125ms have passed since we
2633 2970                           * instantiated this buffer.  Move it to the
2634 2971                           * most frequently used state.
2635 2972                           */
2636 2973                          buf->b_arc_access = now;
2637 2974                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2638 2975                          arc_change_state(arc_mfu, buf, hash_lock);
2639 2976                  }
2640 2977                  ARCSTAT_BUMP(arcstat_mru_hits);
2641 2978          } else if (buf->b_state == arc_mru_ghost) {
2642 2979                  arc_state_t     *new_state;
2643 2980                  /*
2644 2981                   * This buffer has been "accessed" recently, but
2645 2982                   * was evicted from the cache.  Move it to the
2646 2983                   * MFU state.
2647 2984                   */
2648 2985  
2649 2986                  if (buf->b_flags & ARC_PREFETCH) {
2650 2987                          new_state = arc_mru;
2651 2988                          if (refcount_count(&buf->b_refcnt) > 0)
2652 2989                                  buf->b_flags &= ~ARC_PREFETCH;
2653 2990                          DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2654 2991                  } else {
2655 2992                          new_state = arc_mfu;
2656 2993                          DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2657 2994                  }
2658 2995  
2659 2996                  buf->b_arc_access = ddi_get_lbolt();
2660 2997                  arc_change_state(new_state, buf, hash_lock);
2661 2998  
2662 2999                  ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2663 3000          } else if (buf->b_state == arc_mfu) {
2664 3001                  /*
2665 3002                   * This buffer has been accessed more than once and is
2666 3003                   * still in the cache.  Keep it in the MFU state.
2667 3004                   *
2668 3005                   * NOTE: an add_reference() that occurred when we did
2669 3006                   * the arc_read() will have kicked this off the list.
2670 3007                   * If it was a prefetch, we will explicitly move it to
2671 3008                   * the head of the list now.
2672 3009                   */
2673 3010                  if ((buf->b_flags & ARC_PREFETCH) != 0) {
2674 3011                          ASSERT(refcount_count(&buf->b_refcnt) == 0);
2675 3012                          ASSERT(list_link_active(&buf->b_arc_node));
2676 3013                  }
2677 3014                  ARCSTAT_BUMP(arcstat_mfu_hits);
2678 3015                  buf->b_arc_access = ddi_get_lbolt();
2679 3016          } else if (buf->b_state == arc_mfu_ghost) {
2680 3017                  arc_state_t     *new_state = arc_mfu;
2681 3018                  /*
2682 3019                   * This buffer has been accessed more than once but has
2683 3020                   * been evicted from the cache.  Move it back to the
2684 3021                   * MFU state.
2685 3022                   */
2686 3023  
2687 3024                  if (buf->b_flags & ARC_PREFETCH) {
2688 3025                          /*
2689 3026                           * This is a prefetch access...
2690 3027                           * move this block back to the MRU state.
2691 3028                           */
2692 3029                          ASSERT0(refcount_count(&buf->b_refcnt));
2693 3030                          new_state = arc_mru;
2694 3031                  }
2695 3032  
2696 3033                  buf->b_arc_access = ddi_get_lbolt();
2697 3034                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2698 3035                  arc_change_state(new_state, buf, hash_lock);
2699 3036  
2700 3037                  ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2701 3038          } else if (buf->b_state == arc_l2c_only) {
2702 3039                  /*
2703 3040                   * This buffer is on the 2nd Level ARC.
2704 3041                   */
2705 3042  
2706 3043                  buf->b_arc_access = ddi_get_lbolt();
2707 3044                  DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2708 3045                  arc_change_state(arc_mfu, buf, hash_lock);
2709 3046          } else {
2710 3047                  ASSERT(!"invalid arc state");
2711 3048          }
2712 3049  }
2713 3050  
2714 3051  /* a generic arc_done_func_t which you can use */
2715 3052  /* ARGSUSED */
2716 3053  void
2717 3054  arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2718 3055  {
2719 3056          if (zio == NULL || zio->io_error == 0)
2720 3057                  bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2721 3058          VERIFY(arc_buf_remove_ref(buf, arg));
2722 3059  }
2723 3060  
2724 3061  /* a generic arc_done_func_t */
2725 3062  void
2726 3063  arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2727 3064  {
2728 3065          arc_buf_t **bufp = arg;
2729 3066          if (zio && zio->io_error) {
2730 3067                  VERIFY(arc_buf_remove_ref(buf, arg));
2731 3068                  *bufp = NULL;
2732 3069          } else {
2733 3070                  *bufp = buf;
2734 3071                  ASSERT(buf->b_data);
2735 3072          }
2736 3073  }
2737 3074  
2738 3075  static void
2739 3076  arc_read_done(zio_t *zio)
2740 3077  {
2741 3078          arc_buf_hdr_t   *hdr, *found;
2742 3079          arc_buf_t       *buf;
2743 3080          arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2744 3081          kmutex_t        *hash_lock;
2745 3082          arc_callback_t  *callback_list, *acb;
2746 3083          int             freeable = FALSE;
2747 3084  
2748 3085          buf = zio->io_private;
2749 3086          hdr = buf->b_hdr;
2750 3087  
2751 3088          /*
2752 3089           * The hdr was inserted into hash-table and removed from lists
2753 3090           * prior to starting I/O.  We should find this header, since
2754 3091           * it's in the hash table, and it should be legit since it's
2755 3092           * not possible to evict it during the I/O.  The only possible
2756 3093           * reason for it not to be found is if we were freed during the
2757 3094           * read.
2758 3095           */
2759 3096          found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2760 3097              &hash_lock);
2761 3098  
2762 3099          ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2763 3100              (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2764 3101              (found == hdr && HDR_L2_READING(hdr)));
2765 3102  
2766 3103          hdr->b_flags &= ~ARC_L2_EVICTED;
2767 3104          if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2768 3105                  hdr->b_flags &= ~ARC_L2CACHE;
2769 3106  
2770 3107          /* byteswap if necessary */
2771 3108          callback_list = hdr->b_acb;
2772 3109          ASSERT(callback_list != NULL);
2773 3110          if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2774 3111                  dmu_object_byteswap_t bswap =
2775 3112                      DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2776 3113                  arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2777 3114                      byteswap_uint64_array :
2778 3115                      dmu_ot_byteswap[bswap].ob_func;
2779 3116                  func(buf->b_data, hdr->b_size);
2780 3117          }
2781 3118  
2782 3119          arc_cksum_compute(buf, B_FALSE);
2783 3120          arc_buf_watch(buf);
2784 3121  
2785 3122          if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2786 3123                  /*
2787 3124                   * Only call arc_access on anonymous buffers.  This is because
2788 3125                   * if we've issued an I/O for an evicted buffer, we've already
2789 3126                   * called arc_access (to prevent any simultaneous readers from
2790 3127                   * getting confused).
2791 3128                   */
2792 3129                  arc_access(hdr, hash_lock);
2793 3130          }
2794 3131  
2795 3132          /* create copies of the data buffer for the callers */
2796 3133          abuf = buf;
2797 3134          for (acb = callback_list; acb; acb = acb->acb_next) {
2798 3135                  if (acb->acb_done) {
2799 3136                          if (abuf == NULL) {
2800 3137                                  ARCSTAT_BUMP(arcstat_duplicate_reads);
2801 3138                                  abuf = arc_buf_clone(buf);
2802 3139                          }
2803 3140                          acb->acb_buf = abuf;
2804 3141                          abuf = NULL;
2805 3142                  }
2806 3143          }
2807 3144          hdr->b_acb = NULL;
2808 3145          hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2809 3146          ASSERT(!HDR_BUF_AVAILABLE(hdr));
2810 3147          if (abuf == buf) {
2811 3148                  ASSERT(buf->b_efunc == NULL);
2812 3149                  ASSERT(hdr->b_datacnt == 1);
2813 3150                  hdr->b_flags |= ARC_BUF_AVAILABLE;
2814 3151          }
2815 3152  
2816 3153          ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2817 3154  
2818 3155          if (zio->io_error != 0) {
2819 3156                  hdr->b_flags |= ARC_IO_ERROR;
2820 3157                  if (hdr->b_state != arc_anon)
2821 3158                          arc_change_state(arc_anon, hdr, hash_lock);
2822 3159                  if (HDR_IN_HASH_TABLE(hdr))
2823 3160                          buf_hash_remove(hdr);
2824 3161                  freeable = refcount_is_zero(&hdr->b_refcnt);
2825 3162          }
2826 3163  
2827 3164          /*
2828 3165           * Broadcast before we drop the hash_lock to avoid the possibility
2829 3166           * that the hdr (and hence the cv) might be freed before we get to
2830 3167           * the cv_broadcast().
2831 3168           */
2832 3169          cv_broadcast(&hdr->b_cv);
2833 3170  
2834 3171          if (hash_lock) {
2835 3172                  mutex_exit(hash_lock);
2836 3173          } else {
2837 3174                  /*
2838 3175                   * This block was freed while we waited for the read to
2839 3176                   * complete.  It has been removed from the hash table and
2840 3177                   * moved to the anonymous state (so that it won't show up
2841 3178                   * in the cache).
2842 3179                   */
2843 3180                  ASSERT3P(hdr->b_state, ==, arc_anon);
2844 3181                  freeable = refcount_is_zero(&hdr->b_refcnt);
2845 3182          }
2846 3183  
2847 3184          /* execute each callback and free its structure */
2848 3185          while ((acb = callback_list) != NULL) {
2849 3186                  if (acb->acb_done)
2850 3187                          acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2851 3188  
2852 3189                  if (acb->acb_zio_dummy != NULL) {
2853 3190                          acb->acb_zio_dummy->io_error = zio->io_error;
2854 3191                          zio_nowait(acb->acb_zio_dummy);
2855 3192                  }
2856 3193  
2857 3194                  callback_list = acb->acb_next;
2858 3195                  kmem_free(acb, sizeof (arc_callback_t));
2859 3196          }
2860 3197  
2861 3198          if (freeable)
2862 3199                  arc_hdr_destroy(hdr);
2863 3200  }
2864 3201  
2865 3202  /*
2866 3203   * "Read" the block at the specified DVA (in bp) via the
2867 3204   * cache.  If the block is found in the cache, invoke the provided
2868 3205   * callback immediately and return.  Note that the `zio' parameter
2869 3206   * in the callback will be NULL in this case, since no IO was
2870 3207   * required.  If the block is not in the cache pass the read request
2871 3208   * on to the spa with a substitute callback function, so that the
2872 3209   * requested block will be added to the cache.
2873 3210   *
2874 3211   * If a read request arrives for a block that has a read in-progress,
2875 3212   * either wait for the in-progress read to complete (and return the
2876 3213   * results); or, if this is a read with a "done" func, add a record
2877 3214   * to the read to invoke the "done" func when the read completes,
2878 3215   * and return; or just return.
2879 3216   *
2880 3217   * arc_read_done() will invoke all the requested "done" functions
2881 3218   * for readers of this block.
2882 3219   */
2883 3220  int
2884 3221  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2885 3222      void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
2886 3223      const zbookmark_t *zb)
2887 3224  {
2888 3225          arc_buf_hdr_t *hdr;
2889 3226          arc_buf_t *buf = NULL;
2890 3227          kmutex_t *hash_lock;
2891 3228          zio_t *rzio;
2892 3229          uint64_t guid = spa_load_guid(spa);
2893 3230  
2894 3231  top:
2895 3232          hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2896 3233              &hash_lock);
2897 3234          if (hdr && hdr->b_datacnt > 0) {
2898 3235  
2899 3236                  *arc_flags |= ARC_CACHED;
2900 3237  
2901 3238                  if (HDR_IO_IN_PROGRESS(hdr)) {
2902 3239  
2903 3240                          if (*arc_flags & ARC_WAIT) {
2904 3241                                  cv_wait(&hdr->b_cv, hash_lock);
2905 3242                                  mutex_exit(hash_lock);
2906 3243                                  goto top;
2907 3244                          }
2908 3245                          ASSERT(*arc_flags & ARC_NOWAIT);
2909 3246  
2910 3247                          if (done) {
2911 3248                                  arc_callback_t  *acb = NULL;
2912 3249  
2913 3250                                  acb = kmem_zalloc(sizeof (arc_callback_t),
2914 3251                                      KM_SLEEP);
2915 3252                                  acb->acb_done = done;
2916 3253                                  acb->acb_private = private;
2917 3254                                  if (pio != NULL)
2918 3255                                          acb->acb_zio_dummy = zio_null(pio,
2919 3256                                              spa, NULL, NULL, NULL, zio_flags);
2920 3257  
2921 3258                                  ASSERT(acb->acb_done != NULL);
2922 3259                                  acb->acb_next = hdr->b_acb;
2923 3260                                  hdr->b_acb = acb;
2924 3261                                  add_reference(hdr, hash_lock, private);
2925 3262                                  mutex_exit(hash_lock);
2926 3263                                  return (0);
2927 3264                          }
2928 3265                          mutex_exit(hash_lock);
2929 3266                          return (0);
2930 3267                  }
2931 3268  
2932 3269                  ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2933 3270  
2934 3271                  if (done) {
2935 3272                          add_reference(hdr, hash_lock, private);
2936 3273                          /*
2937 3274                           * If this block is already in use, create a new
2938 3275                           * copy of the data so that we will be guaranteed
2939 3276                           * that arc_release() will always succeed.
2940 3277                           */
2941 3278                          buf = hdr->b_buf;
2942 3279                          ASSERT(buf);
2943 3280                          ASSERT(buf->b_data);
2944 3281                          if (HDR_BUF_AVAILABLE(hdr)) {
2945 3282                                  ASSERT(buf->b_efunc == NULL);
2946 3283                                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2947 3284                          } else {
2948 3285                                  buf = arc_buf_clone(buf);
2949 3286                          }
2950 3287  
2951 3288                  } else if (*arc_flags & ARC_PREFETCH &&
2952 3289                      refcount_count(&hdr->b_refcnt) == 0) {
2953 3290                          hdr->b_flags |= ARC_PREFETCH;
2954 3291                  }
2955 3292                  DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2956 3293                  arc_access(hdr, hash_lock);
2957 3294                  if (*arc_flags & ARC_L2CACHE)
2958 3295                          hdr->b_flags |= ARC_L2CACHE;
2959 3296                  if (*arc_flags & ARC_L2COMPRESS)
2960 3297                          hdr->b_flags |= ARC_L2COMPRESS;
2961 3298                  mutex_exit(hash_lock);
2962 3299                  ARCSTAT_BUMP(arcstat_hits);
2963 3300                  ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2964 3301                      demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2965 3302                      data, metadata, hits);
2966 3303  
2967 3304                  if (done)
2968 3305                          done(NULL, buf, private);
2969 3306          } else {
2970 3307                  uint64_t size = BP_GET_LSIZE(bp);
2971 3308                  arc_callback_t  *acb;
2972 3309                  vdev_t *vd = NULL;
2973 3310                  uint64_t addr = 0;
2974 3311                  boolean_t devw = B_FALSE;
2975 3312                  enum zio_compress b_compress = ZIO_COMPRESS_OFF;
2976 3313                  uint64_t b_asize = 0;
2977 3314  
2978 3315                  if (hdr == NULL) {
2979 3316                          /* this block is not in the cache */
2980 3317                          arc_buf_hdr_t   *exists;
2981 3318                          arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2982 3319                          buf = arc_buf_alloc(spa, size, private, type);
2983 3320                          hdr = buf->b_hdr;
2984 3321                          hdr->b_dva = *BP_IDENTITY(bp);
2985 3322                          hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2986 3323                          hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2987 3324                          exists = buf_hash_insert(hdr, &hash_lock);
2988 3325                          if (exists) {
2989 3326                                  /* somebody beat us to the hash insert */
2990 3327                                  mutex_exit(hash_lock);
2991 3328                                  buf_discard_identity(hdr);
2992 3329                                  (void) arc_buf_remove_ref(buf, private);
2993 3330                                  goto top; /* restart the IO request */
2994 3331                          }
2995 3332                          /* if this is a prefetch, we don't have a reference */
2996 3333                          if (*arc_flags & ARC_PREFETCH) {
2997 3334                                  (void) remove_reference(hdr, hash_lock,
2998 3335                                      private);
2999 3336                                  hdr->b_flags |= ARC_PREFETCH;
3000 3337                          }
3001 3338                          if (*arc_flags & ARC_L2CACHE)
3002 3339                                  hdr->b_flags |= ARC_L2CACHE;
3003 3340                          if (*arc_flags & ARC_L2COMPRESS)
3004 3341                                  hdr->b_flags |= ARC_L2COMPRESS;
3005 3342                          if (BP_GET_LEVEL(bp) > 0)
3006 3343                                  hdr->b_flags |= ARC_INDIRECT;
3007 3344                  } else {
3008 3345                          /* this block is in the ghost cache */
3009 3346                          ASSERT(GHOST_STATE(hdr->b_state));
3010 3347                          ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3011 3348                          ASSERT0(refcount_count(&hdr->b_refcnt));
3012 3349                          ASSERT(hdr->b_buf == NULL);
3013 3350  
3014 3351                          /* if this is a prefetch, we don't have a reference */
3015 3352                          if (*arc_flags & ARC_PREFETCH)
3016 3353                                  hdr->b_flags |= ARC_PREFETCH;
3017 3354                          else
3018 3355                                  add_reference(hdr, hash_lock, private);
3019 3356                          if (*arc_flags & ARC_L2CACHE)
3020 3357                                  hdr->b_flags |= ARC_L2CACHE;
3021 3358                          if (*arc_flags & ARC_L2COMPRESS)
3022 3359                                  hdr->b_flags |= ARC_L2COMPRESS;
3023 3360                          buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3024 3361                          buf->b_hdr = hdr;
3025 3362                          buf->b_data = NULL;
3026 3363                          buf->b_efunc = NULL;
3027 3364                          buf->b_private = NULL;
3028 3365                          buf->b_next = NULL;
3029 3366                          hdr->b_buf = buf;
3030 3367                          ASSERT(hdr->b_datacnt == 0);
3031 3368                          hdr->b_datacnt = 1;
3032 3369                          arc_get_data_buf(buf);
3033 3370                          arc_access(hdr, hash_lock);
3034 3371                  }
3035 3372  
3036 3373                  ASSERT(!GHOST_STATE(hdr->b_state));
3037 3374

↓ open down ↓

1436 lines elided

↑ open up ↑

3038 3375                  acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3039 3376                  acb->acb_done = done;
3040 3377                  acb->acb_private = private;
3041 3378  
3042 3379                  ASSERT(hdr->b_acb == NULL);
3043 3380                  hdr->b_acb = acb;
3044 3381                  hdr->b_flags |= ARC_IO_IN_PROGRESS;
3045 3382  
3046 3383                  if (hdr->b_l2hdr != NULL &&
3047 3384                      (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
     3385 +                        /*
     3386 +                         * Need to stash these before letting go of hash_lock
     3387 +                         */
3048 3388                          devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3049 3389                          addr = hdr->b_l2hdr->b_daddr;
3050 3390                          b_compress = hdr->b_l2hdr->b_compress;
3051 3391                          b_asize = hdr->b_l2hdr->b_asize;
3052 3392                          /*
3053 3393                           * Lock out device removal.
3054 3394                           */
3055 3395                          if (vdev_is_dead(vd) ||
3056 3396                              !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3057 3397                                  vd = NULL;

3058 3398                  }
3059 3399  
3060 3400                  mutex_exit(hash_lock);
3061 3401  
3062 3402                  /*
3063 3403                   * At this point, we have a level 1 cache miss.  Try again in
3064 3404                   * L2ARC if possible.
3065 3405                   */
3066 3406                  ASSERT3U(hdr->b_size, ==, size);
3067 3407                  DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3068 3408                      uint64_t, size, zbookmark_t *, zb);
3069 3409                  ARCSTAT_BUMP(arcstat_misses);
3070 3410                  ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3071 3411                      demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3072 3412                      data, metadata, misses);
3073 3413  
3074 3414                  if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3075 3415                          /*
3076 3416                           * Read from the L2ARC if the following are true:
3077 3417                           * 1. The L2ARC vdev was previously cached.
3078 3418                           * 2. This buffer still has L2ARC metadata.
3079 3419                           * 3. This buffer isn't currently writing to the L2ARC.
3080 3420                           * 4. The L2ARC entry wasn't evicted, which may
3081 3421                           *    also have invalidated the vdev.
3082 3422                           * 5. This isn't prefetch and l2arc_noprefetch is set.
3083 3423                           */
3084 3424                          if (hdr->b_l2hdr != NULL &&
3085 3425                              !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3086 3426                              !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3087 3427                                  l2arc_read_callback_t *cb;
3088 3428  
3089 3429                                  DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3090 3430                                  ARCSTAT_BUMP(arcstat_l2_hits);
3091 3431  
3092 3432                                  cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3093 3433                                      KM_SLEEP);
3094 3434                                  cb->l2rcb_buf = buf;
3095 3435                                  cb->l2rcb_spa = spa;
3096 3436                                  cb->l2rcb_bp = *bp;
3097 3437                                  cb->l2rcb_zb = *zb;
3098 3438                                  cb->l2rcb_flags = zio_flags;
3099 3439                                  cb->l2rcb_compress = b_compress;
3100 3440  
3101 3441                                  ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3102 3442                                      addr + size < vd->vdev_psize -
3103 3443                                      VDEV_LABEL_END_SIZE);
3104 3444  
3105 3445                                  /*
3106 3446                                   * l2arc read.  The SCL_L2ARC lock will be
3107 3447                                   * released by l2arc_read_done().
3108 3448                                   * Issue a null zio if the underlying buffer
3109 3449                                   * was squashed to zero size by compression.
3110 3450                                   */
3111 3451                                  if (b_compress == ZIO_COMPRESS_EMPTY) {
3112 3452                                          rzio = zio_null(pio, spa, vd,
3113 3453                                              l2arc_read_done, cb,
3114 3454                                              zio_flags | ZIO_FLAG_DONT_CACHE |
3115 3455                                              ZIO_FLAG_CANFAIL |
3116 3456                                              ZIO_FLAG_DONT_PROPAGATE |
3117 3457                                              ZIO_FLAG_DONT_RETRY);
3118 3458                                  } else {
3119 3459                                          rzio = zio_read_phys(pio, vd, addr,
3120 3460                                              b_asize, buf->b_data,
3121 3461                                              ZIO_CHECKSUM_OFF,
3122 3462                                              l2arc_read_done, cb, priority,
3123 3463                                              zio_flags | ZIO_FLAG_DONT_CACHE |
3124 3464                                              ZIO_FLAG_CANFAIL |
3125 3465                                              ZIO_FLAG_DONT_PROPAGATE |
3126 3466                                              ZIO_FLAG_DONT_RETRY, B_FALSE);
3127 3467                                  }
3128 3468                                  DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3129 3469                                      zio_t *, rzio);
3130 3470                                  ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3131 3471  
3132 3472                                  if (*arc_flags & ARC_NOWAIT) {
3133 3473                                          zio_nowait(rzio);
3134 3474                                          return (0);
3135 3475                                  }
3136 3476  
3137 3477                                  ASSERT(*arc_flags & ARC_WAIT);
3138 3478                                  if (zio_wait(rzio) == 0)
3139 3479                                          return (0);
3140 3480  
3141 3481                                  /* l2arc read error; goto zio_read() */
3142 3482                          } else {
3143 3483                                  DTRACE_PROBE1(l2arc__miss,
3144 3484                                      arc_buf_hdr_t *, hdr);
3145 3485                                  ARCSTAT_BUMP(arcstat_l2_misses);
3146 3486                                  if (HDR_L2_WRITING(hdr))
3147 3487                                          ARCSTAT_BUMP(arcstat_l2_rw_clash);
3148 3488                                  spa_config_exit(spa, SCL_L2ARC, vd);
3149 3489                          }
3150 3490                  } else {
3151 3491                          if (vd != NULL)
3152 3492                                  spa_config_exit(spa, SCL_L2ARC, vd);
3153 3493                          if (l2arc_ndev != 0) {
3154 3494                                  DTRACE_PROBE1(l2arc__miss,
3155 3495                                      arc_buf_hdr_t *, hdr);
3156 3496                                  ARCSTAT_BUMP(arcstat_l2_misses);
3157 3497                          }
3158 3498                  }
3159 3499  
3160 3500                  rzio = zio_read(pio, spa, bp, buf->b_data, size,
3161 3501                      arc_read_done, buf, priority, zio_flags, zb);
3162 3502  
3163 3503                  if (*arc_flags & ARC_WAIT)
3164 3504                          return (zio_wait(rzio));
3165 3505  
3166 3506                  ASSERT(*arc_flags & ARC_NOWAIT);
3167 3507                  zio_nowait(rzio);
3168 3508          }
3169 3509          return (0);
3170 3510  }
3171 3511  
3172 3512  void
3173 3513  arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3174 3514  {
3175 3515          ASSERT(buf->b_hdr != NULL);
3176 3516          ASSERT(buf->b_hdr->b_state != arc_anon);
3177 3517          ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3178 3518          ASSERT(buf->b_efunc == NULL);
3179 3519          ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3180 3520  
3181 3521          buf->b_efunc = func;
3182 3522          buf->b_private = private;
3183 3523  }
3184 3524  
3185 3525  /*
3186 3526   * Notify the arc that a block was freed, and thus will never be used again.
3187 3527   */
3188 3528  void
3189 3529  arc_freed(spa_t *spa, const blkptr_t *bp)
3190 3530  {
3191 3531          arc_buf_hdr_t *hdr;
3192 3532          kmutex_t *hash_lock;
3193 3533          uint64_t guid = spa_load_guid(spa);
3194 3534  
3195 3535          hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3196 3536              &hash_lock);
3197 3537          if (hdr == NULL)
3198 3538                  return;
3199 3539          if (HDR_BUF_AVAILABLE(hdr)) {
3200 3540                  arc_buf_t *buf = hdr->b_buf;
3201 3541                  add_reference(hdr, hash_lock, FTAG);
3202 3542                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3203 3543                  mutex_exit(hash_lock);
3204 3544  
3205 3545                  arc_release(buf, FTAG);
3206 3546                  (void) arc_buf_remove_ref(buf, FTAG);
3207 3547          } else {
3208 3548                  mutex_exit(hash_lock);
3209 3549          }
3210 3550  
3211 3551  }
3212 3552  
3213 3553  /*
3214 3554   * This is used by the DMU to let the ARC know that a buffer is
3215 3555   * being evicted, so the ARC should clean up.  If this arc buf
3216 3556   * is not yet in the evicted state, it will be put there.
3217 3557   */
3218 3558  int
3219 3559  arc_buf_evict(arc_buf_t *buf)
3220 3560  {
3221 3561          arc_buf_hdr_t *hdr;
3222 3562          kmutex_t *hash_lock;
3223 3563          arc_buf_t **bufp;
3224 3564  
3225 3565          mutex_enter(&buf->b_evict_lock);
3226 3566          hdr = buf->b_hdr;
3227 3567          if (hdr == NULL) {
3228 3568                  /*
3229 3569                   * We are in arc_do_user_evicts().
3230 3570                   */
3231 3571                  ASSERT(buf->b_data == NULL);
3232 3572                  mutex_exit(&buf->b_evict_lock);
3233 3573                  return (0);
3234 3574          } else if (buf->b_data == NULL) {
3235 3575                  arc_buf_t copy = *buf; /* structure assignment */
3236 3576                  /*
3237 3577                   * We are on the eviction list; process this buffer now
3238 3578                   * but let arc_do_user_evicts() do the reaping.
3239 3579                   */
3240 3580                  buf->b_efunc = NULL;
3241 3581                  mutex_exit(&buf->b_evict_lock);
3242 3582                  VERIFY(copy.b_efunc(&copy) == 0);
3243 3583                  return (1);
3244 3584          }
3245 3585          hash_lock = HDR_LOCK(hdr);
3246 3586          mutex_enter(hash_lock);
3247 3587          hdr = buf->b_hdr;
3248 3588          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3249 3589  
3250 3590          ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3251 3591          ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3252 3592  
3253 3593          /*
3254 3594           * Pull this buffer off of the hdr
3255 3595           */
3256 3596          bufp = &hdr->b_buf;
3257 3597          while (*bufp != buf)
3258 3598                  bufp = &(*bufp)->b_next;
3259 3599          *bufp = buf->b_next;
3260 3600  
3261 3601          ASSERT(buf->b_data != NULL);
3262 3602          arc_buf_destroy(buf, FALSE, FALSE);
3263 3603  
3264 3604          if (hdr->b_datacnt == 0) {
3265 3605                  arc_state_t *old_state = hdr->b_state;
3266 3606                  arc_state_t *evicted_state;
3267 3607  
3268 3608                  ASSERT(hdr->b_buf == NULL);
3269 3609                  ASSERT(refcount_is_zero(&hdr->b_refcnt));
3270 3610  
3271 3611                  evicted_state =
3272 3612                      (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3273 3613  
3274 3614                  mutex_enter(&old_state->arcs_mtx);
3275 3615                  mutex_enter(&evicted_state->arcs_mtx);
3276 3616  
3277 3617                  arc_change_state(evicted_state, hdr, hash_lock);
3278 3618                  ASSERT(HDR_IN_HASH_TABLE(hdr));
3279 3619                  hdr->b_flags |= ARC_IN_HASH_TABLE;
3280 3620                  hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3281 3621  
3282 3622                  mutex_exit(&evicted_state->arcs_mtx);
3283 3623                  mutex_exit(&old_state->arcs_mtx);
3284 3624          }
3285 3625          mutex_exit(hash_lock);
3286 3626          mutex_exit(&buf->b_evict_lock);
3287 3627  
3288 3628          VERIFY(buf->b_efunc(buf) == 0);
3289 3629          buf->b_efunc = NULL;
3290 3630          buf->b_private = NULL;
3291 3631          buf->b_hdr = NULL;
3292 3632          buf->b_next = NULL;
3293 3633          kmem_cache_free(buf_cache, buf);
3294 3634          return (1);
3295 3635  }
3296 3636  
3297 3637  /*
3298 3638   * Release this buffer from the cache, making it an anonymous buffer.  This
3299 3639   * must be done after a read and prior to modifying the buffer contents.
3300 3640   * If the buffer has more than one reference, we must make
3301 3641   * a new hdr for the buffer.
3302 3642   */
3303 3643  void
3304 3644  arc_release(arc_buf_t *buf, void *tag)
3305 3645  {
3306 3646          arc_buf_hdr_t *hdr;
3307 3647          kmutex_t *hash_lock = NULL;
3308 3648          l2arc_buf_hdr_t *l2hdr;
3309 3649          uint64_t buf_size;
3310 3650  
3311 3651          /*
3312 3652           * It would be nice to assert that if it's DMU metadata (level >
3313 3653           * 0 || it's the dnode file), then it must be syncing context.
3314 3654           * But we don't know that information at this level.
3315 3655           */
3316 3656  
3317 3657          mutex_enter(&buf->b_evict_lock);
3318 3658          hdr = buf->b_hdr;
3319 3659  
3320 3660          /* this buffer is not on any list */
3321 3661          ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3322 3662  
3323 3663          if (hdr->b_state == arc_anon) {
3324 3664                  /* this buffer is already released */
3325 3665                  ASSERT(buf->b_efunc == NULL);
3326 3666          } else {
3327 3667                  hash_lock = HDR_LOCK(hdr);
3328 3668                  mutex_enter(hash_lock);
3329 3669                  hdr = buf->b_hdr;
3330 3670                  ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3331 3671          }
3332 3672  
3333 3673          l2hdr = hdr->b_l2hdr;
3334 3674          if (l2hdr) {
3335 3675                  mutex_enter(&l2arc_buflist_mtx);
3336 3676                  hdr->b_l2hdr = NULL;
3337 3677                  list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3338 3678          }
3339 3679          buf_size = hdr->b_size;
3340 3680  
3341 3681          /*
3342 3682           * Do we have more than one buf?
3343 3683           */
3344 3684          if (hdr->b_datacnt > 1) {
3345 3685                  arc_buf_hdr_t *nhdr;
3346 3686                  arc_buf_t **bufp;
3347 3687                  uint64_t blksz = hdr->b_size;
3348 3688                  uint64_t spa = hdr->b_spa;
3349 3689                  arc_buf_contents_t type = hdr->b_type;
3350 3690                  uint32_t flags = hdr->b_flags;
3351 3691  
3352 3692                  ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3353 3693                  /*
3354 3694                   * Pull the data off of this hdr and attach it to
3355 3695                   * a new anonymous hdr.
3356 3696                   */
3357 3697                  (void) remove_reference(hdr, hash_lock, tag);
3358 3698                  bufp = &hdr->b_buf;
3359 3699                  while (*bufp != buf)
3360 3700                          bufp = &(*bufp)->b_next;
3361 3701                  *bufp = buf->b_next;
3362 3702                  buf->b_next = NULL;
3363 3703  
3364 3704                  ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3365 3705                  atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3366 3706                  if (refcount_is_zero(&hdr->b_refcnt)) {
3367 3707                          uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3368 3708                          ASSERT3U(*size, >=, hdr->b_size);
3369 3709                          atomic_add_64(size, -hdr->b_size);
3370 3710                  }
3371 3711  
3372 3712                  /*
3373 3713                   * We're releasing a duplicate user data buffer, update
3374 3714                   * our statistics accordingly.
3375 3715                   */
3376 3716                  if (hdr->b_type == ARC_BUFC_DATA) {
3377 3717                          ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3378 3718                          ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3379 3719                              -hdr->b_size);
3380 3720                  }
3381 3721                  hdr->b_datacnt -= 1;
3382 3722                  arc_cksum_verify(buf);
3383 3723                  arc_buf_unwatch(buf);
3384 3724  
3385 3725                  mutex_exit(hash_lock);
3386 3726  
3387 3727                  nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3388 3728                  nhdr->b_size = blksz;
3389 3729                  nhdr->b_spa = spa;
3390 3730                  nhdr->b_type = type;
3391 3731                  nhdr->b_buf = buf;
3392 3732                  nhdr->b_state = arc_anon;
3393 3733                  nhdr->b_arc_access = 0;
3394 3734                  nhdr->b_flags = flags & ARC_L2_WRITING;
3395 3735                  nhdr->b_l2hdr = NULL;
3396 3736                  nhdr->b_datacnt = 1;
3397 3737                  nhdr->b_freeze_cksum = NULL;
3398 3738                  (void) refcount_add(&nhdr->b_refcnt, tag);
3399 3739                  buf->b_hdr = nhdr;
3400 3740                  mutex_exit(&buf->b_evict_lock);
3401 3741                  atomic_add_64(&arc_anon->arcs_size, blksz);
3402 3742          } else {
3403 3743                  mutex_exit(&buf->b_evict_lock);
3404 3744                  ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3405 3745                  ASSERT(!list_link_active(&hdr->b_arc_node));
3406 3746                  ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3407 3747                  if (hdr->b_state != arc_anon)
3408 3748                          arc_change_state(arc_anon, hdr, hash_lock);
3409 3749                  hdr->b_arc_access = 0;
3410 3750                  if (hash_lock)

↓ open down ↓

353 lines elided

↑ open up ↑

3411 3751                          mutex_exit(hash_lock);
3412 3752  
3413 3753                  buf_discard_identity(hdr);
3414 3754                  arc_buf_thaw(buf);
3415 3755          }
3416 3756          buf->b_efunc = NULL;
3417 3757          buf->b_private = NULL;
3418 3758  
3419 3759          if (l2hdr) {
3420 3760                  ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3421      -                kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
     3761 +                kmem_free(l2hdr, sizeof (*l2hdr));
3422 3762                  ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3423 3763                  mutex_exit(&l2arc_buflist_mtx);
3424 3764          }
3425 3765  }
3426 3766  
3427 3767  int
3428 3768  arc_released(arc_buf_t *buf)
3429 3769  {
3430 3770          int released;
3431 3771

3432 3772          mutex_enter(&buf->b_evict_lock);
3433 3773          released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3434 3774          mutex_exit(&buf->b_evict_lock);
3435 3775          return (released);
3436 3776  }
3437 3777  
3438 3778  int
3439 3779  arc_has_callback(arc_buf_t *buf)
3440 3780  {
3441 3781          int callback;
3442 3782  
3443 3783          mutex_enter(&buf->b_evict_lock);
3444 3784          callback = (buf->b_efunc != NULL);
3445 3785          mutex_exit(&buf->b_evict_lock);
3446 3786          return (callback);
3447 3787  }
3448 3788  
3449 3789  #ifdef ZFS_DEBUG
3450 3790  int
3451 3791  arc_referenced(arc_buf_t *buf)
3452 3792  {
3453 3793          int referenced;
3454 3794  
3455 3795          mutex_enter(&buf->b_evict_lock);
3456 3796          referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3457 3797          mutex_exit(&buf->b_evict_lock);
3458 3798          return (referenced);
3459 3799  }
3460 3800  #endif
3461 3801  
3462 3802  static void
3463 3803  arc_write_ready(zio_t *zio)
3464 3804  {
3465 3805          arc_write_callback_t *callback = zio->io_private;
3466 3806          arc_buf_t *buf = callback->awcb_buf;
3467 3807          arc_buf_hdr_t *hdr = buf->b_hdr;
3468 3808  
3469 3809          ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3470 3810          callback->awcb_ready(zio, buf, callback->awcb_private);
3471 3811  
3472 3812          /*
3473 3813           * If the IO is already in progress, then this is a re-write
3474 3814           * attempt, so we need to thaw and re-compute the cksum.
3475 3815           * It is the responsibility of the callback to handle the
3476 3816           * accounting for any re-write attempt.
3477 3817           */
3478 3818          if (HDR_IO_IN_PROGRESS(hdr)) {
3479 3819                  mutex_enter(&hdr->b_freeze_lock);
3480 3820                  if (hdr->b_freeze_cksum != NULL) {
3481 3821                          kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3482 3822                          hdr->b_freeze_cksum = NULL;
3483 3823                  }
3484 3824                  mutex_exit(&hdr->b_freeze_lock);
3485 3825          }
3486 3826          arc_cksum_compute(buf, B_FALSE);
3487 3827          hdr->b_flags |= ARC_IO_IN_PROGRESS;
3488 3828  }
3489 3829  
3490 3830  /*
3491 3831   * The SPA calls this callback for each physical write that happens on behalf
3492 3832   * of a logical write.  See the comment in dbuf_write_physdone() for details.
3493 3833   */
3494 3834  static void
3495 3835  arc_write_physdone(zio_t *zio)
3496 3836  {
3497 3837          arc_write_callback_t *cb = zio->io_private;
3498 3838          if (cb->awcb_physdone != NULL)
3499 3839                  cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3500 3840  }
3501 3841  
3502 3842  static void
3503 3843  arc_write_done(zio_t *zio)
3504 3844  {
3505 3845          arc_write_callback_t *callback = zio->io_private;
3506 3846          arc_buf_t *buf = callback->awcb_buf;
3507 3847          arc_buf_hdr_t *hdr = buf->b_hdr;
3508 3848  
3509 3849          ASSERT(hdr->b_acb == NULL);
3510 3850  
3511 3851          if (zio->io_error == 0) {
3512 3852                  hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3513 3853                  hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3514 3854                  hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3515 3855          } else {
3516 3856                  ASSERT(BUF_EMPTY(hdr));
3517 3857          }
3518 3858  
3519 3859          /*
3520 3860           * If the block to be written was all-zero, we may have
3521 3861           * compressed it away.  In this case no write was performed
3522 3862           * so there will be no dva/birth/checksum.  The buffer must
3523 3863           * therefore remain anonymous (and uncached).
3524 3864           */
3525 3865          if (!BUF_EMPTY(hdr)) {
3526 3866                  arc_buf_hdr_t *exists;
3527 3867                  kmutex_t *hash_lock;
3528 3868  
3529 3869                  ASSERT(zio->io_error == 0);
3530 3870  
3531 3871                  arc_cksum_verify(buf);
3532 3872  
3533 3873                  exists = buf_hash_insert(hdr, &hash_lock);
3534 3874                  if (exists) {
3535 3875                          /*
3536 3876                           * This can only happen if we overwrite for
3537 3877                           * sync-to-convergence, because we remove
3538 3878                           * buffers from the hash table when we arc_free().
3539 3879                           */
3540 3880                          if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3541 3881                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3542 3882                                          panic("bad overwrite, hdr=%p exists=%p",
3543 3883                                              (void *)hdr, (void *)exists);
3544 3884                                  ASSERT(refcount_is_zero(&exists->b_refcnt));
3545 3885                                  arc_change_state(arc_anon, exists, hash_lock);
3546 3886                                  mutex_exit(hash_lock);
3547 3887                                  arc_hdr_destroy(exists);
3548 3888                                  exists = buf_hash_insert(hdr, &hash_lock);
3549 3889                                  ASSERT3P(exists, ==, NULL);
3550 3890                          } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3551 3891                                  /* nopwrite */
3552 3892                                  ASSERT(zio->io_prop.zp_nopwrite);
3553 3893                                  if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3554 3894                                          panic("bad nopwrite, hdr=%p exists=%p",
3555 3895                                              (void *)hdr, (void *)exists);
3556 3896                          } else {
3557 3897                                  /* Dedup */
3558 3898                                  ASSERT(hdr->b_datacnt == 1);
3559 3899                                  ASSERT(hdr->b_state == arc_anon);
3560 3900                                  ASSERT(BP_GET_DEDUP(zio->io_bp));
3561 3901                                  ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3562 3902                          }
3563 3903                  }
3564 3904                  hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3565 3905                  /* if it's not anon, we are doing a scrub */
3566 3906                  if (!exists && hdr->b_state == arc_anon)
3567 3907                          arc_access(hdr, hash_lock);
3568 3908                  mutex_exit(hash_lock);
3569 3909          } else {
3570 3910                  hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3571 3911          }
3572 3912  
3573 3913          ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3574 3914          callback->awcb_done(zio, buf, callback->awcb_private);
3575 3915  
3576 3916          kmem_free(callback, sizeof (arc_write_callback_t));
3577 3917  }
3578 3918  
3579 3919  zio_t *
3580 3920  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3581 3921      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3582 3922      const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3583 3923      arc_done_func_t *done, void *private, zio_priority_t priority,
3584 3924      int zio_flags, const zbookmark_t *zb)
3585 3925  {
3586 3926          arc_buf_hdr_t *hdr = buf->b_hdr;
3587 3927          arc_write_callback_t *callback;
3588 3928          zio_t *zio;
3589 3929  
3590 3930          ASSERT(ready != NULL);
3591 3931          ASSERT(done != NULL);
3592 3932          ASSERT(!HDR_IO_ERROR(hdr));
3593 3933          ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3594 3934          ASSERT(hdr->b_acb == NULL);
3595 3935          if (l2arc)
3596 3936                  hdr->b_flags |= ARC_L2CACHE;
3597 3937          if (l2arc_compress)
3598 3938                  hdr->b_flags |= ARC_L2COMPRESS;
3599 3939          callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3600 3940          callback->awcb_ready = ready;
3601 3941          callback->awcb_physdone = physdone;
3602 3942          callback->awcb_done = done;
3603 3943          callback->awcb_private = private;
3604 3944          callback->awcb_buf = buf;
3605 3945  
3606 3946          zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3607 3947              arc_write_ready, arc_write_physdone, arc_write_done, callback,
3608 3948              priority, zio_flags, zb);
3609 3949  
3610 3950          return (zio);
3611 3951  }
3612 3952  
3613 3953  static int
3614 3954  arc_memory_throttle(uint64_t reserve, uint64_t txg)
3615 3955  {
3616 3956  #ifdef _KERNEL
3617 3957          uint64_t available_memory = ptob(freemem);
3618 3958          static uint64_t page_load = 0;
3619 3959          static uint64_t last_txg = 0;
3620 3960  
3621 3961  #if defined(__i386)
3622 3962          available_memory =
3623 3963              MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3624 3964  #endif
3625 3965  
3626 3966          if (freemem > physmem * arc_lotsfree_percent / 100)
3627 3967                  return (0);
3628 3968  
3629 3969          if (txg > last_txg) {
3630 3970                  last_txg = txg;
3631 3971                  page_load = 0;
3632 3972          }
3633 3973          /*
3634 3974           * If we are in pageout, we know that memory is already tight,
3635 3975           * the arc is already going to be evicting, so we just want to
3636 3976           * continue to let page writes occur as quickly as possible.
3637 3977           */
3638 3978          if (curproc == proc_pageout) {
3639 3979                  if (page_load > MAX(ptob(minfree), available_memory) / 4)
3640 3980                          return (SET_ERROR(ERESTART));
3641 3981                  /* Note: reserve is inflated, so we deflate */
3642 3982                  page_load += reserve / 8;
3643 3983                  return (0);
3644 3984          } else if (page_load > 0 && arc_reclaim_needed()) {
3645 3985                  /* memory is low, delay before restarting */
3646 3986                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3647 3987                  return (SET_ERROR(EAGAIN));
3648 3988          }
3649 3989          page_load = 0;
3650 3990  #endif
3651 3991          return (0);
3652 3992  }
3653 3993  
3654 3994  void
3655 3995  arc_tempreserve_clear(uint64_t reserve)
3656 3996  {
3657 3997          atomic_add_64(&arc_tempreserve, -reserve);
3658 3998          ASSERT((int64_t)arc_tempreserve >= 0);
3659 3999  }
3660 4000  
3661 4001  int
3662 4002  arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3663 4003  {
3664 4004          int error;
3665 4005          uint64_t anon_size;
3666 4006  
3667 4007          if (reserve > arc_c/4 && !arc_no_grow)
3668 4008                  arc_c = MIN(arc_c_max, reserve * 4);
3669 4009          if (reserve > arc_c)
3670 4010                  return (SET_ERROR(ENOMEM));
3671 4011  
3672 4012          /*
3673 4013           * Don't count loaned bufs as in flight dirty data to prevent long
3674 4014           * network delays from blocking transactions that are ready to be
3675 4015           * assigned to a txg.
3676 4016           */
3677 4017          anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3678 4018  
3679 4019          /*
3680 4020           * Writes will, almost always, require additional memory allocations
3681 4021           * in order to compress/encrypt/etc the data.  We therefore need to
3682 4022           * make sure that there is sufficient available memory for this.
3683 4023           */
3684 4024          error = arc_memory_throttle(reserve, txg);
3685 4025          if (error != 0)
3686 4026                  return (error);
3687 4027  
3688 4028          /*
3689 4029           * Throttle writes when the amount of dirty data in the cache
3690 4030           * gets too large.  We try to keep the cache less than half full
3691 4031           * of dirty blocks so that our sync times don't grow too large.
3692 4032           * Note: if two requests come in concurrently, we might let them
3693 4033           * both succeed, when one of them should fail.  Not a huge deal.
3694 4034           */
3695 4035  
3696 4036          if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3697 4037              anon_size > arc_c / 4) {
3698 4038                  dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3699 4039                      "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3700 4040                      arc_tempreserve>>10,
3701 4041                      arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3702 4042                      arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3703 4043                      reserve>>10, arc_c>>10);
3704 4044                  return (SET_ERROR(ERESTART));
3705 4045          }
3706 4046          atomic_add_64(&arc_tempreserve, reserve);
3707 4047          return (0);
3708 4048  }
3709 4049  
3710 4050  void
3711 4051  arc_init(void)
3712 4052  {
3713 4053          mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3714 4054          cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3715 4055  
3716 4056          /* Convert seconds to clock ticks */
3717 4057          arc_min_prefetch_lifespan = 1 * hz;
3718 4058  
3719 4059          /* Start out with 1/8 of all memory */
3720 4060          arc_c = physmem * PAGESIZE / 8;
3721 4061  
3722 4062  #ifdef _KERNEL
3723 4063          /*
3724 4064           * On architectures where the physical memory can be larger
3725 4065           * than the addressable space (intel in 32-bit mode), we may
3726 4066           * need to limit the cache to 1/8 of VM size.
3727 4067           */
3728 4068          arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3729 4069  #endif
3730 4070  
3731 4071          /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3732 4072          arc_c_min = MAX(arc_c / 4, 64<<20);
3733 4073          /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3734 4074          if (arc_c * 8 >= 1<<30)
3735 4075                  arc_c_max = (arc_c * 8) - (1<<30);
3736 4076          else
3737 4077                  arc_c_max = arc_c_min;
3738 4078          arc_c_max = MAX(arc_c * 6, arc_c_max);
3739 4079  
3740 4080          /*
3741 4081           * Allow the tunables to override our calculations if they are
3742 4082           * reasonable (ie. over 64MB)
3743 4083           */
3744 4084          if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3745 4085                  arc_c_max = zfs_arc_max;
3746 4086          if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3747 4087                  arc_c_min = zfs_arc_min;
3748 4088  
3749 4089          arc_c = arc_c_max;
3750 4090          arc_p = (arc_c >> 1);
3751 4091  
3752 4092          /* limit meta-data to 1/4 of the arc capacity */
3753 4093          arc_meta_limit = arc_c_max / 4;
3754 4094  
3755 4095          /* Allow the tunable to override if it is reasonable */
3756 4096          if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3757 4097                  arc_meta_limit = zfs_arc_meta_limit;
3758 4098  
3759 4099          if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3760 4100                  arc_c_min = arc_meta_limit / 2;
3761 4101  
3762 4102          if (zfs_arc_grow_retry > 0)
3763 4103                  arc_grow_retry = zfs_arc_grow_retry;
3764 4104  
3765 4105          if (zfs_arc_shrink_shift > 0)
3766 4106                  arc_shrink_shift = zfs_arc_shrink_shift;
3767 4107  
3768 4108          if (zfs_arc_p_min_shift > 0)
3769 4109                  arc_p_min_shift = zfs_arc_p_min_shift;
3770 4110  
3771 4111          /* if kmem_flags are set, lets try to use less memory */
3772 4112          if (kmem_debugging())
3773 4113                  arc_c = arc_c / 2;
3774 4114          if (arc_c < arc_c_min)
3775 4115                  arc_c = arc_c_min;
3776 4116  
3777 4117          arc_anon = &ARC_anon;
3778 4118          arc_mru = &ARC_mru;
3779 4119          arc_mru_ghost = &ARC_mru_ghost;
3780 4120          arc_mfu = &ARC_mfu;
3781 4121          arc_mfu_ghost = &ARC_mfu_ghost;
3782 4122          arc_l2c_only = &ARC_l2c_only;
3783 4123          arc_size = 0;
3784 4124  
3785 4125          mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3786 4126          mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3787 4127          mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3788 4128          mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3789 4129          mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3790 4130          mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3791 4131  
3792 4132          list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3793 4133              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3794 4134          list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3795 4135              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3796 4136          list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3797 4137              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3798 4138          list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3799 4139              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3800 4140          list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3801 4141              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3802 4142          list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3803 4143              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3804 4144          list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3805 4145              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3806 4146          list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3807 4147              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3808 4148          list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3809 4149              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3810 4150          list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3811 4151              sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3812 4152  
3813 4153          buf_init();
3814 4154  
3815 4155          arc_thread_exit = 0;
3816 4156          arc_eviction_list = NULL;
3817 4157          mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3818 4158          bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3819 4159  
3820 4160          arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3821 4161              sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3822 4162  
3823 4163          if (arc_ksp != NULL) {
3824 4164                  arc_ksp->ks_data = &arc_stats;
3825 4165                  kstat_install(arc_ksp);
3826 4166          }
3827 4167  
3828 4168          (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3829 4169              TS_RUN, minclsyspri);
3830 4170  
3831 4171          arc_dead = FALSE;
3832 4172          arc_warm = B_FALSE;
3833 4173  
3834 4174          /*
3835 4175           * Calculate maximum amount of dirty data per pool.
3836 4176           *
3837 4177           * If it has been set by /etc/system, take that.
3838 4178           * Otherwise, use a percentage of physical memory defined by
3839 4179           * zfs_dirty_data_max_percent (default 10%) with a cap at
3840 4180           * zfs_dirty_data_max_max (default 4GB).
3841 4181           */
3842 4182          if (zfs_dirty_data_max == 0) {
3843 4183                  zfs_dirty_data_max = physmem * PAGESIZE *
3844 4184                      zfs_dirty_data_max_percent / 100;
3845 4185                  zfs_dirty_data_max = MIN(zfs_dirty_data_max,
3846 4186                      zfs_dirty_data_max_max);
3847 4187          }
3848 4188  }
3849 4189  
3850 4190  void
3851 4191  arc_fini(void)
3852 4192  {
3853 4193          mutex_enter(&arc_reclaim_thr_lock);
3854 4194          arc_thread_exit = 1;
3855 4195          while (arc_thread_exit != 0)
3856 4196                  cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3857 4197          mutex_exit(&arc_reclaim_thr_lock);
3858 4198  
3859 4199          arc_flush(NULL);
3860 4200  
3861 4201          arc_dead = TRUE;
3862 4202  
3863 4203          if (arc_ksp != NULL) {
3864 4204                  kstat_delete(arc_ksp);
3865 4205                  arc_ksp = NULL;
3866 4206          }
3867 4207  
3868 4208          mutex_destroy(&arc_eviction_mtx);
3869 4209          mutex_destroy(&arc_reclaim_thr_lock);
3870 4210          cv_destroy(&arc_reclaim_thr_cv);
3871 4211  
3872 4212          list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3873 4213          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3874 4214          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3875 4215          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3876 4216          list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3877 4217          list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3878 4218          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3879 4219          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3880 4220  
3881 4221          mutex_destroy(&arc_anon->arcs_mtx);
3882 4222          mutex_destroy(&arc_mru->arcs_mtx);
3883 4223          mutex_destroy(&arc_mru_ghost->arcs_mtx);
3884 4224          mutex_destroy(&arc_mfu->arcs_mtx);
3885 4225          mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3886 4226          mutex_destroy(&arc_l2c_only->arcs_mtx);
3887 4227  
3888 4228          buf_fini();
3889 4229  
3890 4230          ASSERT(arc_loaned_bytes == 0);
3891 4231  }
3892 4232  
3893 4233  /*
3894 4234   * Level 2 ARC
3895 4235   *
3896 4236   * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3897 4237   * It uses dedicated storage devices to hold cached data, which are populated
3898 4238   * using large infrequent writes.  The main role of this cache is to boost
3899 4239   * the performance of random read workloads.  The intended L2ARC devices
3900 4240   * include short-stroked disks, solid state disks, and other media with
3901 4241   * substantially faster read latency than disk.
3902 4242   *
3903 4243   *                 +-----------------------+
3904 4244   *                 |         ARC           |
3905 4245   *                 +-----------------------+
3906 4246   *                    |         ^     ^
3907 4247   *                    |         |     |
3908 4248   *      l2arc_feed_thread()    arc_read()
3909 4249   *                    |         |     |
3910 4250   *                    |  l2arc read   |
3911 4251   *                    V         |     |
3912 4252   *               +---------------+    |
3913 4253   *               |     L2ARC     |    |
3914 4254   *               +---------------+    |
3915 4255   *                   |    ^           |
3916 4256   *          l2arc_write() |           |
3917 4257   *                   |    |           |
3918 4258   *                   V    |           |
3919 4259   *                 +-------+      +-------+
3920 4260   *                 | vdev  |      | vdev  |
3921 4261   *                 | cache |      | cache |
3922 4262   *                 +-------+      +-------+
3923 4263   *                 +=========+     .-----.
3924 4264   *                 :  L2ARC  :    |-_____-|
3925 4265   *                 : devices :    | Disks |
3926 4266   *                 +=========+    `-_____-'
3927 4267   *
3928 4268   * Read requests are satisfied from the following sources, in order:
3929 4269   *
3930 4270   *      1) ARC
3931 4271   *      2) vdev cache of L2ARC devices
3932 4272   *      3) L2ARC devices
3933 4273   *      4) vdev cache of disks
3934 4274   *      5) disks
3935 4275   *
3936 4276   * Some L2ARC device types exhibit extremely slow write performance.
3937 4277   * To accommodate for this there are some significant differences between
3938 4278   * the L2ARC and traditional cache design:
3939 4279   *
3940 4280   * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3941 4281   * the ARC behave as usual, freeing buffers and placing headers on ghost
3942 4282   * lists.  The ARC does not send buffers to the L2ARC during eviction as
3943 4283   * this would add inflated write latencies for all ARC memory pressure.
3944 4284   *
3945 4285   * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3946 4286   * It does this by periodically scanning buffers from the eviction-end of
3947 4287   * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3948 4288   * not already there. It scans until a headroom of buffers is satisfied,
3949 4289   * which itself is a buffer for ARC eviction. If a compressible buffer is
3950 4290   * found during scanning and selected for writing to an L2ARC device, we
3951 4291   * temporarily boost scanning headroom during the next scan cycle to make
3952 4292   * sure we adapt to compression effects (which might significantly reduce
3953 4293   * the data volume we write to L2ARC). The thread that does this is
3954 4294   * l2arc_feed_thread(), illustrated below; example sizes are included to
3955 4295   * provide a better sense of ratio than this diagram:
3956 4296   *
3957 4297   *             head -->                        tail
3958 4298   *              +---------------------+----------+
3959 4299   *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3960 4300   *              +---------------------+----------+   |   o L2ARC eligible
3961 4301   *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3962 4302   *              +---------------------+----------+   |
3963 4303   *                   15.9 Gbytes      ^ 32 Mbytes    |
3964 4304   *                                 headroom          |
3965 4305   *                                            l2arc_feed_thread()
3966 4306   *                                                   |
3967 4307   *                       l2arc write hand <--[oooo]--'
3968 4308   *                               |           8 Mbyte
3969 4309   *                               |          write max
3970 4310   *                               V
3971 4311   *                +==============================+
3972 4312   *      L2ARC dev |####|#|###|###|    |####| ... |
3973 4313   *                +==============================+
3974 4314   *                           32 Gbytes
3975 4315   *
3976 4316   * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3977 4317   * evicted, then the L2ARC has cached a buffer much sooner than it probably
3978 4318   * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3979 4319   * safe to say that this is an uncommon case, since buffers at the end of
3980 4320   * the ARC lists have moved there due to inactivity.
3981 4321   *
3982 4322   * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3983 4323   * then the L2ARC simply misses copying some buffers.  This serves as a
3984 4324   * pressure valve to prevent heavy read workloads from both stalling the ARC
3985 4325   * with waits and clogging the L2ARC with writes.  This also helps prevent
3986 4326   * the potential for the L2ARC to churn if it attempts to cache content too
3987 4327   * quickly, such as during backups of the entire pool.
3988 4328   *
3989 4329   * 5. After system boot and before the ARC has filled main memory, there are
3990 4330   * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3991 4331   * lists can remain mostly static.  Instead of searching from tail of these
3992 4332   * lists as pictured, the l2arc_feed_thread() will search from the list heads
3993 4333   * for eligible buffers, greatly increasing its chance of finding them.
3994 4334   *
3995 4335   * The L2ARC device write speed is also boosted during this time so that
3996 4336   * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3997 4337   * there are no L2ARC reads, and no fear of degrading read performance
3998 4338   * through increased writes.
3999 4339   *
4000 4340   * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4001 4341   * the vdev queue can aggregate them into larger and fewer writes.  Each
4002 4342   * device is written to in a rotor fashion, sweeping writes through
4003 4343   * available space then repeating.
4004 4344   *
4005 4345   * 7. The L2ARC does not store dirty content.  It never needs to flush
4006 4346   * write buffers back to disk based storage.
4007 4347   *
4008 4348   * 8. If an ARC buffer is written (and dirtied) which also exists in the
4009 4349   * L2ARC, the now stale L2ARC buffer is immediately dropped.
4010 4350   *
4011 4351   * The performance of the L2ARC can be tweaked by a number of tunables, which
4012 4352   * may be necessary for different workloads:
4013 4353   *
4014 4354   *      l2arc_write_max         max write bytes per interval
4015 4355   *      l2arc_write_boost       extra write bytes during device warmup
4016 4356   *      l2arc_noprefetch        skip caching prefetched buffers
4017 4357   *      l2arc_headroom          number of max device writes to precache
4018 4358   *      l2arc_headroom_boost    when we find compressed buffers during ARC
4019 4359   *                              scanning, we multiply headroom by this
4020 4360   *                              percentage factor for the next scan cycle,
4021 4361   *                              since more compressed buffers are likely to
4022 4362   *                              be present
4023 4363   *      l2arc_feed_secs         seconds between L2ARC writing
4024 4364   *
4025 4365   * Tunables may be removed or added as future performance improvements are

↓ open down ↓

594 lines elided

↑ open up ↑

4026 4366   * integrated, and also may become zpool properties.
4027 4367   *
4028 4368   * There are three key functions that control how the L2ARC warms up:
4029 4369   *
4030 4370   *      l2arc_write_eligible()  check if a buffer is eligible to cache
4031 4371   *      l2arc_write_size()      calculate how much to write
4032 4372   *      l2arc_write_interval()  calculate sleep delay between writes
4033 4373   *
4034 4374   * These three functions determine what to write, how much, and how quickly
4035 4375   * to send writes.
     4376 + *
     4377 + * L2ARC persistency:
     4378 + *
     4379 + * When writing buffers to L2ARC, we periodically add some metadata to
     4380 + * make sure we can pick them up after reboot, thus dramatically reducing
     4381 + * the impact that any downtime has on the performance of storage systems
     4382 + * with large caches.
     4383 + *
     4384 + * The implementation works fairly simply by integrating the following two
     4385 + * modifications:
     4386 + *
     4387 + * *) Every now and then we mix in a piece of metadata (called a log block)
     4388 + *    into the L2ARC write. This allows us to understand what's been written,
     4389 + *    so that we can rebuild the arc_buf_hdr_t structures of the main ARC
     4390 + *    buffers. The log block also includes a "back-reference" pointer to the
     4391 + *    previous block, forming a back-linked list of blocks on the L2ARC device.
     4392 + *
     4393 + * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
     4394 + *    for our header bookkeeping purposes. This contains a device header, which
     4395 + *    contains our top-level reference structures. We update it each time we
     4396 + *    write a new log block, so that we're able to locate it in the L2ARC
     4397 + *    device. If this write results in an inconsistent device header (e.g. due
     4398 + *    to power failure), we detect this by verifying the header's checksum
     4399 + *    and simply drop the entries from L2ARC.
     4400 + *
     4401 + * Implementation diagram:
     4402 + *
     4403 + * +=== L2ARC device (not to scale) ======================================+
     4404 + * |       __________newest log block pointers_________                   |
     4405 + * |      /                                  \1 back   \latest            |
     4406 + * |     /                                    V         V                 |
     4407 + * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
     4408 + * |                       ^       / ^       / ^       /                  |
     4409 + * |                       `-prev-'  `-prev-'  `-prev-'                   |
     4410 + * |                         lb        lb        lb                       |
     4411 + * +======================================================================+
     4412 + *
     4413 + * On-device data structures:
     4414 + *
     4415 + * L2ARC device header: l2arc_dev_hdr_phys_t
     4416 + * L2ARC log block:     l2arc_log_blk_phys_t
     4417 + *
     4418 + * L2ARC reconstruction:
     4419 + *
     4420 + * When writing data, we simply write in the standard rotary fashion,
     4421 + * evicting buffers as we go and simply writing new data over them (writing
     4422 + * a new log block every now and then). This obviously means that once we
     4423 + * loop around the end of the device, we will start cutting into an already
     4424 + * committed log block (and its referenced data buffers), like so:
     4425 + *
     4426 + *    current write head__       __old tail
     4427 + *                        \     /
     4428 + *                        V    V
     4429 + * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
     4430 + *                         ^    ^^^^^^^^^___________________________________
     4431 + *                         |                                                \
     4432 + *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
     4433 + *
     4434 + * When importing the pool, we detect this situation and use it to stop
     4435 + * our scanning process (see l2arc_rebuild).
     4436 + *
     4437 + * There is one significant caveat to consider when rebuilding ARC contents
     4438 + * from an L2ARC device: what about invalidated buffers? Given the above
     4439 + * construction, we cannot update blocks which we've already written to amend
     4440 + * them to remove buffers which were invalidated. Thus, during reconstruction,
     4441 + * we might be populating the cache with buffers for data that's not on the
     4442 + * main pool anymore, or may have been overwritten!
     4443 + *
     4444 + * As it turns out, this isn't a problem. Every arc_read request includes
     4445 + * both the DVA and, crucially, the birth TXG of the BP the caller is
     4446 + * looking for. So even if the cache were populated by completely rotten
     4447 + * blocks for data that had been long deleted and/or overwritten, we'll
     4448 + * never actually return bad data from the cache, since the DVA with the
     4449 + * birth TXG uniquely identify a block in space and time - once created,
     4450 + * a block is immutable on disk. The worst thing we have done is wasted
     4451 + * some time and memory at l2arc rebuild to reconstruct outdated ARC
     4452 + * entries that will get dropped from the l2arc as it is being updated
     4453 + * with new blocks.
4036 4454   */
4037 4455  
4038 4456  static boolean_t
4039 4457  l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4040 4458  {
4041 4459          /*
4042 4460           * A buffer is *not* eligible for the L2ARC if it:
4043 4461           * 1. belongs to a different spa.
4044 4462           * 2. is already cached on the L2ARC.
4045 4463           * 3. has an I/O in progress (it may be an incomplete read).

4046 4464           * 4. is flagged not eligible (zfs property).
4047 4465           */
4048 4466          if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4049 4467              HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4050 4468                  return (B_FALSE);
4051 4469  
4052 4470          return (B_TRUE);
4053 4471  }
4054 4472  
4055 4473  static uint64_t
4056 4474  l2arc_write_size(void)
4057 4475  {
4058 4476          uint64_t size;
4059 4477  
4060 4478          /*
4061 4479           * Make sure our globals have meaningful values in case the user
4062 4480           * altered them.
4063 4481           */
4064 4482          size = l2arc_write_max;
4065 4483          if (size == 0) {
4066 4484                  cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4067 4485                      "be greater than zero, resetting it to the default (%d)",
4068 4486                      L2ARC_WRITE_SIZE);
4069 4487                  size = l2arc_write_max = L2ARC_WRITE_SIZE;
4070 4488          }
4071 4489  
4072 4490          if (arc_warm == B_FALSE)
4073 4491                  size += l2arc_write_boost;
4074 4492  
4075 4493          return (size);
4076 4494  
4077 4495  }
4078 4496  
4079 4497  static clock_t
4080 4498  l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4081 4499  {
4082 4500          clock_t interval, next, now;
4083 4501  
4084 4502          /*
4085 4503           * If the ARC lists are busy, increase our write rate; if the
4086 4504           * lists are stale, idle back.  This is achieved by checking
4087 4505           * how much we previously wrote - if it was more than half of
4088 4506           * what we wanted, schedule the next write much sooner.
4089 4507           */
4090 4508          if (l2arc_feed_again && wrote > (wanted / 2))
4091 4509                  interval = (hz * l2arc_feed_min_ms) / 1000;

↓ open down ↓

46 lines elided

↑ open up ↑

4092 4510          else
4093 4511                  interval = hz * l2arc_feed_secs;
4094 4512  
4095 4513          now = ddi_get_lbolt();
4096 4514          next = MAX(now, MIN(now + interval, began + interval));
4097 4515  
4098 4516          return (next);
4099 4517  }
4100 4518  
4101 4519  static void
4102      -l2arc_hdr_stat_add(void)
     4520 +l2arc_hdr_stat_add(boolean_t from_arc)
4103 4521  {
4104 4522          ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4105      -        ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
     4523 +        if (from_arc)
     4524 +                ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4106 4525  }
4107 4526  
4108 4527  static void
4109 4528  l2arc_hdr_stat_remove(void)
4110 4529  {
4111 4530          ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4112 4531          ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4113 4532  }
4114 4533  
4115 4534  /*

4116 4535   * Cycle through L2ARC devices.  This is how L2ARC load balances.
4117 4536   * If a device is returned, this also returns holding the spa config lock.
4118 4537   */
4119 4538  static l2arc_dev_t *
4120 4539  l2arc_dev_get_next(void)
4121 4540  {
4122 4541          l2arc_dev_t *first, *next = NULL;
4123 4542  
4124 4543          /*
4125 4544           * Lock out the removal of spas (spa_namespace_lock), then removal
4126 4545           * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4127 4546           * both locks will be dropped and a spa config lock held instead.
4128 4547           */

↓ open down ↓

13 lines elided

↑ open up ↑

4129 4548          mutex_enter(&spa_namespace_lock);
4130 4549          mutex_enter(&l2arc_dev_mtx);
4131 4550  
4132 4551          /* if there are no vdevs, there is nothing to do */
4133 4552          if (l2arc_ndev == 0)
4134 4553                  goto out;
4135 4554  
4136 4555          first = NULL;
4137 4556          next = l2arc_dev_last;
4138 4557          do {
4139      -                /* loop around the list looking for a non-faulted vdev */
     4558 +                /*
     4559 +                 * Loop around the list looking for a non-faulted vdev
     4560 +                 * and one that isn't currently doing an L2ARC rebuild.
     4561 +                 */
4140 4562                  if (next == NULL) {
4141 4563                          next = list_head(l2arc_dev_list);
4142 4564                  } else {
4143 4565                          next = list_next(l2arc_dev_list, next);
4144 4566                          if (next == NULL)
4145 4567                                  next = list_head(l2arc_dev_list);
4146 4568                  }
4147 4569  
4148 4570                  /* if we have come back to the start, bail out */
4149 4571                  if (first == NULL)
4150 4572                          first = next;
4151 4573                  else if (next == first)
4152 4574                          break;
4153 4575  
4154      -        } while (vdev_is_dead(next->l2ad_vdev));
     4576 +        } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
4155 4577  
4156 4578          /* if we were unable to find any usable vdevs, return NULL */
4157      -        if (vdev_is_dead(next->l2ad_vdev))
     4579 +        if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
4158 4580                  next = NULL;
4159 4581  
4160 4582          l2arc_dev_last = next;
4161 4583  
4162 4584  out:
4163 4585          mutex_exit(&l2arc_dev_mtx);
4164 4586  
4165 4587          /*
4166 4588           * Grab the config lock to prevent the 'next' device from being
4167 4589           * removed while we are writing to it.

4168 4590           */
4169 4591          if (next != NULL)
4170 4592                  spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4171 4593          mutex_exit(&spa_namespace_lock);
4172 4594  
4173 4595          return (next);
4174 4596  }
4175 4597  
4176 4598  /*
4177 4599   * Free buffers that were tagged for destruction.
4178 4600   */
4179 4601  static void
4180 4602  l2arc_do_free_on_write()
4181 4603  {
4182 4604          list_t *buflist;
4183 4605          l2arc_data_free_t *df, *df_prev;
4184 4606  
4185 4607          mutex_enter(&l2arc_free_on_write_mtx);
4186 4608          buflist = l2arc_free_on_write;
4187 4609  
4188 4610          for (df = list_tail(buflist); df; df = df_prev) {
4189 4611                  df_prev = list_prev(buflist, df);
4190 4612                  ASSERT(df->l2df_data != NULL);
4191 4613                  ASSERT(df->l2df_func != NULL);
4192 4614                  df->l2df_func(df->l2df_data, df->l2df_size);
4193 4615                  list_remove(buflist, df);
4194 4616                  kmem_free(df, sizeof (l2arc_data_free_t));
4195 4617          }
4196 4618  
4197 4619          mutex_exit(&l2arc_free_on_write_mtx);
4198 4620  }
4199 4621  
4200 4622  /*

↓ open down ↓

33 lines elided

↑ open up ↑

4201 4623   * A write to a cache device has completed.  Update all headers to allow
4202 4624   * reads from these buffers to begin.
4203 4625   */
4204 4626  static void
4205 4627  l2arc_write_done(zio_t *zio)
4206 4628  {
4207 4629          l2arc_write_callback_t *cb;
4208 4630          l2arc_dev_t *dev;
4209 4631          list_t *buflist;
4210 4632          arc_buf_hdr_t *head, *ab, *ab_prev;
4211      -        l2arc_buf_hdr_t *abl2;
     4633 +        l2arc_buf_hdr_t *l2hdr;
4212 4634          kmutex_t *hash_lock;
     4635 +        l2arc_log_blk_buf_t *lb_buf;
4213 4636  
4214 4637          cb = zio->io_private;
4215 4638          ASSERT(cb != NULL);
4216 4639          dev = cb->l2wcb_dev;
4217 4640          ASSERT(dev != NULL);
4218 4641          head = cb->l2wcb_head;
4219 4642          ASSERT(head != NULL);
4220 4643          buflist = dev->l2ad_buflist;
4221 4644          ASSERT(buflist != NULL);
4222 4645          DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,

4223 4646              l2arc_write_callback_t *, cb);
4224 4647

↓ open down ↓

2 lines elided

↑ open up ↑

4225 4648          if (zio->io_error != 0)
4226 4649                  ARCSTAT_BUMP(arcstat_l2_writes_error);
4227 4650  
4228 4651          mutex_enter(&l2arc_buflist_mtx);
4229 4652  
4230 4653          /*
4231 4654           * All writes completed, or an error was hit.
4232 4655           */
4233 4656          for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4234 4657                  ab_prev = list_prev(buflist, ab);
     4658 +                l2hdr = ab->b_l2hdr;
4235 4659  
     4660 +                /*
     4661 +                 * Release the temporary compressed buffer as soon as possible.
     4662 +                 */
     4663 +                if (l2hdr->b_compress != ZIO_COMPRESS_OFF)
     4664 +                        l2arc_release_cdata_buf(ab);
     4665 +
4236 4666                  hash_lock = HDR_LOCK(ab);
4237 4667                  if (!mutex_tryenter(hash_lock)) {
4238 4668                          /*
4239 4669                           * This buffer misses out.  It may be in a stage
4240 4670                           * of eviction.  Its ARC_L2_WRITING flag will be
4241 4671                           * left set, denying reads to this buffer.
4242 4672                           */
4243 4673                          ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4244 4674                          continue;
4245 4675                  }
4246 4676  
4247      -                abl2 = ab->b_l2hdr;
4248      -
4249      -                /*
4250      -                 * Release the temporary compressed buffer as soon as possible.
4251      -                 */
4252      -                if (abl2->b_compress != ZIO_COMPRESS_OFF)
4253      -                        l2arc_release_cdata_buf(ab);
4254      -
4255 4677                  if (zio->io_error != 0) {
4256 4678                          /*
4257 4679                           * Error - drop L2ARC entry.
4258 4680                           */
4259 4681                          list_remove(buflist, ab);
4260      -                        ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
     4682 +                        ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4261 4683                          ab->b_l2hdr = NULL;
4262      -                        kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
     4684 +                        kmem_free(l2hdr, sizeof (*l2hdr));
4263 4685                          ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4264 4686                  }
4265 4687  
4266 4688                  /*
4267 4689                   * Allow ARC to begin reads to this L2ARC entry.
4268 4690                   */
4269 4691                  ab->b_flags &= ~ARC_L2_WRITING;
4270 4692  
4271 4693                  mutex_exit(hash_lock);
4272 4694          }
4273 4695  
4274 4696          atomic_inc_64(&l2arc_writes_done);
4275 4697          list_remove(buflist, head);
4276 4698          kmem_cache_free(hdr_cache, head);
4277 4699          mutex_exit(&l2arc_buflist_mtx);
4278 4700  
4279 4701          l2arc_do_free_on_write();
4280 4702  
     4703 +        for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL;
     4704 +            lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) {
     4705 +                (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list);
     4706 +                kmem_free(lb_buf, sizeof (*lb_buf));
     4707 +        }
     4708 +        list_destroy(&cb->l2wcb_log_blk_buf_list);
4281 4709          kmem_free(cb, sizeof (l2arc_write_callback_t));
4282 4710  }
4283 4711  
4284 4712  /*
4285 4713   * A read to a cache device completed.  Validate buffer contents before
4286 4714   * handing over to the regular ARC routines.
4287 4715   */
4288 4716  static void
4289 4717  l2arc_read_done(zio_t *zio)
4290 4718  {

4291 4719          l2arc_read_callback_t *cb;
4292 4720          arc_buf_hdr_t *hdr;
4293 4721          arc_buf_t *buf;
4294 4722          kmutex_t *hash_lock;
4295 4723          int equal;
4296 4724  
4297 4725          ASSERT(zio->io_vd != NULL);
4298 4726          ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4299 4727  
4300 4728          spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4301 4729  
4302 4730          cb = zio->io_private;
4303 4731          ASSERT(cb != NULL);
4304 4732          buf = cb->l2rcb_buf;
4305 4733          ASSERT(buf != NULL);
4306 4734  
4307 4735          hash_lock = HDR_LOCK(buf->b_hdr);
4308 4736          mutex_enter(hash_lock);
4309 4737          hdr = buf->b_hdr;
4310 4738          ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4311 4739  
4312 4740          /*
4313 4741           * If the buffer was compressed, decompress it first.
4314 4742           */
4315 4743          if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4316 4744                  l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4317 4745          ASSERT(zio->io_data != NULL);
4318 4746  
4319 4747          /*
4320 4748           * Check this survived the L2ARC journey.
4321 4749           */
4322 4750          equal = arc_cksum_equal(buf);
4323 4751          if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4324 4752                  mutex_exit(hash_lock);
4325 4753                  zio->io_private = buf;
4326 4754                  zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4327 4755                  zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
4328 4756                  arc_read_done(zio);
4329 4757          } else {
4330 4758                  mutex_exit(hash_lock);
4331 4759                  /*
4332 4760                   * Buffer didn't survive caching.  Increment stats and
4333 4761                   * reissue to the original storage device.
4334 4762                   */
4335 4763                  if (zio->io_error != 0) {
4336 4764                          ARCSTAT_BUMP(arcstat_l2_io_error);
4337 4765                  } else {
4338 4766                          zio->io_error = SET_ERROR(EIO);
4339 4767                  }
4340 4768                  if (!equal)
4341 4769                          ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4342 4770  
4343 4771                  /*
4344 4772                   * If there's no waiter, issue an async i/o to the primary
4345 4773                   * storage now.  If there *is* a waiter, the caller must
4346 4774                   * issue the i/o in a context where it's OK to block.
4347 4775                   */
4348 4776                  if (zio->io_waiter == NULL) {
4349 4777                          zio_t *pio = zio_unique_parent(zio);
4350 4778  
4351 4779                          ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4352 4780  
4353 4781                          zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4354 4782                              buf->b_data, zio->io_size, arc_read_done, buf,
4355 4783                              zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4356 4784                  }
4357 4785          }
4358 4786  
4359 4787          kmem_free(cb, sizeof (l2arc_read_callback_t));
4360 4788  }
4361 4789  
4362 4790  /*
4363 4791   * This is the list priority from which the L2ARC will search for pages to
4364 4792   * cache.  This is used within loops (0..3) to cycle through lists in the
4365 4793   * desired order.  This order can have a significant effect on cache
4366 4794   * performance.
4367 4795   *
4368 4796   * Currently the metadata lists are hit first, MFU then MRU, followed by
4369 4797   * the data lists.  This function returns a locked list, and also returns
4370 4798   * the lock pointer.
4371 4799   */
4372 4800  static list_t *
4373 4801  l2arc_list_locked(int list_num, kmutex_t **lock)
4374 4802  {
4375 4803          list_t *list = NULL;
4376 4804  
4377 4805          ASSERT(list_num >= 0 && list_num <= 3);
4378 4806  
4379 4807          switch (list_num) {
4380 4808          case 0:
4381 4809                  list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4382 4810                  *lock = &arc_mfu->arcs_mtx;
4383 4811                  break;
4384 4812          case 1:
4385 4813                  list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4386 4814                  *lock = &arc_mru->arcs_mtx;
4387 4815                  break;
4388 4816          case 2:
4389 4817                  list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4390 4818                  *lock = &arc_mfu->arcs_mtx;
4391 4819                  break;
4392 4820          case 3:
4393 4821                  list = &arc_mru->arcs_list[ARC_BUFC_DATA];

↓ open down ↓

103 lines elided

↑ open up ↑

4394 4822                  *lock = &arc_mru->arcs_mtx;
4395 4823                  break;
4396 4824          }
4397 4825  
4398 4826          ASSERT(!(MUTEX_HELD(*lock)));
4399 4827          mutex_enter(*lock);
4400 4828          return (list);
4401 4829  }
4402 4830  
4403 4831  /*
     4832 + * Calculates the maximum overhead of L2ARC metadata log blocks for a given
     4833 + * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
     4834 + * overhead in processing to make sure there is enough headroom available
     4835 + * when writing buffers.
     4836 + */
     4837 +static inline uint64_t
     4838 +l2arc_log_blk_overhead(uint64_t write_sz)
     4839 +{
     4840 +        return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
     4841 +            L2ARC_LOG_BLK_SIZE;
     4842 +}
     4843 +
     4844 +/*
4404 4845   * Evict buffers from the device write hand to the distance specified in
4405 4846   * bytes.  This distance may span populated buffers, it may span nothing.
4406 4847   * This is clearing a region on the L2ARC device ready for writing.
4407 4848   * If the 'all' boolean is set, every buffer is evicted.
4408 4849   */
4409 4850  static void
4410 4851  l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4411 4852  {
4412 4853          list_t *buflist;
4413      -        l2arc_buf_hdr_t *abl2;
     4854 +        l2arc_buf_hdr_t *l2hdr;
4414 4855          arc_buf_hdr_t *ab, *ab_prev;
4415 4856          kmutex_t *hash_lock;
4416 4857          uint64_t taddr;
4417 4858  
4418 4859          buflist = dev->l2ad_buflist;
4419 4860  
4420 4861          if (buflist == NULL)
4421 4862                  return;
4422 4863  
4423 4864          if (!all && dev->l2ad_first) {
4424 4865                  /*
4425 4866                   * This is the first sweep through the device.  There is
4426 4867                   * nothing to evict.
4427 4868                   */
4428 4869                  return;
4429 4870          }
4430 4871  
     4872 +        /*
     4873 +         * We need to add in the worst case scenario of log block overhead.
     4874 +         */
     4875 +        distance += l2arc_log_blk_overhead(distance);
4431 4876          if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4432 4877                  /*
4433 4878                   * When nearing the end of the device, evict to the end
4434 4879                   * before the device write hand jumps to the start.
4435 4880                   */
4436 4881                  taddr = dev->l2ad_end;
4437 4882          } else {
4438 4883                  taddr = dev->l2ad_hand + distance;
4439 4884          }
4440 4885          DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,

4441 4886              uint64_t, taddr, boolean_t, all);
4442 4887  
4443 4888  top:
4444 4889          mutex_enter(&l2arc_buflist_mtx);
4445 4890          for (ab = list_tail(buflist); ab; ab = ab_prev) {
4446 4891                  ab_prev = list_prev(buflist, ab);
4447 4892  
4448 4893                  hash_lock = HDR_LOCK(ab);
4449 4894                  if (!mutex_tryenter(hash_lock)) {
4450 4895                          /*
4451 4896                           * Missed the hash lock.  Retry.
4452 4897                           */
4453 4898                          ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4454 4899                          mutex_exit(&l2arc_buflist_mtx);
4455 4900                          mutex_enter(hash_lock);
4456 4901                          mutex_exit(hash_lock);
4457 4902                          goto top;
4458 4903                  }
4459 4904  
4460 4905                  if (HDR_L2_WRITE_HEAD(ab)) {
4461 4906                          /*
4462 4907                           * We hit a write head node.  Leave it for
4463 4908                           * l2arc_write_done().
4464 4909                           */
4465 4910                          list_remove(buflist, ab);
4466 4911                          mutex_exit(hash_lock);
4467 4912                          continue;
4468 4913                  }
4469 4914  
4470 4915                  if (!all && ab->b_l2hdr != NULL &&
4471 4916                      (ab->b_l2hdr->b_daddr > taddr ||
4472 4917                      ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4473 4918                          /*
4474 4919                           * We've evicted to the target address,
4475 4920                           * or the end of the device.
4476 4921                           */
4477 4922                          mutex_exit(hash_lock);
4478 4923                          break;
4479 4924                  }
4480 4925  
4481 4926                  if (HDR_FREE_IN_PROGRESS(ab)) {
4482 4927                          /*
4483 4928                           * Already on the path to destruction.
4484 4929                           */
4485 4930                          mutex_exit(hash_lock);
4486 4931                          continue;
4487 4932                  }
4488 4933  
4489 4934                  if (ab->b_state == arc_l2c_only) {
4490 4935                          ASSERT(!HDR_L2_READING(ab));
4491 4936                          /*
4492 4937                           * This doesn't exist in the ARC.  Destroy.
4493 4938                           * arc_hdr_destroy() will call list_remove()
4494 4939                           * and decrement arcstat_l2_size.
4495 4940                           */
4496 4941                          arc_change_state(arc_anon, ab, hash_lock);
4497 4942                          arc_hdr_destroy(ab);
4498 4943                  } else {
4499 4944                          /*
4500 4945                           * Invalidate issued or about to be issued
4501 4946                           * reads, since we may be about to write
4502 4947                           * over this location.

↓ open down ↓

62 lines elided

↑ open up ↑

4503 4948                           */
4504 4949                          if (HDR_L2_READING(ab)) {
4505 4950                                  ARCSTAT_BUMP(arcstat_l2_evict_reading);
4506 4951                                  ab->b_flags |= ARC_L2_EVICTED;
4507 4952                          }
4508 4953  
4509 4954                          /*
4510 4955                           * Tell ARC this no longer exists in L2ARC.
4511 4956                           */
4512 4957                          if (ab->b_l2hdr != NULL) {
4513      -                                abl2 = ab->b_l2hdr;
4514      -                                ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
     4958 +                                l2hdr = ab->b_l2hdr;
     4959 +                                ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4515 4960                                  ab->b_l2hdr = NULL;
4516      -                                kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
     4961 +                                kmem_free(l2hdr, sizeof (*l2hdr));
4517 4962                                  ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4518 4963                          }
4519 4964                          list_remove(buflist, ab);
4520 4965  
4521 4966                          /*
4522 4967                           * This may have been leftover after a
4523 4968                           * failed write.
4524 4969                           */
4525 4970                          ab->b_flags &= ~ARC_L2_WRITING;
4526 4971                  }

4527 4972                  mutex_exit(hash_lock);
4528 4973          }
4529 4974          mutex_exit(&l2arc_buflist_mtx);
4530 4975  
4531 4976          vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4532 4977          dev->l2ad_evict = taddr;
4533 4978  }
4534 4979  
4535 4980  /*
4536 4981   * Find and write ARC buffers to the L2ARC device.
4537 4982   *
4538 4983   * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4539 4984   * for reading until they have completed writing.
4540 4985   * The headroom_boost is an in-out parameter used to maintain headroom boost
4541 4986   * state between calls to this function.

↓ open down ↓

15 lines elided

↑ open up ↑

4542 4987   *
4543 4988   * Returns the number of bytes actually written (which may be smaller than
4544 4989   * the delta by which the device hand has changed due to alignment).
4545 4990   */
4546 4991  static uint64_t
4547 4992  l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4548 4993      boolean_t *headroom_boost)
4549 4994  {
4550 4995          arc_buf_hdr_t *ab, *ab_prev, *head;
4551 4996          list_t *list;
4552      -        uint64_t write_asize, write_psize, write_sz, headroom,
     4997 +        /*
     4998 +         * These variables mean:
     4999 +         * - write_size: in-memory size of ARC buffers we've written (before
     5000 +         *      compression).
     5001 +         * - write_asize: actual on-disk size of ARC buffers we've written
     5002 +         *      (after compression).
     5003 +         * - write_aligned_asize: actual sum of space taken by ARC buffers
     5004 +         *      on the device (after compression and alignment, so that
     5005 +         *      every buffer starts on a multiple of the device block size).
     5006 +         * - headroom: L2ARC scanning headroom (we won't scan beyond this
     5007 +         *      distance from the list tail).
     5008 +         * - buf_compress_minsz: minimum in-memory ARC buffer size for us
     5009 +         *      to try compressing it.
     5010 +         */
     5011 +        uint64_t write_size, write_asize, write_aligned_asize, headroom,
4553 5012              buf_compress_minsz;
4554 5013          void *buf_data;
4555 5014          kmutex_t *list_lock;
4556 5015          boolean_t full;
4557 5016          l2arc_write_callback_t *cb;
4558 5017          zio_t *pio, *wzio;
4559 5018          uint64_t guid = spa_load_guid(spa);
4560 5019          const boolean_t do_headroom_boost = *headroom_boost;
     5020 +        boolean_t dev_hdr_update = B_FALSE;
4561 5021  
4562 5022          ASSERT(dev->l2ad_vdev != NULL);
4563 5023  
4564 5024          /* Lower the flag now, we might want to raise it again later. */
4565 5025          *headroom_boost = B_FALSE;
4566 5026  
4567 5027          pio = NULL;
4568      -        write_sz = write_asize = write_psize = 0;
     5028 +        cb = NULL;
     5029 +        write_size = write_asize = write_aligned_asize = 0;
4569 5030          full = B_FALSE;
4570 5031          head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4571 5032          head->b_flags |= ARC_L2_WRITE_HEAD;
4572 5033  
4573 5034          /*
4574 5035           * We will want to try to compress buffers that are at least 2x the
4575 5036           * device sector size.
4576 5037           */
4577 5038          buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4578 5039

4579 5040          /*
4580 5041           * Copy buffers for L2ARC writing.
4581 5042           */
4582 5043          mutex_enter(&l2arc_buflist_mtx);
4583 5044          for (int try = 0; try <= 3; try++) {
4584 5045                  uint64_t passed_sz = 0;
4585 5046  
4586 5047                  list = l2arc_list_locked(try, &list_lock);
4587 5048  
4588 5049                  /*
4589 5050                   * L2ARC fast warmup.
4590 5051                   *
4591 5052                   * Until the ARC is warm and starts to evict, read from the
4592 5053                   * head of the ARC lists rather than the tail.
4593 5054                   */
4594 5055                  if (arc_warm == B_FALSE)
4595 5056                          ab = list_head(list);

↓ open down ↓

17 lines elided

↑ open up ↑

4596 5057                  else
4597 5058                          ab = list_tail(list);
4598 5059  
4599 5060                  headroom = target_sz * l2arc_headroom;
4600 5061                  if (do_headroom_boost)
4601 5062                          headroom = (headroom * l2arc_headroom_boost) / 100;
4602 5063  
4603 5064                  for (; ab; ab = ab_prev) {
4604 5065                          l2arc_buf_hdr_t *l2hdr;
4605 5066                          kmutex_t *hash_lock;
4606      -                        uint64_t buf_sz;
     5067 +                        uint64_t buf_aligned_size;
4607 5068  
4608 5069                          if (arc_warm == B_FALSE)
4609 5070                                  ab_prev = list_next(list, ab);
4610 5071                          else
4611 5072                                  ab_prev = list_prev(list, ab);
4612 5073  
4613 5074                          hash_lock = HDR_LOCK(ab);
4614 5075                          if (!mutex_tryenter(hash_lock)) {
4615 5076                                  /*
4616 5077                                   * Skip this buffer rather than waiting.
4617 5078                                   */
4618 5079                                  continue;
4619 5080                          }
4620 5081  
4621      -                        passed_sz += ab->b_size;
     5082 +                        /*
     5083 +                         * When examining whether we've met our write target,
     5084 +                         * we must always use the aligned size of the buffer,
     5085 +                         * since that's the maximum amount of space a buffer
     5086 +                         * can take up on the L2ARC device.
     5087 +                         */
     5088 +                        buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev,
     5089 +                            ab->b_size);
     5090 +                        passed_sz += buf_aligned_size;
4622 5091                          if (passed_sz > headroom) {
4623 5092                                  /*
4624 5093                                   * Searched too far.
4625 5094                                   */
4626 5095                                  mutex_exit(hash_lock);
4627 5096                                  break;
4628 5097                          }
4629 5098  
4630 5099                          if (!l2arc_write_eligible(guid, ab)) {
4631 5100                                  mutex_exit(hash_lock);
4632 5101                                  continue;
4633 5102                          }
4634 5103  
4635      -                        if ((write_sz + ab->b_size) > target_sz) {
     5104 +                        if ((write_size + buf_aligned_size) > target_sz) {
4636 5105                                  full = B_TRUE;
4637 5106                                  mutex_exit(hash_lock);
4638 5107                                  break;
4639 5108                          }
4640 5109  
4641 5110                          if (pio == NULL) {
4642 5111                                  /*
4643 5112                                   * Insert a dummy header on the buflist so
4644 5113                                   * l2arc_write_done() can find where the
4645 5114                                   * write buffers begin without searching.
4646 5115                                   */
4647 5116                                  list_insert_head(dev->l2ad_buflist, head);
4648 5117  
4649      -                                cb = kmem_alloc(
     5118 +                                cb = kmem_zalloc(
4650 5119                                      sizeof (l2arc_write_callback_t), KM_SLEEP);
4651 5120                                  cb->l2wcb_dev = dev;
4652 5121                                  cb->l2wcb_head = head;
     5122 +                                list_create(&cb->l2wcb_log_blk_buf_list,
     5123 +                                    sizeof (l2arc_log_blk_buf_t),
     5124 +                                    offsetof(l2arc_log_blk_buf_t, l2lbb_node));
4653 5125                                  pio = zio_root(spa, l2arc_write_done, cb,
4654 5126                                      ZIO_FLAG_CANFAIL);
4655 5127                          }
4656 5128  
4657 5129                          /*
4658 5130                           * Create and add a new L2ARC header.
4659 5131                           */
4660      -                        l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
     5132 +                        l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
4661 5133                          l2hdr->b_dev = dev;
4662 5134                          ab->b_flags |= ARC_L2_WRITING;
4663 5135  
4664 5136                          /*
4665 5137                           * Temporarily stash the data buffer in b_tmp_cdata.
4666 5138                           * The subsequent write step will pick it up from
4667 5139                           * there. This is because can't access ab->b_buf
4668 5140                           * without holding the hash_lock, which we in turn
4669 5141                           * can't access without holding the ARC list locks
4670 5142                           * (which we want to avoid during compression/writing).
4671 5143                           */
4672 5144                          l2hdr->b_compress = ZIO_COMPRESS_OFF;
4673 5145                          l2hdr->b_asize = ab->b_size;
4674 5146                          l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4675 5147  
4676      -                        buf_sz = ab->b_size;
4677 5148                          ab->b_l2hdr = l2hdr;
4678 5149  
4679 5150                          list_insert_head(dev->l2ad_buflist, ab);
4680 5151  
4681 5152                          /*
4682 5153                           * Compute and store the buffer cksum before
4683 5154                           * writing.  On debug the cksum is verified first.
4684 5155                           */
4685 5156                          arc_cksum_verify(ab->b_buf);
4686 5157                          arc_cksum_compute(ab->b_buf, B_TRUE);
4687 5158  
4688 5159                          mutex_exit(hash_lock);
4689 5160  
4690      -                        write_sz += buf_sz;
     5161 +                        write_size += buf_aligned_size;
4691 5162                  }
4692 5163  
4693 5164                  mutex_exit(list_lock);
4694 5165  
4695 5166                  if (full == B_TRUE)
4696 5167                          break;
4697 5168          }
4698 5169  
4699 5170          /* No buffers selected for writing? */
4700 5171          if (pio == NULL) {
4701      -                ASSERT0(write_sz);
     5172 +                ASSERT0(write_size);
4702 5173                  mutex_exit(&l2arc_buflist_mtx);
4703 5174                  kmem_cache_free(hdr_cache, head);
4704 5175                  return (0);
4705 5176          }
4706 5177  
4707 5178          /*
4708 5179           * Now start writing the buffers. We're starting at the write head
4709 5180           * and work backwards, retracing the course of the buffer selector
4710 5181           * loop above.
4711 5182           */

4712 5183          for (ab = list_prev(dev->l2ad_buflist, head); ab;
4713 5184              ab = list_prev(dev->l2ad_buflist, ab)) {
4714 5185                  l2arc_buf_hdr_t *l2hdr;
4715 5186                  uint64_t buf_sz;
4716 5187  
4717 5188                  /*
4718 5189                   * We shouldn't need to lock the buffer here, since we flagged
4719 5190                   * it as ARC_L2_WRITING in the previous step, but we must take
4720 5191                   * care to only access its L2 cache parameters. In particular,
4721 5192                   * ab->b_buf may be invalid by now due to ARC eviction.
4722 5193                   */
4723 5194                  l2hdr = ab->b_l2hdr;
4724 5195                  l2hdr->b_daddr = dev->l2ad_hand;
4725 5196  
4726 5197                  if ((ab->b_flags & ARC_L2COMPRESS) &&
4727 5198                      l2hdr->b_asize >= buf_compress_minsz) {
4728 5199                          if (l2arc_compress_buf(l2hdr)) {
4729 5200                                  /*
4730 5201                                   * If compression succeeded, enable headroom
4731 5202                                   * boost on the next scan cycle.
4732 5203                                   */
4733 5204                                  *headroom_boost = B_TRUE;
4734 5205                          }
4735 5206                  }

↓ open down ↓

24 lines elided

↑ open up ↑

4736 5207  
4737 5208                  /*
4738 5209                   * Pick up the buffer data we had previously stashed away
4739 5210                   * (and now potentially also compressed).
4740 5211                   */
4741 5212                  buf_data = l2hdr->b_tmp_cdata;
4742 5213                  buf_sz = l2hdr->b_asize;
4743 5214  
4744 5215                  /* Compression may have squashed the buffer to zero length. */
4745 5216                  if (buf_sz != 0) {
4746      -                        uint64_t buf_p_sz;
     5217 +                        uint64_t buf_aligned_asize;
4747 5218  
4748 5219                          wzio = zio_write_phys(pio, dev->l2ad_vdev,
4749 5220                              dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4750 5221                              NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4751 5222                              ZIO_FLAG_CANFAIL, B_FALSE);
4752 5223  
4753 5224                          DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4754 5225                              zio_t *, wzio);
4755 5226                          (void) zio_nowait(wzio);
4756 5227  
4757 5228                          write_asize += buf_sz;
4758 5229                          /*
4759 5230                           * Keep the clock hand suitably device-aligned.
4760 5231                           */
4761      -                        buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4762      -                        write_psize += buf_p_sz;
4763      -                        dev->l2ad_hand += buf_p_sz;
     5232 +                        buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev,
     5233 +                            buf_sz);
     5234 +                        write_aligned_asize += buf_aligned_asize;
     5235 +                        dev->l2ad_hand += buf_aligned_asize;
     5236 +                        ASSERT(dev->l2ad_hand <= dev->l2ad_evict ||
     5237 +                            dev->l2ad_first);
4764 5238                  }
4765      -        }
4766 5239  
     5240 +                if (l2arc_log_blk_insert(dev, ab)) {
     5241 +                        l2arc_log_blk_commit(dev, pio, cb);
     5242 +                        dev_hdr_update = B_TRUE;
     5243 +                }
     5244 +        }
4767 5245          mutex_exit(&l2arc_buflist_mtx);
4768 5246  
4769      -        ASSERT3U(write_asize, <=, target_sz);
     5247 +        if (dev_hdr_update)
     5248 +                l2arc_dev_hdr_update(dev, pio);
     5249 +
     5250 +        VERIFY3U(write_aligned_asize, <=, target_sz);
4770 5251          ARCSTAT_BUMP(arcstat_l2_writes_sent);
4771 5252          ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4772      -        ARCSTAT_INCR(arcstat_l2_size, write_sz);
4773      -        ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4774      -        vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
     5253 +        ARCSTAT_INCR(arcstat_l2_size, write_size);
     5254 +        ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize);
     5255 +        vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0);
4775 5256  
4776 5257          /*
4777 5258           * Bump device hand to the device start if it is approaching the end.
4778 5259           * l2arc_evict() will already have evicted ahead for this case.
4779 5260           */
4780      -        if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
     5261 +        if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
     5262 +            dev->l2ad_end) {
4781 5263                  vdev_space_update(dev->l2ad_vdev,
4782 5264                      dev->l2ad_end - dev->l2ad_hand, 0, 0);
4783 5265                  dev->l2ad_hand = dev->l2ad_start;
4784 5266                  dev->l2ad_evict = dev->l2ad_start;
4785 5267                  dev->l2ad_first = B_FALSE;
4786 5268          }
4787 5269  
4788 5270          dev->l2ad_writing = B_TRUE;
4789 5271          (void) zio_wait(pio);
4790 5272          dev->l2ad_writing = B_FALSE;

4791 5273  
4792 5274          return (write_asize);
4793 5275  }
4794 5276  
4795 5277  /*
4796 5278   * Compresses an L2ARC buffer.
4797 5279   * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4798 5280   * size in l2hdr->b_asize. This routine tries to compress the data and
4799 5281   * depending on the compression result there are three possible outcomes:
4800 5282   * *) The buffer was incompressible. The original l2hdr contents were left
4801 5283   *    untouched and are ready for writing to an L2 device.
4802 5284   * *) The buffer was all-zeros, so there is no need to write it to an L2
4803 5285   *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4804 5286   *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4805 5287   * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4806 5288   *    data buffer which holds the compressed data to be written, and b_asize
4807 5289   *    tells us how much data there is. b_compress is set to the appropriate
4808 5290   *    compression algorithm. Once writing is done, invoke
4809 5291   *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4810 5292   *
4811 5293   * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4812 5294   * buffer was incompressible).
4813 5295   */
4814 5296  static boolean_t
4815 5297  l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4816 5298  {
4817 5299          void *cdata;
4818 5300          size_t csize, len;
4819 5301  
4820 5302          ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4821 5303          ASSERT(l2hdr->b_tmp_cdata != NULL);
4822 5304  
4823 5305          len = l2hdr->b_asize;
4824 5306          cdata = zio_data_buf_alloc(len);
4825 5307          csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4826 5308              cdata, l2hdr->b_asize);
4827 5309  
4828 5310          if (csize == 0) {
4829 5311                  /* zero block, indicate that there's nothing to write */
4830 5312                  zio_data_buf_free(cdata, len);
4831 5313                  l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4832 5314                  l2hdr->b_asize = 0;
4833 5315                  l2hdr->b_tmp_cdata = NULL;
4834 5316                  ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4835 5317                  return (B_TRUE);
4836 5318          } else if (csize > 0 && csize < len) {
4837 5319                  /*
4838 5320                   * Compression succeeded, we'll keep the cdata around for
4839 5321                   * writing and release it afterwards.
4840 5322                   */
4841 5323                  l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4842 5324                  l2hdr->b_asize = csize;
4843 5325                  l2hdr->b_tmp_cdata = cdata;
4844 5326                  ARCSTAT_BUMP(arcstat_l2_compress_successes);
4845 5327                  return (B_TRUE);
4846 5328          } else {
4847 5329                  /*
4848 5330                   * Compression failed, release the compressed buffer.
4849 5331                   * l2hdr will be left unmodified.
4850 5332                   */
4851 5333                  zio_data_buf_free(cdata, len);
4852 5334                  ARCSTAT_BUMP(arcstat_l2_compress_failures);
4853 5335                  return (B_FALSE);
4854 5336          }
4855 5337  }
4856 5338  
4857 5339  /*
4858 5340   * Decompresses a zio read back from an l2arc device. On success, the
4859 5341   * underlying zio's io_data buffer is overwritten by the uncompressed
4860 5342   * version. On decompression error (corrupt compressed stream), the
4861 5343   * zio->io_error value is set to signal an I/O error.
4862 5344   *
4863 5345   * Please note that the compressed data stream is not checksummed, so
4864 5346   * if the underlying device is experiencing data corruption, we may feed
4865 5347   * corrupt data to the decompressor, so the decompressor needs to be
4866 5348   * able to handle this situation (LZ4 does).
4867 5349   */
4868 5350  static void
4869 5351  l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4870 5352  {
4871 5353          ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4872 5354  
4873 5355          if (zio->io_error != 0) {
4874 5356                  /*
4875 5357                   * An io error has occured, just restore the original io
4876 5358                   * size in preparation for a main pool read.
4877 5359                   */
4878 5360                  zio->io_orig_size = zio->io_size = hdr->b_size;
4879 5361                  return;
4880 5362          }
4881 5363  
4882 5364          if (c == ZIO_COMPRESS_EMPTY) {
4883 5365                  /*
4884 5366                   * An empty buffer results in a null zio, which means we
4885 5367                   * need to fill its io_data after we're done restoring the
4886 5368                   * buffer's contents.
4887 5369                   */
4888 5370                  ASSERT(hdr->b_buf != NULL);
4889 5371                  bzero(hdr->b_buf->b_data, hdr->b_size);
4890 5372                  zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
4891 5373          } else {
4892 5374                  ASSERT(zio->io_data != NULL);
4893 5375                  /*
4894 5376                   * We copy the compressed data from the start of the arc buffer
4895 5377                   * (the zio_read will have pulled in only what we need, the
4896 5378                   * rest is garbage which we will overwrite at decompression)
4897 5379                   * and then decompress back to the ARC data buffer. This way we
4898 5380                   * can minimize copying by simply decompressing back over the
4899 5381                   * original compressed data (rather than decompressing to an
4900 5382                   * aux buffer and then copying back the uncompressed buffer,
4901 5383                   * which is likely to be much larger).
4902 5384                   */
4903 5385                  uint64_t csize;
4904 5386                  void *cdata;
4905 5387  
4906 5388                  csize = zio->io_size;
4907 5389                  cdata = zio_data_buf_alloc(csize);
4908 5390                  bcopy(zio->io_data, cdata, csize);
4909 5391                  if (zio_decompress_data(c, cdata, zio->io_data, csize,
4910 5392                      hdr->b_size) != 0)
4911 5393                          zio->io_error = EIO;
4912 5394                  zio_data_buf_free(cdata, csize);
4913 5395          }
4914 5396  
4915 5397          /* Restore the expected uncompressed IO size. */
4916 5398          zio->io_orig_size = zio->io_size = hdr->b_size;
4917 5399  }
4918 5400  
4919 5401  /*
4920 5402   * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
4921 5403   * This buffer serves as a temporary holder of compressed data while
4922 5404   * the buffer entry is being written to an l2arc device. Once that is
4923 5405   * done, we can dispose of it.
4924 5406   */
4925 5407  static void
4926 5408  l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
4927 5409  {
4928 5410          l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4929 5411  
4930 5412          if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
4931 5413                  /*
4932 5414                   * If the data was compressed, then we've allocated a
4933 5415                   * temporary buffer for it, so now we need to release it.
4934 5416                   */
4935 5417                  ASSERT(l2hdr->b_tmp_cdata != NULL);
4936 5418                  zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
4937 5419          }
4938 5420          l2hdr->b_tmp_cdata = NULL;
4939 5421  }
4940 5422  
4941 5423  /*
4942 5424   * This thread feeds the L2ARC at regular intervals.  This is the beating
4943 5425   * heart of the L2ARC.
4944 5426   */
4945 5427  static void
4946 5428  l2arc_feed_thread(void)
4947 5429  {
4948 5430          callb_cpr_t cpr;
4949 5431          l2arc_dev_t *dev;
4950 5432          spa_t *spa;
4951 5433          uint64_t size, wrote;
4952 5434          clock_t begin, next = ddi_get_lbolt();
4953 5435          boolean_t headroom_boost = B_FALSE;
4954 5436  
4955 5437          CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4956 5438  
4957 5439          mutex_enter(&l2arc_feed_thr_lock);
4958 5440  
4959 5441          while (l2arc_thread_exit == 0) {
4960 5442                  CALLB_CPR_SAFE_BEGIN(&cpr);
4961 5443                  (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4962 5444                      next);
4963 5445                  CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4964 5446                  next = ddi_get_lbolt() + hz;
4965 5447  
4966 5448                  /*
4967 5449                   * Quick check for L2ARC devices.
4968 5450                   */
4969 5451                  mutex_enter(&l2arc_dev_mtx);
4970 5452                  if (l2arc_ndev == 0) {
4971 5453                          mutex_exit(&l2arc_dev_mtx);
4972 5454                          continue;
4973 5455                  }
4974 5456                  mutex_exit(&l2arc_dev_mtx);
4975 5457                  begin = ddi_get_lbolt();
4976 5458  
4977 5459                  /*
4978 5460                   * This selects the next l2arc device to write to, and in
4979 5461                   * doing so the next spa to feed from: dev->l2ad_spa.   This
4980 5462                   * will return NULL if there are now no l2arc devices or if
4981 5463                   * they are all faulted.
4982 5464                   *
4983 5465                   * If a device is returned, its spa's config lock is also
4984 5466                   * held to prevent device removal.  l2arc_dev_get_next()
4985 5467                   * will grab and release l2arc_dev_mtx.
4986 5468                   */
4987 5469                  if ((dev = l2arc_dev_get_next()) == NULL)
4988 5470                          continue;
4989 5471  
4990 5472                  spa = dev->l2ad_spa;
4991 5473                  ASSERT(spa != NULL);
4992 5474  
4993 5475                  /*
4994 5476                   * If the pool is read-only then force the feed thread to
4995 5477                   * sleep a little longer.
4996 5478                   */
4997 5479                  if (!spa_writeable(spa)) {
4998 5480                          next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4999 5481                          spa_config_exit(spa, SCL_L2ARC, dev);
5000 5482                          continue;
5001 5483                  }
5002 5484  
5003 5485                  /*
5004 5486                   * Avoid contributing to memory pressure.
5005 5487                   */
5006 5488                  if (arc_reclaim_needed()) {
5007 5489                          ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5008 5490                          spa_config_exit(spa, SCL_L2ARC, dev);
5009 5491                          continue;
5010 5492                  }
5011 5493  
5012 5494                  ARCSTAT_BUMP(arcstat_l2_feeds);
5013 5495  
5014 5496                  size = l2arc_write_size();
5015 5497  
5016 5498                  /*
5017 5499                   * Evict L2ARC buffers that will be overwritten.
5018 5500                   */
5019 5501                  l2arc_evict(dev, size, B_FALSE);
5020 5502  
5021 5503                  /*
5022 5504                   * Write ARC buffers.
5023 5505                   */
5024 5506                  wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5025 5507  
5026 5508                  /*
5027 5509                   * Calculate interval between writes.
5028 5510                   */
5029 5511                  next = l2arc_write_interval(begin, size, wrote);
5030 5512                  spa_config_exit(spa, SCL_L2ARC, dev);
5031 5513          }

↓ open down ↓

241 lines elided

↑ open up ↑

5032 5514  
5033 5515          l2arc_thread_exit = 0;
5034 5516          cv_broadcast(&l2arc_feed_thr_cv);
5035 5517          CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
5036 5518          thread_exit();
5037 5519  }
5038 5520  
5039 5521  boolean_t
5040 5522  l2arc_vdev_present(vdev_t *vd)
5041 5523  {
5042      -        l2arc_dev_t *dev;
     5524 +        return (l2arc_vdev_get(vd) != NULL);
     5525 +}
5043 5526  
5044      -        mutex_enter(&l2arc_dev_mtx);
     5527 +static l2arc_dev_t *
     5528 +l2arc_vdev_get(vdev_t *vd)
     5529 +{
     5530 +        l2arc_dev_t     *dev;
     5531 +        boolean_t       held = MUTEX_HELD(&l2arc_dev_mtx);
     5532 +
     5533 +        if (!held)
     5534 +                mutex_enter(&l2arc_dev_mtx);
5045 5535          for (dev = list_head(l2arc_dev_list); dev != NULL;
5046 5536              dev = list_next(l2arc_dev_list, dev)) {
5047 5537                  if (dev->l2ad_vdev == vd)
5048 5538                          break;
5049 5539          }
5050      -        mutex_exit(&l2arc_dev_mtx);
     5540 +        if (!held)
     5541 +                mutex_exit(&l2arc_dev_mtx);
5051 5542  
5052      -        return (dev != NULL);
     5543 +        return (dev);
5053 5544  }
5054 5545  
5055 5546  /*
5056 5547   * Add a vdev for use by the L2ARC.  By this point the spa has already
5057      - * validated the vdev and opened it.
     5548 + * validated the vdev and opened it. The `rebuild' flag indicates whether
     5549 + * we should attempt an L2ARC persistency rebuild.
5058 5550   */
5059 5551  void
5060      -l2arc_add_vdev(spa_t *spa, vdev_t *vd)
     5552 +l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5061 5553  {
5062 5554          l2arc_dev_t *adddev;
5063 5555  
5064 5556          ASSERT(!l2arc_vdev_present(vd));
5065 5557  
5066 5558          /*
5067 5559           * Create a new l2arc device entry.
5068 5560           */
5069 5561          adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5070 5562          adddev->l2ad_spa = spa;
5071 5563          adddev->l2ad_vdev = vd;
5072      -        adddev->l2ad_start = VDEV_LABEL_START_SIZE;
     5564 +        /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */
     5565 +        adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE;
5073 5566          adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5074 5567          adddev->l2ad_hand = adddev->l2ad_start;
5075 5568          adddev->l2ad_evict = adddev->l2ad_start;
5076 5569          adddev->l2ad_first = B_TRUE;
5077 5570          adddev->l2ad_writing = B_FALSE;
5078 5571  
5079 5572          /*
5080 5573           * This is a list of all ARC buffers that are still valid on the
5081 5574           * device.
5082 5575           */

5083 5576          adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5084 5577          list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),

↓ open down ↓

2 lines elided

↑ open up ↑

5085 5578              offsetof(arc_buf_hdr_t, b_l2node));
5086 5579  
5087 5580          vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5088 5581  
5089 5582          /*
5090 5583           * Add device to global list
5091 5584           */
5092 5585          mutex_enter(&l2arc_dev_mtx);
5093 5586          list_insert_head(l2arc_dev_list, adddev);
5094 5587          atomic_inc_64(&l2arc_ndev);
     5588 +        if (rebuild && l2arc_rebuild_enabled &&
     5589 +            adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
     5590 +                /*
     5591 +                 * Just mark the device as pending for a rebuild. We won't
     5592 +                 * be starting a rebuild in line here as it would block pool
     5593 +                 * import. Instead spa_load_impl will hand that off to an
     5594 +                 * async task which will call l2arc_spa_rebuild_start.
     5595 +                 */
     5596 +                adddev->l2ad_rebuild = B_TRUE;
     5597 +        }
5095 5598          mutex_exit(&l2arc_dev_mtx);
5096 5599  }
5097 5600  
5098 5601  /*
5099 5602   * Remove a vdev from the L2ARC.
5100 5603   */
5101 5604  void
5102 5605  l2arc_remove_vdev(vdev_t *vd)
5103 5606  {
5104 5607          l2arc_dev_t *dev, *nextdev, *remdev = NULL;

5105 5608  
5106 5609          /*
5107 5610           * Find the device by vdev
5108 5611           */
5109 5612          mutex_enter(&l2arc_dev_mtx);
5110 5613          for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5111 5614                  nextdev = list_next(l2arc_dev_list, dev);
5112 5615                  if (vd == dev->l2ad_vdev) {
5113 5616                          remdev = dev;
5114 5617                          break;
5115 5618                  }
5116 5619          }
5117 5620          ASSERT(remdev != NULL);
5118 5621  
5119 5622          /*
5120 5623           * Remove device from global list
5121 5624           */
5122 5625          list_remove(l2arc_dev_list, remdev);
5123 5626          l2arc_dev_last = NULL;          /* may have been invalidated */
5124 5627          atomic_dec_64(&l2arc_ndev);
5125 5628          mutex_exit(&l2arc_dev_mtx);
5126 5629  
5127 5630          /*
5128 5631           * Clear all buflists and ARC references.  L2ARC device flush.
5129 5632           */
5130 5633          l2arc_evict(remdev, 0, B_TRUE);
5131 5634          list_destroy(remdev->l2ad_buflist);
5132 5635          kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5133 5636          kmem_free(remdev, sizeof (l2arc_dev_t));
5134 5637  }
5135 5638  
5136 5639  void
5137 5640  l2arc_init(void)
5138 5641  {
5139 5642          l2arc_thread_exit = 0;
5140 5643          l2arc_ndev = 0;
5141 5644          l2arc_writes_sent = 0;
5142 5645          l2arc_writes_done = 0;
5143 5646  
5144 5647          mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5145 5648          cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5146 5649          mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5147 5650          mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5148 5651          mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5149 5652  
5150 5653          l2arc_dev_list = &L2ARC_dev_list;
5151 5654          l2arc_free_on_write = &L2ARC_free_on_write;
5152 5655          list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5153 5656              offsetof(l2arc_dev_t, l2ad_node));
5154 5657          list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5155 5658              offsetof(l2arc_data_free_t, l2df_list_node));
5156 5659  }
5157 5660  
5158 5661  void
5159 5662  l2arc_fini(void)
5160 5663  {
5161 5664          /*
5162 5665           * This is called from dmu_fini(), which is called from spa_fini();
5163 5666           * Because of this, we can assume that all l2arc devices have
5164 5667           * already been removed when the pools themselves were removed.
5165 5668           */
5166 5669  
5167 5670          l2arc_do_free_on_write();
5168 5671  
5169 5672          mutex_destroy(&l2arc_feed_thr_lock);
5170 5673          cv_destroy(&l2arc_feed_thr_cv);
5171 5674          mutex_destroy(&l2arc_dev_mtx);
5172 5675          mutex_destroy(&l2arc_buflist_mtx);
5173 5676          mutex_destroy(&l2arc_free_on_write_mtx);
5174 5677  
5175 5678          list_destroy(l2arc_dev_list);
5176 5679          list_destroy(l2arc_free_on_write);
5177 5680  }
5178 5681  
5179 5682  void
5180 5683  l2arc_start(void)
5181 5684  {
5182 5685          if (!(spa_mode_global & FWRITE))
5183 5686                  return;
5184 5687  
5185 5688          (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5186 5689              TS_RUN, minclsyspri);
5187 5690  }
5188 5691  
5189 5692  void
5190 5693  l2arc_stop(void)

↓ open down ↓

86 lines elided

↑ open up ↑

5191 5694  {
5192 5695          if (!(spa_mode_global & FWRITE))
5193 5696                  return;
5194 5697  
5195 5698          mutex_enter(&l2arc_feed_thr_lock);
5196 5699          cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
5197 5700          l2arc_thread_exit = 1;
5198 5701          while (l2arc_thread_exit != 0)
5199 5702                  cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5200 5703          mutex_exit(&l2arc_feed_thr_lock);
     5704 +}
     5705 +
     5706 +/*
     5707 + * Punches out rebuild threads for the L2ARC devices in a spa. This should
     5708 + * be called as one of the final steps of a pool import.
     5709 + */
     5710 +void
     5711 +l2arc_spa_rebuild_start(spa_t *spa)
     5712 +{
     5713 +        l2arc_dev_t     *dev;
     5714 +        /*
     5715 +         * Locate the spa's l2arc devices and kick off rebuild threads.
     5716 +         */
     5717 +        mutex_enter(&l2arc_dev_mtx);
     5718 +        for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
     5719 +                dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
     5720 +                ASSERT(dev != NULL);
     5721 +                if (dev->l2ad_rebuild) {
     5722 +                        (void) thread_create(NULL, 0, l2arc_dev_rebuild_start,
     5723 +                            dev, 0, &p0, TS_RUN, minclsyspri);
     5724 +                }
     5725 +        }
     5726 +        mutex_exit(&l2arc_dev_mtx);
     5727 +}
     5728 +
     5729 +/*
     5730 + * Main entry point for L2ARC rebuilding.
     5731 + */
     5732 +static void
     5733 +l2arc_dev_rebuild_start(l2arc_dev_t *dev)
     5734 +{
     5735 +        spa_t *spa = dev->l2ad_spa;
     5736 +        vdev_t *vd = dev->l2ad_vdev;
     5737 +
     5738 +        /* Lock out device removal. */
     5739 +        spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
     5740 +        ASSERT(dev->l2ad_rebuild);
     5741 +        (void) l2arc_rebuild(dev);
     5742 +        dev->l2ad_rebuild = B_FALSE;
     5743 +        spa_config_exit(spa, SCL_L2ARC, vd);
     5744 +        thread_exit();
     5745 +}
     5746 +
     5747 +/*
     5748 + * This function implements the actual L2ARC metadata rebuild. It:
     5749 + *
     5750 + * 1) reads the device's header
     5751 + * 2) if a good device header is found, starts reading the log block chain
     5752 + * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
     5753 + *
     5754 + * Operation stops under any of the following conditions:
     5755 + *
     5756 + * 1) We reach the end of the log blk chain (the back-reference in the blk is
     5757 + *    invalid or loops over our starting point).
     5758 + * 2) We encounter *any* error condition (cksum errors, io errors, looped
     5759 + *    blocks, etc.).
     5760 + * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
     5761 + *    from making severely fragmented L2ARC log blocks or slow L2ARC devices
     5762 + *    prevent a machine from finishing a pool import (and thus letting the
     5763 + *    administrator take corrective action, e.g. by kicking the misbehaving
     5764 + *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
     5765 + *    rebuilding disabled).
     5766 + */
     5767 +static int
     5768 +l2arc_rebuild(l2arc_dev_t *dev)
     5769 +{
     5770 +        int                     err;
     5771 +        l2arc_log_blk_phys_t    *this_lb, *next_lb;
     5772 +        uint8_t                 *this_lb_buf, *next_lb_buf;
     5773 +        zio_t                   *this_io = NULL, *next_io = NULL;
     5774 +        int64_t                 deadline;
     5775 +        l2arc_log_blk_ptr_t     lb_ptrs[2];
     5776 +        boolean_t               first_pass;
     5777 +        uint64_t                load_guid;
     5778 +
     5779 +        load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
     5780 +        deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
     5781 +        /*
     5782 +         * Device header processing phase.
     5783 +         */
     5784 +        if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) {
     5785 +                /* device header corrupted, start a new one */
     5786 +                bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr));
     5787 +                return (err);
     5788 +        }
     5789 +        if (l2arc_check_rebuild_timeout_hit(deadline))
     5790 +                return (SET_ERROR(ETIMEDOUT));
     5791 +
     5792 +        /* Retrieve the persistent L2ARC device state */
     5793 +        dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail;
     5794 +        dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
     5795 +            dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr +
     5796 +            LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0]));
     5797 +        dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags &
     5798 +            L2ARC_DEV_HDR_EVICT_FIRST);
     5799 +
     5800 +        /* Prepare the rebuild processing state */
     5801 +        bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
     5802 +        this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
     5803 +        next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
     5804 +        this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
     5805 +        next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
     5806 +        first_pass = B_TRUE;
     5807 +
     5808 +        /* Start the rebuild process */
     5809 +        for (;;) {
     5810 +                if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0]))
     5811 +                        /* We hit an invalid block address, end the rebuild. */
     5812 +                        break;
     5813 +
     5814 +                if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
     5815 +                    this_lb, next_lb, this_lb_buf, next_lb_buf,
     5816 +                    this_io, &next_io)) != 0)
     5817 +                        break;
     5818 +
     5819 +                /* Protection against infinite loops of log blocks. */
     5820 +                if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr,
     5821 +                    lb_ptrs[0].l2lbp_daddr,
     5822 +                    dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
     5823 +                    !first_pass) {
     5824 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
     5825 +                        err = SET_ERROR(ELOOP);
     5826 +                        break;
     5827 +                }
     5828 +
     5829 +                /*
     5830 +                 * Our memory pressure valve. If the system is running low
     5831 +                 * on memory, rather than swamping memory with new ARC buf
     5832 +                 * hdrs, we opt not to rebuild the L2ARC. At this point,
     5833 +                 * however, we have already set up our L2ARC dev to chain in
     5834 +                 * new metadata log blk, so the user may choose to re-add the
     5835 +                 * L2ARC dev at a later time to reconstruct it (when there's
     5836 +                 * less memory pressure).
     5837 +                 */
     5838 +                if (arc_reclaim_needed()) {
     5839 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
     5840 +                        cmn_err(CE_NOTE, "System running low on memory, "
     5841 +                            "aborting L2ARC rebuild.");
     5842 +                        err = SET_ERROR(ENOMEM);
     5843 +                        break;
     5844 +                }
     5845 +
     5846 +                /*
     5847 +                 * Now that we know that the next_lb checks out alright, we
     5848 +                 * can start reconstruction from this lb - we can be sure
     5849 +                 * that the L2ARC write hand has not yet reached any of our
     5850 +                 * buffers.
     5851 +                 */
     5852 +                l2arc_log_blk_restore(dev, load_guid, this_lb,
     5853 +                    LBP_GET_PSIZE(&lb_ptrs[0]));
     5854 +
     5855 +                /*
     5856 +                 * End of list detection. We can look ahead two steps in the
     5857 +                 * blk chain and if the 2nd blk from this_lb dips below the
     5858 +                 * initial chain starting point, then we know two things:
     5859 +                 *      1) it can't be valid, and
     5860 +                 *      2) the next_lb's ARC entries might have already been
     5861 +                 *      partially overwritten and so we should stop before
     5862 +                 *      we restore it
     5863 +                 */
     5864 +                if (l2arc_range_check_overlap(
     5865 +                    this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr,
     5866 +                    dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
     5867 +                    !first_pass)
     5868 +                        break;
     5869 +
     5870 +                /* log blk restored, continue with next one in the list */
     5871 +                lb_ptrs[0] = lb_ptrs[1];
     5872 +                lb_ptrs[1] = this_lb->l2lb_back2_lbp;
     5873 +                PTR_SWAP(this_lb, next_lb);
     5874 +                PTR_SWAP(this_lb_buf, next_lb_buf);
     5875 +                this_io = next_io;
     5876 +                next_io = NULL;
     5877 +                first_pass = B_FALSE;
     5878 +
     5879 +                if (l2arc_check_rebuild_timeout_hit(deadline)) {
     5880 +                        err = SET_ERROR(ETIMEDOUT);
     5881 +                        break;
     5882 +                }
     5883 +        }
     5884 +        if (next_io != NULL)
     5885 +                l2arc_log_blk_prefetch_abort(next_io);
     5886 +        kmem_free(this_lb, sizeof (*this_lb));
     5887 +        kmem_free(next_lb, sizeof (*next_lb));
     5888 +        kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
     5889 +        kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
     5890 +        if (err == 0)
     5891 +                ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
     5892 +
     5893 +        return (err);
     5894 +}
     5895 +
     5896 +/*
     5897 + * Restores the payload of a log blk to ARC. This creates empty ARC hdr
     5898 + * entries which only contain an l2arc hdr, essentially restoring the
     5899 + * buffers to their L2ARC evicted state. This function also updates space
     5900 + * usage on the L2ARC vdev to make sure it tracks restored buffers.
     5901 + */
     5902 +static void
     5903 +l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
     5904 +    l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
     5905 +{
     5906 +        uint64_t        size = 0, psize = 0;
     5907 +
     5908 +        mutex_enter(&l2arc_buflist_mtx);
     5909 +
     5910 +        for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
     5911 +                /*
     5912 +                 * Restore goes in the reverse direction to preserve correct
     5913 +                 * temporal ordering of buffers in the l2ad_buflist.
     5914 +                 */
     5915 +                l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid);
     5916 +                size += LE_GET_LSIZE(&lb->l2lb_entries[i]);
     5917 +                psize += LE_GET_PSIZE(&lb->l2lb_entries[i]);
     5918 +        }
     5919 +        mutex_exit(&l2arc_buflist_mtx);
     5920 +
     5921 +        /*
     5922 +         * Record rebuild stats:
     5923 +         *      size            In-memory size of restored buffer data in ARC
     5924 +         *      psize           Physical size of restored buffers in the L2ARC
     5925 +         *      bufs            # of ARC buffer headers restored
     5926 +         *      log_blks        # of L2ARC log entries processed during restore
     5927 +         */
     5928 +        ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
     5929 +        ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
     5930 +        ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
     5931 +        ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
     5932 +        ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
     5933 +        ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
     5934 +        vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
     5935 +}
     5936 +
     5937 +/*
     5938 + * Restores a single ARC buf hdr from a log block. The ARC buffer is put
     5939 + * into a state indicating that it has been evicted to L2ARC.
     5940 + */
     5941 +static void
     5942 +l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
     5943 +    uint64_t load_guid)
     5944 +{
     5945 +        arc_buf_hdr_t   *hdr, *exists;
     5946 +        kmutex_t        *hash_lock;
     5947 +        arc_buf_contents_t      type = LE_GET_TYPE(le);
     5948 +        l2arc_buf_hdr_t         *l2hdr;
     5949 +
     5950 +        hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type);
     5951 +        hdr->b_dva = le->l2le_dva;
     5952 +        hdr->b_birth = le->l2le_birth;
     5953 +        hdr->b_cksum0 = le->l2le_cksum0;
     5954 +        hdr->b_size = LE_GET_LSIZE(le);
     5955 +        exists = buf_hash_insert(hdr, &hash_lock);
     5956 +        if (exists) {
     5957 +                /* Buffer was already cached, no need to restore it. */
     5958 +                mutex_exit(hash_lock);
     5959 +                arc_hdr_destroy(hdr);
     5960 +                ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
     5961 +                return;
     5962 +        }
     5963 +        hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE;
     5964 +        if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF)
     5965 +                hdr->b_flags |= ARC_L2COMPRESS;
     5966 +        mutex_enter(&hdr->b_freeze_lock);
     5967 +        ASSERT(hdr->b_freeze_cksum == NULL);
     5968 +        hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
     5969 +        *hdr->b_freeze_cksum = le->l2le_freeze_cksum;
     5970 +        mutex_exit(&hdr->b_freeze_lock);
     5971 +
     5972 +        /* now rebuild the l2arc entry */
     5973 +        ASSERT(hdr->b_l2hdr == NULL);
     5974 +        l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
     5975 +        l2hdr->b_dev = dev;
     5976 +        l2hdr->b_daddr = le->l2le_daddr;
     5977 +        l2hdr->b_asize = LE_GET_PSIZE(le);
     5978 +        l2hdr->b_compress = LE_GET_COMPRESS(le);
     5979 +        hdr->b_l2hdr = l2hdr;
     5980 +        list_insert_tail(dev->l2ad_buflist, hdr);
     5981 +        ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
     5982 +        ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
     5983 +
     5984 +        arc_change_state(arc_l2c_only, hdr, hash_lock);
     5985 +        mutex_exit(hash_lock);
     5986 +}
     5987 +
     5988 +/*
     5989 + * Attempts to read the device header on the provided L2ARC device and writes
     5990 + * it to `ub'. On success, this function returns 0, otherwise the appropriate
     5991 + * error code is returned.
     5992 + */
     5993 +static int
     5994 +l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr)
     5995 +{
     5996 +        int             err;
     5997 +        uint64_t        guid;
     5998 +        zio_cksum_t     cksum;
     5999 +
     6000 +        guid = spa_guid(dev->l2ad_vdev->vdev_spa);
     6001 +
     6002 +        if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
     6003 +            VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr,
     6004 +            ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
     6005 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
     6006 +            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
     6007 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
     6008 +                return (err);
     6009 +        }
     6010 +
     6011 +        if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
     6012 +                byteswap_uint64_array(hdr, sizeof (*hdr));
     6013 +
     6014 +        if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC ||
     6015 +            hdr->l2dh_spa_guid != guid) {
     6016 +                /*
     6017 +                 * Attempt to rebuild a device containing no actual dev hdr
     6018 +                 * or containing a header from some other pool.
     6019 +                 */
     6020 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
     6021 +                return (SET_ERROR(ENOTSUP));
     6022 +        }
     6023 +
     6024 +        l2arc_dev_hdr_checksum(hdr, &cksum);
     6025 +        if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) {
     6026 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
     6027 +                return (SET_ERROR(EINVAL));
     6028 +        }
     6029 +        if (hdr->l2dh_evict_tail < dev->l2ad_start ||
     6030 +            hdr->l2dh_evict_tail >= dev->l2ad_end) {
     6031 +                /* Data in dev hdr is invalid for this device. */
     6032 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
     6033 +                return (SET_ERROR(EINVAL));
     6034 +        }
     6035 +
     6036 +        return (0);
     6037 +}
     6038 +
     6039 +/*
     6040 + * Reads L2ARC log blocks from storage and validates their contents.
     6041 + *
     6042 + * This function implements a simple prefetcher to make sure that while
     6043 + * we're processing one buffer the L2ARC is already prefetching the next
     6044 + * one in the chain.
     6045 + *
     6046 + * The arguments this_lp and next_lp point to the current and next log blk
     6047 + * address in the block chain. Similarly, this_lb and next_lb hold the
     6048 + * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
     6049 + * and next_lb_buf must be buffers of appropriate to hold a raw
     6050 + * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
     6051 + * to buffer decompression).
     6052 + *
     6053 + * The `this_io' and `next_io' arguments are used for block prefetching.
     6054 + * When issuing the first blk IO during rebuild, you should pass NULL for
     6055 + * `this_io'. This function will then issue a sync IO to read the block and
     6056 + * also issue an async IO to fetch the next block in the block chain. The
     6057 + * prefetch IO is returned in `next_io'. On subsequent calls to this
     6058 + * function, pass the value returned in `next_io' from the previous call
     6059 + * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
     6060 + * Prior to the call, you should initialize your `next_io' pointer to be
     6061 + * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
     6062 + *
     6063 + * On success, this function returns 0, otherwise it returns an appropriate
     6064 + * error code. On error the prefetching IO is aborted and cleared before
     6065 + * returning from this function. Therefore, if we return `success', the
     6066 + * caller can assume that we have taken care of cleanup of prefetch IOs.
     6067 + */
     6068 +static int
     6069 +l2arc_log_blk_read(l2arc_dev_t *dev,
     6070 +    const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp,
     6071 +    l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     6072 +    uint8_t *this_lb_buf, uint8_t *next_lb_buf,
     6073 +    zio_t *this_io, zio_t **next_io)
     6074 +{
     6075 +        int err = 0;
     6076 +        zio_cksum_t cksum;
     6077 +
     6078 +        ASSERT(this_lbp != NULL && next_lbp != NULL);
     6079 +        ASSERT(this_lb != NULL && next_lb != NULL);
     6080 +        ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
     6081 +        ASSERT(next_io != NULL && *next_io == NULL);
     6082 +        ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp));
     6083 +
     6084 +        /*
     6085 +         * Check to see if we have issued the IO for this log blk in a
     6086 +         * previous run. If not, this is the first call, so issue it now.
     6087 +         */
     6088 +        if (this_io == NULL) {
     6089 +                this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
     6090 +                    this_lb_buf);
     6091 +        }
     6092 +
     6093 +        /*
     6094 +         * Peek to see if we can start issuing the next IO immediately.
     6095 +         */
     6096 +        if (l2arc_log_blk_ptr_valid(dev, next_lbp)) {
     6097 +                /*
     6098 +                 * Start issuing IO for the next log blk early - this
     6099 +                 * should help keep the L2ARC device busy while we
     6100 +                 * decompress and restore this log blk.
     6101 +                 */
     6102 +                *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
     6103 +                    next_lb_buf);
     6104 +        }
     6105 +
     6106 +        /* Wait for the IO to read this log block to complete */
     6107 +        if ((err = zio_wait(this_io)) != 0) {
     6108 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
     6109 +                goto cleanup;
     6110 +        }
     6111 +
     6112 +        /* Make sure the buffer checks out */
     6113 +        fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum);
     6114 +        if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) {
     6115 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
     6116 +                err = SET_ERROR(EINVAL);
     6117 +                goto cleanup;
     6118 +        }
     6119 +
     6120 +        /* Now we can take our time decoding this buffer */
     6121 +        switch (LBP_GET_COMPRESS(this_lbp)) {
     6122 +        case ZIO_COMPRESS_OFF:
     6123 +                bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
     6124 +                break;
     6125 +        case ZIO_COMPRESS_LZ4:
     6126 +                if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
     6127 +                    this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
     6128 +                    sizeof (*this_lb))) != 0) {
     6129 +                        err = SET_ERROR(EINVAL);
     6130 +                        goto cleanup;
     6131 +                }
     6132 +                break;
     6133 +        default:
     6134 +                err = SET_ERROR(EINVAL);
     6135 +                goto cleanup;
     6136 +        }
     6137 +        if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
     6138 +                byteswap_uint64_array(this_lb, sizeof (*this_lb));
     6139 +        if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) {
     6140 +                err = SET_ERROR(EINVAL);
     6141 +                goto cleanup;
     6142 +        }
     6143 +cleanup:
     6144 +        /* Abort an in-flight prefetch I/O in case of error */
     6145 +        if (err != 0 && *next_io != NULL) {
     6146 +                l2arc_log_blk_prefetch_abort(*next_io);
     6147 +                *next_io = NULL;
     6148 +        }
     6149 +        return (err);
     6150 +}
     6151 +
     6152 +/*
     6153 + * Validates an L2ARC log blk address to make sure that it can be read
     6154 + * from the provided L2ARC device. Returns B_TRUE if the address is
     6155 + * within the device's bounds, or B_FALSE if not.
     6156 + */
     6157 +static boolean_t
     6158 +l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp)
     6159 +{
     6160 +        uint64_t psize = LBP_GET_PSIZE(lbp);
     6161 +        uint64_t end = lbp->l2lbp_daddr + psize;
     6162 +
     6163 +        /*
     6164 +         * A log block is valid if all of the following conditions are true:
     6165 +         * - it fits entirely between l2ad_start and l2ad_end
     6166 +         * - it has a valid size
     6167 +         * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it
     6168 +         *      doesn't sit in the evicted region)
     6169 +         */
     6170 +        return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end &&
     6171 +            psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
     6172 +            lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand);
     6173 +}
     6174 +
     6175 +/*
     6176 + * Starts an asynchronous read IO to read a log block. This is used in log
     6177 + * block reconstruction to start reading the next block before we are done
     6178 + * decoding and reconstructing the current block, to keep the l2arc device
     6179 + * nice and hot with read IO to process.
     6180 + * The returned zio will contain a newly allocated memory buffers for the IO
     6181 + * data which should then be freed by the caller once the zio is no longer
     6182 + * needed (i.e. due to it having completed). If you wish to abort this
     6183 + * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
     6184 + * care of disposing of the allocated buffers correctly.
     6185 + */
     6186 +static zio_t *
     6187 +l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp,
     6188 +    uint8_t *lb_buf)
     6189 +{
     6190 +        uint32_t psize;
     6191 +        zio_t *pio;
     6192 +
     6193 +        psize = LBP_GET_PSIZE(lbp);
     6194 +        ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
     6195 +        pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
     6196 +            ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
     6197 +            ZIO_FLAG_DONT_RETRY);
     6198 +        (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize,
     6199 +            lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
     6200 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
     6201 +            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
     6202 +
     6203 +        return (pio);
     6204 +}
     6205 +
     6206 +/*
     6207 + * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
     6208 + * buffers allocated for it.
     6209 + */
     6210 +static void
     6211 +l2arc_log_blk_prefetch_abort(zio_t *zio)
     6212 +{
     6213 +        (void) zio_wait(zio);
     6214 +}
     6215 +
     6216 +/*
     6217 + * Creates a zio to update the device header on an l2arc device. The zio is
     6218 + * initiated as a child of `pio'.
     6219 + */
     6220 +static void
     6221 +l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
     6222 +{
     6223 +        zio_t *wzio;
     6224 +        vdev_stat_t st;
     6225 +        l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr;
     6226 +
     6227 +        vdev_get_stats(dev->l2ad_vdev, &st);
     6228 +
     6229 +        hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC;
     6230 +        hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
     6231 +        hdr->l2dh_evict_tail = dev->l2ad_evict;
     6232 +        hdr->l2dh_alloc_space = st.vs_alloc;
     6233 +        hdr->l2dh_flags = 0;
     6234 +        if (dev->l2ad_first)
     6235 +                hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
     6236 +
     6237 +        /* checksum operation goes last */
     6238 +        l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum);
     6239 +
     6240 +        CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE &&
     6241 +            sizeof (*hdr) <= SPA_MAXBLOCKSIZE);
     6242 +        wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
     6243 +            sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL,
     6244 +            NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
     6245 +        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
     6246 +            zio_t *, wzio);
     6247 +        (void) zio_nowait(wzio);
     6248 +}
     6249 +
     6250 +/*
     6251 + * Commits a log block to the L2ARC device. This routine is invoked from
     6252 + * l2arc_write_buffers when the log block fills up.
     6253 + * This function allocates some memory to temporarily hold the serialized
     6254 + * buffer to be written. This is then released in l2arc_write_done.
     6255 + */
     6256 +static void
     6257 +l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     6258 +    l2arc_write_callback_t *cb)
     6259 +{
     6260 +        l2arc_log_blk_phys_t    *lb = &dev->l2ad_log_blk;
     6261 +        uint64_t                psize, asize;
     6262 +        l2arc_log_blk_buf_t     *lb_buf;
     6263 +        zio_t                   *wzio;
     6264 +
     6265 +        VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
     6266 +
     6267 +        /* link the buffer into the block chain */
     6268 +        lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1];
     6269 +        lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC;
     6270 +
     6271 +        /* try to compress the buffer */
     6272 +        lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
     6273 +        list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf);
     6274 +        VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb,
     6275 +            lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0);
     6276 +
     6277 +        /*
     6278 +         * Update the start log blk pointer in the device header to point
     6279 +         * to the log block we're about to write.
     6280 +         */
     6281 +        dev->l2ad_dev_hdr.l2dh_start_lbps[1] =
     6282 +            dev->l2ad_dev_hdr.l2dh_start_lbps[0];
     6283 +        dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand;
     6284 +        LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb));
     6285 +        LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize);
     6286 +        LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
     6287 +            ZIO_CHECKSUM_FLETCHER_4);
     6288 +        LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0);
     6289 +        if (psize < sizeof (*lb)) {
     6290 +                /* compression succeeded */
     6291 +                LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
     6292 +                    ZIO_COMPRESS_LZ4);
     6293 +        } else {
     6294 +                /* compression failed */
     6295 +                bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb));
     6296 +                LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
     6297 +                    ZIO_COMPRESS_OFF);
     6298 +        }
     6299 +        /* checksum what we're about to write */
     6300 +        fletcher_4_native(lb_buf->l2lbb_log_blk, psize,
     6301 +            &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum);
     6302 +
     6303 +        /* perform the write itself */
     6304 +        CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
     6305 +            L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
     6306 +        wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
     6307 +            psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
     6308 +            ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
     6309 +        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
     6310 +        (void) zio_nowait(wzio);
     6311 +
     6312 +        /* realign the device hand */
     6313 +        asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
     6314 +        dev->l2ad_hand += asize;
     6315 +        VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first);
     6316 +        vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
     6317 +
     6318 +        /* bump the kstats */
     6319 +        ARCSTAT_INCR(arcstat_l2_write_bytes, psize);
     6320 +        ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
     6321 +        ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
     6322 +        ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
     6323 +            dev->l2ad_log_blk_payload_asize / asize);
     6324 +
     6325 +        dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0;
     6326 +}
     6327 +
     6328 +/*
     6329 + * Computes the checksum of `hdr' and stores it in `cksum'.
     6330 + */
     6331 +static void
     6332 +l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
     6333 +{
     6334 +        fletcher_4_native((uint8_t *)hdr +
     6335 +            offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
     6336 +            sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
     6337 +            cksum);
     6338 +}
     6339 +
     6340 +/*
     6341 + * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
     6342 + * The buffer being inserted must be present in L2ARC.
     6343 + * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
     6344 + * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
     6345 + */
     6346 +static boolean_t
     6347 +l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
     6348 +{
     6349 +        l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
     6350 +        l2arc_log_ent_phys_t *le;
     6351 +        const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
     6352 +        int index = dev->l2ad_log_ent_idx++;
     6353 +
     6354 +        ASSERT(l2hdr != NULL);
     6355 +        ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
     6356 +
     6357 +        le = &lb->l2lb_entries[index];
     6358 +        bzero(le, sizeof (*le));
     6359 +        le->l2le_dva = ab->b_dva;
     6360 +        le->l2le_birth = ab->b_birth;
     6361 +        le->l2le_cksum0 = ab->b_cksum0;
     6362 +        le->l2le_daddr = l2hdr->b_daddr;
     6363 +        LE_SET_LSIZE(le, ab->b_size);
     6364 +        LE_SET_PSIZE(le, l2hdr->b_asize);
     6365 +        LE_SET_COMPRESS(le, l2hdr->b_compress);
     6366 +        le->l2le_freeze_cksum = *ab->b_freeze_cksum;
     6367 +        LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
     6368 +        LE_SET_TYPE(le, ab->b_type);
     6369 +        dev->l2ad_log_blk_payload_asize += l2hdr->b_asize;
     6370 +
     6371 +        return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
     6372 +}
     6373 +
     6374 +/*
     6375 + * Checks whether a given L2ARC device address sits in a time-sequential
     6376 + * range. The trick here is that the L2ARC is a rotary buffer, so we can't
     6377 + * just do a range comparison, we need to handle the situation in which the
     6378 + * range wraps around the end of the L2ARC device. Arguments:
     6379 + *      bottom  Lower end of the range to check (written to earlier).
     6380 + *      top     Upper end of the range to check (written to later).
     6381 + *      check   The address for which we want to determine if it sits in
     6382 + *              between the top and bottom.
     6383 + *
     6384 + * The 3-way conditional below represents the following cases:
     6385 + *
     6386 + *      bottom < top : Sequentially ordered case:
     6387 + *        <check>--------+-------------------+
     6388 + *                       |  (overlap here?)  |
     6389 + *       L2ARC dev       V                   V
     6390 + *       |---------------<bottom>============<top>--------------|
     6391 + *
     6392 + *      bottom > top: Looped-around case:
     6393 + *                            <check>--------+------------------+
     6394 + *                                           |  (overlap here?) |
     6395 + *       L2ARC dev                           V                  V
     6396 + *       |===============<top>---------------<bottom>===========|
     6397 + *       ^               ^
     6398 + *       |  (or here?)   |
     6399 + *       +---------------+---------<check>
     6400 + *
     6401 + *      top == bottom : Just a single address comparison.
     6402 + */
     6403 +static inline boolean_t
     6404 +l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
     6405 +{
     6406 +        if (bottom < top)
     6407 +                return (bottom <= check && check <= top);
     6408 +        else if (bottom > top)
     6409 +                return (check <= top || bottom <= check);
     6410 +        else
     6411 +                return (check == top);
     6412 +}
     6413 +
     6414 +/*
     6415 + * Checks whether a rebuild timeout deadline has been hit and if it has,
     6416 + * increments the appropriate error counters.
     6417 + */
     6418 +static boolean_t
     6419 +l2arc_check_rebuild_timeout_hit(int64_t deadline)
     6420 +{
     6421 +        if (deadline != 0 && deadline < ddi_get_lbolt64()) {
     6422 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout);
     6423 +                cmn_err(CE_WARN, "L2ARC rebuild is taking too long, "
     6424 +                    "dropping remaining L2ARC metadata.");
     6425 +                return (B_TRUE);
     6426 +        } else {
     6427 +                return (B_FALSE);
     6428 +        }
5201 6429  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX