Print this page
3525 Persistent L2ARC

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c
↓ open down ↓ 128 lines elided ↑ open up ↑
 129  129  #include <sys/vdev_impl.h>
 130  130  #ifdef _KERNEL
 131  131  #include <sys/vmsystm.h>
 132  132  #include <vm/anon.h>
 133  133  #include <sys/fs/swapnode.h>
 134  134  #include <sys/dnlc.h>
 135  135  #endif
 136  136  #include <sys/callb.h>
 137  137  #include <sys/kstat.h>
 138  138  #include <zfs_fletcher.h>
      139 +#include <sys/byteorder.h>
 139  140  
 140  141  #ifndef _KERNEL
 141  142  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 142  143  boolean_t arc_watch = B_FALSE;
 143  144  int arc_procfd;
 144  145  #endif
 145  146  
 146  147  static kmutex_t         arc_reclaim_thr_lock;
 147  148  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 148  149  static uint8_t          arc_thread_exit;
↓ open down ↓ 151 lines elided ↑ open up ↑
 300  301          kstat_named_t arcstat_l2_free_on_write;
 301  302          kstat_named_t arcstat_l2_abort_lowmem;
 302  303          kstat_named_t arcstat_l2_cksum_bad;
 303  304          kstat_named_t arcstat_l2_io_error;
 304  305          kstat_named_t arcstat_l2_size;
 305  306          kstat_named_t arcstat_l2_asize;
 306  307          kstat_named_t arcstat_l2_hdr_size;
 307  308          kstat_named_t arcstat_l2_compress_successes;
 308  309          kstat_named_t arcstat_l2_compress_zeros;
 309  310          kstat_named_t arcstat_l2_compress_failures;
      311 +        kstat_named_t arcstat_l2_meta_writes;
      312 +        kstat_named_t arcstat_l2_meta_avg_size;
      313 +        kstat_named_t arcstat_l2_meta_avg_asize;
      314 +        kstat_named_t arcstat_l2_asize_to_meta_ratio;
      315 +        kstat_named_t arcstat_l2_rebuild_attempts;
      316 +        kstat_named_t arcstat_l2_rebuild_successes;
      317 +        kstat_named_t arcstat_l2_rebuild_unsupported;
      318 +        kstat_named_t arcstat_l2_rebuild_timeout;
      319 +        kstat_named_t arcstat_l2_rebuild_arc_bytes;
      320 +        kstat_named_t arcstat_l2_rebuild_l2arc_bytes;
      321 +        kstat_named_t arcstat_l2_rebuild_bufs;
      322 +        kstat_named_t arcstat_l2_rebuild_bufs_precached;
      323 +        kstat_named_t arcstat_l2_rebuild_metabufs;
      324 +        kstat_named_t arcstat_l2_rebuild_uberblk_errors;
      325 +        kstat_named_t arcstat_l2_rebuild_io_errors;
      326 +        kstat_named_t arcstat_l2_rebuild_cksum_errors;
      327 +        kstat_named_t arcstat_l2_rebuild_loop_errors;
      328 +        kstat_named_t arcstat_l2_rebuild_abort_lowmem;
 310  329          kstat_named_t arcstat_memory_throttle_count;
 311  330          kstat_named_t arcstat_duplicate_buffers;
 312  331          kstat_named_t arcstat_duplicate_buffers_size;
 313  332          kstat_named_t arcstat_duplicate_reads;
 314  333          kstat_named_t arcstat_meta_used;
 315  334          kstat_named_t arcstat_meta_limit;
 316  335          kstat_named_t arcstat_meta_max;
 317  336  } arc_stats_t;
 318  337  
 319  338  static arc_stats_t arc_stats = {
↓ open down ↓ 46 lines elided ↑ open up ↑
 366  385          { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 367  386          { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 368  387          { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 369  388          { "l2_io_error",                KSTAT_DATA_UINT64 },
 370  389          { "l2_size",                    KSTAT_DATA_UINT64 },
 371  390          { "l2_asize",                   KSTAT_DATA_UINT64 },
 372  391          { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 373  392          { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 374  393          { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 375  394          { "l2_compress_failures",       KSTAT_DATA_UINT64 },
      395 +        { "l2_meta_writes",             KSTAT_DATA_UINT64 },
      396 +        { "l2_meta_avg_size",           KSTAT_DATA_UINT64 },
      397 +        { "l2_meta_avg_asize",          KSTAT_DATA_UINT64 },
      398 +        { "l2_asize_to_meta_ratio",     KSTAT_DATA_UINT64 },
      399 +        { "l2_rebuild_attempts",        KSTAT_DATA_UINT64 },
      400 +        { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
      401 +        { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
      402 +        { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
      403 +        { "l2_rebuild_arc_bytes",       KSTAT_DATA_UINT64 },
      404 +        { "l2_rebuild_l2arc_bytes",     KSTAT_DATA_UINT64 },
      405 +        { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
      406 +        { "l2_rebuild_precached",       KSTAT_DATA_UINT64 },
      407 +        { "l2_rebuild_metabufs",        KSTAT_DATA_UINT64 },
      408 +        { "l2_rebuild_uberblk_errors",  KSTAT_DATA_UINT64 },
      409 +        { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
      410 +        { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
      411 +        { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
      412 +        { "l2_rebuild_abort_lowmem",    KSTAT_DATA_UINT64 },
 376  413          { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 377  414          { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 378  415          { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 379  416          { "duplicate_reads",            KSTAT_DATA_UINT64 },
 380  417          { "arc_meta_used",              KSTAT_DATA_UINT64 },
 381  418          { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 382  419          { "arc_meta_max",               KSTAT_DATA_UINT64 }
 383  420  };
 384  421  
 385  422  #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
↓ open down ↓ 27 lines elided ↑ open up ↑
 413  450                          ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 414  451                  }                                                       \
 415  452          } else {                                                        \
 416  453                  if (cond2) {                                            \
 417  454                          ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 418  455                  } else {                                                \
 419  456                          ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 420  457                  }                                                       \
 421  458          }
 422  459  
      460 +/*
      461 + * This macro allows us to use kstats as floating averages. Each time we
      462 + * update this kstat, we first factor it and the update value by
      463 + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
      464 + * average. This macro assumes that integer loads and stores are atomic, but
      465 + * is not safe for multiple writers updating the kstat in parallel (only the
      466 + * last writer's update will remain).
      467 + */
      468 +#define ARCSTAT_F_AVG_FACTOR    3
      469 +#define ARCSTAT_F_AVG(stat, value) \
      470 +        do { \
      471 +                uint64_t x = ARCSTAT(stat); \
      472 +                x = x - x / ARCSTAT_F_AVG_FACTOR + \
      473 +                    (value) / ARCSTAT_F_AVG_FACTOR; \
      474 +                ARCSTAT(stat) = x; \
      475 +                _NOTE(NOTREACHED) \
      476 +                _NOTE(CONSTCOND) \
      477 +        } while (0)
      478 +
 423  479  kstat_t                 *arc_ksp;
 424  480  static arc_state_t      *arc_anon;
 425  481  static arc_state_t      *arc_mru;
 426  482  static arc_state_t      *arc_mru_ghost;
 427  483  static arc_state_t      *arc_mfu;
 428  484  static arc_state_t      *arc_mfu_ghost;
 429  485  static arc_state_t      *arc_l2c_only;
 430  486  
 431  487  /*
 432  488   * There are several ARC variables that are critical to export as kstats --
↓ open down ↓ 187 lines elided ↑ open up ↑
 620  676  uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 621  677  uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 622  678  uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 623  679  boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 624  680  boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 625  681  boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 626  682  
 627  683  /*
 628  684   * L2ARC Internals
 629  685   */
 630      -typedef struct l2arc_dev {
 631      -        vdev_t                  *l2ad_vdev;     /* vdev */
 632      -        spa_t                   *l2ad_spa;      /* spa */
 633      -        uint64_t                l2ad_hand;      /* next write location */
 634      -        uint64_t                l2ad_start;     /* first addr on device */
 635      -        uint64_t                l2ad_end;       /* last addr on device */
 636      -        uint64_t                l2ad_evict;     /* last addr eviction reached */
 637      -        boolean_t               l2ad_first;     /* first sweep through */
 638      -        boolean_t               l2ad_writing;   /* currently writing */
 639      -        list_t                  *l2ad_buflist;  /* buffer list */
 640      -        list_node_t             l2ad_node;      /* device list node */
 641      -} l2arc_dev_t;
 642      -
      686 +typedef struct l2arc_dev l2arc_dev_t;
 643  687  static list_t L2ARC_dev_list;                   /* device list */
 644  688  static list_t *l2arc_dev_list;                  /* device list pointer */
 645  689  static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 646  690  static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 647  691  static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 648  692  static list_t L2ARC_free_on_write;              /* free after write buf list */
 649  693  static list_t *l2arc_free_on_write;             /* free after write list ptr */
 650  694  static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 651  695  static uint64_t l2arc_ndev;                     /* number of devices */
 652  696  
↓ open down ↓ 2 lines elided ↑ open up ↑
 655  699          spa_t                   *l2rcb_spa;             /* spa */
 656  700          blkptr_t                l2rcb_bp;               /* original blkptr */
 657  701          zbookmark_t             l2rcb_zb;               /* original bookmark */
 658  702          int                     l2rcb_flags;            /* original flags */
 659  703          enum zio_compress       l2rcb_compress;         /* applied compress */
 660  704  } l2arc_read_callback_t;
 661  705  
 662  706  typedef struct l2arc_write_callback {
 663  707          l2arc_dev_t     *l2wcb_dev;             /* device info */
 664  708          arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
      709 +        uint8_t         *l2wcb_pbuf;            /* pbuf sent in this write */
      710 +        uint32_t        l2wcb_pbuf_size;        /* size of committed pbuf */
      711 +        uint8_t         *l2wcb_ub_buf;          /* uberblock in this write */
 665  712  } l2arc_write_callback_t;
 666  713  
 667  714  struct l2arc_buf_hdr {
 668  715          /* protected by arc_buf_hdr  mutex */
 669  716          l2arc_dev_t             *b_dev;         /* L2ARC device */
 670  717          uint64_t                b_daddr;        /* disk address, offset byte */
 671  718          /* compression applied to buffer data */
 672  719          enum zio_compress       b_compress;
 673  720          /* real alloc'd buffer size depending on b_compress applied */
 674  721          int                     b_asize;
↓ open down ↓ 7 lines elided ↑ open up ↑
 682  729          size_t          l2df_size;
 683  730          void            (*l2df_func)(void *, size_t);
 684  731          list_node_t     l2df_list_node;
 685  732  } l2arc_data_free_t;
 686  733  
 687  734  static kmutex_t l2arc_feed_thr_lock;
 688  735  static kcondvar_t l2arc_feed_thr_cv;
 689  736  static uint8_t l2arc_thread_exit;
 690  737  
 691  738  static void l2arc_read_done(zio_t *zio);
 692      -static void l2arc_hdr_stat_add(void);
      739 +static void l2arc_hdr_stat_add(boolean_t from_arc);
 693  740  static void l2arc_hdr_stat_remove(void);
 694  741  
 695  742  static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 696  743  static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 697  744      enum zio_compress c);
 698  745  static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 699  746  
      747 +typedef enum {
      748 +        L2UBLK_BIG_ENDIAN = (1 << 0),   /* little endian assumed otherwise */
      749 +        L2UBLK_EVICT_FIRST = (1 << 1)   /* mirror of l2ad_first in l2dev */
      750 +} l2uberblock_flags_t;
      751 +
      752 +typedef struct l2uberblock {
      753 +        uint32_t                ub_magic;
      754 +        uint8_t                 ub_version;
      755 +        l2uberblock_flags_t     ub_flags;
      756 +
      757 +        uint64_t                ub_spa_guid;
      758 +        uint64_t                ub_birth;
      759 +        uint64_t                ub_evict_tail;  /* current evict pointer */
      760 +        uint64_t                ub_alloc_space; /* vdev space alloc status */
      761 +        uint64_t                ub_pbuf_daddr;  /* address of newest pbuf */
      762 +        uint32_t                ub_pbuf_asize;  /* size of newest pbuf */
      763 +        zio_cksum_t             ub_pbuf_cksum;  /* fletcher4 of newest pbuf */
      764 +
      765 +        zio_cksum_t             ub_cksum;       /* cksum of uberblock */
      766 +} l2uberblock_t;
      767 +
      768 +typedef enum {
      769 +        L2PBUF_BIG_ENDIAN = (1 << 0),   /* little endian assumed otherwise */
      770 +        L2PBUF_COMPRESSED = (1 << 1)    /* pbuf data items are compressed */
      771 +} l2pbuf_flags_t;
      772 +
      773 +typedef struct l2pbuf {
      774 +        uint32_t                pb_magic;
      775 +        unsigned int            pb_version;
      776 +        l2pbuf_flags_t          pb_flags;
      777 +
      778 +        uint64_t                pb_prev_daddr;  /* address of previous pbuf */
      779 +        uint32_t                pb_prev_asize;  /* size of previous pbuf */
      780 +        zio_cksum_t             pb_prev_cksum;  /* fletcher4 of prev. pbuf */
      781 +
      782 +        /*
      783 +         * This is a set of item lists that are contained in this pbuf. Each
      784 +         * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's.
      785 +         * This serves as a soft timeout feature - once the limit of the
      786 +         * number of item lists that a pbuf can hold is reached, the pbuf is
      787 +         * flushed to stable storage, regardless of its total size.
      788 +         */
      789 +        list_t                  *pb_buflists_list;
      790 +
      791 +        /*
      792 +         * Number of compressed bytes referenced by items in this pbuf and
      793 +         * the number of lists present.
      794 +         * This is not actually written to storage, it is only used by
      795 +         * internal algorithms which check for when a pbuf reaches a
      796 +         * certain size limit, after which it is flushed in a write.
      797 +         */
      798 +        uint64_t                pb_payload_asz;
      799 +        /* Same thing for number of buflists */
      800 +        int                     pb_nbuflists;
      801 +
      802 +        /*
      803 +         * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size.
      804 +         * This is then used by l2arc_pbuf_restore to update used space
      805 +         * on the L2ARC vdev.
      806 +         */
      807 +        size_t                  pb_asize;
      808 +} l2pbuf_t;
      809 +
      810 +typedef struct l2pbuf_buf l2pbuf_buf_t;
      811 +typedef struct l2pbuf_buflist {
      812 +        uint32_t                l2pbl_nbufs;
      813 +        l2pbuf_buf_t            *l2pbl_bufs;
      814 +        list_node_t             l2pbl_node;
      815 +} l2pbuf_buflist_t;
      816 +
      817 +struct l2pbuf_buf {
      818 +        dva_t                   b_dva;          /* dva of buffer */
      819 +        uint64_t                b_birth;        /* birth txg of buffer */
      820 +        uint64_t                b_cksum0;
      821 +        zio_cksum_t             b_freeze_cksum;
      822 +        uint32_t                b_size;         /* uncompressed buf size */
      823 +        uint64_t                b_l2daddr;      /* buf location on l2dev */
      824 +        uint32_t                b_l2asize;      /* actual buf data size */
      825 +        enum zio_compress       b_l2compress;   /* compression applied */
      826 +        uint16_t                b_contents_type;
      827 +        uint32_t                b_flags;
      828 +};
      829 +
      830 +struct l2arc_dev {
      831 +        vdev_t                  *l2ad_vdev;     /* vdev */
      832 +        spa_t                   *l2ad_spa;      /* spa */
      833 +        uint64_t                l2ad_hand;      /* next write location */
      834 +        uint64_t                l2ad_start;     /* first addr on device */
      835 +        uint64_t                l2ad_end;       /* last addr on device */
      836 +        uint64_t                l2ad_evict;     /* last addr eviction reached */
      837 +        boolean_t               l2ad_first;     /* first sweep through */
      838 +        boolean_t               l2ad_writing;   /* currently writing */
      839 +        list_t                  *l2ad_buflist;  /* buffer list */
      840 +        list_node_t             l2ad_node;      /* device list node */
      841 +        l2pbuf_t                l2ad_pbuf;      /* currently open pbuf */
      842 +        uint64_t                l2ad_pbuf_daddr;        /* prev pbuf daddr */
      843 +        uint64_t                l2ad_pbuf_asize;        /* prev pbuf asize */
      844 +        zio_cksum_t             l2ad_pbuf_cksum;        /* prev pbuf cksum */
      845 +        /* uberblock birth counter - incremented for each committed uberblk */
      846 +        uint64_t                l2ad_uberblock_birth;
      847 +        /* flag indicating whether a rebuild is currently going on */
      848 +        boolean_t               l2ad_rebuilding;
      849 +};
      850 +
      851 +/* Stores information about an L2ARC prefetch zio */
      852 +typedef struct l2arc_prefetch_info {
      853 +        uint8_t                 *pi_buf;        /* where the zio writes to */
      854 +        uint64_t                pi_buflen;      /* length of `buf' */
      855 +        zio_t                   *pi_hdr_io;     /* see l2arc_pbuf_read below */
      856 +} l2arc_prefetch_info_t;
      857 +
      858 +/* 256 x 4k of l2uberblocks */
      859 +#define L2UBERBLOCK_SIZE        4096
      860 +#define L2UBERBLOCK_MAGIC       0x12bab10c
      861 +#define L2UBERBLOCK_MAX_VERSION 1       /* our maximum uberblock version */
      862 +#define L2PBUF_MAGIC            0xdb0faba6
      863 +#define L2PBUF_MAX_VERSION      1       /* our maximum pbuf version */
      864 +#define L2PBUF_BUF_SIZE         88      /* size of one pbuf buf entry */
      865 +#define L2PBUF_HDR_SIZE         56      /* pbuf header excluding any payload */
      866 +#define L2PBUF_ENCODED_SIZE(_pb) \
      867 +        (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb))
      868 +/*
      869 + * Allocation limit for the payload of a pbuf. This also fundamentally
      870 + * limits the number of bufs we can reference in a pbuf.
      871 + */
      872 +#define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024)
      873 +#define L2PBUF_MAX_BUFS         (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE)
      874 +#define L2PBUF_COMPRESS_MINSZ   8192    /* minimum size to compress a pbuf */
      875 +#define L2PBUF_MAXSZ            100 * 1024 * 1024       /* maximum pbuf size */
      876 +#define L2PBUF_MAX_BUFLISTS     128     /* max number of buflists per pbuf */
      877 +#define L2ARC_REBUILD_TIMEOUT   60      /* a rebuild may take at most 60s */
      878 +#define L2PBUF_IS_FULL(_pb) \
      879 +        ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \
      880 +        (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists)
      881 +/*
      882 + * These are the flags we allow to persist in L2ARC pbufs. The other flags
      883 + * of an ARC buffer pertain to the buffer's runtime behavior.
      884 + */
      885 +#define L2ARC_PERSIST_FLAGS \
      886 +        (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH)
      887 +
      888 +/*
      889 + * Used during L2ARC rebuild after each read operation to check whether we
      890 + * haven't exceeded the rebuild timeout value.
      891 + */
      892 +#define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \
      893 +        do { \
      894 +                if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \
      895 +                        __VA_ARGS__; \
      896 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \
      897 +                        cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \
      898 +                            "dropping remaining L2ARC metadata."); \
      899 +                        return; \
      900 +                } \
      901 +                _NOTE(NOTREACHED) \
      902 +                _NOTE(CONSTCOND) \
      903 +        } while (0)
      904 +
      905 +/*
      906 + * Performance tuning of L2ARC persistency:
      907 + *
      908 + * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt
      909 + *              compressing it.
      910 + * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers
      911 + *              referenced from a pbuf. Once a pbuf reaches this size, it is
      912 + *              committed to stable storage. Ideally, there should be approx.
      913 + *              l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device.
      914 + * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will
      915 + *              be buffered in a pbuf before it is committed to L2ARC. This
      916 + *              puts a soft temporal upper bound on pbuf commit intervals.
      917 + * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
      918 + *              pool import or when adding one manually later) will attempt
      919 + *              to rebuild L2ARC buffer contents. In special circumstances,
      920 + *              the administrator may want to set this to B_FALSE, if they
      921 + *              are having trouble importing a pool or attaching an L2ARC
      922 + *              device (e.g. the L2ARC device is slow to read in stored pbuf
      923 + *              metadata, or the metadata has become somehow
      924 + *              fragmented/unusable).
      925 + * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
      926 + *              avoid a slow L2ARC device from preventing pool import. If we
      927 + *              are not done rebuilding an L2ARC device by this time, we
      928 + *              stop the rebuild and return immediately.
      929 + */
      930 +uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ;
      931 +uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ;
      932 +uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS;
      933 +boolean_t l2arc_rebuild_enabled = B_TRUE;
      934 +uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
      935 +
      936 +static void l2arc_rebuild_start(l2arc_dev_t *dev);
      937 +static void l2arc_rebuild(l2arc_dev_t *dev);
      938 +static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb);
      939 +static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev,
      940 +    uint64_t guid);
      941 +
      942 +static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub);
      943 +static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
      944 +    zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io);
      945 +static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr,
      946 +    uint32_t asize);
      947 +static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize);
      948 +static void l2arc_pbuf_prefetch_abort(zio_t *zio);
      949 +
      950 +static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf);
      951 +static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub);
      952 +static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
      953 +    uint64_t guid);
      954 +static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio,
      955 +    l2arc_write_callback_t *cb);
      956 +
      957 +static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen);
      958 +static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen,
      959 +    l2pbuf_t *pbuf);
      960 +static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen,
      961 +    uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum);
      962 +static void l2arc_pbuf_init(l2pbuf_t *pb);
      963 +static void l2arc_pbuf_destroy(l2pbuf_t *pb);
      964 +static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio,
      965 +    l2arc_write_callback_t *cb);
      966 +static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs);
      967 +static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
      968 +    const arc_buf_hdr_t *ab, int index);
      969 +static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb);
      970 +
 700  971  static uint64_t
 701  972  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 702  973  {
 703  974          uint8_t *vdva = (uint8_t *)dva;
 704  975          uint64_t crc = -1ULL;
 705  976          int i;
 706  977  
 707  978          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 708  979  
 709  980          for (i = 0; i < sizeof (dva_t); i++)
↓ open down ↓ 520 lines elided ↑ open up ↑
1230 1501          if (to_delta)
1231 1502                  atomic_add_64(&new_state->arcs_size, to_delta);
1232 1503          if (from_delta) {
1233 1504                  ASSERT3U(old_state->arcs_size, >=, from_delta);
1234 1505                  atomic_add_64(&old_state->arcs_size, -from_delta);
1235 1506          }
1236 1507          ab->b_state = new_state;
1237 1508  
1238 1509          /* adjust l2arc hdr stats */
1239 1510          if (new_state == arc_l2c_only)
1240      -                l2arc_hdr_stat_add();
     1511 +                l2arc_hdr_stat_add(old_state != arc_anon);
1241 1512          else if (old_state == arc_l2c_only)
1242 1513                  l2arc_hdr_stat_remove();
1243 1514  }
1244 1515  
1245 1516  void
1246 1517  arc_space_consume(uint64_t space, arc_space_type_t type)
1247 1518  {
1248 1519          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1249 1520  
1250 1521          switch (type) {
↓ open down ↓ 83 lines elided ↑ open up ↑
1334 1605          hdr->b_buf = buf;
1335 1606          arc_get_data_buf(buf);
1336 1607          hdr->b_datacnt = 1;
1337 1608          hdr->b_flags = 0;
1338 1609          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1339 1610          (void) refcount_add(&hdr->b_refcnt, tag);
1340 1611  
1341 1612          return (buf);
1342 1613  }
1343 1614  
     1615 +/*
     1616 + * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
     1617 + * This is used during l2arc reconstruction to make empty ARC buffers
     1618 + * which circumvent the regular disk->arc->l2arc path and instead come
     1619 + * into being in the reverse order, i.e. l2arc->arc->(disk).
     1620 + */
     1621 +arc_buf_hdr_t *
     1622 +arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
     1623 +{
     1624 +        arc_buf_hdr_t *hdr;
     1625 +
     1626 +        ASSERT3U(size, >, 0);
     1627 +        hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
     1628 +        ASSERT(BUF_EMPTY(hdr));
     1629 +        hdr->b_size = size;
     1630 +        hdr->b_type = type;
     1631 +        hdr->b_spa = guid;
     1632 +        hdr->b_state = arc_anon;
     1633 +        hdr->b_arc_access = 0;
     1634 +        hdr->b_buf = NULL;
     1635 +        hdr->b_datacnt = 0;
     1636 +        hdr->b_flags = 0;
     1637 +        ASSERT(refcount_is_zero(&hdr->b_refcnt));
     1638 +
     1639 +        return (hdr);
     1640 +}
     1641 +
1344 1642  static char *arc_onloan_tag = "onloan";
1345 1643  
1346 1644  /*
1347 1645   * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1348 1646   * flight data by arc_tempreserve_space() until they are "returned". Loaned
1349 1647   * buffers must be returned to the arc before they can be used by the DMU or
1350 1648   * freed.
1351 1649   */
1352 1650  arc_buf_t *
1353 1651  arc_loan_buf(spa_t *spa, int size)
↓ open down ↓ 2612 lines elided ↑ open up ↑
3966 4264   * integrated, and also may become zpool properties.
3967 4265   *
3968 4266   * There are three key functions that control how the L2ARC warms up:
3969 4267   *
3970 4268   *      l2arc_write_eligible()  check if a buffer is eligible to cache
3971 4269   *      l2arc_write_size()      calculate how much to write
3972 4270   *      l2arc_write_interval()  calculate sleep delay between writes
3973 4271   *
3974 4272   * These three functions determine what to write, how much, and how quickly
3975 4273   * to send writes.
     4274 + *
     4275 + * L2ARC persistency:
     4276 + *
     4277 + * When writing buffers to L2ARC, we periodically add some metadata to
     4278 + * make sure we can pick them up after reboot, thus dramatically reducing
     4279 + * the impact that any downtime has on the performance of storage systems
     4280 + * with large caches.
     4281 + *
     4282 + * The implementation works fairly simply by integrating the following two
     4283 + * modifications:
     4284 + *
     4285 + * *) Every now and then, at end of an L2ARC feed cycle, we append a piece
     4286 + *    of metadata (called a "pbuf", or "persistency buffer") to the L2ARC
     4287 + *    write. This allows us to understand what what's been written, so that
     4288 + *    we can rebuild the arc_buf_hdr_t structures of the main ARC buffers.
     4289 + *    The pbuf also includes a "back-reference" pointer to the previous
     4290 + *    pbuf, forming a linked list of pbufs on the L2ARC device.
     4291 + *
     4292 + * *) We reserve 4k of space at the start of each L2ARC device for our
     4293 + *    header bookkeeping purposes. This contains a single 4k uberblock, which
     4294 + *    contains our top-level reference structures. We update it on each pbuf
     4295 + *    write. If this write results in an inconsistent uberblock (e.g. due to
     4296 + *    power failure), we detect this by verifying the uberblock's checksum
     4297 + *    and simply drop the entries from L2ARC. Once an L2ARC pbuf update
     4298 + *    completes, we update the uberblock to point to it.
     4299 + *
     4300 + * Implementation diagram:
     4301 + *
     4302 + * +=== L2ARC device (not to scale) ======================================+
     4303 + * |       ____________newest pbuf pointer_____________                   |
     4304 + * |      /                                            \                  |
     4305 + * |     /                                              V                 |
     4306 + * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---|
     4307 + * |                       ^       / ^       / ^       /                  |
     4308 + * |                       `-prev-'  `-prev-'  `-prev-'                   |
     4309 + * |                         pbuf      pbuf      pbuf                     |
     4310 + * +======================================================================+
     4311 + *
     4312 + * On-device data structures:
     4313 + *
     4314 + * (L2ARC persistent uberblock)
     4315 + * struct l2uberblock {
     4316 + *      (these fields are in network byte order)
     4317 + *      uint32_t magic = 0x12bab10c;    l2-ber-block
     4318 + *      uint8_t  version = 0x1;
     4319 + *      uint8_t  reserved = 0x0;
     4320 + *      uint16_t ublk_flags;            see l2uberblock_flags_t
     4321 + *
     4322 + *      (byte order of fields below determined by `ublk_flags')
     4323 + *      uint64_t spa_guid;              what pool this l2arc dev belongs to
     4324 + *      uint64_t birth_txg;             ublk with highest birth_txg is newest
     4325 + *      uint64_t evict_tail;            current evict pointer on l2arc dev
     4326 + *      uint64_t alloc_space;           how much space is alloc'd on the dev
     4327 + *      uint64_t pbuf_daddr;            dev addr of the newest l2pbuf_t
     4328 + *      uint32_t pbuf_asize;            size of newest pbuf
     4329 + *      uint64_t pbuf_cksum[4];         fletcher4 of newest pbuf
     4330 + *
     4331 + *      uint8_t  reserved[3996] = {0x0, 0x0, ... 0x0};
     4332 + *
     4333 + *      uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above);
     4334 + * } l2dev_uberblock;
     4335 + *
     4336 + * (L2ARC persistent buffer list)
     4337 + * typedef struct l2pbuf_t {
     4338 + *      (these fields are in network byte order)
     4339 + *      uint32_t magic = 0xdb0faba6;    the-buffer-bag
     4340 + *      uint8_t  version = 0x1;
     4341 + *      uint8_t  reserved = 0x0;
     4342 + *      uint16_t pbuf_flags;            see l2pbuf_flags_t
     4343 + *
     4344 + *      (byte order of fields below determined by `pbuf_flags')
     4345 + *      uint64_t prev_pbuf_daddr;       previous pbuf dev addr
     4346 + *      uint32_t prev_pbuf_asize;       previous pbuf size
     4347 + *      uint64_t prev_pbuf_cksum[4];    fletcher4(of previous pbuf)
     4348 + *
     4349 + *      uint32_t items_size;            uncompressed size of `items' below
     4350 + *      (if (pbuf_flags & compress) decompress `items' prior to decoding)
     4351 + *      struct l2pbuf_buf_item {
     4352 + *              (these fields mirror [l2]arc_buf_hdr fields)
     4353 + *              uint64_t dva[2];                buffer's DVA
     4354 + *              uint64_t birth;                 buffer's birth TXG in ARC
     4355 + *              uint64_t cksum0;                lower 64-bits of buffer's cksum
     4356 + *              uint64_t freeze_cksum[4];       buffer's freeze cksum
     4357 + *              uint32_t size;                  uncompressed buffer data size
     4358 + *              uint64_t l2daddr;               device address (offset) of buf
     4359 + *              uint32_t l2asize;               actual space occupied by buf
     4360 + *              uint8_t  compress;              compress algo used on data
     4361 + *              uint8_t  contents_type;         buffer's contents type
     4362 + *              uint16_t reserved = 0x0;        for alignment and future use
     4363 + *              uint32_t flags;                 buffer's persistent flags
     4364 + *      } items[];                              continues for remainder of pbuf
     4365 + * } l2pbuf_t;
     4366 + *
     4367 + * L2ARC reconstruction:
     4368 + *
     4369 + * When writing data, we simply write in the standard rotary fashion,
     4370 + * evicting buffers as we go and simply writing new data over them (appending
     4371 + * an updated l2pbuf_t every now and then). This obviously means that once we
     4372 + * loop around the end of the device, we will start cutting into an already
     4373 + * committed l2pbuf (and its referenced data buffers), like so:
     4374 + *
     4375 + *    current write head__       __old tail
     4376 + *                        \     /
     4377 + *                        V    V
     4378 + * <--|bufs|pbuf|bufs|pbuf|    |bufs|pbuf|bufs|pbuf|-->
     4379 + *                         ^    ^^^^^^^^^_____________________________
     4380 + *                         |                                          \
     4381 + *                         <<nextwrite>> - will overwrite this pbuf --/
     4382 + *
     4383 + * When importing the pool, we detect this situation and use it to stop
     4384 + * our scanning process:
     4385 + * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the
     4386 + *      previous one.
     4387 + * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum)
     4388 + *      then the pbuf is invalid and stop scanning (goto step 3 below).
     4389 + * 3) if (this is the last valid pbuf)
     4390 + *      discard this pbuf as well (its ARC bufs may have been damaged by a
     4391 + *      partial overwrite).
     4392 + * (We could potentially salvage the remaining good arc bufs above in step 3,
     4393 + * buf the cost of doing so probably outweighs the value of the entire pbuf).
     4394 + *
     4395 + * There is one significant caveat to consider when rebuilding ARC contents
     4396 + * from an L2ARC device: what about invalidated buffers? Given the above
     4397 + * construction, we cannot update pbufs which we've already written to amend
     4398 + * them to remove buffers which were invalidated. Thus, during reconstruction,
     4399 + * we might be populating the cache with buffers for data that's not on the
     4400 + * main pool anymore, or may have been overwritten!
     4401 + *
     4402 + * As it turns out, this isn't a problem. Every arc_read request includes
     4403 + * both the DVA and, crucially, the birth TXG of the BP the caller is
     4404 + * looking for. So even if the cache were populated by completely rotten
     4405 + * blocks for data that had been long deleted and/or overwritten, we'll
     4406 + * never actually return bad data from the cache, since the DVA with the
     4407 + * birth TXG uniquely identify a block in space and time - once created,
     4408 + * a block is immutable on disk. The worst thing we have done is wasted
     4409 + * some time and memory at l2arc rebuild to reconstruct outdated ARC
     4410 + * entries that will get dropped from the l2arc as it is being updated
     4411 + * with new blocks.
3976 4412   */
3977 4413  
3978 4414  static boolean_t
3979 4415  l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3980 4416  {
3981 4417          /*
3982 4418           * A buffer is *not* eligible for the L2ARC if it:
3983 4419           * 1. belongs to a different spa.
3984 4420           * 2. is already cached on the L2ARC.
3985 4421           * 3. has an I/O in progress (it may be an incomplete read).
↓ open down ↓ 46 lines elided ↑ open up ↑
4032 4468          else
4033 4469                  interval = hz * l2arc_feed_secs;
4034 4470  
4035 4471          now = ddi_get_lbolt();
4036 4472          next = MAX(now, MIN(now + interval, began + interval));
4037 4473  
4038 4474          return (next);
4039 4475  }
4040 4476  
4041 4477  static void
4042      -l2arc_hdr_stat_add(void)
     4478 +l2arc_hdr_stat_add(boolean_t from_arc)
4043 4479  {
4044 4480          ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4045      -        ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
     4481 +        if (from_arc)
     4482 +                ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4046 4483  }
4047 4484  
4048 4485  static void
4049 4486  l2arc_hdr_stat_remove(void)
4050 4487  {
4051 4488          ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4052 4489          ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4053 4490  }
4054 4491  
4055 4492  /*
↓ open down ↓ 13 lines elided ↑ open up ↑
4069 4506          mutex_enter(&spa_namespace_lock);
4070 4507          mutex_enter(&l2arc_dev_mtx);
4071 4508  
4072 4509          /* if there are no vdevs, there is nothing to do */
4073 4510          if (l2arc_ndev == 0)
4074 4511                  goto out;
4075 4512  
4076 4513          first = NULL;
4077 4514          next = l2arc_dev_last;
4078 4515          do {
4079      -                /* loop around the list looking for a non-faulted vdev */
     4516 +                /*
     4517 +                 * Loop around the list looking for a non-faulted vdev
     4518 +                 * and one that isn't currently doing an L2ARC rebuild.
     4519 +                 */
4080 4520                  if (next == NULL) {
4081 4521                          next = list_head(l2arc_dev_list);
4082 4522                  } else {
4083 4523                          next = list_next(l2arc_dev_list, next);
4084 4524                          if (next == NULL)
4085 4525                                  next = list_head(l2arc_dev_list);
4086 4526                  }
4087 4527  
4088 4528                  /* if we have come back to the start, bail out */
4089 4529                  if (first == NULL)
4090 4530                          first = next;
4091 4531                  else if (next == first)
4092 4532                          break;
4093 4533  
4094      -        } while (vdev_is_dead(next->l2ad_vdev));
     4534 +        } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding);
4095 4535  
4096 4536          /* if we were unable to find any usable vdevs, return NULL */
4097      -        if (vdev_is_dead(next->l2ad_vdev))
     4537 +        if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding)
4098 4538                  next = NULL;
4099 4539  
4100 4540          l2arc_dev_last = next;
4101 4541  
4102 4542  out:
4103 4543          mutex_exit(&l2arc_dev_mtx);
4104 4544  
4105 4545          /*
4106 4546           * Grab the config lock to prevent the 'next' device from being
4107 4547           * removed while we are writing to it.
↓ open down ↓ 57 lines elided ↑ open up ↑
4165 4605          if (zio->io_error != 0)
4166 4606                  ARCSTAT_BUMP(arcstat_l2_writes_error);
4167 4607  
4168 4608          mutex_enter(&l2arc_buflist_mtx);
4169 4609  
4170 4610          /*
4171 4611           * All writes completed, or an error was hit.
4172 4612           */
4173 4613          for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4174 4614                  ab_prev = list_prev(buflist, ab);
     4615 +                abl2 = ab->b_l2hdr;
4175 4616  
     4617 +                /*
     4618 +                 * Release the temporary compressed buffer as soon as possible.
     4619 +                 */
     4620 +                if (abl2->b_compress != ZIO_COMPRESS_OFF)
     4621 +                        l2arc_release_cdata_buf(ab);
     4622 +
4176 4623                  hash_lock = HDR_LOCK(ab);
4177 4624                  if (!mutex_tryenter(hash_lock)) {
4178 4625                          /*
4179 4626                           * This buffer misses out.  It may be in a stage
4180 4627                           * of eviction.  Its ARC_L2_WRITING flag will be
4181 4628                           * left set, denying reads to this buffer.
4182 4629                           */
4183 4630                          ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4184 4631                          continue;
4185 4632                  }
4186 4633  
4187      -                abl2 = ab->b_l2hdr;
4188      -
4189      -                /*
4190      -                 * Release the temporary compressed buffer as soon as possible.
4191      -                 */
4192      -                if (abl2->b_compress != ZIO_COMPRESS_OFF)
4193      -                        l2arc_release_cdata_buf(ab);
4194      -
4195 4634                  if (zio->io_error != 0) {
4196 4635                          /*
4197 4636                           * Error - drop L2ARC entry.
4198 4637                           */
4199 4638                          list_remove(buflist, ab);
4200 4639                          ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4201 4640                          ab->b_l2hdr = NULL;
4202 4641                          kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4203 4642                          ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4204 4643                  }
↓ open down ↓ 6 lines elided ↑ open up ↑
4211 4650                  mutex_exit(hash_lock);
4212 4651          }
4213 4652  
4214 4653          atomic_inc_64(&l2arc_writes_done);
4215 4654          list_remove(buflist, head);
4216 4655          kmem_cache_free(hdr_cache, head);
4217 4656          mutex_exit(&l2arc_buflist_mtx);
4218 4657  
4219 4658          l2arc_do_free_on_write();
4220 4659  
     4660 +        if (cb->l2wcb_pbuf)
     4661 +                kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size);
     4662 +        if (cb->l2wcb_ub_buf)
     4663 +                kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE);
4221 4664          kmem_free(cb, sizeof (l2arc_write_callback_t));
4222 4665  }
4223 4666  
4224 4667  /*
4225 4668   * A read to a cache device completed.  Validate buffer contents before
4226 4669   * handing over to the regular ARC routines.
4227 4670   */
4228 4671  static void
4229 4672  l2arc_read_done(zio_t *zio)
4230 4673  {
↓ open down ↓ 261 lines elided ↑ open up ↑
4492 4935          uint64_t write_asize, write_psize, write_sz, headroom,
4493 4936              buf_compress_minsz;
4494 4937          void *buf_data;
4495 4938          kmutex_t *list_lock;
4496 4939          boolean_t full;
4497 4940          l2arc_write_callback_t *cb;
4498 4941          zio_t *pio, *wzio;
4499 4942          uint64_t guid = spa_load_guid(spa);
4500 4943          const boolean_t do_headroom_boost = *headroom_boost;
4501 4944  
     4945 +        /* persistency-related */
     4946 +        l2pbuf_t *pb;
     4947 +        l2pbuf_buflist_t *pb_buflist;
     4948 +        int num_bufs, buf_index;
     4949 +
4502 4950          ASSERT(dev->l2ad_vdev != NULL);
4503 4951  
4504 4952          /* Lower the flag now, we might want to raise it again later. */
4505 4953          *headroom_boost = B_FALSE;
4506 4954  
4507 4955          pio = NULL;
     4956 +        cb = NULL;
4508 4957          write_sz = write_asize = write_psize = 0;
4509 4958          full = B_FALSE;
4510 4959          head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4511 4960          head->b_flags |= ARC_L2_WRITE_HEAD;
4512 4961  
4513 4962          /*
4514 4963           * We will want to try to compress buffers that are at least 2x the
4515 4964           * device sector size.
4516 4965           */
4517 4966          buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4518 4967  
     4968 +        pb = &dev->l2ad_pbuf;
     4969 +        num_bufs = 0;
     4970 +
4519 4971          /*
     4972 +         * We will want to try to compress buffers that are at least 2x the
     4973 +         * device sector size.
     4974 +         */
     4975 +        buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
     4976 +
     4977 +        /*
4520 4978           * Copy buffers for L2ARC writing.
4521 4979           */
4522 4980          mutex_enter(&l2arc_buflist_mtx);
4523 4981          for (int try = 0; try <= 3; try++) {
4524 4982                  uint64_t passed_sz = 0;
4525 4983  
4526 4984                  list = l2arc_list_locked(try, &list_lock);
4527 4985  
4528 4986                  /*
4529 4987                   * L2ARC fast warmup.
↓ open down ↓ 49 lines elided ↑ open up ↑
4579 5037                          }
4580 5038  
4581 5039                          if (pio == NULL) {
4582 5040                                  /*
4583 5041                                   * Insert a dummy header on the buflist so
4584 5042                                   * l2arc_write_done() can find where the
4585 5043                                   * write buffers begin without searching.
4586 5044                                   */
4587 5045                                  list_insert_head(dev->l2ad_buflist, head);
4588 5046  
4589      -                                cb = kmem_alloc(
     5047 +                                cb = kmem_zalloc(
4590 5048                                      sizeof (l2arc_write_callback_t), KM_SLEEP);
4591 5049                                  cb->l2wcb_dev = dev;
4592 5050                                  cb->l2wcb_head = head;
4593 5051                                  pio = zio_root(spa, l2arc_write_done, cb,
4594 5052                                      ZIO_FLAG_CANFAIL);
4595 5053                          }
4596 5054  
4597 5055                          /*
4598 5056                           * Create and add a new L2ARC header.
4599 5057                           */
↓ open down ↓ 21 lines elided ↑ open up ↑
4621 5079                          /*
4622 5080                           * Compute and store the buffer cksum before
4623 5081                           * writing.  On debug the cksum is verified first.
4624 5082                           */
4625 5083                          arc_cksum_verify(ab->b_buf);
4626 5084                          arc_cksum_compute(ab->b_buf, B_TRUE);
4627 5085  
4628 5086                          mutex_exit(hash_lock);
4629 5087  
4630 5088                          write_sz += buf_sz;
     5089 +                        num_bufs++;
4631 5090                  }
4632 5091  
4633 5092                  mutex_exit(list_lock);
4634 5093  
4635 5094                  if (full == B_TRUE)
4636 5095                          break;
4637 5096          }
4638 5097  
4639 5098          /* No buffers selected for writing? */
4640 5099          if (pio == NULL) {
4641 5100                  ASSERT0(write_sz);
4642 5101                  mutex_exit(&l2arc_buflist_mtx);
4643 5102                  kmem_cache_free(hdr_cache, head);
4644 5103                  return (0);
4645 5104          }
4646 5105  
     5106 +        /* expand the pbuf to include a new list */
     5107 +        pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs);
     5108 +
4647 5109          /*
4648 5110           * Now start writing the buffers. We're starting at the write head
4649 5111           * and work backwards, retracing the course of the buffer selector
4650 5112           * loop above.
4651 5113           */
4652      -        for (ab = list_prev(dev->l2ad_buflist, head); ab;
4653      -            ab = list_prev(dev->l2ad_buflist, ab)) {
     5114 +        for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab;
     5115 +            ab = list_prev(dev->l2ad_buflist, ab), buf_index++) {
4654 5116                  l2arc_buf_hdr_t *l2hdr;
4655 5117                  uint64_t buf_sz;
4656 5118  
4657 5119                  /*
4658 5120                   * We shouldn't need to lock the buffer here, since we flagged
4659 5121                   * it as ARC_L2_WRITING in the previous step, but we must take
4660 5122                   * care to only access its L2 cache parameters. In particular,
4661 5123                   * ab->b_buf may be invalid by now due to ARC eviction.
4662 5124                   */
4663 5125                  l2hdr = ab->b_l2hdr;
↓ open down ↓ 31 lines elided ↑ open up ↑
4695 5157                          (void) zio_nowait(wzio);
4696 5158  
4697 5159                          write_asize += buf_sz;
4698 5160                          /*
4699 5161                           * Keep the clock hand suitably device-aligned.
4700 5162                           */
4701 5163                          buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4702 5164                          write_psize += buf_p_sz;
4703 5165                          dev->l2ad_hand += buf_p_sz;
4704 5166                  }
4705      -        }
4706 5167  
     5168 +                l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index);
     5169 +        }
     5170 +        ASSERT(buf_index == num_bufs);
4707 5171          mutex_exit(&l2arc_buflist_mtx);
4708 5172  
4709 5173          ASSERT3U(write_asize, <=, target_sz);
4710 5174          ARCSTAT_BUMP(arcstat_l2_writes_sent);
4711 5175          ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4712 5176          ARCSTAT_INCR(arcstat_l2_size, write_sz);
4713 5177          ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4714 5178          vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4715 5179  
     5180 +        /* Is it time to commit this pbuf? */
     5181 +        if (L2PBUF_IS_FULL(pb) &&
     5182 +            dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) {
     5183 +                l2arc_pbuf_commit(dev, pio, cb);
     5184 +                l2arc_pbuf_destroy(pb);
     5185 +                l2arc_pbuf_init(pb);
     5186 +        }
     5187 +
4716 5188          /*
4717 5189           * Bump device hand to the device start if it is approaching the end.
4718 5190           * l2arc_evict() will already have evicted ahead for this case.
4719 5191           */
4720 5192          if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4721 5193                  vdev_space_update(dev->l2ad_vdev,
4722 5194                      dev->l2ad_end - dev->l2ad_hand, 0, 0);
4723 5195                  dev->l2ad_hand = dev->l2ad_start;
4724 5196                  dev->l2ad_evict = dev->l2ad_start;
4725 5197                  dev->l2ad_first = B_FALSE;
↓ open down ↓ 261 lines elided ↑ open up ↑
4987 5459                  if (dev->l2ad_vdev == vd)
4988 5460                          break;
4989 5461          }
4990 5462          mutex_exit(&l2arc_dev_mtx);
4991 5463  
4992 5464          return (dev != NULL);
4993 5465  }
4994 5466  
4995 5467  /*
4996 5468   * Add a vdev for use by the L2ARC.  By this point the spa has already
4997      - * validated the vdev and opened it.
     5469 + * validated the vdev and opened it. The `rebuild' flag indicates whether
     5470 + * we should attempt an L2ARC persistency rebuild.
4998 5471   */
4999 5472  void
5000      -l2arc_add_vdev(spa_t *spa, vdev_t *vd)
     5473 +l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5001 5474  {
5002 5475          l2arc_dev_t *adddev;
5003 5476  
5004 5477          ASSERT(!l2arc_vdev_present(vd));
5005 5478  
5006 5479          /*
5007 5480           * Create a new l2arc device entry.
5008 5481           */
5009 5482          adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5010 5483          adddev->l2ad_spa = spa;
5011 5484          adddev->l2ad_vdev = vd;
5012      -        adddev->l2ad_start = VDEV_LABEL_START_SIZE;
     5485 +        adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE;
5013 5486          adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5014 5487          adddev->l2ad_hand = adddev->l2ad_start;
5015 5488          adddev->l2ad_evict = adddev->l2ad_start;
5016 5489          adddev->l2ad_first = B_TRUE;
5017 5490          adddev->l2ad_writing = B_FALSE;
     5491 +        l2arc_pbuf_init(&adddev->l2ad_pbuf);
5018 5492  
5019 5493          /*
5020 5494           * This is a list of all ARC buffers that are still valid on the
5021 5495           * device.
5022 5496           */
5023 5497          adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5024 5498          list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5025 5499              offsetof(arc_buf_hdr_t, b_l2node));
5026 5500  
5027 5501          vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5028 5502  
5029 5503          /*
5030 5504           * Add device to global list
5031 5505           */
5032 5506          mutex_enter(&l2arc_dev_mtx);
5033 5507          list_insert_head(l2arc_dev_list, adddev);
5034 5508          atomic_inc_64(&l2arc_ndev);
     5509 +        if (rebuild && l2arc_rebuild_enabled) {
     5510 +                adddev->l2ad_rebuilding = B_TRUE;
     5511 +                (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev,
     5512 +                    0, &p0, TS_RUN, minclsyspri);
     5513 +        }
5035 5514          mutex_exit(&l2arc_dev_mtx);
5036 5515  }
5037 5516  
5038 5517  /*
5039 5518   * Remove a vdev from the L2ARC.
5040 5519   */
5041 5520  void
5042 5521  l2arc_remove_vdev(vdev_t *vd)
5043 5522  {
5044 5523          l2arc_dev_t *dev, *nextdev, *remdev = NULL;
↓ open down ↓ 15 lines elided ↑ open up ↑
5060 5539           * Remove device from global list
5061 5540           */
5062 5541          list_remove(l2arc_dev_list, remdev);
5063 5542          l2arc_dev_last = NULL;          /* may have been invalidated */
5064 5543          atomic_dec_64(&l2arc_ndev);
5065 5544          mutex_exit(&l2arc_dev_mtx);
5066 5545  
5067 5546          /*
5068 5547           * Clear all buflists and ARC references.  L2ARC device flush.
5069 5548           */
     5549 +        l2arc_pbuf_destroy(&remdev->l2ad_pbuf);
5070 5550          l2arc_evict(remdev, 0, B_TRUE);
5071 5551          list_destroy(remdev->l2ad_buflist);
5072 5552          kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5073 5553          kmem_free(remdev, sizeof (l2arc_dev_t));
5074 5554  }
5075 5555  
5076 5556  void
5077 5557  l2arc_init(void)
5078 5558  {
5079 5559          l2arc_thread_exit = 0;
↓ open down ↓ 51 lines elided ↑ open up ↑
5131 5611  {
5132 5612          if (!(spa_mode_global & FWRITE))
5133 5613                  return;
5134 5614  
5135 5615          mutex_enter(&l2arc_feed_thr_lock);
5136 5616          cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
5137 5617          l2arc_thread_exit = 1;
5138 5618          while (l2arc_thread_exit != 0)
5139 5619                  cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5140 5620          mutex_exit(&l2arc_feed_thr_lock);
     5621 +}
     5622 +
     5623 +/*
     5624 + * Main entry point for L2ARC metadata rebuilding. This function must be
     5625 + * called via thread_create so that the L2ARC metadata rebuild doesn't block
     5626 + * pool import and may proceed in parallel on all available L2ARC devices.
     5627 + */
     5628 +static void
     5629 +l2arc_rebuild_start(l2arc_dev_t *dev)
     5630 +{
     5631 +        vdev_t *vd = dev->l2ad_vdev;
     5632 +        spa_t *spa = dev->l2ad_spa;
     5633 +
     5634 +        /* Lock out device removal. */
     5635 +        spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
     5636 +        ASSERT(dev->l2ad_rebuilding == B_TRUE);
     5637 +        l2arc_rebuild(dev);
     5638 +        dev->l2ad_rebuilding = B_FALSE;
     5639 +        spa_config_exit(spa, SCL_L2ARC, vd);
     5640 +        thread_exit();
     5641 +}
     5642 +
     5643 +/*
     5644 + * This function implements the actual L2ARC metadata rebuild. It:
     5645 + *
     5646 + * 1) scans the device for valid l2uberblocks
     5647 + * 2) if it finds a good uberblock, starts reading the pbuf chain
     5648 + * 3) restores each pbuf's contents to memory
     5649 + *
     5650 + * Operation stops under any of the following conditions:
     5651 + *
     5652 + * 1) We reach the end of the pbuf chain (the previous-buffer reference
     5653 + *    in the pbuf is zero).
     5654 + * 2) We encounter *any* error condition (cksum errors, io errors, looped
     5655 + *    pbufs, etc.).
     5656 + * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
     5657 + *    from making severely fragmented L2ARC pbufs or slow L2ARC devices
     5658 + *    prevent a machine from importing the pool (and letting the
     5659 + *    administrator take corrective action, e.g. by kicking the misbehaving
     5660 + *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
     5661 + *    rebuilding disabled).
     5662 + */
     5663 +static void
     5664 +l2arc_rebuild(l2arc_dev_t *dev)
     5665 +{
     5666 +        int err;
     5667 +        l2uberblock_t ub;
     5668 +        l2pbuf_t pb;
     5669 +        zio_t *this_io = NULL, *next_io = NULL;
     5670 +        int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
     5671 +
     5672 +        if ((err = l2arc_uberblock_find(dev, &ub)) != 0)
     5673 +                return;
     5674 +        L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */);
     5675 +
     5676 +        /* set up uberblock update info */
     5677 +        dev->l2ad_uberblock_birth = ub.ub_birth + 1;
     5678 +
     5679 +        /* initial sanity checks */
     5680 +        l2arc_pbuf_init(&pb);
     5681 +        if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize,
     5682 +            ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) {
     5683 +                /* root pbuf is bad, we can't do anything about that */
     5684 +                if (err == EINVAL) {
     5685 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
     5686 +                } else {
     5687 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
     5688 +                }
     5689 +                l2arc_pbuf_destroy(&pb);
     5690 +                return;
     5691 +        }
     5692 +        L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
     5693 +
     5694 +        dev->l2ad_evict = ub.ub_evict_tail;
     5695 +
     5696 +        /* keep on chaining in new blocks */
     5697 +        dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr;
     5698 +        dev->l2ad_pbuf_asize = ub.ub_pbuf_asize;
     5699 +        dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum;
     5700 +        dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
     5701 +            ub.ub_pbuf_daddr + ub.ub_pbuf_asize);
     5702 +        dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0);
     5703 +
     5704 +        /* start the rebuild process */
     5705 +        for (;;) {
     5706 +                l2pbuf_t pb_prev;
     5707 +
     5708 +                l2arc_pbuf_init(&pb_prev);
     5709 +                if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr,
     5710 +                    pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io,
     5711 +                    &next_io)) != 0) {
     5712 +                        /*
     5713 +                         * We are done reading, discard the last good buffer.
     5714 +                         */
     5715 +                        if (pb.pb_prev_daddr > dev->l2ad_hand &&
     5716 +                            pb.pb_prev_asize > L2PBUF_HDR_SIZE) {
     5717 +                                /* this is an error, we stopped too early */
     5718 +                                if (err == EINVAL) {
     5719 +                                        ARCSTAT_BUMP(
     5720 +                                            arcstat_l2_rebuild_cksum_errors);
     5721 +                                } else {
     5722 +                                        ARCSTAT_BUMP(
     5723 +                                            arcstat_l2_rebuild_io_errors);
     5724 +                                }
     5725 +                        }
     5726 +                        l2arc_pbuf_destroy(&pb_prev);
     5727 +                        l2arc_pbuf_destroy(&pb);
     5728 +                        break;
     5729 +                }
     5730 +
     5731 +                /*
     5732 +                 * Protection against infinite loops of pbufs. This is also
     5733 +                 * our primary termination mechanism - once the buffer list
     5734 +                 * loops around our starting pbuf, we can stop.
     5735 +                 */
     5736 +                if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr &&
     5737 +                    pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) {
     5738 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
     5739 +                        l2arc_pbuf_destroy(&pb);
     5740 +                        l2arc_pbuf_destroy(&pb_prev);
     5741 +                        if (next_io)
     5742 +                                l2arc_pbuf_prefetch_abort(next_io);
     5743 +                        return;
     5744 +                }
     5745 +
     5746 +                /*
     5747 +                 * Our memory pressure valve. If the system is running low
     5748 +                 * on memory, rather than swamping memory with new ARC buf
     5749 +                 * hdrs, we opt not to reconstruct the L2ARC. At this point,
     5750 +                 * however, we have already set up our L2ARC dev to chain in
     5751 +                 * new metadata pbufs, so the user may choose to re-add the
     5752 +                 * L2ARC dev at a later time to reconstruct it (when there's
     5753 +                 * less memory pressure).
     5754 +                 */
     5755 +                if (arc_reclaim_needed()) {
     5756 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
     5757 +                        cmn_err(CE_NOTE, "System running low on memory, "
     5758 +                            "aborting L2ARC rebuild.");
     5759 +                        l2arc_pbuf_destroy(&pb);
     5760 +                        l2arc_pbuf_destroy(&pb_prev);
     5761 +                        if (next_io)
     5762 +                                l2arc_pbuf_prefetch_abort(next_io);
     5763 +                        break;
     5764 +                }
     5765 +
     5766 +                /*
     5767 +                 * Now that we know that the prev_pbuf checks out alright, we
     5768 +                 * can start reconstruction from this pbuf - we can be sure
     5769 +                 * that the L2ARC write hand has not yet reached any of our
     5770 +                 * buffers.
     5771 +                 */
     5772 +                l2arc_pbuf_restore(dev, &pb);
     5773 +
     5774 +                /* pbuf restored, continue with next one in the list */
     5775 +                l2arc_pbuf_destroy(&pb);
     5776 +                pb = pb_prev;
     5777 +                this_io = next_io;
     5778 +                next_io = NULL;
     5779 +
     5780 +                L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
     5781 +        }
     5782 +
     5783 +        ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
     5784 +}
     5785 +
     5786 +/*
     5787 + * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries
     5788 + * which only contain an l2arc hdr, essentially restoring the buffers to
     5789 + * their L2ARC evicted state. This function also updates space usage on the
     5790 + * L2ARC vdev to make sure it tracks restored buffers.
     5791 + */
     5792 +static void
     5793 +l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb)
     5794 +{
     5795 +        spa_t *spa;
     5796 +        uint64_t guid;
     5797 +        list_t *buflists_list;
     5798 +        l2pbuf_buflist_t *buflist;
     5799 +
     5800 +        mutex_enter(&l2arc_buflist_mtx);
     5801 +        spa = dev->l2ad_vdev->vdev_spa;
     5802 +        guid = spa_load_guid(spa);
     5803 +        buflists_list = pb->pb_buflists_list;
     5804 +        for (buflist = list_head(buflists_list); buflist;
     5805 +            buflist = list_next(buflists_list, buflist)) {
     5806 +                int i;
     5807 +                uint64_t size, asize, psize;
     5808 +
     5809 +                size = asize = psize = 0;
     5810 +                for (i = 0; i < buflist->l2pbl_nbufs; i++) {
     5811 +                        l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev,
     5812 +                            guid);
     5813 +                        size += buflist->l2pbl_bufs[i].b_size;
     5814 +                        asize += buflist->l2pbl_bufs[i].b_l2asize;
     5815 +                        psize += vdev_psize_to_asize(dev->l2ad_vdev,
     5816 +                            buflist->l2pbl_bufs[i].b_l2asize);
     5817 +                }
     5818 +                ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size);
     5819 +                ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize);
     5820 +                ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs);
     5821 +                vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
     5822 +        }
     5823 +        mutex_exit(&l2arc_buflist_mtx);
     5824 +        ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs);
     5825 +        vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev,
     5826 +            pb->pb_asize), 0, 0);
     5827 +}
     5828 +
     5829 +/*
     5830 + * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into
     5831 + * a state indicating that it has been evicted to L2ARC.
     5832 + * The `guid' here is the ARC-load-guid from spa_load_guid.
     5833 + */
     5834 +static void
     5835 +l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid)
     5836 +{
     5837 +        arc_buf_hdr_t *hdr;
     5838 +        kmutex_t *hash_lock;
     5839 +        dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]};
     5840 +
     5841 +        hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock);
     5842 +        if (hdr == NULL) {
     5843 +                /* not in cache, try to insert */
     5844 +                arc_buf_hdr_t *exists;
     5845 +                arc_buf_contents_t type = buf->b_contents_type;
     5846 +                l2arc_buf_hdr_t *l2hdr;
     5847 +
     5848 +                hdr = arc_buf_hdr_alloc(guid, buf->b_size, type);
     5849 +                hdr->b_dva = buf->b_dva;
     5850 +                hdr->b_birth = buf->b_birth;
     5851 +                hdr->b_cksum0 = buf->b_cksum0;
     5852 +                hdr->b_size = buf->b_size;
     5853 +                exists = buf_hash_insert(hdr, &hash_lock);
     5854 +                if (exists) {
     5855 +                        /* somebody beat us to the hash insert */
     5856 +                        mutex_exit(hash_lock);
     5857 +                        arc_hdr_destroy(hdr);
     5858 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
     5859 +                        return;
     5860 +                }
     5861 +                hdr->b_flags = buf->b_flags;
     5862 +                mutex_enter(&hdr->b_freeze_lock);
     5863 +                ASSERT(hdr->b_freeze_cksum == NULL);
     5864 +                hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
     5865 +                    KM_SLEEP);
     5866 +                *hdr->b_freeze_cksum = buf->b_freeze_cksum;
     5867 +                mutex_exit(&hdr->b_freeze_lock);
     5868 +
     5869 +                /* now rebuild the l2arc entry */
     5870 +                ASSERT(hdr->b_l2hdr == NULL);
     5871 +                l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
     5872 +                l2hdr->b_dev = dev;
     5873 +                l2hdr->b_daddr = buf->b_l2daddr;
     5874 +                l2hdr->b_asize = buf->b_l2asize;
     5875 +                l2hdr->b_compress = buf->b_l2compress;
     5876 +                hdr->b_l2hdr = l2hdr;
     5877 +                list_insert_head(dev->l2ad_buflist, hdr);
     5878 +                ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
     5879 +                ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
     5880 +
     5881 +                arc_change_state(arc_l2c_only, hdr, hash_lock);
     5882 +        }
     5883 +        mutex_exit(hash_lock);
     5884 +}
     5885 +
     5886 +/*
     5887 + * Attempts to locate and read the newest valid uberblock on the provided
     5888 + * L2ARC device and writes it to `ub'. On success, this function returns 0,
     5889 + * otherwise the appropriate error code is returned.
     5890 + */
     5891 +static int
     5892 +l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub)
     5893 +{
     5894 +        int err = 0;
     5895 +        uint8_t *ub_buf;
     5896 +        uint64_t guid;
     5897 +
     5898 +        ARCSTAT_BUMP(arcstat_l2_rebuild_attempts);
     5899 +        ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
     5900 +        guid = spa_guid(dev->l2ad_vdev->vdev_spa);
     5901 +
     5902 +        if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
     5903 +            VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf,
     5904 +            ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
     5905 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
     5906 +            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
     5907 +                ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
     5908 +                goto cleanup;
     5909 +        }
     5910 +
     5911 +        /*
     5912 +         * Initial peek - does the device even have any usable uberblocks?
     5913 +         * If not, don't bother continuing.
     5914 +         */
     5915 +        l2arc_uberblock_decode(ub_buf, ub);
     5916 +        if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 ||
     5917 +            ub->ub_version > L2UBERBLOCK_MAX_VERSION ||
     5918 +            ub->ub_spa_guid != guid) {
     5919 +                err = ENOTSUP;
     5920 +                ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported);
     5921 +                goto cleanup;
     5922 +        }
     5923 +
     5924 +        /* now check to make sure that what we selected is okay */
     5925 +        if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) {
     5926 +                if (err == EINVAL) {
     5927 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
     5928 +                } else {
     5929 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors);
     5930 +                }
     5931 +                goto cleanup;
     5932 +        }
     5933 +
     5934 +        /* this uberblock is valid */
     5935 +
     5936 +cleanup:
     5937 +        kmem_free(ub_buf, L2UBERBLOCK_SIZE);
     5938 +        return (err);
     5939 +}
     5940 +
     5941 +/*
     5942 + * Reads a pbuf from storage, decodes it and validates its contents against
     5943 + * the provided checksum. The result is placed in `pb'.
     5944 + *
     5945 + * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching.
     5946 + * When issuing the first pbuf IO during rebuild, you should pass NULL for
     5947 + * `this_io'. This function will then issue a sync IO to read the pbuf and
     5948 + * also issue an async IO to fetch the next pbuf in the pbuf chain. The
     5949 + * prefetch IO is returned in `prefetch_io. On subsequent calls to this
     5950 + * function, pass the value returned in `prefetch_io' from the previous
     5951 + * call as `this_io' and a fresh `prefetch_io' pointer to hold the next
     5952 + * prefetch IO. Prior to the call, you should initialize your `prefetch_io'
     5953 + * pointer to be NULL. If no prefetch IO was issued, the pointer is left
     5954 + * set at NULL.
     5955 + *
     5956 + * Actual prefetching takes place in two steps: a header IO (pi_hdr_io)
     5957 + * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io
     5958 + * IO is used internally in this function to be able to `peek' at the next
     5959 + * buffer's header before the main IO to read it in completely has finished.
     5960 + * We can then begin to issue the IO for the next buffer in the chain before
     5961 + * we are done reading, keeping the L2ARC device's pipeline saturated with
     5962 + * reads (rather than issuing an IO, waiting for it to complete, validating
     5963 + * the returned buffer and issuing the next one). This will make sure that
     5964 + * the rebuild proceeds at maximum read throughput.
     5965 + *
     5966 + * On success, this function returns 0, otherwise it returns an appropriate
     5967 + * error code. On error the prefetching IO is aborted and cleared before
     5968 + * returning from this function. Therefore, if we return `success', the
     5969 + * caller can assume that we have taken care of cleanup of prefetch IOs.
     5970 + */
     5971 +static int
     5972 +l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
     5973 +    zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io)
     5974 +{
     5975 +        int err = 0;
     5976 +        uint64_t prev_pb_start;
     5977 +        uint32_t prev_pb_asize;
     5978 +        zio_cksum_t calc_cksum, prev_pb_cksum;
     5979 +        l2arc_prefetch_info_t *pi = NULL;
     5980 +
     5981 +        ASSERT(dev != NULL);
     5982 +        ASSERT(pb != NULL);
     5983 +        ASSERT(*prefetch_io == NULL);
     5984 +
     5985 +        if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) {
     5986 +                /* We could not have issued a prefetch IO for this */
     5987 +                ASSERT(this_io == NULL);
     5988 +                return (EINVAL);
     5989 +        }
     5990 +
     5991 +        /*
     5992 +         * Check to see if we have issued the IO for this pbuf in a previous
     5993 +         * run. If not, issue it now.
     5994 +         */
     5995 +        if (this_io == NULL)
     5996 +                this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize);
     5997 +
     5998 +        /* Pick up the prefetch info buffer and read its contents */
     5999 +        pi = this_io->io_private;
     6000 +        ASSERT(pi != NULL);
     6001 +        ASSERT(asize <= pi->pi_buflen);
     6002 +
     6003 +        /* Wait for the IO to read this pbuf's header to complete */
     6004 +        if ((err = zio_wait(pi->pi_hdr_io)) != 0) {
     6005 +                (void) zio_wait(this_io);
     6006 +                goto cleanup;
     6007 +        }
     6008 +
     6009 +        /*
     6010 +         * Peek to see if we can start issuing the next pbuf IO immediately.
     6011 +         * At this point, only the current pbuf's header has been read.
     6012 +         */
     6013 +        if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start,
     6014 +            &prev_pb_asize, &prev_pb_cksum) == 0) {
     6015 +                uint64_t this_pb_start, this_pb_end, prev_pb_end;
     6016 +                /* Detect malformed pbuf references and loops */
     6017 +                this_pb_start = daddr;
     6018 +                this_pb_end = daddr + asize;
     6019 +                prev_pb_end = prev_pb_start + prev_pb_asize;
     6020 +                if ((prev_pb_start >= this_pb_start && prev_pb_start <
     6021 +                    this_pb_end) ||
     6022 +                    (prev_pb_end >= this_pb_start && prev_pb_end <
     6023 +                    this_pb_end)) {
     6024 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
     6025 +                        cmn_err(CE_WARN, "Looping L2ARC metadata reference "
     6026 +                            "detected, aborting rebuild.");
     6027 +                        err = EINVAL;
     6028 +                        goto cleanup;
     6029 +                }
     6030 +                /*
     6031 +                 * Start issuing IO for the next pbuf early - this should
     6032 +                 * help keep the L2ARC device busy while we read, decode
     6033 +                 * and restore this pbuf.
     6034 +                 */
     6035 +                if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize))
     6036 +                        *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev,
     6037 +                            prev_pb_start, prev_pb_asize);
     6038 +        }
     6039 +
     6040 +        /* Wait for the main pbuf IO to complete */
     6041 +        if ((err = zio_wait(this_io)) != 0)
     6042 +                goto cleanup;
     6043 +
     6044 +        /* Make sure the buffer checks out ok */
     6045 +        fletcher_4_native(pi->pi_buf, asize, &calc_cksum);
     6046 +        if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) {
     6047 +                err = EINVAL;
     6048 +                goto cleanup;
     6049 +        }
     6050 +
     6051 +        /* Now we can take our time decoding this buffer */
     6052 +        if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0)
     6053 +                goto cleanup;
     6054 +
     6055 +        /* This will be used in l2arc_pbuf_restore for space accounting */
     6056 +        pb->pb_asize = asize;
     6057 +
     6058 +        ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb));
     6059 +        ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize);
     6060 +        ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
     6061 +            pb->pb_payload_asz / asize);
     6062 +
     6063 +cleanup:
     6064 +        kmem_free(pi->pi_buf, pi->pi_buflen);
     6065 +        pi->pi_buf = NULL;
     6066 +        kmem_free(pi, sizeof (l2arc_prefetch_info_t));
     6067 +        /* Abort an in-flight prefetch in case of error */
     6068 +        if (err != 0 && *prefetch_io != NULL) {
     6069 +                l2arc_pbuf_prefetch_abort(*prefetch_io);
     6070 +                *prefetch_io = NULL;
     6071 +        }
     6072 +        return (err);
     6073 +}
     6074 +
     6075 +/*
     6076 + * Validates a pbuf device address to make sure that it can be read
     6077 + * from the provided L2ARC device. Returns 1 if the address is within
     6078 + * the device's bounds, or 0 if not.
     6079 + */
     6080 +static int
     6081 +l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize)
     6082 +{
     6083 +        uint32_t psize;
     6084 +        uint64_t end;
     6085 +
     6086 +        psize = vdev_psize_to_asize(dev->l2ad_vdev, asize);
     6087 +        end = daddr + psize;
     6088 +
     6089 +        if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE ||
     6090 +            asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start ||
     6091 +            /* check that the buffer address is correctly aligned */
     6092 +            (daddr & (vdev_psize_to_asize(dev->l2ad_vdev,
     6093 +            SPA_MINBLOCKSIZE) - 1)) != 0)
     6094 +                return (0);
     6095 +        else
     6096 +                return (1);
     6097 +}
     6098 +
     6099 +/*
     6100 + * Starts an asynchronous read IO to read a pbuf. This is used in pbuf
     6101 + * reconstruction to start reading the next pbuf before we are done
     6102 + * decoding and reconstructing the current pbuf, to keep the l2arc device
     6103 + * nice and hot with read IO to process.
     6104 + * The returned zio will contain a newly allocated memory buffers for the IO
     6105 + * data which should then be freed by the caller once the zio is no longer
     6106 + * needed (i.e. due to it having completed). If you wish to abort this
     6107 + * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care
     6108 + * of disposing of the allocated buffers correctly.
     6109 + */
     6110 +static zio_t *
     6111 +l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize)
     6112 +{
     6113 +        uint32_t i, psize;
     6114 +        zio_t *pio, *hdr_io;
     6115 +        uint64_t hdr_rsize;
     6116 +        uint8_t *buf;
     6117 +        l2arc_prefetch_info_t *pinfo;
     6118 +
     6119 +        psize = vdev_psize_to_asize(vd, asize);
     6120 +        buf = kmem_alloc(psize, KM_SLEEP);
     6121 +        pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP);
     6122 +        pinfo->pi_buf = buf;
     6123 +        pinfo->pi_buflen = psize;
     6124 +
     6125 +        /*
     6126 +         * We start issuing the IO for the pbuf header early. This
     6127 +         * allows l2arc_pbuf_read to start issuing IO for the next
     6128 +         * buffer before the current pbuf is read in completely.
     6129 +         */
     6130 +
     6131 +        hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE);
     6132 +        ASSERT(hdr_rsize <= psize);
     6133 +        pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL,
     6134 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
     6135 +            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
     6136 +        hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf,
     6137 +            ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
     6138 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
     6139 +            ZIO_FLAG_DONT_RETRY, B_FALSE);
     6140 +        (void) zio_nowait(hdr_io);
     6141 +
     6142 +        /*
     6143 +         * Read in the rest of the pbuf - this can take longer than just
     6144 +         * having a peek at the header.
     6145 +         */
     6146 +        pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE |
     6147 +            ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
     6148 +            ZIO_FLAG_DONT_RETRY);
     6149 +        for (i = hdr_rsize; i < psize; ) {
     6150 +                uint64_t rsize = psize - i;
     6151 +                zio_t *rzio;
     6152 +
     6153 +                if (psize - i > SPA_MAXBLOCKSIZE)
     6154 +                        rsize = SPA_MAXBLOCKSIZE;
     6155 +                ASSERT(rsize >= SPA_MINBLOCKSIZE);
     6156 +                rzio = zio_read_phys(pio, vd, daddr + i,
     6157 +                    rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
     6158 +                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE |
     6159 +                    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
     6160 +                    ZIO_FLAG_DONT_RETRY, B_FALSE);
     6161 +                (void) zio_nowait(rzio);
     6162 +                i += rsize;
     6163 +        }
     6164 +
     6165 +        return (pio);
     6166 +}
     6167 +
     6168 +/*
     6169 + * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data
     6170 + * buffers allocated for it.
     6171 + */
     6172 +static void
     6173 +l2arc_pbuf_prefetch_abort(zio_t *zio)
     6174 +{
     6175 +        l2arc_prefetch_info_t *pi;
     6176 +
     6177 +        pi = zio->io_private;
     6178 +        ASSERT(pi != NULL);
     6179 +        if (pi->pi_hdr_io != NULL)
     6180 +                (void) zio_wait(pi->pi_hdr_io);
     6181 +        (void) zio_wait(zio);
     6182 +        kmem_free(pi->pi_buf, pi->pi_buflen);
     6183 +        pi->pi_buf = NULL;
     6184 +        kmem_free(pi, sizeof (l2arc_prefetch_info_t));
     6185 +}
     6186 +
     6187 +/*
     6188 + * Encodes an l2uberblock_t structure into a destination buffer. This
     6189 + * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting
     6190 + * uberblock is always of this constant size.
     6191 + */
     6192 +static void
     6193 +l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf)
     6194 +{
     6195 +        zio_cksum_t cksum;
     6196 +
     6197 +        bzero(buf, L2UBERBLOCK_SIZE);
     6198 +
     6199 +#if defined(_BIG_ENDIAN)
     6200 +        *(uint32_t *)buf = L2UBERBLOCK_MAGIC;
     6201 +        *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN;
     6202 +#else   /* !defined(_BIG_ENDIAN) */
     6203 +        *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC);
     6204 +        /* zero flags is ok */
     6205 +#endif  /* !defined(_BIG_ENDIAN) */
     6206 +        buf[4] = L2UBERBLOCK_MAX_VERSION;
     6207 +
     6208 +        /* rest in native byte order */
     6209 +        *(uint64_t *)(buf + 8) = ub->ub_spa_guid;
     6210 +        *(uint64_t *)(buf + 16) = ub->ub_birth;
     6211 +        *(uint64_t *)(buf + 24) = ub->ub_evict_tail;
     6212 +        *(uint64_t *)(buf + 32) = ub->ub_alloc_space;
     6213 +        *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr;
     6214 +        *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize;
     6215 +        bcopy(&ub->ub_pbuf_cksum, buf + 52, 32);
     6216 +
     6217 +        fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
     6218 +        bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32);
     6219 +}
     6220 +
     6221 +/*
     6222 + * Decodes an l2uberblock_t from an on-disk representation. Please note
     6223 + * that this function does not perform any uberblock validation and
     6224 + * checksumming - call l2arc_uberblock_verify() for that.
     6225 + */
     6226 +static void
     6227 +l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub)
     6228 +{
     6229 +        boolean_t bswap_needed;
     6230 +
     6231 +        /* these always come in big endian */
     6232 +#if defined(_BIG_ENDIAN)
     6233 +        ub->ub_magic = *(uint32_t *)buf;
     6234 +        ub->ub_flags = *(uint16_t *)(buf + 6);
     6235 +        bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1);
     6236 +#else   /* !defined(_BIG_ENDIAN) */
     6237 +        ub->ub_magic = BSWAP_32(*(uint32_t *)buf);
     6238 +        ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6));
     6239 +        bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0);
     6240 +#endif  /* !defined(_BIG_ENDIAN) */
     6241 +        ub->ub_version = buf[4];
     6242 +
     6243 +        ub->ub_spa_guid = *(uint64_t *)(buf + 8);
     6244 +        ub->ub_birth = *(uint64_t *)(buf + 16);
     6245 +        ub->ub_evict_tail = *(uint64_t *)(buf + 24);
     6246 +        ub->ub_alloc_space = *(uint64_t *)(buf + 32);
     6247 +        ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40);
     6248 +        ub->ub_pbuf_asize = *(uint32_t *)(buf + 48);
     6249 +        bcopy(buf + 52, &ub->ub_pbuf_cksum, 36);
     6250 +        bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32);
     6251 +
     6252 +        /* swap the rest if endianness doesn't match us */
     6253 +        if (bswap_needed) {
     6254 +                ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid);
     6255 +                ub->ub_birth = BSWAP_64(ub->ub_birth);
     6256 +                ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail);
     6257 +                ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space);
     6258 +                ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr);
     6259 +                ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize);
     6260 +                ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum);
     6261 +                ZIO_CHECKSUM_BSWAP(&ub->ub_cksum);
     6262 +        }
     6263 +}
     6264 +
     6265 +/*
     6266 + * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is
     6267 + * valid and matches its checksum.
     6268 + */
     6269 +static int
     6270 +l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
     6271 +    uint64_t guid)
     6272 +{
     6273 +        zio_cksum_t cksum;
     6274 +
     6275 +        if (ub->ub_magic != L2UBERBLOCK_MAGIC ||
     6276 +            ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION)
     6277 +                /*
     6278 +                 * bad magic or invalid version => persistent l2arc not
     6279 +                 * supported
     6280 +                 */
     6281 +                return (ENOTSUP);
     6282 +
     6283 +        if (ub->ub_spa_guid != guid)
     6284 +                /* this l2arc dev isn't ours */
     6285 +                return (EINVAL);
     6286 +
     6287 +        fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
     6288 +        if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum))
     6289 +                /* bad checksum, corrupt uberblock */
     6290 +                return (EINVAL);
     6291 +
     6292 +        return (0);
     6293 +}
     6294 +
     6295 +/*
     6296 + * Schedules a zio to update the uberblock on an l2arc device. The zio is
     6297 + * initiated as a child of `pio' and `cb' is filled with the information
     6298 + * needed to free the uberblock data buffer after writing.
     6299 + */
     6300 +static void
     6301 +l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
     6302 +{
     6303 +        uint8_t *ub_buf;
     6304 +        l2uberblock_t ub;
     6305 +        zio_t *wzio;
     6306 +        vdev_stat_t st;
     6307 +
     6308 +        ASSERT(cb->l2wcb_ub_buf == NULL);
     6309 +        vdev_get_stats(dev->l2ad_vdev, &st);
     6310 +
     6311 +        bzero(&ub, sizeof (ub));
     6312 +        ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
     6313 +        ub.ub_birth = dev->l2ad_uberblock_birth++;
     6314 +        ub.ub_evict_tail = dev->l2ad_evict;
     6315 +        ub.ub_alloc_space = st.vs_alloc;
     6316 +        ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr;
     6317 +        ub.ub_pbuf_asize = dev->l2ad_pbuf_asize;
     6318 +        ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum;
     6319 +        if (dev->l2ad_first)
     6320 +                ub.ub_flags |= L2UBLK_EVICT_FIRST;
     6321 +
     6322 +        ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
     6323 +        cb->l2wcb_ub_buf = ub_buf;
     6324 +        l2arc_uberblock_encode(&ub, ub_buf);
     6325 +        wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
     6326 +            L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL,
     6327 +            ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
     6328 +        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
     6329 +            zio_t *, wzio);
     6330 +        (void) zio_nowait(wzio);
     6331 +}
     6332 +
     6333 +/*
     6334 + * Encodes a l2pbuf_t structure into the portable on-disk format. The
     6335 + * `buf' buffer must be suitably sized to hold the entire uncompressed
     6336 + * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function
     6337 + * also compresses the buffer.
     6338 + *
     6339 + * The return value is the length of the resulting encoded pbuf structure.
     6340 + * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression
     6341 + * was applied, or smaller if compression was applied. In either case,
     6342 + * prior to writing to disk, the caller must suitably pad the output
     6343 + * buffer so that it is aligned on a multiple of the underlying storage
     6344 + * system's block size.
     6345 + */
     6346 +static uint32_t
     6347 +l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen)
     6348 +{
     6349 +        uint16_t flags = 0;
     6350 +        uint8_t *dst_buf;
     6351 +        uint32_t enclen;
     6352 +        l2pbuf_buflist_t *buflist;
     6353 +
     6354 +        enclen = L2PBUF_ENCODED_SIZE(pb);
     6355 +        ASSERT(buflen >= enclen);
     6356 +        bzero(buf, enclen);
     6357 +
     6358 +        /* non-header portions of pbufs are in native byte order */
     6359 +        *(uint64_t *)(buf + 8) = pb->pb_prev_daddr;
     6360 +        *(uint32_t *)(buf + 16) = pb->pb_prev_asize;
     6361 +        bcopy(&pb->pb_prev_cksum, buf + 20, 32);
     6362 +        *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE;
     6363 +
     6364 +        /* first we encode the buflists uncompressed */
     6365 +        dst_buf = buf + L2PBUF_HDR_SIZE;
     6366 +        for (buflist = list_head(pb->pb_buflists_list); buflist;
     6367 +            buflist = list_next(pb->pb_buflists_list, buflist)) {
     6368 +                int i;
     6369 +
     6370 +                ASSERT(buflist->l2pbl_nbufs != 0);
     6371 +                for (i = 0; i < buflist->l2pbl_nbufs; i++) {
     6372 +                        l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
     6373 +
     6374 +                        ASSERT(pbl_buf->b_size != 0);
     6375 +                        *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0];
     6376 +                        *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1];
     6377 +                        *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth;
     6378 +                        *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0;
     6379 +                        bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32);
     6380 +                        *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size;
     6381 +                        *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr;
     6382 +                        *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize;
     6383 +                        dst_buf[80] = pbl_buf->b_l2compress;
     6384 +                        dst_buf[81] = pbl_buf->b_contents_type;
     6385 +                        *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags;
     6386 +                        dst_buf += L2PBUF_BUF_SIZE;
     6387 +                }
     6388 +        }
     6389 +        ASSERT((uint32_t)(dst_buf - buf) == enclen);
     6390 +
     6391 +        /* and then compress them if necessary */
     6392 +        if (enclen >= l2arc_pbuf_compress_minsz) {
     6393 +                uint8_t *cbuf;
     6394 +                size_t slen, clen;
     6395 +
     6396 +                slen = l2arc_pbuf_items_encoded_size(pb);
     6397 +                cbuf = kmem_alloc(slen, KM_SLEEP);
     6398 +                clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0);
     6399 +                ASSERT(clen != 0);
     6400 +                if (clen < slen) {
     6401 +                        bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen);
     6402 +                        flags |= L2PBUF_COMPRESSED;
     6403 +                        /* zero out the rest of the input buffer */
     6404 +                        bzero(buf + L2PBUF_HDR_SIZE + clen,
     6405 +                            buflen - (L2PBUF_HDR_SIZE + clen));
     6406 +                        /* adjust our buffer length now that it's shortened */
     6407 +                        enclen = L2PBUF_HDR_SIZE + clen;
     6408 +                }
     6409 +                kmem_free(cbuf, slen);
     6410 +        }
     6411 +
     6412 +        /* the header goes last since `flags' may change due to compression */
     6413 +#if defined(_BIG_ENDIAN)
     6414 +        *(uint32_t *)buf = L2PBUF_MAGIC;
     6415 +        flags |= L2PBUF_BIG_ENDIAN;
     6416 +        *(uint16_t *)(buf + 6) = flags;
     6417 +#else   /* !defined(_BIG_ENDIAN) */
     6418 +        *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC);
     6419 +        *(uint16_t *)(buf + 6) = BSWAP_16(flags);
     6420 +#endif  /* !defined(_BIG_ENDIAN) */
     6421 +        buf[4] = L2PBUF_MAX_VERSION;
     6422 +
     6423 +        return (enclen);
     6424 +}
     6425 +
     6426 +/*
     6427 + * Decodes a stored l2pbuf_t structure previously encoded using
     6428 + * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf
     6429 + * must be initialized by l2arc_pbuf_init by the caller beforehand, but
     6430 + * must not have been used to store any buffers yet.
     6431 + *
     6432 + * Please note that we don't do checksum verification here, as we don't
     6433 + * know our own checksum (that's know by the previous block in the linked
     6434 + * list, or by the uberblock). This should be performed by the caller
     6435 + * prior to calling l2arc_pbuf_decode.
     6436 + */
     6437 +static int
     6438 +l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb)
     6439 +{
     6440 +        boolean_t bswap_needed;
     6441 +        uint32_t payload_sz, payload_asz;
     6442 +        uint8_t *src_bufs;
     6443 +        l2pbuf_buflist_t *buflist;
     6444 +        int i, nbufs;
     6445 +
     6446 +        ASSERT(input_buf != NULL);
     6447 +        ASSERT(pb != NULL);
     6448 +        ASSERT(pb->pb_version != 0);
     6449 +        ASSERT(pb->pb_nbuflists == 0);
     6450 +
     6451 +        /* no valid buffer can be this small */
     6452 +        if (buflen < L2PBUF_HDR_SIZE)
     6453 +                return (EINVAL);
     6454 +
     6455 +        /* these always come in big endian */
     6456 +#if defined(_BIG_ENDIAN)
     6457 +        pb->pb_magic = *(uint32_t *)input_buf;
     6458 +        pb->pb_flags = *(uint16_t *)(input_buf + 6);
     6459 +        bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1);
     6460 +#else   /* !defined(_BIG_ENDIAN) */
     6461 +        pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf);
     6462 +        pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6));
     6463 +        bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0);
     6464 +#endif  /* !defined(_BIG_ENDIAN) */
     6465 +        pb->pb_version = input_buf[4];
     6466 +
     6467 +        if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0)
     6468 +                return (EINVAL);
     6469 +        if (pb->pb_version > L2PBUF_MAX_VERSION)
     6470 +                return (ENOTSUP);
     6471 +
     6472 +        /* remainder of pbuf may need bswap'ping */
     6473 +        pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8);
     6474 +        pb->pb_prev_asize = *(uint64_t *)(input_buf + 16);
     6475 +        bcopy(input_buf + 20, &pb->pb_prev_cksum, 32);
     6476 +        payload_sz = *(uint32_t *)(input_buf + 52);
     6477 +        payload_asz = buflen - L2PBUF_HDR_SIZE;
     6478 +
     6479 +        if (bswap_needed) {
     6480 +                pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr);
     6481 +                pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize);
     6482 +                ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum);
     6483 +                payload_sz = BSWAP_32(payload_sz);
     6484 +        }
     6485 +
     6486 +        /* check for sensible buffer allocation limits */
     6487 +        if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) ||
     6488 +            (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) ||
     6489 +            (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0)
     6490 +                return (EINVAL);
     6491 +        nbufs = payload_sz / L2PBUF_BUF_SIZE;
     6492 +
     6493 +        /* decompression might be needed */
     6494 +        if (pb->pb_flags & L2PBUF_COMPRESSED) {
     6495 +                src_bufs = kmem_alloc(payload_sz, KM_SLEEP);
     6496 +                if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs,
     6497 +                    payload_asz, payload_sz, 0) != 0) {
     6498 +                        kmem_free(src_bufs, payload_sz);
     6499 +                        return (EINVAL);
     6500 +                }
     6501 +        } else {
     6502 +                src_bufs = input_buf + L2PBUF_HDR_SIZE;
     6503 +        }
     6504 +
     6505 +        /* Decode individual pbuf items from our source buffer. */
     6506 +        buflist = l2arc_pbuf_buflist_alloc(pb, nbufs);
     6507 +        for (i = 0; i < nbufs; i++) {
     6508 +                l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
     6509 +                const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE;
     6510 +
     6511 +                pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src;
     6512 +                pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8);
     6513 +                pbl_buf->b_birth = *(uint64_t *)(src + 16);
     6514 +                pbl_buf->b_cksum0 = *(uint64_t *)(src + 24);
     6515 +                bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32);
     6516 +                pbl_buf->b_size = *(uint32_t *)(src + 64);
     6517 +                pbl_buf->b_l2daddr = *(uint64_t *)(src + 68);
     6518 +                pbl_buf->b_l2asize = *(uint32_t *)(src + 76);
     6519 +                pbl_buf->b_l2compress = src[80];
     6520 +                pbl_buf->b_contents_type = src[81];
     6521 +                pbl_buf->b_flags = *(uint32_t *)(src + 84);
     6522 +
     6523 +                if (bswap_needed) {
     6524 +                        pbl_buf->b_dva.dva_word[0] =
     6525 +                            BSWAP_64(pbl_buf->b_dva.dva_word[0]);
     6526 +                        pbl_buf->b_dva.dva_word[1] =
     6527 +                            BSWAP_64(pbl_buf->b_dva.dva_word[1]);
     6528 +                        pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth);
     6529 +                        pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0);
     6530 +                        ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum);
     6531 +                        pbl_buf->b_size = BSWAP_32(pbl_buf->b_size);
     6532 +                        pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr);
     6533 +                        pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize);
     6534 +                        pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags);
     6535 +                }
     6536 +
     6537 +                pb->pb_payload_asz += pbl_buf->b_l2asize;
     6538 +        }
     6539 +
     6540 +        if (pb->pb_flags & L2PBUF_COMPRESSED)
     6541 +                kmem_free(src_bufs, payload_sz);
     6542 +
     6543 +        return (0);
     6544 +}
     6545 +
     6546 +/*
     6547 + * Decodes the previous buffer pointer encoded in a pbuf. This is used
     6548 + * during L2ARC reconstruction to "peek" at the next buffer and start
     6549 + * issuing IO to fetch it early, before decoding of the current buffer
     6550 + * is done (which can take time due to decompression).
     6551 + * Returns 0 on success (and fills in the return parameters `daddr',
     6552 + * `asize' and `cksum' with the info of the previous pbuf), and an errno
     6553 + * on error.
     6554 + */
     6555 +static int
     6556 +l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr,
     6557 +    uint32_t *asize, zio_cksum_t *cksum)
     6558 +{
     6559 +        boolean_t bswap_needed;
     6560 +        uint16_t version, flags;
     6561 +        uint32_t magic;
     6562 +
     6563 +        ASSERT(buf != NULL);
     6564 +
     6565 +        /* no valid buffer can be this small */
     6566 +        if (buflen <= L2PBUF_HDR_SIZE)
     6567 +                return (EINVAL);
     6568 +
     6569 +        /* these always come in big endian */
     6570 +#if defined(_BIG_ENDIAN)
     6571 +        magic = *(uint32_t *)buf;
     6572 +        flags = *(uint16_t *)(buf + 6);
     6573 +        bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1);
     6574 +#else   /* !defined(_BIG_ENDIAN) */
     6575 +        magic = BSWAP_32(*(uint32_t *)buf);
     6576 +        flags = BSWAP_16(*(uint16_t *)(buf + 6));
     6577 +        bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0);
     6578 +#endif  /* !defined(_BIG_ENDIAN) */
     6579 +        version = buf[4];
     6580 +
     6581 +        if (magic != L2PBUF_MAGIC || version == 0)
     6582 +                return (EINVAL);
     6583 +        if (version > L2PBUF_MAX_VERSION)
     6584 +                return (ENOTSUP);
     6585 +
     6586 +        *daddr = *(uint64_t *)(buf + 4);
     6587 +        *asize = *(uint64_t *)(buf + 12);
     6588 +        bcopy(buf + 16, cksum, 32);
     6589 +
     6590 +        if (bswap_needed) {
     6591 +                *daddr = BSWAP_64(*daddr);
     6592 +                *asize = BSWAP_64(*asize);
     6593 +                ZIO_CHECKSUM_BSWAP(cksum);
     6594 +        }
     6595 +
     6596 +        return (0);
     6597 +}
     6598 +
     6599 +/*
     6600 + * Initializes a pbuf structure into a clean state. All version and flags
     6601 + * fields are filled in as appropriate for this architecture.
     6602 + * If the structure was used before, first call l2arc_pbuf_destroy on it,
     6603 + * as this function assumes the structure is uninitialized.
     6604 + */
     6605 +static void
     6606 +l2arc_pbuf_init(l2pbuf_t *pb)
     6607 +{
     6608 +        bzero(pb, sizeof (l2pbuf_t));
     6609 +        pb->pb_version = L2PBUF_MAX_VERSION;
     6610 +#if defined(_BIG_ENDIAN)
     6611 +        pb->pb_flags |= L2PB_BIG_ENDIAN;
     6612 +#endif
     6613 +        pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP);
     6614 +        list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t),
     6615 +            offsetof(l2pbuf_buflist_t, l2pbl_node));
     6616 +}
     6617 +
     6618 +/*
     6619 + * Destroys a pbuf structure and puts it into a clean state ready to be
     6620 + * initialized by l2arc_pbuf_init. All buflists created by
     6621 + * l2arc_pbuf_buflist_alloc are released as well.
     6622 + */
     6623 +static void
     6624 +l2arc_pbuf_destroy(l2pbuf_t *pb)
     6625 +{
     6626 +        list_t *buflist_list = pb->pb_buflists_list;
     6627 +        l2pbuf_buflist_t *buflist;
     6628 +
     6629 +        while ((buflist = list_head(buflist_list)) != NULL) {
     6630 +                ASSERT(buflist->l2pbl_nbufs > 0);
     6631 +                kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) *
     6632 +                    buflist->l2pbl_nbufs);
     6633 +                list_remove(buflist_list, buflist);
     6634 +                kmem_free(buflist, sizeof (l2pbuf_buflist_t));
     6635 +        }
     6636 +        pb->pb_nbuflists = 0;
     6637 +        list_destroy(pb->pb_buflists_list);
     6638 +        kmem_free(pb->pb_buflists_list, sizeof (list_t));
     6639 +        bzero(pb, sizeof (l2pbuf_t));
     6640 +}
     6641 +
     6642 +/*
     6643 + * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs'
     6644 + * buffers. This is used during the buffer write cycle - each cycle allocates
     6645 + * a new buflist and fills it with buffers it writes. Then, when the pbuf
     6646 + * reaches its buflist limit, it is commited to stable storage.
     6647 + */
     6648 +static l2pbuf_buflist_t *
     6649 +l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs)
     6650 +{
     6651 +        l2pbuf_buflist_t *buflist;
     6652 +
     6653 +        ASSERT(pb->pb_buflists_list != NULL);
     6654 +        buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP);
     6655 +        buflist->l2pbl_nbufs = nbufs;
     6656 +        buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs,
     6657 +            KM_SLEEP);
     6658 +        list_insert_tail(pb->pb_buflists_list, buflist);
     6659 +        pb->pb_nbuflists++;
     6660 +
     6661 +        return (buflist);
     6662 +}
     6663 +
     6664 +/*
     6665 + * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'.
     6666 + * The buffer being inserted must be present in L2ARC.
     6667 + */
     6668 +static void
     6669 +l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
     6670 +    const arc_buf_hdr_t *ab, int index)
     6671 +{
     6672 +        l2pbuf_buf_t *pb_buf;
     6673 +        const l2arc_buf_hdr_t *l2hdr;
     6674 +
     6675 +        l2hdr = ab->b_l2hdr;
     6676 +        ASSERT(l2hdr != NULL);
     6677 +        ASSERT(pbl->l2pbl_nbufs > index);
     6678 +
     6679 +        pb_buf = &pbl->l2pbl_bufs[index];
     6680 +        pb_buf->b_dva = ab->b_dva;
     6681 +        pb_buf->b_birth = ab->b_birth;
     6682 +        pb_buf->b_cksum0 = ab->b_cksum0;
     6683 +        pb_buf->b_freeze_cksum = *ab->b_freeze_cksum;
     6684 +        pb_buf->b_size = ab->b_size;
     6685 +        pb_buf->b_l2daddr = l2hdr->b_daddr;
     6686 +        pb_buf->b_l2asize = l2hdr->b_asize;
     6687 +        pb_buf->b_l2compress = l2hdr->b_compress;
     6688 +        pb_buf->b_contents_type = ab->b_type;
     6689 +        pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS;
     6690 +        pb->pb_payload_asz += l2hdr->b_asize;
     6691 +}
     6692 +
     6693 +/*
     6694 + * Commits a pbuf to stable storage. This routine is invoked when writing
     6695 + * ARC buffers to an L2ARC device. When the pbuf associated with the device
     6696 + * has reached its limits (either in size or in number of writes), it is
     6697 + * scheduled here for writing.
     6698 + * This function allocates some memory to temporarily hold the serialized
     6699 + * buffer to be written. This is then released in l2arc_write_done.
     6700 + */
     6701 +static void
     6702 +l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
     6703 +{
     6704 +        l2pbuf_t *pb = &dev->l2ad_pbuf;
     6705 +        uint64_t i, est_encsize, bufsize, encsize, io_size;
     6706 +        uint8_t *pb_buf;
     6707 +
     6708 +        pb->pb_prev_daddr = dev->l2ad_pbuf_daddr;
     6709 +        pb->pb_prev_asize = dev->l2ad_pbuf_asize;
     6710 +        pb->pb_prev_cksum = dev->l2ad_pbuf_cksum;
     6711 +
     6712 +        est_encsize = L2PBUF_ENCODED_SIZE(pb);
     6713 +        bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize);
     6714 +        pb_buf = kmem_zalloc(bufsize, KM_SLEEP);
     6715 +        encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize);
     6716 +        cb->l2wcb_pbuf = pb_buf;
     6717 +        cb->l2wcb_pbuf_size = bufsize;
     6718 +
     6719 +        dev->l2ad_pbuf_daddr = dev->l2ad_hand;
     6720 +        dev->l2ad_pbuf_asize = encsize;
     6721 +        fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum);
     6722 +
     6723 +        io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize);
     6724 +        for (i = 0; i < io_size; ) {
     6725 +                zio_t *wzio;
     6726 +                uint64_t wsize = io_size - i;
     6727 +
     6728 +                if (wsize > SPA_MAXBLOCKSIZE)
     6729 +                        wsize = SPA_MAXBLOCKSIZE;
     6730 +                ASSERT(wsize >= SPA_MINBLOCKSIZE);
     6731 +                wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i,
     6732 +                    wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
     6733 +                    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
     6734 +                DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
     6735 +                    zio_t *, wzio);
     6736 +                (void) zio_nowait(wzio);
     6737 +                i += wsize;
     6738 +        }
     6739 +
     6740 +        dev->l2ad_hand += io_size;
     6741 +        vdev_space_update(dev->l2ad_vdev, io_size, 0, 0);
     6742 +        l2arc_uberblock_update(dev, pio, cb);
     6743 +
     6744 +        ARCSTAT_INCR(arcstat_l2_write_bytes, io_size);
     6745 +        ARCSTAT_BUMP(arcstat_l2_meta_writes);
     6746 +        ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize);
     6747 +        ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize);
     6748 +        ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
     6749 +            pb->pb_payload_asz / encsize);
     6750 +}
     6751 +
     6752 +/*
     6753 + * Returns the number of bytes occupied by the payload buffer items of
     6754 + * a pbuf in portable (on-disk) encoded form, i.e. the bytes following
     6755 + * L2PBUF_HDR_SIZE.
     6756 + */
     6757 +static uint32_t
     6758 +l2arc_pbuf_items_encoded_size(l2pbuf_t *pb)
     6759 +{
     6760 +        uint32_t size = 0;
     6761 +        l2pbuf_buflist_t *buflist;
     6762 +
     6763 +        for (buflist = list_head(pb->pb_buflists_list); buflist != NULL;
     6764 +            buflist = list_next(pb->pb_buflists_list, buflist))
     6765 +                size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs;
     6766 +
     6767 +        return (size);
5141 6768  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX