Print this page
3525 Persistent L2ARC

@@ -134,10 +134,11 @@
 #include <sys/dnlc.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <zfs_fletcher.h>
+#include <sys/byteorder.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 int arc_procfd;

@@ -305,10 +306,28 @@
         kstat_named_t arcstat_l2_asize;
         kstat_named_t arcstat_l2_hdr_size;
         kstat_named_t arcstat_l2_compress_successes;
         kstat_named_t arcstat_l2_compress_zeros;
         kstat_named_t arcstat_l2_compress_failures;
+        kstat_named_t arcstat_l2_meta_writes;
+        kstat_named_t arcstat_l2_meta_avg_size;
+        kstat_named_t arcstat_l2_meta_avg_asize;
+        kstat_named_t arcstat_l2_asize_to_meta_ratio;
+        kstat_named_t arcstat_l2_rebuild_attempts;
+        kstat_named_t arcstat_l2_rebuild_successes;
+        kstat_named_t arcstat_l2_rebuild_unsupported;
+        kstat_named_t arcstat_l2_rebuild_timeout;
+        kstat_named_t arcstat_l2_rebuild_arc_bytes;
+        kstat_named_t arcstat_l2_rebuild_l2arc_bytes;
+        kstat_named_t arcstat_l2_rebuild_bufs;
+        kstat_named_t arcstat_l2_rebuild_bufs_precached;
+        kstat_named_t arcstat_l2_rebuild_metabufs;
+        kstat_named_t arcstat_l2_rebuild_uberblk_errors;
+        kstat_named_t arcstat_l2_rebuild_io_errors;
+        kstat_named_t arcstat_l2_rebuild_cksum_errors;
+        kstat_named_t arcstat_l2_rebuild_loop_errors;
+        kstat_named_t arcstat_l2_rebuild_abort_lowmem;
         kstat_named_t arcstat_memory_throttle_count;
         kstat_named_t arcstat_duplicate_buffers;
         kstat_named_t arcstat_duplicate_buffers_size;
         kstat_named_t arcstat_duplicate_reads;
         kstat_named_t arcstat_meta_used;

@@ -371,10 +390,28 @@
         { "l2_asize",                   KSTAT_DATA_UINT64 },
         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
+        { "l2_meta_writes",             KSTAT_DATA_UINT64 },
+        { "l2_meta_avg_size",           KSTAT_DATA_UINT64 },
+        { "l2_meta_avg_asize",          KSTAT_DATA_UINT64 },
+        { "l2_asize_to_meta_ratio",     KSTAT_DATA_UINT64 },
+        { "l2_rebuild_attempts",        KSTAT_DATA_UINT64 },
+        { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
+        { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
+        { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
+        { "l2_rebuild_arc_bytes",       KSTAT_DATA_UINT64 },
+        { "l2_rebuild_l2arc_bytes",     KSTAT_DATA_UINT64 },
+        { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
+        { "l2_rebuild_precached",       KSTAT_DATA_UINT64 },
+        { "l2_rebuild_metabufs",        KSTAT_DATA_UINT64 },
+        { "l2_rebuild_uberblk_errors",  KSTAT_DATA_UINT64 },
+        { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
+        { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
+        { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
+        { "l2_rebuild_abort_lowmem",    KSTAT_DATA_UINT64 },
         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
         { "duplicate_reads",            KSTAT_DATA_UINT64 },
         { "arc_meta_used",              KSTAT_DATA_UINT64 },

@@ -418,10 +455,29 @@
                 } else {                                                \
                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
                 }                                                       \
         }
 
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG_FACTOR    3
+#define ARCSTAT_F_AVG(stat, value) \
+        do { \
+                uint64_t x = ARCSTAT(stat); \
+                x = x - x / ARCSTAT_F_AVG_FACTOR + \
+                    (value) / ARCSTAT_F_AVG_FACTOR; \
+                ARCSTAT(stat) = x; \
+                _NOTE(NOTREACHED) \
+                _NOTE(CONSTCOND) \
+        } while (0)
+
 kstat_t                 *arc_ksp;
 static arc_state_t      *arc_anon;
 static arc_state_t      *arc_mru;
 static arc_state_t      *arc_mru_ghost;
 static arc_state_t      *arc_mfu;

@@ -625,23 +681,11 @@
 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 
 /*
  * L2ARC Internals
  */
-typedef struct l2arc_dev {
-        vdev_t                  *l2ad_vdev;     /* vdev */
-        spa_t                   *l2ad_spa;      /* spa */
-        uint64_t                l2ad_hand;      /* next write location */
-        uint64_t                l2ad_start;     /* first addr on device */
-        uint64_t                l2ad_end;       /* last addr on device */
-        uint64_t                l2ad_evict;     /* last addr eviction reached */
-        boolean_t               l2ad_first;     /* first sweep through */
-        boolean_t               l2ad_writing;   /* currently writing */
-        list_t                  *l2ad_buflist;  /* buffer list */
-        list_node_t             l2ad_node;      /* device list node */
-} l2arc_dev_t;
-
+typedef struct l2arc_dev l2arc_dev_t;
 static list_t L2ARC_dev_list;                   /* device list */
 static list_t *l2arc_dev_list;                  /* device list pointer */
 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */

@@ -660,10 +704,13 @@
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
         l2arc_dev_t     *l2wcb_dev;             /* device info */
         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
+        uint8_t         *l2wcb_pbuf;            /* pbuf sent in this write */
+        uint32_t        l2wcb_pbuf_size;        /* size of committed pbuf */
+        uint8_t         *l2wcb_ub_buf;          /* uberblock in this write */
 } l2arc_write_callback_t;
 
 struct l2arc_buf_hdr {
         /* protected by arc_buf_hdr  mutex */
         l2arc_dev_t             *b_dev;         /* L2ARC device */

@@ -687,18 +734,242 @@
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static void l2arc_read_done(zio_t *zio);
-static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_add(boolean_t from_arc);
 static void l2arc_hdr_stat_remove(void);
 
 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
     enum zio_compress c);
 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 
+typedef enum {
+        L2UBLK_BIG_ENDIAN = (1 << 0),   /* little endian assumed otherwise */
+        L2UBLK_EVICT_FIRST = (1 << 1)   /* mirror of l2ad_first in l2dev */
+} l2uberblock_flags_t;
+
+typedef struct l2uberblock {
+        uint32_t                ub_magic;
+        uint8_t                 ub_version;
+        l2uberblock_flags_t     ub_flags;
+
+        uint64_t                ub_spa_guid;
+        uint64_t                ub_birth;
+        uint64_t                ub_evict_tail;  /* current evict pointer */
+        uint64_t                ub_alloc_space; /* vdev space alloc status */
+        uint64_t                ub_pbuf_daddr;  /* address of newest pbuf */
+        uint32_t                ub_pbuf_asize;  /* size of newest pbuf */
+        zio_cksum_t             ub_pbuf_cksum;  /* fletcher4 of newest pbuf */
+
+        zio_cksum_t             ub_cksum;       /* cksum of uberblock */
+} l2uberblock_t;
+
+typedef enum {
+        L2PBUF_BIG_ENDIAN = (1 << 0),   /* little endian assumed otherwise */
+        L2PBUF_COMPRESSED = (1 << 1)    /* pbuf data items are compressed */
+} l2pbuf_flags_t;
+
+typedef struct l2pbuf {
+        uint32_t                pb_magic;
+        unsigned int            pb_version;
+        l2pbuf_flags_t          pb_flags;
+
+        uint64_t                pb_prev_daddr;  /* address of previous pbuf */
+        uint32_t                pb_prev_asize;  /* size of previous pbuf */
+        zio_cksum_t             pb_prev_cksum;  /* fletcher4 of prev. pbuf */
+
+        /*
+         * This is a set of item lists that are contained in this pbuf. Each
+         * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's.
+         * This serves as a soft timeout feature - once the limit of the
+         * number of item lists that a pbuf can hold is reached, the pbuf is
+         * flushed to stable storage, regardless of its total size.
+         */
+        list_t                  *pb_buflists_list;
+
+        /*
+         * Number of compressed bytes referenced by items in this pbuf and
+         * the number of lists present.
+         * This is not actually written to storage, it is only used by
+         * internal algorithms which check for when a pbuf reaches a
+         * certain size limit, after which it is flushed in a write.
+         */
+        uint64_t                pb_payload_asz;
+        /* Same thing for number of buflists */
+        int                     pb_nbuflists;
+
+        /*
+         * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size.
+         * This is then used by l2arc_pbuf_restore to update used space
+         * on the L2ARC vdev.
+         */
+        size_t                  pb_asize;
+} l2pbuf_t;
+
+typedef struct l2pbuf_buf l2pbuf_buf_t;
+typedef struct l2pbuf_buflist {
+        uint32_t                l2pbl_nbufs;
+        l2pbuf_buf_t            *l2pbl_bufs;
+        list_node_t             l2pbl_node;
+} l2pbuf_buflist_t;
+
+struct l2pbuf_buf {
+        dva_t                   b_dva;          /* dva of buffer */
+        uint64_t                b_birth;        /* birth txg of buffer */
+        uint64_t                b_cksum0;
+        zio_cksum_t             b_freeze_cksum;
+        uint32_t                b_size;         /* uncompressed buf size */
+        uint64_t                b_l2daddr;      /* buf location on l2dev */
+        uint32_t                b_l2asize;      /* actual buf data size */
+        enum zio_compress       b_l2compress;   /* compression applied */
+        uint16_t                b_contents_type;
+        uint32_t                b_flags;
+};
+
+struct l2arc_dev {
+        vdev_t                  *l2ad_vdev;     /* vdev */
+        spa_t                   *l2ad_spa;      /* spa */
+        uint64_t                l2ad_hand;      /* next write location */
+        uint64_t                l2ad_start;     /* first addr on device */
+        uint64_t                l2ad_end;       /* last addr on device */
+        uint64_t                l2ad_evict;     /* last addr eviction reached */
+        boolean_t               l2ad_first;     /* first sweep through */
+        boolean_t               l2ad_writing;   /* currently writing */
+        list_t                  *l2ad_buflist;  /* buffer list */
+        list_node_t             l2ad_node;      /* device list node */
+        l2pbuf_t                l2ad_pbuf;      /* currently open pbuf */
+        uint64_t                l2ad_pbuf_daddr;        /* prev pbuf daddr */
+        uint64_t                l2ad_pbuf_asize;        /* prev pbuf asize */
+        zio_cksum_t             l2ad_pbuf_cksum;        /* prev pbuf cksum */
+        /* uberblock birth counter - incremented for each committed uberblk */
+        uint64_t                l2ad_uberblock_birth;
+        /* flag indicating whether a rebuild is currently going on */
+        boolean_t               l2ad_rebuilding;
+};
+
+/* Stores information about an L2ARC prefetch zio */
+typedef struct l2arc_prefetch_info {
+        uint8_t                 *pi_buf;        /* where the zio writes to */
+        uint64_t                pi_buflen;      /* length of `buf' */
+        zio_t                   *pi_hdr_io;     /* see l2arc_pbuf_read below */
+} l2arc_prefetch_info_t;
+
+/* 256 x 4k of l2uberblocks */
+#define L2UBERBLOCK_SIZE        4096
+#define L2UBERBLOCK_MAGIC       0x12bab10c
+#define L2UBERBLOCK_MAX_VERSION 1       /* our maximum uberblock version */
+#define L2PBUF_MAGIC            0xdb0faba6
+#define L2PBUF_MAX_VERSION      1       /* our maximum pbuf version */
+#define L2PBUF_BUF_SIZE         88      /* size of one pbuf buf entry */
+#define L2PBUF_HDR_SIZE         56      /* pbuf header excluding any payload */
+#define L2PBUF_ENCODED_SIZE(_pb) \
+        (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb))
+/*
+ * Allocation limit for the payload of a pbuf. This also fundamentally
+ * limits the number of bufs we can reference in a pbuf.
+ */
+#define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024)
+#define L2PBUF_MAX_BUFS         (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE)
+#define L2PBUF_COMPRESS_MINSZ   8192    /* minimum size to compress a pbuf */
+#define L2PBUF_MAXSZ            100 * 1024 * 1024       /* maximum pbuf size */
+#define L2PBUF_MAX_BUFLISTS     128     /* max number of buflists per pbuf */
+#define L2ARC_REBUILD_TIMEOUT   60      /* a rebuild may take at most 60s */
+#define L2PBUF_IS_FULL(_pb) \
+        ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \
+        (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists)
+/*
+ * These are the flags we allow to persist in L2ARC pbufs. The other flags
+ * of an ARC buffer pertain to the buffer's runtime behavior.
+ */
+#define L2ARC_PERSIST_FLAGS \
+        (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH)
+
+/*
+ * Used during L2ARC rebuild after each read operation to check whether we
+ * haven't exceeded the rebuild timeout value.
+ */
+#define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \
+        do { \
+                if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \
+                        __VA_ARGS__; \
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \
+                        cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \
+                            "dropping remaining L2ARC metadata."); \
+                        return; \
+                } \
+                _NOTE(NOTREACHED) \
+                _NOTE(CONSTCOND) \
+        } while (0)
+
+/*
+ * Performance tuning of L2ARC persistency:
+ *
+ * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt
+ *              compressing it.
+ * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers
+ *              referenced from a pbuf. Once a pbuf reaches this size, it is
+ *              committed to stable storage. Ideally, there should be approx.
+ *              l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device.
+ * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will
+ *              be buffered in a pbuf before it is committed to L2ARC. This
+ *              puts a soft temporal upper bound on pbuf commit intervals.
+ * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
+ *              pool import or when adding one manually later) will attempt
+ *              to rebuild L2ARC buffer contents. In special circumstances,
+ *              the administrator may want to set this to B_FALSE, if they
+ *              are having trouble importing a pool or attaching an L2ARC
+ *              device (e.g. the L2ARC device is slow to read in stored pbuf
+ *              metadata, or the metadata has become somehow
+ *              fragmented/unusable).
+ * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
+ *              avoid a slow L2ARC device from preventing pool import. If we
+ *              are not done rebuilding an L2ARC device by this time, we
+ *              stop the rebuild and return immediately.
+ */
+uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ;
+uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ;
+uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS;
+boolean_t l2arc_rebuild_enabled = B_TRUE;
+uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
+
+static void l2arc_rebuild_start(l2arc_dev_t *dev);
+static void l2arc_rebuild(l2arc_dev_t *dev);
+static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb);
+static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev,
+    uint64_t guid);
+
+static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub);
+static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
+    zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io);
+static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr,
+    uint32_t asize);
+static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize);
+static void l2arc_pbuf_prefetch_abort(zio_t *zio);
+
+static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf);
+static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub);
+static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
+    uint64_t guid);
+static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio,
+    l2arc_write_callback_t *cb);
+
+static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen);
+static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen,
+    l2pbuf_t *pbuf);
+static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen,
+    uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum);
+static void l2arc_pbuf_init(l2pbuf_t *pb);
+static void l2arc_pbuf_destroy(l2pbuf_t *pb);
+static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio,
+    l2arc_write_callback_t *cb);
+static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs);
+static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
+    const arc_buf_hdr_t *ab, int index);
+static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb);
+
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
         uint8_t *vdva = (uint8_t *)dva;
         uint64_t crc = -1ULL;

@@ -1235,11 +1506,11 @@
         }
         ab->b_state = new_state;
 
         /* adjust l2arc hdr stats */
         if (new_state == arc_l2c_only)
-                l2arc_hdr_stat_add();
+                l2arc_hdr_stat_add(old_state != arc_anon);
         else if (old_state == arc_l2c_only)
                 l2arc_hdr_stat_remove();
 }
 
 void

@@ -1339,10 +1610,37 @@
         (void) refcount_add(&hdr->b_refcnt, tag);
 
         return (buf);
 }
 
+/*
+ * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc->(disk).
+ */
+arc_buf_hdr_t *
+arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
+{
+        arc_buf_hdr_t *hdr;
+
+        ASSERT3U(size, >, 0);
+        hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+        ASSERT(BUF_EMPTY(hdr));
+        hdr->b_size = size;
+        hdr->b_type = type;
+        hdr->b_spa = guid;
+        hdr->b_state = arc_anon;
+        hdr->b_arc_access = 0;
+        hdr->b_buf = NULL;
+        hdr->b_datacnt = 0;
+        hdr->b_flags = 0;
+        ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+        return (hdr);
+}
+
 static char *arc_onloan_tag = "onloan";
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned

@@ -3971,10 +4269,148 @@
  *      l2arc_write_size()      calculate how much to write
  *      l2arc_write_interval()  calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
+ *
+ * L2ARC persistency:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) Every now and then, at end of an L2ARC feed cycle, we append a piece
+ *    of metadata (called a "pbuf", or "persistency buffer") to the L2ARC
+ *    write. This allows us to understand what what's been written, so that
+ *    we can rebuild the arc_buf_hdr_t structures of the main ARC buffers.
+ *    The pbuf also includes a "back-reference" pointer to the previous
+ *    pbuf, forming a linked list of pbufs on the L2ARC device.
+ *
+ * *) We reserve 4k of space at the start of each L2ARC device for our
+ *    header bookkeeping purposes. This contains a single 4k uberblock, which
+ *    contains our top-level reference structures. We update it on each pbuf
+ *    write. If this write results in an inconsistent uberblock (e.g. due to
+ *    power failure), we detect this by verifying the uberblock's checksum
+ *    and simply drop the entries from L2ARC. Once an L2ARC pbuf update
+ *    completes, we update the uberblock to point to it.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * |       ____________newest pbuf pointer_____________                   |
+ * |      /                                            \                  |
+ * |     /                                              V                 |
+ * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---|
+ * |                       ^       / ^       / ^       /                  |
+ * |                       `-prev-'  `-prev-'  `-prev-'                   |
+ * |                         pbuf      pbuf      pbuf                     |
+ * +======================================================================+
+ *
+ * On-device data structures:
+ *
+ * (L2ARC persistent uberblock)
+ * struct l2uberblock {
+ *      (these fields are in network byte order)
+ *      uint32_t magic = 0x12bab10c;    l2-ber-block
+ *      uint8_t  version = 0x1;
+ *      uint8_t  reserved = 0x0;
+ *      uint16_t ublk_flags;            see l2uberblock_flags_t
+ *
+ *      (byte order of fields below determined by `ublk_flags')
+ *      uint64_t spa_guid;              what pool this l2arc dev belongs to
+ *      uint64_t birth_txg;             ublk with highest birth_txg is newest
+ *      uint64_t evict_tail;            current evict pointer on l2arc dev
+ *      uint64_t alloc_space;           how much space is alloc'd on the dev
+ *      uint64_t pbuf_daddr;            dev addr of the newest l2pbuf_t
+ *      uint32_t pbuf_asize;            size of newest pbuf
+ *      uint64_t pbuf_cksum[4];         fletcher4 of newest pbuf
+ *
+ *      uint8_t  reserved[3996] = {0x0, 0x0, ... 0x0};
+ *
+ *      uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above);
+ * } l2dev_uberblock;
+ *
+ * (L2ARC persistent buffer list)
+ * typedef struct l2pbuf_t {
+ *      (these fields are in network byte order)
+ *      uint32_t magic = 0xdb0faba6;    the-buffer-bag
+ *      uint8_t  version = 0x1;
+ *      uint8_t  reserved = 0x0;
+ *      uint16_t pbuf_flags;            see l2pbuf_flags_t
+ *
+ *      (byte order of fields below determined by `pbuf_flags')
+ *      uint64_t prev_pbuf_daddr;       previous pbuf dev addr
+ *      uint32_t prev_pbuf_asize;       previous pbuf size
+ *      uint64_t prev_pbuf_cksum[4];    fletcher4(of previous pbuf)
+ *
+ *      uint32_t items_size;            uncompressed size of `items' below
+ *      (if (pbuf_flags & compress) decompress `items' prior to decoding)
+ *      struct l2pbuf_buf_item {
+ *              (these fields mirror [l2]arc_buf_hdr fields)
+ *              uint64_t dva[2];                buffer's DVA
+ *              uint64_t birth;                 buffer's birth TXG in ARC
+ *              uint64_t cksum0;                lower 64-bits of buffer's cksum
+ *              uint64_t freeze_cksum[4];       buffer's freeze cksum
+ *              uint32_t size;                  uncompressed buffer data size
+ *              uint64_t l2daddr;               device address (offset) of buf
+ *              uint32_t l2asize;               actual space occupied by buf
+ *              uint8_t  compress;              compress algo used on data
+ *              uint8_t  contents_type;         buffer's contents type
+ *              uint16_t reserved = 0x0;        for alignment and future use
+ *              uint32_t flags;                 buffer's persistent flags
+ *      } items[];                              continues for remainder of pbuf
+ * } l2pbuf_t;
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (appending
+ * an updated l2pbuf_t every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed l2pbuf (and its referenced data buffers), like so:
+ *
+ *    current write head__       __old tail
+ *                        \     /
+ *                        V    V
+ * <--|bufs|pbuf|bufs|pbuf|    |bufs|pbuf|bufs|pbuf|-->
+ *                         ^    ^^^^^^^^^_____________________________
+ *                         |                                          \
+ *                         <<nextwrite>> - will overwrite this pbuf --/
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process:
+ * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the
+ *      previous one.
+ * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum)
+ *      then the pbuf is invalid and stop scanning (goto step 3 below).
+ * 3) if (this is the last valid pbuf)
+ *      discard this pbuf as well (its ARC bufs may have been damaged by a
+ *      partial overwrite).
+ * (We could potentially salvage the remaining good arc bufs above in step 3,
+ * buf the cost of doing so probably outweighs the value of the entire pbuf).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update pbufs which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
 {

@@ -4037,13 +4473,14 @@
 
         return (next);
 }
 
 static void
-l2arc_hdr_stat_add(void)
+l2arc_hdr_stat_add(boolean_t from_arc)
 {
         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+        if (from_arc)
         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
 }
 
 static void
 l2arc_hdr_stat_remove(void)

@@ -4074,11 +4511,14 @@
                 goto out;
 
         first = NULL;
         next = l2arc_dev_last;
         do {
-                /* loop around the list looking for a non-faulted vdev */
+                /*
+                 * Loop around the list looking for a non-faulted vdev
+                 * and one that isn't currently doing an L2ARC rebuild.
+                 */
                 if (next == NULL) {
                         next = list_head(l2arc_dev_list);
                 } else {
                         next = list_next(l2arc_dev_list, next);
                         if (next == NULL)

@@ -4089,14 +4529,14 @@
                 if (first == NULL)
                         first = next;
                 else if (next == first)
                         break;
 
-        } while (vdev_is_dead(next->l2ad_vdev));
+        } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding);
 
         /* if we were unable to find any usable vdevs, return NULL */
-        if (vdev_is_dead(next->l2ad_vdev))
+        if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding)
                 next = NULL;
 
         l2arc_dev_last = next;
 
 out:

@@ -4170,11 +4610,18 @@
         /*
          * All writes completed, or an error was hit.
          */
         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
                 ab_prev = list_prev(buflist, ab);
+                abl2 = ab->b_l2hdr;
 
+                /*
+                 * Release the temporary compressed buffer as soon as possible.
+                 */
+                if (abl2->b_compress != ZIO_COMPRESS_OFF)
+                        l2arc_release_cdata_buf(ab);
+
                 hash_lock = HDR_LOCK(ab);
                 if (!mutex_tryenter(hash_lock)) {
                         /*
                          * This buffer misses out.  It may be in a stage
                          * of eviction.  Its ARC_L2_WRITING flag will be

@@ -4182,18 +4629,10 @@
                          */
                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
                         continue;
                 }
 
-                abl2 = ab->b_l2hdr;
-
-                /*
-                 * Release the temporary compressed buffer as soon as possible.
-                 */
-                if (abl2->b_compress != ZIO_COMPRESS_OFF)
-                        l2arc_release_cdata_buf(ab);
-
                 if (zio->io_error != 0) {
                         /*
                          * Error - drop L2ARC entry.
                          */
                         list_remove(buflist, ab);

@@ -4216,10 +4655,14 @@
         kmem_cache_free(hdr_cache, head);
         mutex_exit(&l2arc_buflist_mtx);
 
         l2arc_do_free_on_write();
 
+        if (cb->l2wcb_pbuf)
+                kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size);
+        if (cb->l2wcb_ub_buf)
+                kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE);
         kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 /*
  * A read to a cache device completed.  Validate buffer contents before

@@ -4497,16 +4940,22 @@
         l2arc_write_callback_t *cb;
         zio_t *pio, *wzio;
         uint64_t guid = spa_load_guid(spa);
         const boolean_t do_headroom_boost = *headroom_boost;
 
+        /* persistency-related */
+        l2pbuf_t *pb;
+        l2pbuf_buflist_t *pb_buflist;
+        int num_bufs, buf_index;
+
         ASSERT(dev->l2ad_vdev != NULL);
 
         /* Lower the flag now, we might want to raise it again later. */
         *headroom_boost = B_FALSE;
 
         pio = NULL;
+        cb = NULL;
         write_sz = write_asize = write_psize = 0;
         full = B_FALSE;
         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
         head->b_flags |= ARC_L2_WRITE_HEAD;
 

@@ -4514,11 +4963,20 @@
          * We will want to try to compress buffers that are at least 2x the
          * device sector size.
          */
         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
 
+        pb = &dev->l2ad_pbuf;
+        num_bufs = 0;
+
         /*
+         * We will want to try to compress buffers that are at least 2x the
+         * device sector size.
+         */
+        buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
+
+        /*
          * Copy buffers for L2ARC writing.
          */
         mutex_enter(&l2arc_buflist_mtx);
         for (int try = 0; try <= 3; try++) {
                 uint64_t passed_sz = 0;

@@ -4584,11 +5042,11 @@
                                  * l2arc_write_done() can find where the
                                  * write buffers begin without searching.
                                  */
                                 list_insert_head(dev->l2ad_buflist, head);
 
-                                cb = kmem_alloc(
+                                cb = kmem_zalloc(
                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
                                 cb->l2wcb_dev = dev;
                                 cb->l2wcb_head = head;
                                 pio = zio_root(spa, l2arc_write_done, cb,
                                     ZIO_FLAG_CANFAIL);

@@ -4626,10 +5084,11 @@
                         arc_cksum_compute(ab->b_buf, B_TRUE);
 
                         mutex_exit(hash_lock);
 
                         write_sz += buf_sz;
+                        num_bufs++;
                 }
 
                 mutex_exit(list_lock);
 
                 if (full == B_TRUE)

@@ -4642,17 +5101,20 @@
                 mutex_exit(&l2arc_buflist_mtx);
                 kmem_cache_free(hdr_cache, head);
                 return (0);
         }
 
+        /* expand the pbuf to include a new list */
+        pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs);
+
         /*
          * Now start writing the buffers. We're starting at the write head
          * and work backwards, retracing the course of the buffer selector
          * loop above.
          */
-        for (ab = list_prev(dev->l2ad_buflist, head); ab;
-            ab = list_prev(dev->l2ad_buflist, ab)) {
+        for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab;
+            ab = list_prev(dev->l2ad_buflist, ab), buf_index++) {
                 l2arc_buf_hdr_t *l2hdr;
                 uint64_t buf_sz;
 
                 /*
                  * We shouldn't need to lock the buffer here, since we flagged

@@ -4700,21 +5162,31 @@
                          */
                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
                         write_psize += buf_p_sz;
                         dev->l2ad_hand += buf_p_sz;
                 }
-        }
 
+                l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index);
+        }
+        ASSERT(buf_index == num_bufs);
         mutex_exit(&l2arc_buflist_mtx);
 
         ASSERT3U(write_asize, <=, target_sz);
         ARCSTAT_BUMP(arcstat_l2_writes_sent);
         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
         ARCSTAT_INCR(arcstat_l2_size, write_sz);
         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
 
+        /* Is it time to commit this pbuf? */
+        if (L2PBUF_IS_FULL(pb) &&
+            dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) {
+                l2arc_pbuf_commit(dev, pio, cb);
+                l2arc_pbuf_destroy(pb);
+                l2arc_pbuf_init(pb);
+        }
+
         /*
          * Bump device hand to the device start if it is approaching the end.
          * l2arc_evict() will already have evicted ahead for this case.
          */
         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {

@@ -4992,14 +5464,15 @@
         return (dev != NULL);
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
- * validated the vdev and opened it.
+ * validated the vdev and opened it. The `rebuild' flag indicates whether
+ * we should attempt an L2ARC persistency rebuild.
  */
 void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd)
+l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
 {
         l2arc_dev_t *adddev;
 
         ASSERT(!l2arc_vdev_present(vd));
 

@@ -5007,16 +5480,17 @@
          * Create a new l2arc device entry.
          */
         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
         adddev->l2ad_spa = spa;
         adddev->l2ad_vdev = vd;
-        adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+        adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE;
         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
         adddev->l2ad_hand = adddev->l2ad_start;
         adddev->l2ad_evict = adddev->l2ad_start;
         adddev->l2ad_first = B_TRUE;
         adddev->l2ad_writing = B_FALSE;
+        l2arc_pbuf_init(&adddev->l2ad_pbuf);
 
         /*
          * This is a list of all ARC buffers that are still valid on the
          * device.
          */

@@ -5030,10 +5504,15 @@
          * Add device to global list
          */
         mutex_enter(&l2arc_dev_mtx);
         list_insert_head(l2arc_dev_list, adddev);
         atomic_inc_64(&l2arc_ndev);
+        if (rebuild && l2arc_rebuild_enabled) {
+                adddev->l2ad_rebuilding = B_TRUE;
+                (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev,
+                    0, &p0, TS_RUN, minclsyspri);
+        }
         mutex_exit(&l2arc_dev_mtx);
 }
 
 /*
  * Remove a vdev from the L2ARC.

@@ -5065,10 +5544,11 @@
         mutex_exit(&l2arc_dev_mtx);
 
         /*
          * Clear all buflists and ARC references.  L2ARC device flush.
          */
+        l2arc_pbuf_destroy(&remdev->l2ad_pbuf);
         l2arc_evict(remdev, 0, B_TRUE);
         list_destroy(remdev->l2ad_buflist);
         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
         kmem_free(remdev, sizeof (l2arc_dev_t));
 }

@@ -5136,6 +5616,1153 @@
         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
         l2arc_thread_exit = 1;
         while (l2arc_thread_exit != 0)
                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
         mutex_exit(&l2arc_feed_thr_lock);
+}
+
+/*
+ * Main entry point for L2ARC metadata rebuilding. This function must be
+ * called via thread_create so that the L2ARC metadata rebuild doesn't block
+ * pool import and may proceed in parallel on all available L2ARC devices.
+ */
+static void
+l2arc_rebuild_start(l2arc_dev_t *dev)
+{
+        vdev_t *vd = dev->l2ad_vdev;
+        spa_t *spa = dev->l2ad_spa;
+
+        /* Lock out device removal. */
+        spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+        ASSERT(dev->l2ad_rebuilding == B_TRUE);
+        l2arc_rebuild(dev);
+        dev->l2ad_rebuilding = B_FALSE;
+        spa_config_exit(spa, SCL_L2ARC, vd);
+        thread_exit();
+}
+
+/*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ *
+ * 1) scans the device for valid l2uberblocks
+ * 2) if it finds a good uberblock, starts reading the pbuf chain
+ * 3) restores each pbuf's contents to memory
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the pbuf chain (the previous-buffer reference
+ *    in the pbuf is zero).
+ * 2) We encounter *any* error condition (cksum errors, io errors, looped
+ *    pbufs, etc.).
+ * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
+ *    from making severely fragmented L2ARC pbufs or slow L2ARC devices
+ *    prevent a machine from importing the pool (and letting the
+ *    administrator take corrective action, e.g. by kicking the misbehaving
+ *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
+ *    rebuilding disabled).
+ */
+static void
+l2arc_rebuild(l2arc_dev_t *dev)
+{
+        int err;
+        l2uberblock_t ub;
+        l2pbuf_t pb;
+        zio_t *this_io = NULL, *next_io = NULL;
+        int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
+
+        if ((err = l2arc_uberblock_find(dev, &ub)) != 0)
+                return;
+        L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */);
+
+        /* set up uberblock update info */
+        dev->l2ad_uberblock_birth = ub.ub_birth + 1;
+
+        /* initial sanity checks */
+        l2arc_pbuf_init(&pb);
+        if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize,
+            ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) {
+                /* root pbuf is bad, we can't do anything about that */
+                if (err == EINVAL) {
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
+                } else {
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
+                }
+                l2arc_pbuf_destroy(&pb);
+                return;
+        }
+        L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
+
+        dev->l2ad_evict = ub.ub_evict_tail;
+
+        /* keep on chaining in new blocks */
+        dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr;
+        dev->l2ad_pbuf_asize = ub.ub_pbuf_asize;
+        dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum;
+        dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
+            ub.ub_pbuf_daddr + ub.ub_pbuf_asize);
+        dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0);
+
+        /* start the rebuild process */
+        for (;;) {
+                l2pbuf_t pb_prev;
+
+                l2arc_pbuf_init(&pb_prev);
+                if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr,
+                    pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io,
+                    &next_io)) != 0) {
+                        /*
+                         * We are done reading, discard the last good buffer.
+                         */
+                        if (pb.pb_prev_daddr > dev->l2ad_hand &&
+                            pb.pb_prev_asize > L2PBUF_HDR_SIZE) {
+                                /* this is an error, we stopped too early */
+                                if (err == EINVAL) {
+                                        ARCSTAT_BUMP(
+                                            arcstat_l2_rebuild_cksum_errors);
+                                } else {
+                                        ARCSTAT_BUMP(
+                                            arcstat_l2_rebuild_io_errors);
+                                }
+                        }
+                        l2arc_pbuf_destroy(&pb_prev);
+                        l2arc_pbuf_destroy(&pb);
+                        break;
+                }
+
+                /*
+                 * Protection against infinite loops of pbufs. This is also
+                 * our primary termination mechanism - once the buffer list
+                 * loops around our starting pbuf, we can stop.
+                 */
+                if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr &&
+                    pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) {
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
+                        l2arc_pbuf_destroy(&pb);
+                        l2arc_pbuf_destroy(&pb_prev);
+                        if (next_io)
+                                l2arc_pbuf_prefetch_abort(next_io);
+                        return;
+                }
+
+                /*
+                 * Our memory pressure valve. If the system is running low
+                 * on memory, rather than swamping memory with new ARC buf
+                 * hdrs, we opt not to reconstruct the L2ARC. At this point,
+                 * however, we have already set up our L2ARC dev to chain in
+                 * new metadata pbufs, so the user may choose to re-add the
+                 * L2ARC dev at a later time to reconstruct it (when there's
+                 * less memory pressure).
+                 */
+                if (arc_reclaim_needed()) {
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+                        cmn_err(CE_NOTE, "System running low on memory, "
+                            "aborting L2ARC rebuild.");
+                        l2arc_pbuf_destroy(&pb);
+                        l2arc_pbuf_destroy(&pb_prev);
+                        if (next_io)
+                                l2arc_pbuf_prefetch_abort(next_io);
+                        break;
+                }
+
+                /*
+                 * Now that we know that the prev_pbuf checks out alright, we
+                 * can start reconstruction from this pbuf - we can be sure
+                 * that the L2ARC write hand has not yet reached any of our
+                 * buffers.
+                 */
+                l2arc_pbuf_restore(dev, &pb);
+
+                /* pbuf restored, continue with next one in the list */
+                l2arc_pbuf_destroy(&pb);
+                pb = pb_prev;
+                this_io = next_io;
+                next_io = NULL;
+
+                L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
+        }
+
+        ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
+}
+
+/*
+ * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries
+ * which only contain an l2arc hdr, essentially restoring the buffers to
+ * their L2ARC evicted state. This function also updates space usage on the
+ * L2ARC vdev to make sure it tracks restored buffers.
+ */
+static void
+l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb)
+{
+        spa_t *spa;
+        uint64_t guid;
+        list_t *buflists_list;
+        l2pbuf_buflist_t *buflist;
+
+        mutex_enter(&l2arc_buflist_mtx);
+        spa = dev->l2ad_vdev->vdev_spa;
+        guid = spa_load_guid(spa);
+        buflists_list = pb->pb_buflists_list;
+        for (buflist = list_head(buflists_list); buflist;
+            buflist = list_next(buflists_list, buflist)) {
+                int i;
+                uint64_t size, asize, psize;
+
+                size = asize = psize = 0;
+                for (i = 0; i < buflist->l2pbl_nbufs; i++) {
+                        l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev,
+                            guid);
+                        size += buflist->l2pbl_bufs[i].b_size;
+                        asize += buflist->l2pbl_bufs[i].b_l2asize;
+                        psize += vdev_psize_to_asize(dev->l2ad_vdev,
+                            buflist->l2pbl_bufs[i].b_l2asize);
+                }
+                ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size);
+                ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize);
+                ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs);
+                vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
+        }
+        mutex_exit(&l2arc_buflist_mtx);
+        ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs);
+        vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev,
+            pb->pb_asize), 0, 0);
+}
+
+/*
+ * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into
+ * a state indicating that it has been evicted to L2ARC.
+ * The `guid' here is the ARC-load-guid from spa_load_guid.
+ */
+static void
+l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid)
+{
+        arc_buf_hdr_t *hdr;
+        kmutex_t *hash_lock;
+        dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]};
+
+        hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock);
+        if (hdr == NULL) {
+                /* not in cache, try to insert */
+                arc_buf_hdr_t *exists;
+                arc_buf_contents_t type = buf->b_contents_type;
+                l2arc_buf_hdr_t *l2hdr;
+
+                hdr = arc_buf_hdr_alloc(guid, buf->b_size, type);
+                hdr->b_dva = buf->b_dva;
+                hdr->b_birth = buf->b_birth;
+                hdr->b_cksum0 = buf->b_cksum0;
+                hdr->b_size = buf->b_size;
+                exists = buf_hash_insert(hdr, &hash_lock);
+                if (exists) {
+                        /* somebody beat us to the hash insert */
+                        mutex_exit(hash_lock);
+                        arc_hdr_destroy(hdr);
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+                        return;
+                }
+                hdr->b_flags = buf->b_flags;
+                mutex_enter(&hdr->b_freeze_lock);
+                ASSERT(hdr->b_freeze_cksum == NULL);
+                hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+                    KM_SLEEP);
+                *hdr->b_freeze_cksum = buf->b_freeze_cksum;
+                mutex_exit(&hdr->b_freeze_lock);
+
+                /* now rebuild the l2arc entry */
+                ASSERT(hdr->b_l2hdr == NULL);
+                l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+                l2hdr->b_dev = dev;
+                l2hdr->b_daddr = buf->b_l2daddr;
+                l2hdr->b_asize = buf->b_l2asize;
+                l2hdr->b_compress = buf->b_l2compress;
+                hdr->b_l2hdr = l2hdr;
+                list_insert_head(dev->l2ad_buflist, hdr);
+                ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
+                ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
+
+                arc_change_state(arc_l2c_only, hdr, hash_lock);
+        }
+        mutex_exit(hash_lock);
+}
+
+/*
+ * Attempts to locate and read the newest valid uberblock on the provided
+ * L2ARC device and writes it to `ub'. On success, this function returns 0,
+ * otherwise the appropriate error code is returned.
+ */
+static int
+l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub)
+{
+        int err = 0;
+        uint8_t *ub_buf;
+        uint64_t guid;
+
+        ARCSTAT_BUMP(arcstat_l2_rebuild_attempts);
+        ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
+        guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+        if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+            VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf,
+            ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
+                ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
+                goto cleanup;
+        }
+
+        /*
+         * Initial peek - does the device even have any usable uberblocks?
+         * If not, don't bother continuing.
+         */
+        l2arc_uberblock_decode(ub_buf, ub);
+        if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 ||
+            ub->ub_version > L2UBERBLOCK_MAX_VERSION ||
+            ub->ub_spa_guid != guid) {
+                err = ENOTSUP;
+                ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported);
+                goto cleanup;
+        }
+
+        /* now check to make sure that what we selected is okay */
+        if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) {
+                if (err == EINVAL) {
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
+                } else {
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors);
+                }
+                goto cleanup;
+        }
+
+        /* this uberblock is valid */
+
+cleanup:
+        kmem_free(ub_buf, L2UBERBLOCK_SIZE);
+        return (err);
+}
+
+/*
+ * Reads a pbuf from storage, decodes it and validates its contents against
+ * the provided checksum. The result is placed in `pb'.
+ *
+ * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching.
+ * When issuing the first pbuf IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the pbuf and
+ * also issue an async IO to fetch the next pbuf in the pbuf chain. The
+ * prefetch IO is returned in `prefetch_io. On subsequent calls to this
+ * function, pass the value returned in `prefetch_io' from the previous
+ * call as `this_io' and a fresh `prefetch_io' pointer to hold the next
+ * prefetch IO. Prior to the call, you should initialize your `prefetch_io'
+ * pointer to be NULL. If no prefetch IO was issued, the pointer is left
+ * set at NULL.
+ *
+ * Actual prefetching takes place in two steps: a header IO (pi_hdr_io)
+ * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io
+ * IO is used internally in this function to be able to `peek' at the next
+ * buffer's header before the main IO to read it in completely has finished.
+ * We can then begin to issue the IO for the next buffer in the chain before
+ * we are done reading, keeping the L2ARC device's pipeline saturated with
+ * reads (rather than issuing an IO, waiting for it to complete, validating
+ * the returned buffer and issuing the next one). This will make sure that
+ * the rebuild proceeds at maximum read throughput.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the prefetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of prefetch IOs.
+ */
+static int
+l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
+    zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io)
+{
+        int err = 0;
+        uint64_t prev_pb_start;
+        uint32_t prev_pb_asize;
+        zio_cksum_t calc_cksum, prev_pb_cksum;
+        l2arc_prefetch_info_t *pi = NULL;
+
+        ASSERT(dev != NULL);
+        ASSERT(pb != NULL);
+        ASSERT(*prefetch_io == NULL);
+
+        if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) {
+                /* We could not have issued a prefetch IO for this */
+                ASSERT(this_io == NULL);
+                return (EINVAL);
+        }
+
+        /*
+         * Check to see if we have issued the IO for this pbuf in a previous
+         * run. If not, issue it now.
+         */
+        if (this_io == NULL)
+                this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize);
+
+        /* Pick up the prefetch info buffer and read its contents */
+        pi = this_io->io_private;
+        ASSERT(pi != NULL);
+        ASSERT(asize <= pi->pi_buflen);
+
+        /* Wait for the IO to read this pbuf's header to complete */
+        if ((err = zio_wait(pi->pi_hdr_io)) != 0) {
+                (void) zio_wait(this_io);
+                goto cleanup;
+        }
+
+        /*
+         * Peek to see if we can start issuing the next pbuf IO immediately.
+         * At this point, only the current pbuf's header has been read.
+         */
+        if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start,
+            &prev_pb_asize, &prev_pb_cksum) == 0) {
+                uint64_t this_pb_start, this_pb_end, prev_pb_end;
+                /* Detect malformed pbuf references and loops */
+                this_pb_start = daddr;
+                this_pb_end = daddr + asize;
+                prev_pb_end = prev_pb_start + prev_pb_asize;
+                if ((prev_pb_start >= this_pb_start && prev_pb_start <
+                    this_pb_end) ||
+                    (prev_pb_end >= this_pb_start && prev_pb_end <
+                    this_pb_end)) {
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
+                        cmn_err(CE_WARN, "Looping L2ARC metadata reference "
+                            "detected, aborting rebuild.");
+                        err = EINVAL;
+                        goto cleanup;
+                }
+                /*
+                 * Start issuing IO for the next pbuf early - this should
+                 * help keep the L2ARC device busy while we read, decode
+                 * and restore this pbuf.
+                 */
+                if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize))
+                        *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev,
+                            prev_pb_start, prev_pb_asize);
+        }
+
+        /* Wait for the main pbuf IO to complete */
+        if ((err = zio_wait(this_io)) != 0)
+                goto cleanup;
+
+        /* Make sure the buffer checks out ok */
+        fletcher_4_native(pi->pi_buf, asize, &calc_cksum);
+        if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) {
+                err = EINVAL;
+                goto cleanup;
+        }
+
+        /* Now we can take our time decoding this buffer */
+        if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0)
+                goto cleanup;
+
+        /* This will be used in l2arc_pbuf_restore for space accounting */
+        pb->pb_asize = asize;
+
+        ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb));
+        ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize);
+        ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
+            pb->pb_payload_asz / asize);
+
+cleanup:
+        kmem_free(pi->pi_buf, pi->pi_buflen);
+        pi->pi_buf = NULL;
+        kmem_free(pi, sizeof (l2arc_prefetch_info_t));
+        /* Abort an in-flight prefetch in case of error */
+        if (err != 0 && *prefetch_io != NULL) {
+                l2arc_pbuf_prefetch_abort(*prefetch_io);
+                *prefetch_io = NULL;
+        }
+        return (err);
+}
+
+/*
+ * Validates a pbuf device address to make sure that it can be read
+ * from the provided L2ARC device. Returns 1 if the address is within
+ * the device's bounds, or 0 if not.
+ */
+static int
+l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize)
+{
+        uint32_t psize;
+        uint64_t end;
+
+        psize = vdev_psize_to_asize(dev->l2ad_vdev, asize);
+        end = daddr + psize;
+
+        if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE ||
+            asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start ||
+            /* check that the buffer address is correctly aligned */
+            (daddr & (vdev_psize_to_asize(dev->l2ad_vdev,
+            SPA_MINBLOCKSIZE) - 1)) != 0)
+                return (0);
+        else
+                return (1);
+}
+
+/*
+ * Starts an asynchronous read IO to read a pbuf. This is used in pbuf
+ * reconstruction to start reading the next pbuf before we are done
+ * decoding and reconstructing the current pbuf, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain a newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care
+ * of disposing of the allocated buffers correctly.
+ */
+static zio_t *
+l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize)
+{
+        uint32_t i, psize;
+        zio_t *pio, *hdr_io;
+        uint64_t hdr_rsize;
+        uint8_t *buf;
+        l2arc_prefetch_info_t *pinfo;
+
+        psize = vdev_psize_to_asize(vd, asize);
+        buf = kmem_alloc(psize, KM_SLEEP);
+        pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP);
+        pinfo->pi_buf = buf;
+        pinfo->pi_buflen = psize;
+
+        /*
+         * We start issuing the IO for the pbuf header early. This
+         * allows l2arc_pbuf_read to start issuing IO for the next
+         * buffer before the current pbuf is read in completely.
+         */
+
+        hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE);
+        ASSERT(hdr_rsize <= psize);
+        pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL,
+            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
+        hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf,
+            ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+            ZIO_FLAG_DONT_RETRY, B_FALSE);
+        (void) zio_nowait(hdr_io);
+
+        /*
+         * Read in the rest of the pbuf - this can take longer than just
+         * having a peek at the header.
+         */
+        pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE |
+            ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+            ZIO_FLAG_DONT_RETRY);
+        for (i = hdr_rsize; i < psize; ) {
+                uint64_t rsize = psize - i;
+                zio_t *rzio;
+
+                if (psize - i > SPA_MAXBLOCKSIZE)
+                        rsize = SPA_MAXBLOCKSIZE;
+                ASSERT(rsize >= SPA_MINBLOCKSIZE);
+                rzio = zio_read_phys(pio, vd, daddr + i,
+                    rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
+                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE |
+                    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+                    ZIO_FLAG_DONT_RETRY, B_FALSE);
+                (void) zio_nowait(rzio);
+                i += rsize;
+        }
+
+        return (pio);
+}
+
+/*
+ * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data
+ * buffers allocated for it.
+ */
+static void
+l2arc_pbuf_prefetch_abort(zio_t *zio)
+{
+        l2arc_prefetch_info_t *pi;
+
+        pi = zio->io_private;
+        ASSERT(pi != NULL);
+        if (pi->pi_hdr_io != NULL)
+                (void) zio_wait(pi->pi_hdr_io);
+        (void) zio_wait(zio);
+        kmem_free(pi->pi_buf, pi->pi_buflen);
+        pi->pi_buf = NULL;
+        kmem_free(pi, sizeof (l2arc_prefetch_info_t));
+}
+
+/*
+ * Encodes an l2uberblock_t structure into a destination buffer. This
+ * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting
+ * uberblock is always of this constant size.
+ */
+static void
+l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf)
+{
+        zio_cksum_t cksum;
+
+        bzero(buf, L2UBERBLOCK_SIZE);
+
+#if defined(_BIG_ENDIAN)
+        *(uint32_t *)buf = L2UBERBLOCK_MAGIC;
+        *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN;
+#else   /* !defined(_BIG_ENDIAN) */
+        *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC);
+        /* zero flags is ok */
+#endif  /* !defined(_BIG_ENDIAN) */
+        buf[4] = L2UBERBLOCK_MAX_VERSION;
+
+        /* rest in native byte order */
+        *(uint64_t *)(buf + 8) = ub->ub_spa_guid;
+        *(uint64_t *)(buf + 16) = ub->ub_birth;
+        *(uint64_t *)(buf + 24) = ub->ub_evict_tail;
+        *(uint64_t *)(buf + 32) = ub->ub_alloc_space;
+        *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr;
+        *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize;
+        bcopy(&ub->ub_pbuf_cksum, buf + 52, 32);
+
+        fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
+        bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32);
+}
+
+/*
+ * Decodes an l2uberblock_t from an on-disk representation. Please note
+ * that this function does not perform any uberblock validation and
+ * checksumming - call l2arc_uberblock_verify() for that.
+ */
+static void
+l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub)
+{
+        boolean_t bswap_needed;
+
+        /* these always come in big endian */
+#if defined(_BIG_ENDIAN)
+        ub->ub_magic = *(uint32_t *)buf;
+        ub->ub_flags = *(uint16_t *)(buf + 6);
+        bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1);
+#else   /* !defined(_BIG_ENDIAN) */
+        ub->ub_magic = BSWAP_32(*(uint32_t *)buf);
+        ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6));
+        bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0);
+#endif  /* !defined(_BIG_ENDIAN) */
+        ub->ub_version = buf[4];
+
+        ub->ub_spa_guid = *(uint64_t *)(buf + 8);
+        ub->ub_birth = *(uint64_t *)(buf + 16);
+        ub->ub_evict_tail = *(uint64_t *)(buf + 24);
+        ub->ub_alloc_space = *(uint64_t *)(buf + 32);
+        ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40);
+        ub->ub_pbuf_asize = *(uint32_t *)(buf + 48);
+        bcopy(buf + 52, &ub->ub_pbuf_cksum, 36);
+        bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32);
+
+        /* swap the rest if endianness doesn't match us */
+        if (bswap_needed) {
+                ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid);
+                ub->ub_birth = BSWAP_64(ub->ub_birth);
+                ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail);
+                ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space);
+                ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr);
+                ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize);
+                ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum);
+                ZIO_CHECKSUM_BSWAP(&ub->ub_cksum);
+        }
+}
+
+/*
+ * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is
+ * valid and matches its checksum.
+ */
+static int
+l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
+    uint64_t guid)
+{
+        zio_cksum_t cksum;
+
+        if (ub->ub_magic != L2UBERBLOCK_MAGIC ||
+            ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION)
+                /*
+                 * bad magic or invalid version => persistent l2arc not
+                 * supported
+                 */
+                return (ENOTSUP);
+
+        if (ub->ub_spa_guid != guid)
+                /* this l2arc dev isn't ours */
+                return (EINVAL);
+
+        fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
+        if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum))
+                /* bad checksum, corrupt uberblock */
+                return (EINVAL);
+
+        return (0);
+}
+
+/*
+ * Schedules a zio to update the uberblock on an l2arc device. The zio is
+ * initiated as a child of `pio' and `cb' is filled with the information
+ * needed to free the uberblock data buffer after writing.
+ */
+static void
+l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
+{
+        uint8_t *ub_buf;
+        l2uberblock_t ub;
+        zio_t *wzio;
+        vdev_stat_t st;
+
+        ASSERT(cb->l2wcb_ub_buf == NULL);
+        vdev_get_stats(dev->l2ad_vdev, &st);
+
+        bzero(&ub, sizeof (ub));
+        ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+        ub.ub_birth = dev->l2ad_uberblock_birth++;
+        ub.ub_evict_tail = dev->l2ad_evict;
+        ub.ub_alloc_space = st.vs_alloc;
+        ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr;
+        ub.ub_pbuf_asize = dev->l2ad_pbuf_asize;
+        ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum;
+        if (dev->l2ad_first)
+                ub.ub_flags |= L2UBLK_EVICT_FIRST;
+
+        ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
+        cb->l2wcb_ub_buf = ub_buf;
+        l2arc_uberblock_encode(&ub, ub_buf);
+        wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
+            L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL,
+            ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+            zio_t *, wzio);
+        (void) zio_nowait(wzio);
+}
+
+/*
+ * Encodes a l2pbuf_t structure into the portable on-disk format. The
+ * `buf' buffer must be suitably sized to hold the entire uncompressed
+ * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function
+ * also compresses the buffer.
+ *
+ * The return value is the length of the resulting encoded pbuf structure.
+ * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression
+ * was applied, or smaller if compression was applied. In either case,
+ * prior to writing to disk, the caller must suitably pad the output
+ * buffer so that it is aligned on a multiple of the underlying storage
+ * system's block size.
+ */
+static uint32_t
+l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen)
+{
+        uint16_t flags = 0;
+        uint8_t *dst_buf;
+        uint32_t enclen;
+        l2pbuf_buflist_t *buflist;
+
+        enclen = L2PBUF_ENCODED_SIZE(pb);
+        ASSERT(buflen >= enclen);
+        bzero(buf, enclen);
+
+        /* non-header portions of pbufs are in native byte order */
+        *(uint64_t *)(buf + 8) = pb->pb_prev_daddr;
+        *(uint32_t *)(buf + 16) = pb->pb_prev_asize;
+        bcopy(&pb->pb_prev_cksum, buf + 20, 32);
+        *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE;
+
+        /* first we encode the buflists uncompressed */
+        dst_buf = buf + L2PBUF_HDR_SIZE;
+        for (buflist = list_head(pb->pb_buflists_list); buflist;
+            buflist = list_next(pb->pb_buflists_list, buflist)) {
+                int i;
+
+                ASSERT(buflist->l2pbl_nbufs != 0);
+                for (i = 0; i < buflist->l2pbl_nbufs; i++) {
+                        l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
+
+                        ASSERT(pbl_buf->b_size != 0);
+                        *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0];
+                        *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1];
+                        *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth;
+                        *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0;
+                        bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32);
+                        *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size;
+                        *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr;
+                        *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize;
+                        dst_buf[80] = pbl_buf->b_l2compress;
+                        dst_buf[81] = pbl_buf->b_contents_type;
+                        *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags;
+                        dst_buf += L2PBUF_BUF_SIZE;
+                }
+        }
+        ASSERT((uint32_t)(dst_buf - buf) == enclen);
+
+        /* and then compress them if necessary */
+        if (enclen >= l2arc_pbuf_compress_minsz) {
+                uint8_t *cbuf;
+                size_t slen, clen;
+
+                slen = l2arc_pbuf_items_encoded_size(pb);
+                cbuf = kmem_alloc(slen, KM_SLEEP);
+                clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0);
+                ASSERT(clen != 0);
+                if (clen < slen) {
+                        bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen);
+                        flags |= L2PBUF_COMPRESSED;
+                        /* zero out the rest of the input buffer */
+                        bzero(buf + L2PBUF_HDR_SIZE + clen,
+                            buflen - (L2PBUF_HDR_SIZE + clen));
+                        /* adjust our buffer length now that it's shortened */
+                        enclen = L2PBUF_HDR_SIZE + clen;
+                }
+                kmem_free(cbuf, slen);
+        }
+
+        /* the header goes last since `flags' may change due to compression */
+#if defined(_BIG_ENDIAN)
+        *(uint32_t *)buf = L2PBUF_MAGIC;
+        flags |= L2PBUF_BIG_ENDIAN;
+        *(uint16_t *)(buf + 6) = flags;
+#else   /* !defined(_BIG_ENDIAN) */
+        *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC);
+        *(uint16_t *)(buf + 6) = BSWAP_16(flags);
+#endif  /* !defined(_BIG_ENDIAN) */
+        buf[4] = L2PBUF_MAX_VERSION;
+
+        return (enclen);
+}
+
+/*
+ * Decodes a stored l2pbuf_t structure previously encoded using
+ * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf
+ * must be initialized by l2arc_pbuf_init by the caller beforehand, but
+ * must not have been used to store any buffers yet.
+ *
+ * Please note that we don't do checksum verification here, as we don't
+ * know our own checksum (that's know by the previous block in the linked
+ * list, or by the uberblock). This should be performed by the caller
+ * prior to calling l2arc_pbuf_decode.
+ */
+static int
+l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb)
+{
+        boolean_t bswap_needed;
+        uint32_t payload_sz, payload_asz;
+        uint8_t *src_bufs;
+        l2pbuf_buflist_t *buflist;
+        int i, nbufs;
+
+        ASSERT(input_buf != NULL);
+        ASSERT(pb != NULL);
+        ASSERT(pb->pb_version != 0);
+        ASSERT(pb->pb_nbuflists == 0);
+
+        /* no valid buffer can be this small */
+        if (buflen < L2PBUF_HDR_SIZE)
+                return (EINVAL);
+
+        /* these always come in big endian */
+#if defined(_BIG_ENDIAN)
+        pb->pb_magic = *(uint32_t *)input_buf;
+        pb->pb_flags = *(uint16_t *)(input_buf + 6);
+        bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1);
+#else   /* !defined(_BIG_ENDIAN) */
+        pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf);
+        pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6));
+        bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0);
+#endif  /* !defined(_BIG_ENDIAN) */
+        pb->pb_version = input_buf[4];
+
+        if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0)
+                return (EINVAL);
+        if (pb->pb_version > L2PBUF_MAX_VERSION)
+                return (ENOTSUP);
+
+        /* remainder of pbuf may need bswap'ping */
+        pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8);
+        pb->pb_prev_asize = *(uint64_t *)(input_buf + 16);
+        bcopy(input_buf + 20, &pb->pb_prev_cksum, 32);
+        payload_sz = *(uint32_t *)(input_buf + 52);
+        payload_asz = buflen - L2PBUF_HDR_SIZE;
+
+        if (bswap_needed) {
+                pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr);
+                pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize);
+                ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum);
+                payload_sz = BSWAP_32(payload_sz);
+        }
+
+        /* check for sensible buffer allocation limits */
+        if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) ||
+            (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) ||
+            (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0)
+                return (EINVAL);
+        nbufs = payload_sz / L2PBUF_BUF_SIZE;
+
+        /* decompression might be needed */
+        if (pb->pb_flags & L2PBUF_COMPRESSED) {
+                src_bufs = kmem_alloc(payload_sz, KM_SLEEP);
+                if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs,
+                    payload_asz, payload_sz, 0) != 0) {
+                        kmem_free(src_bufs, payload_sz);
+                        return (EINVAL);
+                }
+        } else {
+                src_bufs = input_buf + L2PBUF_HDR_SIZE;
+        }
+
+        /* Decode individual pbuf items from our source buffer. */
+        buflist = l2arc_pbuf_buflist_alloc(pb, nbufs);
+        for (i = 0; i < nbufs; i++) {
+                l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
+                const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE;
+
+                pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src;
+                pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8);
+                pbl_buf->b_birth = *(uint64_t *)(src + 16);
+                pbl_buf->b_cksum0 = *(uint64_t *)(src + 24);
+                bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32);
+                pbl_buf->b_size = *(uint32_t *)(src + 64);
+                pbl_buf->b_l2daddr = *(uint64_t *)(src + 68);
+                pbl_buf->b_l2asize = *(uint32_t *)(src + 76);
+                pbl_buf->b_l2compress = src[80];
+                pbl_buf->b_contents_type = src[81];
+                pbl_buf->b_flags = *(uint32_t *)(src + 84);
+
+                if (bswap_needed) {
+                        pbl_buf->b_dva.dva_word[0] =
+                            BSWAP_64(pbl_buf->b_dva.dva_word[0]);
+                        pbl_buf->b_dva.dva_word[1] =
+                            BSWAP_64(pbl_buf->b_dva.dva_word[1]);
+                        pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth);
+                        pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0);
+                        ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum);
+                        pbl_buf->b_size = BSWAP_32(pbl_buf->b_size);
+                        pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr);
+                        pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize);
+                        pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags);
+                }
+
+                pb->pb_payload_asz += pbl_buf->b_l2asize;
+        }
+
+        if (pb->pb_flags & L2PBUF_COMPRESSED)
+                kmem_free(src_bufs, payload_sz);
+
+        return (0);
+}
+
+/*
+ * Decodes the previous buffer pointer encoded in a pbuf. This is used
+ * during L2ARC reconstruction to "peek" at the next buffer and start
+ * issuing IO to fetch it early, before decoding of the current buffer
+ * is done (which can take time due to decompression).
+ * Returns 0 on success (and fills in the return parameters `daddr',
+ * `asize' and `cksum' with the info of the previous pbuf), and an errno
+ * on error.
+ */
+static int
+l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr,
+    uint32_t *asize, zio_cksum_t *cksum)
+{
+        boolean_t bswap_needed;
+        uint16_t version, flags;
+        uint32_t magic;
+
+        ASSERT(buf != NULL);
+
+        /* no valid buffer can be this small */
+        if (buflen <= L2PBUF_HDR_SIZE)
+                return (EINVAL);
+
+        /* these always come in big endian */
+#if defined(_BIG_ENDIAN)
+        magic = *(uint32_t *)buf;
+        flags = *(uint16_t *)(buf + 6);
+        bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1);
+#else   /* !defined(_BIG_ENDIAN) */
+        magic = BSWAP_32(*(uint32_t *)buf);
+        flags = BSWAP_16(*(uint16_t *)(buf + 6));
+        bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0);
+#endif  /* !defined(_BIG_ENDIAN) */
+        version = buf[4];
+
+        if (magic != L2PBUF_MAGIC || version == 0)
+                return (EINVAL);
+        if (version > L2PBUF_MAX_VERSION)
+                return (ENOTSUP);
+
+        *daddr = *(uint64_t *)(buf + 4);
+        *asize = *(uint64_t *)(buf + 12);
+        bcopy(buf + 16, cksum, 32);
+
+        if (bswap_needed) {
+                *daddr = BSWAP_64(*daddr);
+                *asize = BSWAP_64(*asize);
+                ZIO_CHECKSUM_BSWAP(cksum);
+        }
+
+        return (0);
+}
+
+/*
+ * Initializes a pbuf structure into a clean state. All version and flags
+ * fields are filled in as appropriate for this architecture.
+ * If the structure was used before, first call l2arc_pbuf_destroy on it,
+ * as this function assumes the structure is uninitialized.
+ */
+static void
+l2arc_pbuf_init(l2pbuf_t *pb)
+{
+        bzero(pb, sizeof (l2pbuf_t));
+        pb->pb_version = L2PBUF_MAX_VERSION;
+#if defined(_BIG_ENDIAN)
+        pb->pb_flags |= L2PB_BIG_ENDIAN;
+#endif
+        pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP);
+        list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t),
+            offsetof(l2pbuf_buflist_t, l2pbl_node));
+}
+
+/*
+ * Destroys a pbuf structure and puts it into a clean state ready to be
+ * initialized by l2arc_pbuf_init. All buflists created by
+ * l2arc_pbuf_buflist_alloc are released as well.
+ */
+static void
+l2arc_pbuf_destroy(l2pbuf_t *pb)
+{
+        list_t *buflist_list = pb->pb_buflists_list;
+        l2pbuf_buflist_t *buflist;
+
+        while ((buflist = list_head(buflist_list)) != NULL) {
+                ASSERT(buflist->l2pbl_nbufs > 0);
+                kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) *
+                    buflist->l2pbl_nbufs);
+                list_remove(buflist_list, buflist);
+                kmem_free(buflist, sizeof (l2pbuf_buflist_t));
+        }
+        pb->pb_nbuflists = 0;
+        list_destroy(pb->pb_buflists_list);
+        kmem_free(pb->pb_buflists_list, sizeof (list_t));
+        bzero(pb, sizeof (l2pbuf_t));
+}
+
+/*
+ * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs'
+ * buffers. This is used during the buffer write cycle - each cycle allocates
+ * a new buflist and fills it with buffers it writes. Then, when the pbuf
+ * reaches its buflist limit, it is commited to stable storage.
+ */
+static l2pbuf_buflist_t *
+l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs)
+{
+        l2pbuf_buflist_t *buflist;
+
+        ASSERT(pb->pb_buflists_list != NULL);
+        buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP);
+        buflist->l2pbl_nbufs = nbufs;
+        buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs,
+            KM_SLEEP);
+        list_insert_tail(pb->pb_buflists_list, buflist);
+        pb->pb_nbuflists++;
+
+        return (buflist);
+}
+
+/*
+ * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'.
+ * The buffer being inserted must be present in L2ARC.
+ */
+static void
+l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
+    const arc_buf_hdr_t *ab, int index)
+{
+        l2pbuf_buf_t *pb_buf;
+        const l2arc_buf_hdr_t *l2hdr;
+
+        l2hdr = ab->b_l2hdr;
+        ASSERT(l2hdr != NULL);
+        ASSERT(pbl->l2pbl_nbufs > index);
+
+        pb_buf = &pbl->l2pbl_bufs[index];
+        pb_buf->b_dva = ab->b_dva;
+        pb_buf->b_birth = ab->b_birth;
+        pb_buf->b_cksum0 = ab->b_cksum0;
+        pb_buf->b_freeze_cksum = *ab->b_freeze_cksum;
+        pb_buf->b_size = ab->b_size;
+        pb_buf->b_l2daddr = l2hdr->b_daddr;
+        pb_buf->b_l2asize = l2hdr->b_asize;
+        pb_buf->b_l2compress = l2hdr->b_compress;
+        pb_buf->b_contents_type = ab->b_type;
+        pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS;
+        pb->pb_payload_asz += l2hdr->b_asize;
+}
+
+/*
+ * Commits a pbuf to stable storage. This routine is invoked when writing
+ * ARC buffers to an L2ARC device. When the pbuf associated with the device
+ * has reached its limits (either in size or in number of writes), it is
+ * scheduled here for writing.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+static void
+l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
+{
+        l2pbuf_t *pb = &dev->l2ad_pbuf;
+        uint64_t i, est_encsize, bufsize, encsize, io_size;
+        uint8_t *pb_buf;
+
+        pb->pb_prev_daddr = dev->l2ad_pbuf_daddr;
+        pb->pb_prev_asize = dev->l2ad_pbuf_asize;
+        pb->pb_prev_cksum = dev->l2ad_pbuf_cksum;
+
+        est_encsize = L2PBUF_ENCODED_SIZE(pb);
+        bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize);
+        pb_buf = kmem_zalloc(bufsize, KM_SLEEP);
+        encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize);
+        cb->l2wcb_pbuf = pb_buf;
+        cb->l2wcb_pbuf_size = bufsize;
+
+        dev->l2ad_pbuf_daddr = dev->l2ad_hand;
+        dev->l2ad_pbuf_asize = encsize;
+        fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum);
+
+        io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize);
+        for (i = 0; i < io_size; ) {
+                zio_t *wzio;
+                uint64_t wsize = io_size - i;
+
+                if (wsize > SPA_MAXBLOCKSIZE)
+                        wsize = SPA_MAXBLOCKSIZE;
+                ASSERT(wsize >= SPA_MINBLOCKSIZE);
+                wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i,
+                    wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
+                    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+                DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+                    zio_t *, wzio);
+                (void) zio_nowait(wzio);
+                i += wsize;
+        }
+
+        dev->l2ad_hand += io_size;
+        vdev_space_update(dev->l2ad_vdev, io_size, 0, 0);
+        l2arc_uberblock_update(dev, pio, cb);
+
+        ARCSTAT_INCR(arcstat_l2_write_bytes, io_size);
+        ARCSTAT_BUMP(arcstat_l2_meta_writes);
+        ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize);
+        ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize);
+        ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
+            pb->pb_payload_asz / encsize);
+}
+
+/*
+ * Returns the number of bytes occupied by the payload buffer items of
+ * a pbuf in portable (on-disk) encoded form, i.e. the bytes following
+ * L2PBUF_HDR_SIZE.
+ */
+static uint32_t
+l2arc_pbuf_items_encoded_size(l2pbuf_t *pb)
+{
+        uint32_t size = 0;
+        l2pbuf_buflist_t *buflist;
+
+        for (buflist = list_head(pb->pb_buflists_list); buflist != NULL;
+            buflist = list_next(pb->pb_buflists_list, buflist))
+                size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs;
+
+        return (size);
 }