illumos-gate.git Udiff usr/src/uts/common/fs/zfs/arc.c

Print this page

3525 Persistent L2ARC

@@ -136,10 +136,12 @@
 #include <sys/dnlc.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <zfs_fletcher.h>
+#include <sys/byteorder.h>
+#include <sys/spa_impl.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 int arc_procfd;

@@ -314,10 +316,25 @@
         kstat_named_t arcstat_l2_asize;
         kstat_named_t arcstat_l2_hdr_size;
         kstat_named_t arcstat_l2_compress_successes;
         kstat_named_t arcstat_l2_compress_zeros;
         kstat_named_t arcstat_l2_compress_failures;
+        kstat_named_t arcstat_l2_log_blk_writes;
+        kstat_named_t arcstat_l2_log_blk_avg_size;
+        kstat_named_t arcstat_l2_data_to_meta_ratio;
+        kstat_named_t arcstat_l2_rebuild_successes;
+        kstat_named_t arcstat_l2_rebuild_abort_unsupported;
+        kstat_named_t arcstat_l2_rebuild_abort_timeout;
+        kstat_named_t arcstat_l2_rebuild_abort_io_errors;
+        kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
+        kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
+        kstat_named_t arcstat_l2_rebuild_abort_lowmem;
+        kstat_named_t arcstat_l2_rebuild_size;
+        kstat_named_t arcstat_l2_rebuild_bufs;
+        kstat_named_t arcstat_l2_rebuild_bufs_precached;
+        kstat_named_t arcstat_l2_rebuild_psize;
+        kstat_named_t arcstat_l2_rebuild_log_blks;
         kstat_named_t arcstat_memory_throttle_count;
         kstat_named_t arcstat_duplicate_buffers;
         kstat_named_t arcstat_duplicate_buffers_size;
         kstat_named_t arcstat_duplicate_reads;
         kstat_named_t arcstat_meta_used;

@@ -380,10 +397,25 @@
         { "l2_asize",                   KSTAT_DATA_UINT64 },
         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
+        { "l2_log_blk_writes",          KSTAT_DATA_UINT64 },
+        { "l2_log_blk_avg_size",        KSTAT_DATA_UINT64 },
+        { "l2_data_to_meta_ratio",      KSTAT_DATA_UINT64 },
+        { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
+        { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
+        { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
+        { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
+        { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
+        { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
+        { "l2_rebuild_lowmem",          KSTAT_DATA_UINT64 },
+        { "l2_rebuild_psize",           KSTAT_DATA_UINT64 },
+        { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
+        { "l2_rebuild_bufs_precached",  KSTAT_DATA_UINT64 },
+        { "l2_rebuild_size",            KSTAT_DATA_UINT64 },
+        { "l2_rebuild_log_blks",        KSTAT_DATA_UINT64 },
         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
         { "duplicate_reads",            KSTAT_DATA_UINT64 },
         { "arc_meta_used",              KSTAT_DATA_UINT64 },

@@ -427,10 +459,29 @@
                 } else {                                                \
                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
                 }                                                       \
         }
 
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG_FACTOR    3
+#define ARCSTAT_F_AVG(stat, value) \
+        do { \
+                uint64_t x = ARCSTAT(stat); \
+                x = x - x / ARCSTAT_F_AVG_FACTOR + \
+                    (value) / ARCSTAT_F_AVG_FACTOR; \
+                ARCSTAT(stat) = x; \
+                _NOTE(NOTREACHED) \
+                _NOTE(CONSTCOND) \
+        } while (0)
+
 kstat_t                 *arc_ksp;
 static arc_state_t      *arc_anon;
 static arc_state_t      *arc_mru;
 static arc_state_t      *arc_mru_ghost;
 static arc_state_t      *arc_mfu;

@@ -635,23 +686,11 @@
 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 
 /*
  * L2ARC Internals
  */
-typedef struct l2arc_dev {
-        vdev_t                  *l2ad_vdev;     /* vdev */
-        spa_t                   *l2ad_spa;      /* spa */
-        uint64_t                l2ad_hand;      /* next write location */
-        uint64_t                l2ad_start;     /* first addr on device */
-        uint64_t                l2ad_end;       /* last addr on device */
-        uint64_t                l2ad_evict;     /* last addr eviction reached */
-        boolean_t               l2ad_first;     /* first sweep through */
-        boolean_t               l2ad_writing;   /* currently writing */
-        list_t                  *l2ad_buflist;  /* buffer list */
-        list_node_t             l2ad_node;      /* device list node */
-} l2arc_dev_t;
-
+typedef struct l2arc_dev l2arc_dev_t;
 static list_t L2ARC_dev_list;                   /* device list */
 static list_t *l2arc_dev_list;                  /* device list pointer */
 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */

@@ -670,10 +709,12 @@
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
         l2arc_dev_t     *l2wcb_dev;             /* device info */
         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
+        /* list of in-flight l2arc_log_blk_buf_t's */
+        list_t          l2wcb_log_blk_buf_list;
 } l2arc_write_callback_t;
 
 struct l2arc_buf_hdr {
         /* protected by arc_buf_hdr  mutex */
         l2arc_dev_t             *b_dev;         /* L2ARC device */

@@ -697,19 +738,288 @@
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static void l2arc_read_done(zio_t *zio);
-static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_add(boolean_t from_arc);
 static void l2arc_hdr_stat_remove(void);
+static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
 
 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
     enum zio_compress c);
 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 
-static uint64_t
+enum {
+        L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0)    /* mirror of l2ad_first */
+};
+
+/*
+ * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
+ */
+typedef struct l2arc_log_blk_ptr {
+        uint64_t        l2lbp_daddr;    /* device address of log */
+        /*
+         * l2lbp_prop is the same format as the blk_prop in blkptr_t:
+         *      * logical size (in sectors)
+         *      * physical (compressed) size (in sectors)
+         *      * compression algorithm (we always LZ4-compress l2arc logs)
+         *      * checksum algorithm (used for l2lbp_cksum)
+         *      * object type & level (unused for now)
+         */
+        uint64_t        l2lbp_prop;
+        zio_cksum_t     l2lbp_cksum;    /* fletcher4 of log */
+} l2arc_log_blk_ptr_t;
+
+/*
+ * The persistent L2ARC device header.
+ */
+typedef struct l2arc_dev_hdr_phys {
+        uint64_t        l2dh_magic;
+        zio_cksum_t     l2dh_self_cksum;        /* fletcher4 of fields below */
+
+        /*
+         * Global L2ARC device state and metadata.
+         */
+        uint64_t        l2dh_spa_guid;
+        uint64_t        l2dh_evict_tail;        /* current evict pointer */
+        uint64_t        l2dh_alloc_space;       /* vdev space alloc status */
+        uint64_t        l2dh_flags;             /* l2arc_dev_hdr_flags_t */
+
+        /*
+         * Start of log block chain. [0] -> newest log, [1] -> one older (used
+         * for initiating prefetch).
+         */
+        l2arc_log_blk_ptr_t     l2dh_start_lbps[2];
+
+        const uint64_t  l2dh_pad[43];           /* pad to 512 bytes */
+} l2arc_dev_hdr_phys_t;
+CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
+
+/*
+ * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
+ */
+typedef struct l2arc_log_ent_phys {
+        dva_t                   l2le_dva;       /* dva of buffer */
+        uint64_t                l2le_birth;     /* birth txg of buffer */
+        uint64_t                l2le_cksum0;
+        zio_cksum_t             l2le_freeze_cksum;
+        /*
+         * l2le_prop is the same format as the blk_prop in blkptr_t:
+         *      * logical size (in sectors)
+         *      * physical (compressed) size (in sectors)
+         *      * compression algorithm
+         *      * checksum algorithm (used for cksum0)
+         *      * object type & level (used to restore arc_buf_contents_t)
+         */
+        uint64_t                l2le_prop;
+        uint64_t                l2le_daddr;     /* buf location on l2dev */
+        const uint64_t          l2le_pad[6];    /* resv'd for future use */
+} l2arc_log_ent_phys_t;
+
+/*
+ * These design limits give us the following overhead (before compression):
+ *      avg_blk_sz      overhead
+ *      1k              12.51 %
+ *      2k               6.26 %
+ *      4k               3.13 %
+ *      8k               1.56 %
+ *      16k              0.78 %
+ *      32k              0.39 %
+ *      64k              0.20 %
+ *      128k             0.10 %
+ * Compression should be able to sequeeze these down by about a factor of 2x.
+ */
+#define L2ARC_LOG_BLK_SIZE                      (128 * 1024)    /* 128k */
+#define L2ARC_LOG_BLK_HEADER_LEN                (128)
+#define L2ARC_LOG_BLK_ENTRIES                   /* 1023 entries */      \
+        ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) /              \
+        sizeof (l2arc_log_ent_phys_t))
+/*
+ * Maximum amount of data in an l2arc log block (used to terminate rebuilding
+ * before we hit the write head and restore potentially corrupted blocks).
+ */
+#define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE  \
+        (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
+/*
+ * For the persistency and rebuild algorithms to operate reliably we need
+ * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
+ * excessive log block looping might confuse the log chain end detection).
+ * Under normal circumstances this is not a problem, since this is somewhere
+ * around only 400 MB.
+ */
+#define L2ARC_PERSIST_MIN_SIZE  (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
+
+/*
+ * A log block of up to 1023 ARC buffer log entries, chained into the
+ * persistent L2ARC metadata linked list.
+ */
+typedef struct l2arc_log_blk_phys {
+        /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
+        uint64_t                l2lb_magic;
+        l2arc_log_blk_ptr_t     l2lb_back2_lbp; /* back 2 steps in chain */
+        uint64_t                l2lb_pad[9];    /* resv'd for future use */
+        /* Payload */
+        l2arc_log_ent_phys_t    l2lb_entries[L2ARC_LOG_BLK_ENTRIES];
+} l2arc_log_blk_phys_t;
+
+CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
+CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) -
+    offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
+
+/*
+ * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
+ * written to the L2ARC device. They may be compressed, hence the uint8_t[].
+ */
+typedef struct l2arc_log_blk_buf {
+        uint8_t         l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
+        list_node_t     l2lbb_node;
+} l2arc_log_blk_buf_t;
+
+/* Macros for the manipulation fields in the blk_prop format of blkptr_t */
+#define BLKPROP_GET_LSIZE(_obj, _field)         \
+        BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define BLKPROP_SET_LSIZE(_obj, _field, x)      \
+        BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+#define BLKPROP_GET_PSIZE(_obj, _field)         \
+        BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define BLKPROP_SET_PSIZE(_obj, _field, x)      \
+        BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+#define BLKPROP_GET_COMPRESS(_obj, _field)      \
+        BF64_GET((_obj)->_field, 32, 8)
+#define BLKPROP_SET_COMPRESS(_obj, _field, x)   \
+        BF64_SET((_obj)->_field, 32, 8, x)
+#define BLKPROP_GET_CHECKSUM(_obj, _field)      \
+        BF64_GET((_obj)->_field, 40, 8)
+#define BLKPROP_SET_CHECKSUM(_obj, _field, x)   \
+        BF64_SET((_obj)->_field, 40, 8, x)
+#define BLKPROP_GET_TYPE(_obj, _field)          \
+        BF64_GET((_obj)->_field, 48, 8)
+#define BLKPROP_SET_TYPE(_obj, _field, x)       \
+        BF64_SET((_obj)->_field, 48, 8, x)
+
+/* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */
+#define LBP_GET_LSIZE(_add)             BLKPROP_GET_LSIZE(_add, l2lbp_prop)
+#define LBP_SET_LSIZE(_add, x)          BLKPROP_SET_LSIZE(_add, l2lbp_prop, x)
+#define LBP_GET_PSIZE(_add)             BLKPROP_GET_PSIZE(_add, l2lbp_prop)
+#define LBP_SET_PSIZE(_add, x)          BLKPROP_SET_PSIZE(_add, l2lbp_prop, x)
+#define LBP_GET_COMPRESS(_add)          BLKPROP_GET_COMPRESS(_add, l2lbp_prop)
+#define LBP_SET_COMPRESS(_add, x)       BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \
+    x)
+#define LBP_GET_CHECKSUM(_add)          BLKPROP_GET_CHECKSUM(_add, l2lbp_prop)
+#define LBP_SET_CHECKSUM(_add, x)       BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \
+    x)
+#define LBP_GET_TYPE(_add)              BLKPROP_GET_TYPE(_add, l2lbp_prop)
+#define LBP_SET_TYPE(_add, x)           BLKPROP_SET_TYPE(_add, l2lbp_prop, x)
+
+/* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */
+#define LE_GET_LSIZE(_le)       BLKPROP_GET_LSIZE(_le, l2le_prop)
+#define LE_SET_LSIZE(_le, x)    BLKPROP_SET_LSIZE(_le, l2le_prop, x)
+#define LE_GET_PSIZE(_le)       BLKPROP_GET_PSIZE(_le, l2le_prop)
+#define LE_SET_PSIZE(_le, x)    BLKPROP_SET_PSIZE(_le, l2le_prop, x)
+#define LE_GET_COMPRESS(_le)    BLKPROP_GET_COMPRESS(_le, l2le_prop)
+#define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x)
+#define LE_GET_CHECKSUM(_le)    BLKPROP_GET_CHECKSUM(_le, l2le_prop)
+#define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x)
+#define LE_GET_TYPE(_le)        BLKPROP_GET_TYPE(_le, l2le_prop)
+#define LE_SET_TYPE(_le, x)     BLKPROP_SET_TYPE(_le, l2le_prop, x)
+
+#define PTR_SWAP(x, y)          \
+        do {                    \
+                void *tmp = (x);\
+                x = y;          \
+                y = tmp;        \
+                _NOTE(CONSTCOND)\
+        } while (0)
+
+#define L2ARC_DEV_HDR_MAGIC     0x12bab10c00000001LLU
+#define L2ARC_LOG_BLK_MAGIC     0x120103b10c000001LLU
+#define L2ARC_REBUILD_TIMEOUT   300     /* a rebuild may take at most 300s */
+
+struct l2arc_dev {
+        vdev_t                  *l2ad_vdev;     /* vdev */
+        spa_t                   *l2ad_spa;      /* spa */
+        uint64_t                l2ad_hand;      /* next write location */
+        uint64_t                l2ad_start;     /* first addr on device */
+        uint64_t                l2ad_end;       /* last addr on device */
+        uint64_t                l2ad_evict;     /* last addr eviction reached */
+        boolean_t               l2ad_first;     /* first sweep through */
+        boolean_t               l2ad_writing;   /* currently writing */
+        list_t                  *l2ad_buflist;  /* buffer list */
+        list_node_t             l2ad_node;      /* device list node */
+        l2arc_dev_hdr_phys_t    l2ad_dev_hdr;   /* persistent device header */
+        l2arc_log_blk_phys_t    l2ad_log_blk;   /* currently open log block */
+        int                     l2ad_log_ent_idx; /* index into cur log blk */
+        /* number of bytes in current log block's payload */
+        uint64_t                l2ad_log_blk_payload_asize;
+        /* flag indicating whether a rebuild is scheduled or is going on */
+        boolean_t               l2ad_rebuild;
+};
+
+/*
+ * Performance tuning of L2ARC persistency:
+ *
+ * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
+ *              pool import or when adding one manually later) will attempt
+ *              to rebuild L2ARC buffer contents. In special circumstances,
+ *              the administrator may want to set this to B_FALSE, if they
+ *              are having trouble importing a pool or attaching an L2ARC
+ *              device (e.g. the L2ARC device is slow to read in stored log
+ *              metadata, or the metadata has become somehow
+ *              fragmented/unusable).
+ * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
+ *              avoid a slow L2ARC device from preventing pool import. If we
+ *              are not done rebuilding an L2ARC device by this time, we
+ *              stop the rebuild and return immediately.
+ */
+boolean_t l2arc_rebuild_enabled = B_TRUE;
+uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
+
+/*
+ * L2ARC persistency rebuild routines.
+ */
+static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
+static int l2arc_rebuild(l2arc_dev_t *dev);
+static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
+    l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
+static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
+    l2arc_dev_t *dev, uint64_t guid);
+
+/*
+ * L2ARC persistency read I/O routines.
+ */
+static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr);
+static int l2arc_log_blk_read(l2arc_dev_t *dev,
+    const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp,
+    l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+    uint8_t *this_lb_buf, uint8_t *next_lb_buf,
+    zio_t *this_io, zio_t **next_io);
+static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev,
+    const l2arc_log_blk_ptr_t *lp);
+static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
+    const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf);
+static void l2arc_log_blk_prefetch_abort(zio_t *zio);
+
+/*
+ * L2ARC persistency write I/O routines.
+ */
+static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
+static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+    l2arc_write_callback_t *cb);
+
+/*
+ * L2ARC persistency auxilliary routines.
+ */
+static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
+    zio_cksum_t *cksum);
+static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
+    const arc_buf_hdr_t *ab);
+static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
+    uint64_t top, uint64_t check);
+static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline);
+
+static inline uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
         uint8_t *vdva = (uint8_t *)dva;
         uint64_t crc = -1ULL;
         int i;

@@ -1245,11 +1555,11 @@
         }
         ab->b_state = new_state;
 
         /* adjust l2arc hdr stats */
         if (new_state == arc_l2c_only)
-                l2arc_hdr_stat_add();
+                l2arc_hdr_stat_add(old_state != arc_anon);
         else if (old_state == arc_l2c_only)
                 l2arc_hdr_stat_remove();
 }
 
 void

@@ -1349,10 +1659,37 @@
         (void) refcount_add(&hdr->b_refcnt, tag);
 
         return (buf);
 }
 
+/*
+ * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc->(disk).
+ */
+arc_buf_hdr_t *
+arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
+{
+        arc_buf_hdr_t *hdr;
+
+        ASSERT3U(size, >, 0);
+        hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+        ASSERT(BUF_EMPTY(hdr));
+        hdr->b_size = size;
+        hdr->b_type = type;
+        hdr->b_spa = guid;
+        hdr->b_state = arc_anon;
+        hdr->b_arc_access = 0;
+        hdr->b_buf = NULL;
+        hdr->b_datacnt = 0;
+        hdr->b_flags = 0;
+        ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+        return (hdr);
+}
+
 static char *arc_onloan_tag = "onloan";
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned

@@ -1586,11 +1923,11 @@
 
                 if (l2hdr != NULL) {
                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
-                        kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+                        kmem_free(l2hdr, sizeof (*l2hdr));
                         if (hdr->b_state == arc_l2c_only)
                                 l2arc_hdr_stat_remove();
                         hdr->b_l2hdr = NULL;
                 }

@@ -3043,10 +3380,13 @@
                 hdr->b_acb = acb;
                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
 
                 if (hdr->b_l2hdr != NULL &&
                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+                        /*
+                         * Need to stash these before letting go of hash_lock
+                         */
                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
                         addr = hdr->b_l2hdr->b_daddr;
                         b_compress = hdr->b_l2hdr->b_compress;
                         b_asize = hdr->b_l2hdr->b_asize;
                         /*

@@ -3416,11 +3756,11 @@
         buf->b_efunc = NULL;
         buf->b_private = NULL;
 
         if (l2hdr) {
                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
-                kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+                kmem_free(l2hdr, sizeof (*l2hdr));
                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
                 mutex_exit(&l2arc_buflist_mtx);
         }
 }

@@ -4031,10 +4371,88 @@
  *      l2arc_write_size()      calculate how much to write
  *      l2arc_write_interval()  calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
+ *
+ * L2ARC persistency:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) Every now and then we mix in a piece of metadata (called a log block)
+ *    into the L2ARC write. This allows us to understand what's been written,
+ *    so that we can rebuild the arc_buf_hdr_t structures of the main ARC
+ *    buffers. The log block also includes a "back-reference" pointer to the
+ *    previous block, forming a back-linked list of blocks on the L2ARC device.
+ *
+ * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
+ *    for our header bookkeeping purposes. This contains a device header, which
+ *    contains our top-level reference structures. We update it each time we
+ *    write a new log block, so that we're able to locate it in the L2ARC
+ *    device. If this write results in an inconsistent device header (e.g. due
+ *    to power failure), we detect this by verifying the header's checksum
+ *    and simply drop the entries from L2ARC.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * |       __________newest log block pointers_________                   |
+ * |      /                                  \1 back   \latest            |
+ * |     /                                    V         V                 |
+ * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
+ * |                       ^       / ^       / ^       /                  |
+ * |                       `-prev-'  `-prev-'  `-prev-'                   |
+ * |                         lb        lb        lb                       |
+ * +======================================================================+
+ *
+ * On-device data structures:
+ *
+ * L2ARC device header: l2arc_dev_hdr_phys_t
+ * L2ARC log block:     l2arc_log_blk_phys_t
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (writing
+ * a new log block every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed log block (and its referenced data buffers), like so:
+ *
+ *    current write head__       __old tail
+ *                        \     /
+ *                        V    V
+ * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
+ *                         ^    ^^^^^^^^^___________________________________
+ *                         |                                                \
+ *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process (see l2arc_rebuild).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update blocks which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
 {

@@ -4097,13 +4515,14 @@
 
         return (next);
 }
 
 static void
-l2arc_hdr_stat_add(void)
+l2arc_hdr_stat_add(boolean_t from_arc)
 {
         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+        if (from_arc)
         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
 }
 
 static void
 l2arc_hdr_stat_remove(void)

@@ -4134,11 +4553,14 @@
                 goto out;
 
         first = NULL;
         next = l2arc_dev_last;
         do {
-                /* loop around the list looking for a non-faulted vdev */
+                /*
+                 * Loop around the list looking for a non-faulted vdev
+                 * and one that isn't currently doing an L2ARC rebuild.
+                 */
                 if (next == NULL) {
                         next = list_head(l2arc_dev_list);
                 } else {
                         next = list_next(l2arc_dev_list, next);
                         if (next == NULL)

@@ -4149,14 +4571,14 @@
                 if (first == NULL)
                         first = next;
                 else if (next == first)
                         break;
 
-        } while (vdev_is_dead(next->l2ad_vdev));
+        } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
 
         /* if we were unable to find any usable vdevs, return NULL */
-        if (vdev_is_dead(next->l2ad_vdev))
+        if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
                 next = NULL;
 
         l2arc_dev_last = next;
 
 out:

@@ -4206,12 +4628,13 @@
 {
         l2arc_write_callback_t *cb;
         l2arc_dev_t *dev;
         list_t *buflist;
         arc_buf_hdr_t *head, *ab, *ab_prev;
-        l2arc_buf_hdr_t *abl2;
+        l2arc_buf_hdr_t *l2hdr;
         kmutex_t *hash_lock;
+        l2arc_log_blk_buf_t *lb_buf;
 
         cb = zio->io_private;
         ASSERT(cb != NULL);
         dev = cb->l2wcb_dev;
         ASSERT(dev != NULL);

@@ -4230,11 +4653,18 @@
         /*
          * All writes completed, or an error was hit.
          */
         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
                 ab_prev = list_prev(buflist, ab);
+                l2hdr = ab->b_l2hdr;
 
+                /*
+                 * Release the temporary compressed buffer as soon as possible.
+                 */
+                if (l2hdr->b_compress != ZIO_COMPRESS_OFF)
+                        l2arc_release_cdata_buf(ab);
+
                 hash_lock = HDR_LOCK(ab);
                 if (!mutex_tryenter(hash_lock)) {
                         /*
                          * This buffer misses out.  It may be in a stage
                          * of eviction.  Its ARC_L2_WRITING flag will be

@@ -4242,26 +4672,18 @@
                          */
                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
                         continue;
                 }
 
-                abl2 = ab->b_l2hdr;
-
-                /*
-                 * Release the temporary compressed buffer as soon as possible.
-                 */
-                if (abl2->b_compress != ZIO_COMPRESS_OFF)
-                        l2arc_release_cdata_buf(ab);
-
                 if (zio->io_error != 0) {
                         /*
                          * Error - drop L2ARC entry.
                          */
                         list_remove(buflist, ab);
-                        ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
+                        ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
                         ab->b_l2hdr = NULL;
-                        kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+                        kmem_free(l2hdr, sizeof (*l2hdr));
                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
                 }
 
                 /*
                  * Allow ARC to begin reads to this L2ARC entry.

@@ -4276,10 +4698,16 @@
         kmem_cache_free(hdr_cache, head);
         mutex_exit(&l2arc_buflist_mtx);
 
         l2arc_do_free_on_write();
 
+        for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL;
+            lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) {
+                (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list);
+                kmem_free(lb_buf, sizeof (*lb_buf));
+        }
+        list_destroy(&cb->l2wcb_log_blk_buf_list);
         kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 /*
  * A read to a cache device completed.  Validate buffer contents before

@@ -4399,20 +4827,33 @@
         mutex_enter(*lock);
         return (list);
 }
 
 /*
+ * Calculates the maximum overhead of L2ARC metadata log blocks for a given
+ * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
+ * overhead in processing to make sure there is enough headroom available
+ * when writing buffers.
+ */
+static inline uint64_t
+l2arc_log_blk_overhead(uint64_t write_sz)
+{
+        return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
+            L2ARC_LOG_BLK_SIZE;
+}
+
+/*
  * Evict buffers from the device write hand to the distance specified in
  * bytes.  This distance may span populated buffers, it may span nothing.
  * This is clearing a region on the L2ARC device ready for writing.
  * If the 'all' boolean is set, every buffer is evicted.
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
         list_t *buflist;
-        l2arc_buf_hdr_t *abl2;
+        l2arc_buf_hdr_t *l2hdr;
         arc_buf_hdr_t *ab, *ab_prev;
         kmutex_t *hash_lock;
         uint64_t taddr;
 
         buflist = dev->l2ad_buflist;

@@ -4426,10 +4867,14 @@
                  * nothing to evict.
                  */
                 return;
         }
 
+        /*
+         * We need to add in the worst case scenario of log block overhead.
+         */
+        distance += l2arc_log_blk_overhead(distance);
         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
                 /*
                  * When nearing the end of the device, evict to the end
                  * before the device write hand jumps to the start.
                  */

@@ -4508,14 +4953,14 @@
 
                         /*
                          * Tell ARC this no longer exists in L2ARC.
                          */
                         if (ab->b_l2hdr != NULL) {
-                                abl2 = ab->b_l2hdr;
-                                ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
+                                l2hdr = ab->b_l2hdr;
+                                ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
                                 ab->b_l2hdr = NULL;
-                                kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+                                kmem_free(l2hdr, sizeof (*l2hdr));
                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
                         }
                         list_remove(buflist, ab);
 
                         /*

@@ -4547,27 +4992,43 @@
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
     boolean_t *headroom_boost)
 {
         arc_buf_hdr_t *ab, *ab_prev, *head;
         list_t *list;
-        uint64_t write_asize, write_psize, write_sz, headroom,
+        /*
+         * These variables mean:
+         * - write_size: in-memory size of ARC buffers we've written (before
+         *      compression).
+         * - write_asize: actual on-disk size of ARC buffers we've written
+         *      (after compression).
+         * - write_aligned_asize: actual sum of space taken by ARC buffers
+         *      on the device (after compression and alignment, so that
+         *      every buffer starts on a multiple of the device block size).
+         * - headroom: L2ARC scanning headroom (we won't scan beyond this
+         *      distance from the list tail).
+         * - buf_compress_minsz: minimum in-memory ARC buffer size for us
+         *      to try compressing it.
+         */
+        uint64_t write_size, write_asize, write_aligned_asize, headroom,
             buf_compress_minsz;
         void *buf_data;
         kmutex_t *list_lock;
         boolean_t full;
         l2arc_write_callback_t *cb;
         zio_t *pio, *wzio;
         uint64_t guid = spa_load_guid(spa);
         const boolean_t do_headroom_boost = *headroom_boost;
+        boolean_t dev_hdr_update = B_FALSE;
 
         ASSERT(dev->l2ad_vdev != NULL);
 
         /* Lower the flag now, we might want to raise it again later. */
         *headroom_boost = B_FALSE;
 
         pio = NULL;
-        write_sz = write_asize = write_psize = 0;
+        cb = NULL;
+        write_size = write_asize = write_aligned_asize = 0;
         full = B_FALSE;
         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
         head->b_flags |= ARC_L2_WRITE_HEAD;
 
         /*

@@ -4601,11 +5062,11 @@
                         headroom = (headroom * l2arc_headroom_boost) / 100;
 
                 for (; ab; ab = ab_prev) {
                         l2arc_buf_hdr_t *l2hdr;
                         kmutex_t *hash_lock;
-                        uint64_t buf_sz;
+                        uint64_t buf_aligned_size;
 
                         if (arc_warm == B_FALSE)
                                 ab_prev = list_next(list, ab);
                         else
                                 ab_prev = list_prev(list, ab);

@@ -4616,11 +5077,19 @@
                                  * Skip this buffer rather than waiting.
                                  */
                                 continue;
                         }
 
-                        passed_sz += ab->b_size;
+                        /*
+                         * When examining whether we've met our write target,
+                         * we must always use the aligned size of the buffer,
+                         * since that's the maximum amount of space a buffer
+                         * can take up on the L2ARC device.
+                         */
+                        buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev,
+                            ab->b_size);
+                        passed_sz += buf_aligned_size;
                         if (passed_sz > headroom) {
                                 /*
                                  * Searched too far.
                                  */
                                 mutex_exit(hash_lock);

@@ -4630,11 +5099,11 @@
                         if (!l2arc_write_eligible(guid, ab)) {
                                 mutex_exit(hash_lock);
                                 continue;
                         }
 
-                        if ((write_sz + ab->b_size) > target_sz) {
+                        if ((write_size + buf_aligned_size) > target_sz) {
                                 full = B_TRUE;
                                 mutex_exit(hash_lock);
                                 break;
                         }

@@ -4644,22 +5113,25 @@
                                  * l2arc_write_done() can find where the
                                  * write buffers begin without searching.
                                  */
                                 list_insert_head(dev->l2ad_buflist, head);
 
-                                cb = kmem_alloc(
+                                cb = kmem_zalloc(
                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
                                 cb->l2wcb_dev = dev;
                                 cb->l2wcb_head = head;
+                                list_create(&cb->l2wcb_log_blk_buf_list,
+                                    sizeof (l2arc_log_blk_buf_t),
+                                    offsetof(l2arc_log_blk_buf_t, l2lbb_node));
                                 pio = zio_root(spa, l2arc_write_done, cb,
                                     ZIO_FLAG_CANFAIL);
                         }
 
                         /*
                          * Create and add a new L2ARC header.
                          */
-                        l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+                        l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
                         l2hdr->b_dev = dev;
                         ab->b_flags |= ARC_L2_WRITING;
 
                         /*
                          * Temporarily stash the data buffer in b_tmp_cdata.

@@ -4671,11 +5143,10 @@
                          */
                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
                         l2hdr->b_asize = ab->b_size;
                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
 
-                        buf_sz = ab->b_size;
                         ab->b_l2hdr = l2hdr;
 
                         list_insert_head(dev->l2ad_buflist, ab);
 
                         /*

@@ -4685,11 +5156,11 @@
                         arc_cksum_verify(ab->b_buf);
                         arc_cksum_compute(ab->b_buf, B_TRUE);
 
                         mutex_exit(hash_lock);
 
-                        write_sz += buf_sz;
+                        write_size += buf_aligned_size;
                 }
 
                 mutex_exit(list_lock);
 
                 if (full == B_TRUE)

@@ -4696,11 +5167,11 @@
                         break;
         }
 
         /* No buffers selected for writing? */
         if (pio == NULL) {
-                ASSERT0(write_sz);
+                ASSERT0(write_size);
                 mutex_exit(&l2arc_buflist_mtx);
                 kmem_cache_free(hdr_cache, head);
                 return (0);
         }

@@ -4741,11 +5212,11 @@
                 buf_data = l2hdr->b_tmp_cdata;
                 buf_sz = l2hdr->b_asize;
 
                 /* Compression may have squashed the buffer to zero length. */
                 if (buf_sz != 0) {
-                        uint64_t buf_p_sz;
+                        uint64_t buf_aligned_asize;
 
                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
                             ZIO_FLAG_CANFAIL, B_FALSE);

@@ -4756,30 +5227,41 @@
 
                         write_asize += buf_sz;
                         /*
                          * Keep the clock hand suitably device-aligned.
                          */
-                        buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-                        write_psize += buf_p_sz;
-                        dev->l2ad_hand += buf_p_sz;
+                        buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev,
+                            buf_sz);
+                        write_aligned_asize += buf_aligned_asize;
+                        dev->l2ad_hand += buf_aligned_asize;
+                        ASSERT(dev->l2ad_hand <= dev->l2ad_evict ||
+                            dev->l2ad_first);
                 }
-        }
 
+                if (l2arc_log_blk_insert(dev, ab)) {
+                        l2arc_log_blk_commit(dev, pio, cb);
+                        dev_hdr_update = B_TRUE;
+                }
+        }
         mutex_exit(&l2arc_buflist_mtx);
 
-        ASSERT3U(write_asize, <=, target_sz);
+        if (dev_hdr_update)
+                l2arc_dev_hdr_update(dev, pio);
+
+        VERIFY3U(write_aligned_asize, <=, target_sz);
         ARCSTAT_BUMP(arcstat_l2_writes_sent);
         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
-        ARCSTAT_INCR(arcstat_l2_size, write_sz);
-        ARCSTAT_INCR(arcstat_l2_asize, write_asize);
-        vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
+        ARCSTAT_INCR(arcstat_l2_size, write_size);
+        ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize);
+        vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0);
 
         /*
          * Bump device hand to the device start if it is approaching the end.
          * l2arc_evict() will already have evicted ahead for this case.
          */
-        if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
+        if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
+            dev->l2ad_end) {
                 vdev_space_update(dev->l2ad_vdev,
                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
                 dev->l2ad_hand = dev->l2ad_start;
                 dev->l2ad_evict = dev->l2ad_start;
                 dev->l2ad_first = B_FALSE;

@@ -5037,29 +5519,39 @@
 }
 
 boolean_t
 l2arc_vdev_present(vdev_t *vd)
 {
+        return (l2arc_vdev_get(vd) != NULL);
+}
+
+static l2arc_dev_t *
+l2arc_vdev_get(vdev_t *vd)
+{
         l2arc_dev_t *dev;
+        boolean_t       held = MUTEX_HELD(&l2arc_dev_mtx);
 
+        if (!held)
         mutex_enter(&l2arc_dev_mtx);
         for (dev = list_head(l2arc_dev_list); dev != NULL;
             dev = list_next(l2arc_dev_list, dev)) {
                 if (dev->l2ad_vdev == vd)
                         break;
         }
+        if (!held)
         mutex_exit(&l2arc_dev_mtx);
 
-        return (dev != NULL);
+        return (dev);
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
- * validated the vdev and opened it.
+ * validated the vdev and opened it. The `rebuild' flag indicates whether
+ * we should attempt an L2ARC persistency rebuild.
  */
 void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd)
+l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
 {
         l2arc_dev_t *adddev;
 
         ASSERT(!l2arc_vdev_present(vd));

@@ -5067,11 +5559,12 @@
          * Create a new l2arc device entry.
          */
         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
         adddev->l2ad_spa = spa;
         adddev->l2ad_vdev = vd;
-        adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+        /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */
+        adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE;
         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
         adddev->l2ad_hand = adddev->l2ad_start;
         adddev->l2ad_evict = adddev->l2ad_start;
         adddev->l2ad_first = B_TRUE;
         adddev->l2ad_writing = B_FALSE;

@@ -5090,10 +5583,20 @@
          * Add device to global list
          */
         mutex_enter(&l2arc_dev_mtx);
         list_insert_head(l2arc_dev_list, adddev);
         atomic_inc_64(&l2arc_ndev);
+        if (rebuild && l2arc_rebuild_enabled &&
+            adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
+                /*
+                 * Just mark the device as pending for a rebuild. We won't
+                 * be starting a rebuild in line here as it would block pool
+                 * import. Instead spa_load_impl will hand that off to an
+                 * async task which will call l2arc_spa_rebuild_start.
+                 */
+                adddev->l2ad_rebuild = B_TRUE;
+        }
         mutex_exit(&l2arc_dev_mtx);
 }
 
 /*
  * Remove a vdev from the L2ARC.

@@ -5196,6 +5699,731 @@
         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
         l2arc_thread_exit = 1;
         while (l2arc_thread_exit != 0)
                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
         mutex_exit(&l2arc_feed_thr_lock);
+}
+
+/*
+ * Punches out rebuild threads for the L2ARC devices in a spa. This should
+ * be called as one of the final steps of a pool import.
+ */
+void
+l2arc_spa_rebuild_start(spa_t *spa)
+{
+        l2arc_dev_t     *dev;
+        /*
+         * Locate the spa's l2arc devices and kick off rebuild threads.
+         */
+        mutex_enter(&l2arc_dev_mtx);
+        for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+                dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
+                ASSERT(dev != NULL);
+                if (dev->l2ad_rebuild) {
+                        (void) thread_create(NULL, 0, l2arc_dev_rebuild_start,
+                            dev, 0, &p0, TS_RUN, minclsyspri);
+                }
+        }
+        mutex_exit(&l2arc_dev_mtx);
+}
+
+/*
+ * Main entry point for L2ARC rebuilding.
+ */
+static void
+l2arc_dev_rebuild_start(l2arc_dev_t *dev)
+{
+        spa_t *spa = dev->l2ad_spa;
+        vdev_t *vd = dev->l2ad_vdev;
+
+        /* Lock out device removal. */
+        spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+        ASSERT(dev->l2ad_rebuild);
+        (void) l2arc_rebuild(dev);
+        dev->l2ad_rebuild = B_FALSE;
+        spa_config_exit(spa, SCL_L2ARC, vd);
+        thread_exit();
+}
+
+/*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ *
+ * 1) reads the device's header
+ * 2) if a good device header is found, starts reading the log block chain
+ * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the log blk chain (the back-reference in the blk is
+ *    invalid or loops over our starting point).
+ * 2) We encounter *any* error condition (cksum errors, io errors, looped
+ *    blocks, etc.).
+ * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
+ *    from making severely fragmented L2ARC log blocks or slow L2ARC devices
+ *    prevent a machine from finishing a pool import (and thus letting the
+ *    administrator take corrective action, e.g. by kicking the misbehaving
+ *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
+ *    rebuilding disabled).
+ */
+static int
+l2arc_rebuild(l2arc_dev_t *dev)
+{
+        int                     err;
+        l2arc_log_blk_phys_t    *this_lb, *next_lb;
+        uint8_t                 *this_lb_buf, *next_lb_buf;
+        zio_t                   *this_io = NULL, *next_io = NULL;
+        int64_t                 deadline;
+        l2arc_log_blk_ptr_t     lb_ptrs[2];
+        boolean_t               first_pass;
+        uint64_t                load_guid;
+
+        load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
+        deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
+        /*
+         * Device header processing phase.
+         */
+        if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) {
+                /* device header corrupted, start a new one */
+                bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr));
+                return (err);
+        }
+        if (l2arc_check_rebuild_timeout_hit(deadline))
+                return (SET_ERROR(ETIMEDOUT));
+
+        /* Retrieve the persistent L2ARC device state */
+        dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail;
+        dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
+            dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr +
+            LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0]));
+        dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags &
+            L2ARC_DEV_HDR_EVICT_FIRST);
+
+        /* Prepare the rebuild processing state */
+        bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
+        this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
+        next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
+        this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
+        next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
+        first_pass = B_TRUE;
+
+        /* Start the rebuild process */
+        for (;;) {
+                if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0]))
+                        /* We hit an invalid block address, end the rebuild. */
+                        break;
+
+                if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
+                    this_lb, next_lb, this_lb_buf, next_lb_buf,
+                    this_io, &next_io)) != 0)
+                        break;
+
+                /* Protection against infinite loops of log blocks. */
+                if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr,
+                    lb_ptrs[0].l2lbp_daddr,
+                    dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
+                    !first_pass) {
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
+                        err = SET_ERROR(ELOOP);
+                        break;
+                }
+
+                /*
+                 * Our memory pressure valve. If the system is running low
+                 * on memory, rather than swamping memory with new ARC buf
+                 * hdrs, we opt not to rebuild the L2ARC. At this point,
+                 * however, we have already set up our L2ARC dev to chain in
+                 * new metadata log blk, so the user may choose to re-add the
+                 * L2ARC dev at a later time to reconstruct it (when there's
+                 * less memory pressure).
+                 */
+                if (arc_reclaim_needed()) {
+                        ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+                        cmn_err(CE_NOTE, "System running low on memory, "
+                            "aborting L2ARC rebuild.");
+                        err = SET_ERROR(ENOMEM);
+                        break;
+                }
+
+                /*
+                 * Now that we know that the next_lb checks out alright, we
+                 * can start reconstruction from this lb - we can be sure
+                 * that the L2ARC write hand has not yet reached any of our
+                 * buffers.
+                 */
+                l2arc_log_blk_restore(dev, load_guid, this_lb,
+                    LBP_GET_PSIZE(&lb_ptrs[0]));
+
+                /*
+                 * End of list detection. We can look ahead two steps in the
+                 * blk chain and if the 2nd blk from this_lb dips below the
+                 * initial chain starting point, then we know two things:
+                 *      1) it can't be valid, and
+                 *      2) the next_lb's ARC entries might have already been
+                 *      partially overwritten and so we should stop before
+                 *      we restore it
+                 */
+                if (l2arc_range_check_overlap(
+                    this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr,
+                    dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
+                    !first_pass)
+                        break;
+
+                /* log blk restored, continue with next one in the list */
+                lb_ptrs[0] = lb_ptrs[1];
+                lb_ptrs[1] = this_lb->l2lb_back2_lbp;
+                PTR_SWAP(this_lb, next_lb);
+                PTR_SWAP(this_lb_buf, next_lb_buf);
+                this_io = next_io;
+                next_io = NULL;
+                first_pass = B_FALSE;
+
+                if (l2arc_check_rebuild_timeout_hit(deadline)) {
+                        err = SET_ERROR(ETIMEDOUT);
+                        break;
+                }
+        }
+        if (next_io != NULL)
+                l2arc_log_blk_prefetch_abort(next_io);
+        kmem_free(this_lb, sizeof (*this_lb));
+        kmem_free(next_lb, sizeof (*next_lb));
+        kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
+        kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
+        if (err == 0)
+                ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
+
+        return (err);
+}
+
+/*
+ * Restores the payload of a log blk to ARC. This creates empty ARC hdr
+ * entries which only contain an l2arc hdr, essentially restoring the
+ * buffers to their L2ARC evicted state. This function also updates space
+ * usage on the L2ARC vdev to make sure it tracks restored buffers.
+ */
+static void
+l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
+    l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
+{
+        uint64_t        size = 0, psize = 0;
+
+        mutex_enter(&l2arc_buflist_mtx);
+
+        for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
+                /*
+                 * Restore goes in the reverse direction to preserve correct
+                 * temporal ordering of buffers in the l2ad_buflist.
+                 */
+                l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid);
+                size += LE_GET_LSIZE(&lb->l2lb_entries[i]);
+                psize += LE_GET_PSIZE(&lb->l2lb_entries[i]);
+        }
+        mutex_exit(&l2arc_buflist_mtx);
+
+        /*
+         * Record rebuild stats:
+         *      size            In-memory size of restored buffer data in ARC
+         *      psize           Physical size of restored buffers in the L2ARC
+         *      bufs            # of ARC buffer headers restored
+         *      log_blks        # of L2ARC log entries processed during restore
+         */
+        ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
+        ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
+        ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
+        ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
+        ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
+        ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
+        vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
+}
+
+/*
+ * Restores a single ARC buf hdr from a log block. The ARC buffer is put
+ * into a state indicating that it has been evicted to L2ARC.
+ */
+static void
+l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
+    uint64_t load_guid)
+{
+        arc_buf_hdr_t   *hdr, *exists;
+        kmutex_t        *hash_lock;
+        arc_buf_contents_t      type = LE_GET_TYPE(le);
+        l2arc_buf_hdr_t         *l2hdr;
+
+        hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type);
+        hdr->b_dva = le->l2le_dva;
+        hdr->b_birth = le->l2le_birth;
+        hdr->b_cksum0 = le->l2le_cksum0;
+        hdr->b_size = LE_GET_LSIZE(le);
+        exists = buf_hash_insert(hdr, &hash_lock);
+        if (exists) {
+                /* Buffer was already cached, no need to restore it. */
+                mutex_exit(hash_lock);
+                arc_hdr_destroy(hdr);
+                ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+                return;
+        }
+        hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE;
+        if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF)
+                hdr->b_flags |= ARC_L2COMPRESS;
+        mutex_enter(&hdr->b_freeze_lock);
+        ASSERT(hdr->b_freeze_cksum == NULL);
+        hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+        *hdr->b_freeze_cksum = le->l2le_freeze_cksum;
+        mutex_exit(&hdr->b_freeze_lock);
+
+        /* now rebuild the l2arc entry */
+        ASSERT(hdr->b_l2hdr == NULL);
+        l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
+        l2hdr->b_dev = dev;
+        l2hdr->b_daddr = le->l2le_daddr;
+        l2hdr->b_asize = LE_GET_PSIZE(le);
+        l2hdr->b_compress = LE_GET_COMPRESS(le);
+        hdr->b_l2hdr = l2hdr;
+        list_insert_tail(dev->l2ad_buflist, hdr);
+        ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
+        ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
+
+        arc_change_state(arc_l2c_only, hdr, hash_lock);
+        mutex_exit(hash_lock);
+}
+
+/*
+ * Attempts to read the device header on the provided L2ARC device and writes
+ * it to `ub'. On success, this function returns 0, otherwise the appropriate
+ * error code is returned.
+ */
+static int
+l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr)
+{
+        int             err;
+        uint64_t        guid;
+        zio_cksum_t     cksum;
+
+        guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+        if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+            VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr,
+            ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
+                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+                return (err);
+        }
+
+        if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+                byteswap_uint64_array(hdr, sizeof (*hdr));
+
+        if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC ||
+            hdr->l2dh_spa_guid != guid) {
+                /*
+                 * Attempt to rebuild a device containing no actual dev hdr
+                 * or containing a header from some other pool.
+                 */
+                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+                return (SET_ERROR(ENOTSUP));
+        }
+
+        l2arc_dev_hdr_checksum(hdr, &cksum);
+        if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) {
+                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
+                return (SET_ERROR(EINVAL));
+        }
+        if (hdr->l2dh_evict_tail < dev->l2ad_start ||
+            hdr->l2dh_evict_tail >= dev->l2ad_end) {
+                /* Data in dev hdr is invalid for this device. */
+                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+                return (SET_ERROR(EINVAL));
+        }
+
+        return (0);
+}
+
+/*
+ * Reads L2ARC log blocks from storage and validates their contents.
+ *
+ * This function implements a simple prefetcher to make sure that while
+ * we're processing one buffer the L2ARC is already prefetching the next
+ * one in the chain.
+ *
+ * The arguments this_lp and next_lp point to the current and next log blk
+ * address in the block chain. Similarly, this_lb and next_lb hold the
+ * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
+ * and next_lb_buf must be buffers of appropriate to hold a raw
+ * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
+ * to buffer decompression).
+ *
+ * The `this_io' and `next_io' arguments are used for block prefetching.
+ * When issuing the first blk IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the block and
+ * also issue an async IO to fetch the next block in the block chain. The
+ * prefetch IO is returned in `next_io'. On subsequent calls to this
+ * function, pass the value returned in `next_io' from the previous call
+ * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
+ * Prior to the call, you should initialize your `next_io' pointer to be
+ * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the prefetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of prefetch IOs.
+ */
+static int
+l2arc_log_blk_read(l2arc_dev_t *dev,
+    const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp,
+    l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+    uint8_t *this_lb_buf, uint8_t *next_lb_buf,
+    zio_t *this_io, zio_t **next_io)
+{
+        int err = 0;
+        zio_cksum_t cksum;
+
+        ASSERT(this_lbp != NULL && next_lbp != NULL);
+        ASSERT(this_lb != NULL && next_lb != NULL);
+        ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
+        ASSERT(next_io != NULL && *next_io == NULL);
+        ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp));
+
+        /*
+         * Check to see if we have issued the IO for this log blk in a
+         * previous run. If not, this is the first call, so issue it now.
+         */
+        if (this_io == NULL) {
+                this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
+                    this_lb_buf);
+        }
+
+        /*
+         * Peek to see if we can start issuing the next IO immediately.
+         */
+        if (l2arc_log_blk_ptr_valid(dev, next_lbp)) {
+                /*
+                 * Start issuing IO for the next log blk early - this
+                 * should help keep the L2ARC device busy while we
+                 * decompress and restore this log blk.
+                 */
+                *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
+                    next_lb_buf);
+        }
+
+        /* Wait for the IO to read this log block to complete */
+        if ((err = zio_wait(this_io)) != 0) {
+                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+                goto cleanup;
+        }
+
+        /* Make sure the buffer checks out */
+        fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum);
+        if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) {
+                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
+                err = SET_ERROR(EINVAL);
+                goto cleanup;
+        }
+
+        /* Now we can take our time decoding this buffer */
+        switch (LBP_GET_COMPRESS(this_lbp)) {
+        case ZIO_COMPRESS_OFF:
+                bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
+                break;
+        case ZIO_COMPRESS_LZ4:
+                if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
+                    this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
+                    sizeof (*this_lb))) != 0) {
+                        err = SET_ERROR(EINVAL);
+                        goto cleanup;
+                }
+                break;
+        default:
+                err = SET_ERROR(EINVAL);
+                goto cleanup;
+        }
+        if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+                byteswap_uint64_array(this_lb, sizeof (*this_lb));
+        if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) {
+                err = SET_ERROR(EINVAL);
+                goto cleanup;
+        }
+cleanup:
+        /* Abort an in-flight prefetch I/O in case of error */
+        if (err != 0 && *next_io != NULL) {
+                l2arc_log_blk_prefetch_abort(*next_io);
+                *next_io = NULL;
+        }
+        return (err);
+}
+
+/*
+ * Validates an L2ARC log blk address to make sure that it can be read
+ * from the provided L2ARC device. Returns B_TRUE if the address is
+ * within the device's bounds, or B_FALSE if not.
+ */
+static boolean_t
+l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp)
+{
+        uint64_t psize = LBP_GET_PSIZE(lbp);
+        uint64_t end = lbp->l2lbp_daddr + psize;
+
+        /*
+         * A log block is valid if all of the following conditions are true:
+         * - it fits entirely between l2ad_start and l2ad_end
+         * - it has a valid size
+         * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it
+         *      doesn't sit in the evicted region)
+         */
+        return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end &&
+            psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
+            lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand);
+}
+
+/*
+ * Starts an asynchronous read IO to read a log block. This is used in log
+ * block reconstruction to start reading the next block before we are done
+ * decoding and reconstructing the current block, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain a newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
+ * care of disposing of the allocated buffers correctly.
+ */
+static zio_t *
+l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp,
+    uint8_t *lb_buf)
+{
+        uint32_t psize;
+        zio_t *pio;
+
+        psize = LBP_GET_PSIZE(lbp);
+        ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
+        pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
+            ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+            ZIO_FLAG_DONT_RETRY);
+        (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize,
+            lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
+
+        return (pio);
+}
+
+/*
+ * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
+ * buffers allocated for it.
+ */
+static void
+l2arc_log_blk_prefetch_abort(zio_t *zio)
+{
+        (void) zio_wait(zio);
+}
+
+/*
+ * Creates a zio to update the device header on an l2arc device. The zio is
+ * initiated as a child of `pio'.
+ */
+static void
+l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
+{
+        zio_t *wzio;
+        vdev_stat_t st;
+        l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr;
+
+        vdev_get_stats(dev->l2ad_vdev, &st);
+
+        hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC;
+        hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+        hdr->l2dh_evict_tail = dev->l2ad_evict;
+        hdr->l2dh_alloc_space = st.vs_alloc;
+        hdr->l2dh_flags = 0;
+        if (dev->l2ad_first)
+                hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
+
+        /* checksum operation goes last */
+        l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum);
+
+        CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE &&
+            sizeof (*hdr) <= SPA_MAXBLOCKSIZE);
+        wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
+            sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL,
+            NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+            zio_t *, wzio);
+        (void) zio_nowait(wzio);
+}
+
+/*
+ * Commits a log block to the L2ARC device. This routine is invoked from
+ * l2arc_write_buffers when the log block fills up.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+static void
+l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+    l2arc_write_callback_t *cb)
+{
+        l2arc_log_blk_phys_t    *lb = &dev->l2ad_log_blk;
+        uint64_t                psize, asize;
+        l2arc_log_blk_buf_t     *lb_buf;
+        zio_t                   *wzio;
+
+        VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
+
+        /* link the buffer into the block chain */
+        lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1];
+        lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC;
+
+        /* try to compress the buffer */
+        lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
+        list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf);
+        VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb,
+            lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0);
+
+        /*
+         * Update the start log blk pointer in the device header to point
+         * to the log block we're about to write.
+         */
+        dev->l2ad_dev_hdr.l2dh_start_lbps[1] =
+            dev->l2ad_dev_hdr.l2dh_start_lbps[0];
+        dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand;
+        LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb));
+        LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize);
+        LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
+            ZIO_CHECKSUM_FLETCHER_4);
+        LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0);
+        if (psize < sizeof (*lb)) {
+                /* compression succeeded */
+                LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
+                    ZIO_COMPRESS_LZ4);
+        } else {
+                /* compression failed */
+                bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb));
+                LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
+                    ZIO_COMPRESS_OFF);
+        }
+        /* checksum what we're about to write */
+        fletcher_4_native(lb_buf->l2lbb_log_blk, psize,
+            &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum);
+
+        /* perform the write itself */
+        CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
+            L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
+        wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
+            psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
+            ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
+        (void) zio_nowait(wzio);
+
+        /* realign the device hand */
+        asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+        dev->l2ad_hand += asize;
+        VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first);
+        vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+        /* bump the kstats */
+        ARCSTAT_INCR(arcstat_l2_write_bytes, psize);
+        ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
+        ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
+        ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
+            dev->l2ad_log_blk_payload_asize / asize);
+
+        dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0;
+}
+
+/*
+ * Computes the checksum of `hdr' and stores it in `cksum'.
+ */
+static void
+l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
+{
+        fletcher_4_native((uint8_t *)hdr +
+            offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
+            sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
+            cksum);
+}
+
+/*
+ * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
+ * The buffer being inserted must be present in L2ARC.
+ * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
+ * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
+ */
+static boolean_t
+l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
+{
+        l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+        l2arc_log_ent_phys_t *le;
+        const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+        int index = dev->l2ad_log_ent_idx++;
+
+        ASSERT(l2hdr != NULL);
+        ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
+
+        le = &lb->l2lb_entries[index];
+        bzero(le, sizeof (*le));
+        le->l2le_dva = ab->b_dva;
+        le->l2le_birth = ab->b_birth;
+        le->l2le_cksum0 = ab->b_cksum0;
+        le->l2le_daddr = l2hdr->b_daddr;
+        LE_SET_LSIZE(le, ab->b_size);
+        LE_SET_PSIZE(le, l2hdr->b_asize);
+        LE_SET_COMPRESS(le, l2hdr->b_compress);
+        le->l2le_freeze_cksum = *ab->b_freeze_cksum;
+        LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
+        LE_SET_TYPE(le, ab->b_type);
+        dev->l2ad_log_blk_payload_asize += l2hdr->b_asize;
+
+        return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
+}
+
+/*
+ * Checks whether a given L2ARC device address sits in a time-sequential
+ * range. The trick here is that the L2ARC is a rotary buffer, so we can't
+ * just do a range comparison, we need to handle the situation in which the
+ * range wraps around the end of the L2ARC device. Arguments:
+ *      bottom  Lower end of the range to check (written to earlier).
+ *      top     Upper end of the range to check (written to later).
+ *      check   The address for which we want to determine if it sits in
+ *              between the top and bottom.
+ *
+ * The 3-way conditional below represents the following cases:
+ *
+ *      bottom < top : Sequentially ordered case:
+ *        <check>--------+-------------------+
+ *                       |  (overlap here?)  |
+ *       L2ARC dev       V                   V
+ *       |---------------<bottom>============<top>--------------|
+ *
+ *      bottom > top: Looped-around case:
+ *                            <check>--------+------------------+
+ *                                           |  (overlap here?) |
+ *       L2ARC dev                           V                  V
+ *       |===============<top>---------------<bottom>===========|
+ *       ^               ^
+ *       |  (or here?)   |
+ *       +---------------+---------<check>
+ *
+ *      top == bottom : Just a single address comparison.
+ */
+static inline boolean_t
+l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
+{
+        if (bottom < top)
+                return (bottom <= check && check <= top);
+        else if (bottom > top)
+                return (check <= top || bottom <= check);
+        else
+                return (check == top);
+}
+
+/*
+ * Checks whether a rebuild timeout deadline has been hit and if it has,
+ * increments the appropriate error counters.
+ */
+static boolean_t
+l2arc_check_rebuild_timeout_hit(int64_t deadline)
+{
+        if (deadline != 0 && deadline < ddi_get_lbolt64()) {
+                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout);
+                cmn_err(CE_WARN, "L2ARC rebuild is taking too long, "
+                    "dropping remaining L2ARC metadata.");
+                return (B_TRUE);
+        } else {
+                return (B_FALSE);
+        }
 }