Print this page
3525 Persistent L2ARC
@@ -136,10 +136,12 @@
#include <sys/dnlc.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
#include <zfs_fletcher.h>
+#include <sys/byteorder.h>
+#include <sys/spa_impl.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
boolean_t arc_watch = B_FALSE;
int arc_procfd;
@@ -314,10 +316,25 @@
kstat_named_t arcstat_l2_asize;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_l2_compress_successes;
kstat_named_t arcstat_l2_compress_zeros;
kstat_named_t arcstat_l2_compress_failures;
+ kstat_named_t arcstat_l2_log_blk_writes;
+ kstat_named_t arcstat_l2_log_blk_avg_size;
+ kstat_named_t arcstat_l2_data_to_meta_ratio;
+ kstat_named_t arcstat_l2_rebuild_successes;
+ kstat_named_t arcstat_l2_rebuild_abort_unsupported;
+ kstat_named_t arcstat_l2_rebuild_abort_timeout;
+ kstat_named_t arcstat_l2_rebuild_abort_io_errors;
+ kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
+ kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
+ kstat_named_t arcstat_l2_rebuild_abort_lowmem;
+ kstat_named_t arcstat_l2_rebuild_size;
+ kstat_named_t arcstat_l2_rebuild_bufs;
+ kstat_named_t arcstat_l2_rebuild_bufs_precached;
+ kstat_named_t arcstat_l2_rebuild_psize;
+ kstat_named_t arcstat_l2_rebuild_log_blks;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_duplicate_buffers;
kstat_named_t arcstat_duplicate_buffers_size;
kstat_named_t arcstat_duplicate_reads;
kstat_named_t arcstat_meta_used;
@@ -380,10 +397,25 @@
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
{ "l2_compress_successes", KSTAT_DATA_UINT64 },
{ "l2_compress_zeros", KSTAT_DATA_UINT64 },
{ "l2_compress_failures", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 },
+ { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_successes", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_timeout", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_psize", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_size", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "duplicate_buffers", KSTAT_DATA_UINT64 },
{ "duplicate_buffers_size", KSTAT_DATA_UINT64 },
{ "duplicate_reads", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
@@ -427,10 +459,29 @@
} else { \
ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
} \
}
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG_FACTOR 3
+#define ARCSTAT_F_AVG(stat, value) \
+ do { \
+ uint64_t x = ARCSTAT(stat); \
+ x = x - x / ARCSTAT_F_AVG_FACTOR + \
+ (value) / ARCSTAT_F_AVG_FACTOR; \
+ ARCSTAT(stat) = x; \
+ _NOTE(NOTREACHED) \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
kstat_t *arc_ksp;
static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
@@ -635,23 +686,11 @@
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
/*
* L2ARC Internals
*/
-typedef struct l2arc_dev {
- vdev_t *l2ad_vdev; /* vdev */
- spa_t *l2ad_spa; /* spa */
- uint64_t l2ad_hand; /* next write location */
- uint64_t l2ad_start; /* first addr on device */
- uint64_t l2ad_end; /* last addr on device */
- uint64_t l2ad_evict; /* last addr eviction reached */
- boolean_t l2ad_first; /* first sweep through */
- boolean_t l2ad_writing; /* currently writing */
- list_t *l2ad_buflist; /* buffer list */
- list_node_t l2ad_node; /* device list node */
-} l2arc_dev_t;
-
+typedef struct l2arc_dev l2arc_dev_t;
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
static l2arc_dev_t *l2arc_dev_last; /* last device used */
static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
@@ -670,10 +709,12 @@
} l2arc_read_callback_t;
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+ /* list of in-flight l2arc_log_blk_buf_t's */
+ list_t l2wcb_log_blk_buf_list;
} l2arc_write_callback_t;
struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
@@ -697,19 +738,288 @@
static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
static void l2arc_read_done(zio_t *zio);
-static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_add(boolean_t from_arc);
static void l2arc_hdr_stat_remove(void);
+static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
enum zio_compress c);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
-static uint64_t
+enum {
+ L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
+};
+
+/*
+ * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
+ */
+typedef struct l2arc_log_blk_ptr {
+ uint64_t l2lbp_daddr; /* device address of log */
+ /*
+ * l2lbp_prop is the same format as the blk_prop in blkptr_t:
+ * * logical size (in sectors)
+ * * physical (compressed) size (in sectors)
+ * * compression algorithm (we always LZ4-compress l2arc logs)
+ * * checksum algorithm (used for l2lbp_cksum)
+ * * object type & level (unused for now)
+ */
+ uint64_t l2lbp_prop;
+ zio_cksum_t l2lbp_cksum; /* fletcher4 of log */
+} l2arc_log_blk_ptr_t;
+
+/*
+ * The persistent L2ARC device header.
+ */
+typedef struct l2arc_dev_hdr_phys {
+ uint64_t l2dh_magic;
+ zio_cksum_t l2dh_self_cksum; /* fletcher4 of fields below */
+
+ /*
+ * Global L2ARC device state and metadata.
+ */
+ uint64_t l2dh_spa_guid;
+ uint64_t l2dh_evict_tail; /* current evict pointer */
+ uint64_t l2dh_alloc_space; /* vdev space alloc status */
+ uint64_t l2dh_flags; /* l2arc_dev_hdr_flags_t */
+
+ /*
+ * Start of log block chain. [0] -> newest log, [1] -> one older (used
+ * for initiating prefetch).
+ */
+ l2arc_log_blk_ptr_t l2dh_start_lbps[2];
+
+ const uint64_t l2dh_pad[43]; /* pad to 512 bytes */
+} l2arc_dev_hdr_phys_t;
+CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
+
+/*
+ * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
+ */
+typedef struct l2arc_log_ent_phys {
+ dva_t l2le_dva; /* dva of buffer */
+ uint64_t l2le_birth; /* birth txg of buffer */
+ uint64_t l2le_cksum0;
+ zio_cksum_t l2le_freeze_cksum;
+ /*
+ * l2le_prop is the same format as the blk_prop in blkptr_t:
+ * * logical size (in sectors)
+ * * physical (compressed) size (in sectors)
+ * * compression algorithm
+ * * checksum algorithm (used for cksum0)
+ * * object type & level (used to restore arc_buf_contents_t)
+ */
+ uint64_t l2le_prop;
+ uint64_t l2le_daddr; /* buf location on l2dev */
+ const uint64_t l2le_pad[6]; /* resv'd for future use */
+} l2arc_log_ent_phys_t;
+
+/*
+ * These design limits give us the following overhead (before compression):
+ * avg_blk_sz overhead
+ * 1k 12.51 %
+ * 2k 6.26 %
+ * 4k 3.13 %
+ * 8k 1.56 %
+ * 16k 0.78 %
+ * 32k 0.39 %
+ * 64k 0.20 %
+ * 128k 0.10 %
+ * Compression should be able to sequeeze these down by about a factor of 2x.
+ */
+#define L2ARC_LOG_BLK_SIZE (128 * 1024) /* 128k */
+#define L2ARC_LOG_BLK_HEADER_LEN (128)
+#define L2ARC_LOG_BLK_ENTRIES /* 1023 entries */ \
+ ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) / \
+ sizeof (l2arc_log_ent_phys_t))
+/*
+ * Maximum amount of data in an l2arc log block (used to terminate rebuilding
+ * before we hit the write head and restore potentially corrupted blocks).
+ */
+#define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE \
+ (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
+/*
+ * For the persistency and rebuild algorithms to operate reliably we need
+ * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
+ * excessive log block looping might confuse the log chain end detection).
+ * Under normal circumstances this is not a problem, since this is somewhere
+ * around only 400 MB.
+ */
+#define L2ARC_PERSIST_MIN_SIZE (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
+
+/*
+ * A log block of up to 1023 ARC buffer log entries, chained into the
+ * persistent L2ARC metadata linked list.
+ */
+typedef struct l2arc_log_blk_phys {
+ /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
+ uint64_t l2lb_magic;
+ l2arc_log_blk_ptr_t l2lb_back2_lbp; /* back 2 steps in chain */
+ uint64_t l2lb_pad[9]; /* resv'd for future use */
+ /* Payload */
+ l2arc_log_ent_phys_t l2lb_entries[L2ARC_LOG_BLK_ENTRIES];
+} l2arc_log_blk_phys_t;
+
+CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
+CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) -
+ offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
+
+/*
+ * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
+ * written to the L2ARC device. They may be compressed, hence the uint8_t[].
+ */
+typedef struct l2arc_log_blk_buf {
+ uint8_t l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
+ list_node_t l2lbb_node;
+} l2arc_log_blk_buf_t;
+
+/* Macros for the manipulation fields in the blk_prop format of blkptr_t */
+#define BLKPROP_GET_LSIZE(_obj, _field) \
+ BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define BLKPROP_SET_LSIZE(_obj, _field, x) \
+ BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+#define BLKPROP_GET_PSIZE(_obj, _field) \
+ BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define BLKPROP_SET_PSIZE(_obj, _field, x) \
+ BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+#define BLKPROP_GET_COMPRESS(_obj, _field) \
+ BF64_GET((_obj)->_field, 32, 8)
+#define BLKPROP_SET_COMPRESS(_obj, _field, x) \
+ BF64_SET((_obj)->_field, 32, 8, x)
+#define BLKPROP_GET_CHECKSUM(_obj, _field) \
+ BF64_GET((_obj)->_field, 40, 8)
+#define BLKPROP_SET_CHECKSUM(_obj, _field, x) \
+ BF64_SET((_obj)->_field, 40, 8, x)
+#define BLKPROP_GET_TYPE(_obj, _field) \
+ BF64_GET((_obj)->_field, 48, 8)
+#define BLKPROP_SET_TYPE(_obj, _field, x) \
+ BF64_SET((_obj)->_field, 48, 8, x)
+
+/* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */
+#define LBP_GET_LSIZE(_add) BLKPROP_GET_LSIZE(_add, l2lbp_prop)
+#define LBP_SET_LSIZE(_add, x) BLKPROP_SET_LSIZE(_add, l2lbp_prop, x)
+#define LBP_GET_PSIZE(_add) BLKPROP_GET_PSIZE(_add, l2lbp_prop)
+#define LBP_SET_PSIZE(_add, x) BLKPROP_SET_PSIZE(_add, l2lbp_prop, x)
+#define LBP_GET_COMPRESS(_add) BLKPROP_GET_COMPRESS(_add, l2lbp_prop)
+#define LBP_SET_COMPRESS(_add, x) BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \
+ x)
+#define LBP_GET_CHECKSUM(_add) BLKPROP_GET_CHECKSUM(_add, l2lbp_prop)
+#define LBP_SET_CHECKSUM(_add, x) BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \
+ x)
+#define LBP_GET_TYPE(_add) BLKPROP_GET_TYPE(_add, l2lbp_prop)
+#define LBP_SET_TYPE(_add, x) BLKPROP_SET_TYPE(_add, l2lbp_prop, x)
+
+/* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */
+#define LE_GET_LSIZE(_le) BLKPROP_GET_LSIZE(_le, l2le_prop)
+#define LE_SET_LSIZE(_le, x) BLKPROP_SET_LSIZE(_le, l2le_prop, x)
+#define LE_GET_PSIZE(_le) BLKPROP_GET_PSIZE(_le, l2le_prop)
+#define LE_SET_PSIZE(_le, x) BLKPROP_SET_PSIZE(_le, l2le_prop, x)
+#define LE_GET_COMPRESS(_le) BLKPROP_GET_COMPRESS(_le, l2le_prop)
+#define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x)
+#define LE_GET_CHECKSUM(_le) BLKPROP_GET_CHECKSUM(_le, l2le_prop)
+#define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x)
+#define LE_GET_TYPE(_le) BLKPROP_GET_TYPE(_le, l2le_prop)
+#define LE_SET_TYPE(_le, x) BLKPROP_SET_TYPE(_le, l2le_prop, x)
+
+#define PTR_SWAP(x, y) \
+ do { \
+ void *tmp = (x);\
+ x = y; \
+ y = tmp; \
+ _NOTE(CONSTCOND)\
+ } while (0)
+
+#define L2ARC_DEV_HDR_MAGIC 0x12bab10c00000001LLU
+#define L2ARC_LOG_BLK_MAGIC 0x120103b10c000001LLU
+#define L2ARC_REBUILD_TIMEOUT 300 /* a rebuild may take at most 300s */
+
+struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ uint64_t l2ad_evict; /* last addr eviction reached */
+ boolean_t l2ad_first; /* first sweep through */
+ boolean_t l2ad_writing; /* currently writing */
+ list_t *l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+ l2arc_dev_hdr_phys_t l2ad_dev_hdr; /* persistent device header */
+ l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
+ int l2ad_log_ent_idx; /* index into cur log blk */
+ /* number of bytes in current log block's payload */
+ uint64_t l2ad_log_blk_payload_asize;
+ /* flag indicating whether a rebuild is scheduled or is going on */
+ boolean_t l2ad_rebuild;
+};
+
+/*
+ * Performance tuning of L2ARC persistency:
+ *
+ * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
+ * pool import or when adding one manually later) will attempt
+ * to rebuild L2ARC buffer contents. In special circumstances,
+ * the administrator may want to set this to B_FALSE, if they
+ * are having trouble importing a pool or attaching an L2ARC
+ * device (e.g. the L2ARC device is slow to read in stored log
+ * metadata, or the metadata has become somehow
+ * fragmented/unusable).
+ * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
+ * avoid a slow L2ARC device from preventing pool import. If we
+ * are not done rebuilding an L2ARC device by this time, we
+ * stop the rebuild and return immediately.
+ */
+boolean_t l2arc_rebuild_enabled = B_TRUE;
+uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
+
+/*
+ * L2ARC persistency rebuild routines.
+ */
+static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
+static int l2arc_rebuild(l2arc_dev_t *dev);
+static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
+ l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
+static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
+ l2arc_dev_t *dev, uint64_t guid);
+
+/*
+ * L2ARC persistency read I/O routines.
+ */
+static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr);
+static int l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ uint8_t *this_lb_buf, uint8_t *next_lb_buf,
+ zio_t *this_io, zio_t **next_io);
+static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev,
+ const l2arc_log_blk_ptr_t *lp);
+static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
+ const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf);
+static void l2arc_log_blk_prefetch_abort(zio_t *zio);
+
+/*
+ * L2ARC persistency write I/O routines.
+ */
+static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
+static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+ l2arc_write_callback_t *cb);
+
+/*
+ * L2ARC persistency auxilliary routines.
+ */
+static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
+ zio_cksum_t *cksum);
+static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
+ const arc_buf_hdr_t *ab);
+static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
+ uint64_t top, uint64_t check);
+static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline);
+
+static inline uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL;
int i;
@@ -1245,11 +1555,11 @@
}
ab->b_state = new_state;
/* adjust l2arc hdr stats */
if (new_state == arc_l2c_only)
- l2arc_hdr_stat_add();
+ l2arc_hdr_stat_add(old_state != arc_anon);
else if (old_state == arc_l2c_only)
l2arc_hdr_stat_remove();
}
void
@@ -1349,10 +1659,37 @@
(void) refcount_add(&hdr->b_refcnt, tag);
return (buf);
}
+/*
+ * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc->(disk).
+ */
+arc_buf_hdr_t *
+arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT3U(size, >, 0);
+ hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ ASSERT(BUF_EMPTY(hdr));
+ hdr->b_size = size;
+ hdr->b_type = type;
+ hdr->b_spa = guid;
+ hdr->b_state = arc_anon;
+ hdr->b_arc_access = 0;
+ hdr->b_buf = NULL;
+ hdr->b_datacnt = 0;
+ hdr->b_flags = 0;
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+ return (hdr);
+}
+
static char *arc_onloan_tag = "onloan";
/*
* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
* flight data by arc_tempreserve_space() until they are "returned". Loaned
@@ -1586,11 +1923,11 @@
if (l2hdr != NULL) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ kmem_free(l2hdr, sizeof (*l2hdr));
if (hdr->b_state == arc_l2c_only)
l2arc_hdr_stat_remove();
hdr->b_l2hdr = NULL;
}
@@ -3043,10 +3380,13 @@
hdr->b_acb = acb;
hdr->b_flags |= ARC_IO_IN_PROGRESS;
if (hdr->b_l2hdr != NULL &&
(vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+ /*
+ * Need to stash these before letting go of hash_lock
+ */
devw = hdr->b_l2hdr->b_dev->l2ad_writing;
addr = hdr->b_l2hdr->b_daddr;
b_compress = hdr->b_l2hdr->b_compress;
b_asize = hdr->b_l2hdr->b_asize;
/*
@@ -3416,11 +3756,11 @@
buf->b_efunc = NULL;
buf->b_private = NULL;
if (l2hdr) {
ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ kmem_free(l2hdr, sizeof (*l2hdr));
ARCSTAT_INCR(arcstat_l2_size, -buf_size);
mutex_exit(&l2arc_buflist_mtx);
}
}
@@ -4031,10 +4371,88 @@
* l2arc_write_size() calculate how much to write
* l2arc_write_interval() calculate sleep delay between writes
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
+ *
+ * L2ARC persistency:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) Every now and then we mix in a piece of metadata (called a log block)
+ * into the L2ARC write. This allows us to understand what's been written,
+ * so that we can rebuild the arc_buf_hdr_t structures of the main ARC
+ * buffers. The log block also includes a "back-reference" pointer to the
+ * previous block, forming a back-linked list of blocks on the L2ARC device.
+ *
+ * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
+ * for our header bookkeeping purposes. This contains a device header, which
+ * contains our top-level reference structures. We update it each time we
+ * write a new log block, so that we're able to locate it in the L2ARC
+ * device. If this write results in an inconsistent device header (e.g. due
+ * to power failure), we detect this by verifying the header's checksum
+ * and simply drop the entries from L2ARC.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * | __________newest log block pointers_________ |
+ * | / \1 back \latest |
+ * | / V V |
+ * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
+ * | ^ / ^ / ^ / |
+ * | `-prev-' `-prev-' `-prev-' |
+ * | lb lb lb |
+ * +======================================================================+
+ *
+ * On-device data structures:
+ *
+ * L2ARC device header: l2arc_dev_hdr_phys_t
+ * L2ARC log block: l2arc_log_blk_phys_t
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (writing
+ * a new log block every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed log block (and its referenced data buffers), like so:
+ *
+ * current write head__ __old tail
+ * \ /
+ * V V
+ * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
+ * ^ ^^^^^^^^^___________________________________
+ * | \
+ * <<nextwrite>> may overwrite this blk and/or its bufs --'
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process (see l2arc_rebuild).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update blocks which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
*/
static boolean_t
l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
{
@@ -4097,13 +4515,14 @@
return (next);
}
static void
-l2arc_hdr_stat_add(void)
+l2arc_hdr_stat_add(boolean_t from_arc)
{
ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+ if (from_arc)
ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
}
static void
l2arc_hdr_stat_remove(void)
@@ -4134,11 +4553,14 @@
goto out;
first = NULL;
next = l2arc_dev_last;
do {
- /* loop around the list looking for a non-faulted vdev */
+ /*
+ * Loop around the list looking for a non-faulted vdev
+ * and one that isn't currently doing an L2ARC rebuild.
+ */
if (next == NULL) {
next = list_head(l2arc_dev_list);
} else {
next = list_next(l2arc_dev_list, next);
if (next == NULL)
@@ -4149,14 +4571,14 @@
if (first == NULL)
first = next;
else if (next == first)
break;
- } while (vdev_is_dead(next->l2ad_vdev));
+ } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
/* if we were unable to find any usable vdevs, return NULL */
- if (vdev_is_dead(next->l2ad_vdev))
+ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
next = NULL;
l2arc_dev_last = next;
out:
@@ -4206,12 +4628,13 @@
{
l2arc_write_callback_t *cb;
l2arc_dev_t *dev;
list_t *buflist;
arc_buf_hdr_t *head, *ab, *ab_prev;
- l2arc_buf_hdr_t *abl2;
+ l2arc_buf_hdr_t *l2hdr;
kmutex_t *hash_lock;
+ l2arc_log_blk_buf_t *lb_buf;
cb = zio->io_private;
ASSERT(cb != NULL);
dev = cb->l2wcb_dev;
ASSERT(dev != NULL);
@@ -4230,11 +4653,18 @@
/*
* All writes completed, or an error was hit.
*/
for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
ab_prev = list_prev(buflist, ab);
+ l2hdr = ab->b_l2hdr;
+ /*
+ * Release the temporary compressed buffer as soon as possible.
+ */
+ if (l2hdr->b_compress != ZIO_COMPRESS_OFF)
+ l2arc_release_cdata_buf(ab);
+
hash_lock = HDR_LOCK(ab);
if (!mutex_tryenter(hash_lock)) {
/*
* This buffer misses out. It may be in a stage
* of eviction. Its ARC_L2_WRITING flag will be
@@ -4242,26 +4672,18 @@
*/
ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
continue;
}
- abl2 = ab->b_l2hdr;
-
- /*
- * Release the temporary compressed buffer as soon as possible.
- */
- if (abl2->b_compress != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(ab);
-
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
list_remove(buflist, ab);
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
+ ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
ab->b_l2hdr = NULL;
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ kmem_free(l2hdr, sizeof (*l2hdr));
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
/*
* Allow ARC to begin reads to this L2ARC entry.
@@ -4276,10 +4698,16 @@
kmem_cache_free(hdr_cache, head);
mutex_exit(&l2arc_buflist_mtx);
l2arc_do_free_on_write();
+ for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL;
+ lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) {
+ (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list);
+ kmem_free(lb_buf, sizeof (*lb_buf));
+ }
+ list_destroy(&cb->l2wcb_log_blk_buf_list);
kmem_free(cb, sizeof (l2arc_write_callback_t));
}
/*
* A read to a cache device completed. Validate buffer contents before
@@ -4399,20 +4827,33 @@
mutex_enter(*lock);
return (list);
}
/*
+ * Calculates the maximum overhead of L2ARC metadata log blocks for a given
+ * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
+ * overhead in processing to make sure there is enough headroom available
+ * when writing buffers.
+ */
+static inline uint64_t
+l2arc_log_blk_overhead(uint64_t write_sz)
+{
+ return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
+ L2ARC_LOG_BLK_SIZE;
+}
+
+/*
* Evict buffers from the device write hand to the distance specified in
* bytes. This distance may span populated buffers, it may span nothing.
* This is clearing a region on the L2ARC device ready for writing.
* If the 'all' boolean is set, every buffer is evicted.
*/
static void
l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
{
list_t *buflist;
- l2arc_buf_hdr_t *abl2;
+ l2arc_buf_hdr_t *l2hdr;
arc_buf_hdr_t *ab, *ab_prev;
kmutex_t *hash_lock;
uint64_t taddr;
buflist = dev->l2ad_buflist;
@@ -4426,10 +4867,14 @@
* nothing to evict.
*/
return;
}
+ /*
+ * We need to add in the worst case scenario of log block overhead.
+ */
+ distance += l2arc_log_blk_overhead(distance);
if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
/*
* When nearing the end of the device, evict to the end
* before the device write hand jumps to the start.
*/
@@ -4508,14 +4953,14 @@
/*
* Tell ARC this no longer exists in L2ARC.
*/
if (ab->b_l2hdr != NULL) {
- abl2 = ab->b_l2hdr;
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
+ l2hdr = ab->b_l2hdr;
+ ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
ab->b_l2hdr = NULL;
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ kmem_free(l2hdr, sizeof (*l2hdr));
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
list_remove(buflist, ab);
/*
@@ -4547,27 +4992,43 @@
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
boolean_t *headroom_boost)
{
arc_buf_hdr_t *ab, *ab_prev, *head;
list_t *list;
- uint64_t write_asize, write_psize, write_sz, headroom,
+ /*
+ * These variables mean:
+ * - write_size: in-memory size of ARC buffers we've written (before
+ * compression).
+ * - write_asize: actual on-disk size of ARC buffers we've written
+ * (after compression).
+ * - write_aligned_asize: actual sum of space taken by ARC buffers
+ * on the device (after compression and alignment, so that
+ * every buffer starts on a multiple of the device block size).
+ * - headroom: L2ARC scanning headroom (we won't scan beyond this
+ * distance from the list tail).
+ * - buf_compress_minsz: minimum in-memory ARC buffer size for us
+ * to try compressing it.
+ */
+ uint64_t write_size, write_asize, write_aligned_asize, headroom,
buf_compress_minsz;
void *buf_data;
kmutex_t *list_lock;
boolean_t full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
const boolean_t do_headroom_boost = *headroom_boost;
+ boolean_t dev_hdr_update = B_FALSE;
ASSERT(dev->l2ad_vdev != NULL);
/* Lower the flag now, we might want to raise it again later. */
*headroom_boost = B_FALSE;
pio = NULL;
- write_sz = write_asize = write_psize = 0;
+ cb = NULL;
+ write_size = write_asize = write_aligned_asize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
head->b_flags |= ARC_L2_WRITE_HEAD;
/*
@@ -4601,11 +5062,11 @@
headroom = (headroom * l2arc_headroom_boost) / 100;
for (; ab; ab = ab_prev) {
l2arc_buf_hdr_t *l2hdr;
kmutex_t *hash_lock;
- uint64_t buf_sz;
+ uint64_t buf_aligned_size;
if (arc_warm == B_FALSE)
ab_prev = list_next(list, ab);
else
ab_prev = list_prev(list, ab);
@@ -4616,11 +5077,19 @@
* Skip this buffer rather than waiting.
*/
continue;
}
- passed_sz += ab->b_size;
+ /*
+ * When examining whether we've met our write target,
+ * we must always use the aligned size of the buffer,
+ * since that's the maximum amount of space a buffer
+ * can take up on the L2ARC device.
+ */
+ buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev,
+ ab->b_size);
+ passed_sz += buf_aligned_size;
if (passed_sz > headroom) {
/*
* Searched too far.
*/
mutex_exit(hash_lock);
@@ -4630,11 +5099,11 @@
if (!l2arc_write_eligible(guid, ab)) {
mutex_exit(hash_lock);
continue;
}
- if ((write_sz + ab->b_size) > target_sz) {
+ if ((write_size + buf_aligned_size) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
break;
}
@@ -4644,22 +5113,25 @@
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
list_insert_head(dev->l2ad_buflist, head);
- cb = kmem_alloc(
+ cb = kmem_zalloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
+ list_create(&cb->l2wcb_log_blk_buf_list,
+ sizeof (l2arc_log_blk_buf_t),
+ offsetof(l2arc_log_blk_buf_t, l2lbb_node));
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
}
/*
* Create and add a new L2ARC header.
*/
- l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+ l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
l2hdr->b_dev = dev;
ab->b_flags |= ARC_L2_WRITING;
/*
* Temporarily stash the data buffer in b_tmp_cdata.
@@ -4671,11 +5143,10 @@
*/
l2hdr->b_compress = ZIO_COMPRESS_OFF;
l2hdr->b_asize = ab->b_size;
l2hdr->b_tmp_cdata = ab->b_buf->b_data;
- buf_sz = ab->b_size;
ab->b_l2hdr = l2hdr;
list_insert_head(dev->l2ad_buflist, ab);
/*
@@ -4685,11 +5156,11 @@
arc_cksum_verify(ab->b_buf);
arc_cksum_compute(ab->b_buf, B_TRUE);
mutex_exit(hash_lock);
- write_sz += buf_sz;
+ write_size += buf_aligned_size;
}
mutex_exit(list_lock);
if (full == B_TRUE)
@@ -4696,11 +5167,11 @@
break;
}
/* No buffers selected for writing? */
if (pio == NULL) {
- ASSERT0(write_sz);
+ ASSERT0(write_size);
mutex_exit(&l2arc_buflist_mtx);
kmem_cache_free(hdr_cache, head);
return (0);
}
@@ -4741,11 +5212,11 @@
buf_data = l2hdr->b_tmp_cdata;
buf_sz = l2hdr->b_asize;
/* Compression may have squashed the buffer to zero length. */
if (buf_sz != 0) {
- uint64_t buf_p_sz;
+ uint64_t buf_aligned_asize;
wzio = zio_write_phys(pio, dev->l2ad_vdev,
dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_CANFAIL, B_FALSE);
@@ -4756,30 +5227,41 @@
write_asize += buf_sz;
/*
* Keep the clock hand suitably device-aligned.
*/
- buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
- write_psize += buf_p_sz;
- dev->l2ad_hand += buf_p_sz;
+ buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev,
+ buf_sz);
+ write_aligned_asize += buf_aligned_asize;
+ dev->l2ad_hand += buf_aligned_asize;
+ ASSERT(dev->l2ad_hand <= dev->l2ad_evict ||
+ dev->l2ad_first);
}
- }
+ if (l2arc_log_blk_insert(dev, ab)) {
+ l2arc_log_blk_commit(dev, pio, cb);
+ dev_hdr_update = B_TRUE;
+ }
+ }
mutex_exit(&l2arc_buflist_mtx);
- ASSERT3U(write_asize, <=, target_sz);
+ if (dev_hdr_update)
+ l2arc_dev_hdr_update(dev, pio);
+
+ VERIFY3U(write_aligned_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
- ARCSTAT_INCR(arcstat_l2_size, write_sz);
- ARCSTAT_INCR(arcstat_l2_asize, write_asize);
- vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
+ ARCSTAT_INCR(arcstat_l2_size, write_size);
+ ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize);
+ vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
* l2arc_evict() will already have evicted ahead for this case.
*/
- if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
+ if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
+ dev->l2ad_end) {
vdev_space_update(dev->l2ad_vdev,
dev->l2ad_end - dev->l2ad_hand, 0, 0);
dev->l2ad_hand = dev->l2ad_start;
dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
@@ -5037,29 +5519,39 @@
}
boolean_t
l2arc_vdev_present(vdev_t *vd)
{
+ return (l2arc_vdev_get(vd) != NULL);
+}
+
+static l2arc_dev_t *
+l2arc_vdev_get(vdev_t *vd)
+{
l2arc_dev_t *dev;
+ boolean_t held = MUTEX_HELD(&l2arc_dev_mtx);
+ if (!held)
mutex_enter(&l2arc_dev_mtx);
for (dev = list_head(l2arc_dev_list); dev != NULL;
dev = list_next(l2arc_dev_list, dev)) {
if (dev->l2ad_vdev == vd)
break;
}
+ if (!held)
mutex_exit(&l2arc_dev_mtx);
- return (dev != NULL);
+ return (dev);
}
/*
* Add a vdev for use by the L2ARC. By this point the spa has already
- * validated the vdev and opened it.
+ * validated the vdev and opened it. The `rebuild' flag indicates whether
+ * we should attempt an L2ARC persistency rebuild.
*/
void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd)
+l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
{
l2arc_dev_t *adddev;
ASSERT(!l2arc_vdev_present(vd));
@@ -5067,11 +5559,12 @@
* Create a new l2arc device entry.
*/
adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
adddev->l2ad_spa = spa;
adddev->l2ad_vdev = vd;
- adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
@@ -5090,10 +5583,20 @@
* Add device to global list
*/
mutex_enter(&l2arc_dev_mtx);
list_insert_head(l2arc_dev_list, adddev);
atomic_inc_64(&l2arc_ndev);
+ if (rebuild && l2arc_rebuild_enabled &&
+ adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
+ /*
+ * Just mark the device as pending for a rebuild. We won't
+ * be starting a rebuild in line here as it would block pool
+ * import. Instead spa_load_impl will hand that off to an
+ * async task which will call l2arc_spa_rebuild_start.
+ */
+ adddev->l2ad_rebuild = B_TRUE;
+ }
mutex_exit(&l2arc_dev_mtx);
}
/*
* Remove a vdev from the L2ARC.
@@ -5196,6 +5699,731 @@
cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
l2arc_thread_exit = 1;
while (l2arc_thread_exit != 0)
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
mutex_exit(&l2arc_feed_thr_lock);
+}
+
+/*
+ * Punches out rebuild threads for the L2ARC devices in a spa. This should
+ * be called as one of the final steps of a pool import.
+ */
+void
+l2arc_spa_rebuild_start(spa_t *spa)
+{
+ l2arc_dev_t *dev;
+ /*
+ * Locate the spa's l2arc devices and kick off rebuild threads.
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
+ ASSERT(dev != NULL);
+ if (dev->l2ad_rebuild) {
+ (void) thread_create(NULL, 0, l2arc_dev_rebuild_start,
+ dev, 0, &p0, TS_RUN, minclsyspri);
+ }
+ }
+ mutex_exit(&l2arc_dev_mtx);
+}
+
+/*
+ * Main entry point for L2ARC rebuilding.
+ */
+static void
+l2arc_dev_rebuild_start(l2arc_dev_t *dev)
+{
+ spa_t *spa = dev->l2ad_spa;
+ vdev_t *vd = dev->l2ad_vdev;
+
+ /* Lock out device removal. */
+ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+ ASSERT(dev->l2ad_rebuild);
+ (void) l2arc_rebuild(dev);
+ dev->l2ad_rebuild = B_FALSE;
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ thread_exit();
+}
+
+/*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ *
+ * 1) reads the device's header
+ * 2) if a good device header is found, starts reading the log block chain
+ * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the log blk chain (the back-reference in the blk is
+ * invalid or loops over our starting point).
+ * 2) We encounter *any* error condition (cksum errors, io errors, looped
+ * blocks, etc.).
+ * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
+ * from making severely fragmented L2ARC log blocks or slow L2ARC devices
+ * prevent a machine from finishing a pool import (and thus letting the
+ * administrator take corrective action, e.g. by kicking the misbehaving
+ * L2ARC device out of the pool, or by reimporting the pool with L2ARC
+ * rebuilding disabled).
+ */
+static int
+l2arc_rebuild(l2arc_dev_t *dev)
+{
+ int err;
+ l2arc_log_blk_phys_t *this_lb, *next_lb;
+ uint8_t *this_lb_buf, *next_lb_buf;
+ zio_t *this_io = NULL, *next_io = NULL;
+ int64_t deadline;
+ l2arc_log_blk_ptr_t lb_ptrs[2];
+ boolean_t first_pass;
+ uint64_t load_guid;
+
+ load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
+ deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
+ /*
+ * Device header processing phase.
+ */
+ if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) {
+ /* device header corrupted, start a new one */
+ bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr));
+ return (err);
+ }
+ if (l2arc_check_rebuild_timeout_hit(deadline))
+ return (SET_ERROR(ETIMEDOUT));
+
+ /* Retrieve the persistent L2ARC device state */
+ dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail;
+ dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr +
+ LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0]));
+ dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags &
+ L2ARC_DEV_HDR_EVICT_FIRST);
+
+ /* Prepare the rebuild processing state */
+ bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
+ this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
+ next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
+ this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
+ next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
+ first_pass = B_TRUE;
+
+ /* Start the rebuild process */
+ for (;;) {
+ if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0]))
+ /* We hit an invalid block address, end the rebuild. */
+ break;
+
+ if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
+ this_lb, next_lb, this_lb_buf, next_lb_buf,
+ this_io, &next_io)) != 0)
+ break;
+
+ /* Protection against infinite loops of log blocks. */
+ if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr,
+ lb_ptrs[0].l2lbp_daddr,
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
+ !first_pass) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
+ err = SET_ERROR(ELOOP);
+ break;
+ }
+
+ /*
+ * Our memory pressure valve. If the system is running low
+ * on memory, rather than swamping memory with new ARC buf
+ * hdrs, we opt not to rebuild the L2ARC. At this point,
+ * however, we have already set up our L2ARC dev to chain in
+ * new metadata log blk, so the user may choose to re-add the
+ * L2ARC dev at a later time to reconstruct it (when there's
+ * less memory pressure).
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+ cmn_err(CE_NOTE, "System running low on memory, "
+ "aborting L2ARC rebuild.");
+ err = SET_ERROR(ENOMEM);
+ break;
+ }
+
+ /*
+ * Now that we know that the next_lb checks out alright, we
+ * can start reconstruction from this lb - we can be sure
+ * that the L2ARC write hand has not yet reached any of our
+ * buffers.
+ */
+ l2arc_log_blk_restore(dev, load_guid, this_lb,
+ LBP_GET_PSIZE(&lb_ptrs[0]));
+
+ /*
+ * End of list detection. We can look ahead two steps in the
+ * blk chain and if the 2nd blk from this_lb dips below the
+ * initial chain starting point, then we know two things:
+ * 1) it can't be valid, and
+ * 2) the next_lb's ARC entries might have already been
+ * partially overwritten and so we should stop before
+ * we restore it
+ */
+ if (l2arc_range_check_overlap(
+ this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr,
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
+ !first_pass)
+ break;
+
+ /* log blk restored, continue with next one in the list */
+ lb_ptrs[0] = lb_ptrs[1];
+ lb_ptrs[1] = this_lb->l2lb_back2_lbp;
+ PTR_SWAP(this_lb, next_lb);
+ PTR_SWAP(this_lb_buf, next_lb_buf);
+ this_io = next_io;
+ next_io = NULL;
+ first_pass = B_FALSE;
+
+ if (l2arc_check_rebuild_timeout_hit(deadline)) {
+ err = SET_ERROR(ETIMEDOUT);
+ break;
+ }
+ }
+ if (next_io != NULL)
+ l2arc_log_blk_prefetch_abort(next_io);
+ kmem_free(this_lb, sizeof (*this_lb));
+ kmem_free(next_lb, sizeof (*next_lb));
+ kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
+ kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
+ if (err == 0)
+ ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
+
+ return (err);
+}
+
+/*
+ * Restores the payload of a log blk to ARC. This creates empty ARC hdr
+ * entries which only contain an l2arc hdr, essentially restoring the
+ * buffers to their L2ARC evicted state. This function also updates space
+ * usage on the L2ARC vdev to make sure it tracks restored buffers.
+ */
+static void
+l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
+ l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
+{
+ uint64_t size = 0, psize = 0;
+
+ mutex_enter(&l2arc_buflist_mtx);
+
+ for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
+ /*
+ * Restore goes in the reverse direction to preserve correct
+ * temporal ordering of buffers in the l2ad_buflist.
+ */
+ l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid);
+ size += LE_GET_LSIZE(&lb->l2lb_entries[i]);
+ psize += LE_GET_PSIZE(&lb->l2lb_entries[i]);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ /*
+ * Record rebuild stats:
+ * size In-memory size of restored buffer data in ARC
+ * psize Physical size of restored buffers in the L2ARC
+ * bufs # of ARC buffer headers restored
+ * log_blks # of L2ARC log entries processed during restore
+ */
+ ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
+ ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
+ ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
+ vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
+}
+
+/*
+ * Restores a single ARC buf hdr from a log block. The ARC buffer is put
+ * into a state indicating that it has been evicted to L2ARC.
+ */
+static void
+l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
+ uint64_t load_guid)
+{
+ arc_buf_hdr_t *hdr, *exists;
+ kmutex_t *hash_lock;
+ arc_buf_contents_t type = LE_GET_TYPE(le);
+ l2arc_buf_hdr_t *l2hdr;
+
+ hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type);
+ hdr->b_dva = le->l2le_dva;
+ hdr->b_birth = le->l2le_birth;
+ hdr->b_cksum0 = le->l2le_cksum0;
+ hdr->b_size = LE_GET_LSIZE(le);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* Buffer was already cached, no need to restore it. */
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(hdr);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+ return;
+ }
+ hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE;
+ if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF)
+ hdr->b_flags |= ARC_L2COMPRESS;
+ mutex_enter(&hdr->b_freeze_lock);
+ ASSERT(hdr->b_freeze_cksum == NULL);
+ hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+ *hdr->b_freeze_cksum = le->l2le_freeze_cksum;
+ mutex_exit(&hdr->b_freeze_lock);
+
+ /* now rebuild the l2arc entry */
+ ASSERT(hdr->b_l2hdr == NULL);
+ l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
+ l2hdr->b_dev = dev;
+ l2hdr->b_daddr = le->l2le_daddr;
+ l2hdr->b_asize = LE_GET_PSIZE(le);
+ l2hdr->b_compress = LE_GET_COMPRESS(le);
+ hdr->b_l2hdr = l2hdr;
+ list_insert_tail(dev->l2ad_buflist, hdr);
+ ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
+ ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
+
+ arc_change_state(arc_l2c_only, hdr, hash_lock);
+ mutex_exit(hash_lock);
+}
+
+/*
+ * Attempts to read the device header on the provided L2ARC device and writes
+ * it to `ub'. On success, this function returns 0, otherwise the appropriate
+ * error code is returned.
+ */
+static int
+l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr)
+{
+ int err;
+ uint64_t guid;
+ zio_cksum_t cksum;
+
+ guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+ if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr,
+ ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+ return (err);
+ }
+
+ if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+ byteswap_uint64_array(hdr, sizeof (*hdr));
+
+ if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC ||
+ hdr->l2dh_spa_guid != guid) {
+ /*
+ * Attempt to rebuild a device containing no actual dev hdr
+ * or containing a header from some other pool.
+ */
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ l2arc_dev_hdr_checksum(hdr, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
+ return (SET_ERROR(EINVAL));
+ }
+ if (hdr->l2dh_evict_tail < dev->l2ad_start ||
+ hdr->l2dh_evict_tail >= dev->l2ad_end) {
+ /* Data in dev hdr is invalid for this device. */
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+ return (SET_ERROR(EINVAL));
+ }
+
+ return (0);
+}
+
+/*
+ * Reads L2ARC log blocks from storage and validates their contents.
+ *
+ * This function implements a simple prefetcher to make sure that while
+ * we're processing one buffer the L2ARC is already prefetching the next
+ * one in the chain.
+ *
+ * The arguments this_lp and next_lp point to the current and next log blk
+ * address in the block chain. Similarly, this_lb and next_lb hold the
+ * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
+ * and next_lb_buf must be buffers of appropriate to hold a raw
+ * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
+ * to buffer decompression).
+ *
+ * The `this_io' and `next_io' arguments are used for block prefetching.
+ * When issuing the first blk IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the block and
+ * also issue an async IO to fetch the next block in the block chain. The
+ * prefetch IO is returned in `next_io'. On subsequent calls to this
+ * function, pass the value returned in `next_io' from the previous call
+ * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
+ * Prior to the call, you should initialize your `next_io' pointer to be
+ * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the prefetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of prefetch IOs.
+ */
+static int
+l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ uint8_t *this_lb_buf, uint8_t *next_lb_buf,
+ zio_t *this_io, zio_t **next_io)
+{
+ int err = 0;
+ zio_cksum_t cksum;
+
+ ASSERT(this_lbp != NULL && next_lbp != NULL);
+ ASSERT(this_lb != NULL && next_lb != NULL);
+ ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
+ ASSERT(next_io != NULL && *next_io == NULL);
+ ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp));
+
+ /*
+ * Check to see if we have issued the IO for this log blk in a
+ * previous run. If not, this is the first call, so issue it now.
+ */
+ if (this_io == NULL) {
+ this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
+ this_lb_buf);
+ }
+
+ /*
+ * Peek to see if we can start issuing the next IO immediately.
+ */
+ if (l2arc_log_blk_ptr_valid(dev, next_lbp)) {
+ /*
+ * Start issuing IO for the next log blk early - this
+ * should help keep the L2ARC device busy while we
+ * decompress and restore this log blk.
+ */
+ *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
+ next_lb_buf);
+ }
+
+ /* Wait for the IO to read this log block to complete */
+ if ((err = zio_wait(this_io)) != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+ goto cleanup;
+ }
+
+ /* Make sure the buffer checks out */
+ fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+
+ /* Now we can take our time decoding this buffer */
+ switch (LBP_GET_COMPRESS(this_lbp)) {
+ case ZIO_COMPRESS_OFF:
+ bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
+ break;
+ case ZIO_COMPRESS_LZ4:
+ if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
+ this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
+ sizeof (*this_lb))) != 0) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ break;
+ default:
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+ byteswap_uint64_array(this_lb, sizeof (*this_lb));
+ if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+cleanup:
+ /* Abort an in-flight prefetch I/O in case of error */
+ if (err != 0 && *next_io != NULL) {
+ l2arc_log_blk_prefetch_abort(*next_io);
+ *next_io = NULL;
+ }
+ return (err);
+}
+
+/*
+ * Validates an L2ARC log blk address to make sure that it can be read
+ * from the provided L2ARC device. Returns B_TRUE if the address is
+ * within the device's bounds, or B_FALSE if not.
+ */
+static boolean_t
+l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp)
+{
+ uint64_t psize = LBP_GET_PSIZE(lbp);
+ uint64_t end = lbp->l2lbp_daddr + psize;
+
+ /*
+ * A log block is valid if all of the following conditions are true:
+ * - it fits entirely between l2ad_start and l2ad_end
+ * - it has a valid size
+ * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it
+ * doesn't sit in the evicted region)
+ */
+ return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end &&
+ psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
+ lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand);
+}
+
+/*
+ * Starts an asynchronous read IO to read a log block. This is used in log
+ * block reconstruction to start reading the next block before we are done
+ * decoding and reconstructing the current block, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain a newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
+ * care of disposing of the allocated buffers correctly.
+ */
+static zio_t *
+l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp,
+ uint8_t *lb_buf)
+{
+ uint32_t psize;
+ zio_t *pio;
+
+ psize = LBP_GET_PSIZE(lbp);
+ ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
+ pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY);
+ (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize,
+ lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
+
+ return (pio);
+}
+
+/*
+ * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
+ * buffers allocated for it.
+ */
+static void
+l2arc_log_blk_prefetch_abort(zio_t *zio)
+{
+ (void) zio_wait(zio);
+}
+
+/*
+ * Creates a zio to update the device header on an l2arc device. The zio is
+ * initiated as a child of `pio'.
+ */
+static void
+l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
+{
+ zio_t *wzio;
+ vdev_stat_t st;
+ l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr;
+
+ vdev_get_stats(dev->l2ad_vdev, &st);
+
+ hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC;
+ hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+ hdr->l2dh_evict_tail = dev->l2ad_evict;
+ hdr->l2dh_alloc_space = st.vs_alloc;
+ hdr->l2dh_flags = 0;
+ if (dev->l2ad_first)
+ hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
+
+ /* checksum operation goes last */
+ l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum);
+
+ CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE &&
+ sizeof (*hdr) <= SPA_MAXBLOCKSIZE);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
+ sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL,
+ NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+ (void) zio_nowait(wzio);
+}
+
+/*
+ * Commits a log block to the L2ARC device. This routine is invoked from
+ * l2arc_write_buffers when the log block fills up.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+static void
+l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+ l2arc_write_callback_t *cb)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ uint64_t psize, asize;
+ l2arc_log_blk_buf_t *lb_buf;
+ zio_t *wzio;
+
+ VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
+
+ /* link the buffer into the block chain */
+ lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1];
+ lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC;
+
+ /* try to compress the buffer */
+ lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
+ list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf);
+ VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb,
+ lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0);
+
+ /*
+ * Update the start log blk pointer in the device header to point
+ * to the log block we're about to write.
+ */
+ dev->l2ad_dev_hdr.l2dh_start_lbps[1] =
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0];
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand;
+ LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb));
+ LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize);
+ LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
+ ZIO_CHECKSUM_FLETCHER_4);
+ LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0);
+ if (psize < sizeof (*lb)) {
+ /* compression succeeded */
+ LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
+ ZIO_COMPRESS_LZ4);
+ } else {
+ /* compression failed */
+ bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb));
+ LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
+ ZIO_COMPRESS_OFF);
+ }
+ /* checksum what we're about to write */
+ fletcher_4_native(lb_buf->l2lbb_log_blk, psize,
+ &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum);
+
+ /* perform the write itself */
+ CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
+ L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
+ psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ /* realign the device hand */
+ asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ dev->l2ad_hand += asize;
+ VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ /* bump the kstats */
+ ARCSTAT_INCR(arcstat_l2_write_bytes, psize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
+ dev->l2ad_log_blk_payload_asize / asize);
+
+ dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0;
+}
+
+/*
+ * Computes the checksum of `hdr' and stores it in `cksum'.
+ */
+static void
+l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
+{
+ fletcher_4_native((uint8_t *)hdr +
+ offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
+ sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
+ cksum);
+}
+
+/*
+ * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
+ * The buffer being inserted must be present in L2ARC.
+ * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
+ * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
+ */
+static boolean_t
+l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_log_ent_phys_t *le;
+ const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+ int index = dev->l2ad_log_ent_idx++;
+
+ ASSERT(l2hdr != NULL);
+ ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
+
+ le = &lb->l2lb_entries[index];
+ bzero(le, sizeof (*le));
+ le->l2le_dva = ab->b_dva;
+ le->l2le_birth = ab->b_birth;
+ le->l2le_cksum0 = ab->b_cksum0;
+ le->l2le_daddr = l2hdr->b_daddr;
+ LE_SET_LSIZE(le, ab->b_size);
+ LE_SET_PSIZE(le, l2hdr->b_asize);
+ LE_SET_COMPRESS(le, l2hdr->b_compress);
+ le->l2le_freeze_cksum = *ab->b_freeze_cksum;
+ LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
+ LE_SET_TYPE(le, ab->b_type);
+ dev->l2ad_log_blk_payload_asize += l2hdr->b_asize;
+
+ return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
+}
+
+/*
+ * Checks whether a given L2ARC device address sits in a time-sequential
+ * range. The trick here is that the L2ARC is a rotary buffer, so we can't
+ * just do a range comparison, we need to handle the situation in which the
+ * range wraps around the end of the L2ARC device. Arguments:
+ * bottom Lower end of the range to check (written to earlier).
+ * top Upper end of the range to check (written to later).
+ * check The address for which we want to determine if it sits in
+ * between the top and bottom.
+ *
+ * The 3-way conditional below represents the following cases:
+ *
+ * bottom < top : Sequentially ordered case:
+ * <check>--------+-------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |---------------<bottom>============<top>--------------|
+ *
+ * bottom > top: Looped-around case:
+ * <check>--------+------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |===============<top>---------------<bottom>===========|
+ * ^ ^
+ * | (or here?) |
+ * +---------------+---------<check>
+ *
+ * top == bottom : Just a single address comparison.
+ */
+static inline boolean_t
+l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
+{
+ if (bottom < top)
+ return (bottom <= check && check <= top);
+ else if (bottom > top)
+ return (check <= top || bottom <= check);
+ else
+ return (check == top);
+}
+
+/*
+ * Checks whether a rebuild timeout deadline has been hit and if it has,
+ * increments the appropriate error counters.
+ */
+static boolean_t
+l2arc_check_rebuild_timeout_hit(int64_t deadline)
+{
+ if (deadline != 0 && deadline < ddi_get_lbolt64()) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout);
+ cmn_err(CE_WARN, "L2ARC rebuild is taking too long, "
+ "dropping remaining L2ARC metadata.");
+ return (B_TRUE);
+ } else {
+ return (B_FALSE);
+ }
}