Print this page
3525 Persistent L2ARC
@@ -134,10 +134,11 @@
#include <sys/dnlc.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
#include <zfs_fletcher.h>
+#include <sys/byteorder.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
boolean_t arc_watch = B_FALSE;
int arc_procfd;
@@ -305,10 +306,28 @@
kstat_named_t arcstat_l2_asize;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_l2_compress_successes;
kstat_named_t arcstat_l2_compress_zeros;
kstat_named_t arcstat_l2_compress_failures;
+ kstat_named_t arcstat_l2_meta_writes;
+ kstat_named_t arcstat_l2_meta_avg_size;
+ kstat_named_t arcstat_l2_meta_avg_asize;
+ kstat_named_t arcstat_l2_asize_to_meta_ratio;
+ kstat_named_t arcstat_l2_rebuild_attempts;
+ kstat_named_t arcstat_l2_rebuild_successes;
+ kstat_named_t arcstat_l2_rebuild_unsupported;
+ kstat_named_t arcstat_l2_rebuild_timeout;
+ kstat_named_t arcstat_l2_rebuild_arc_bytes;
+ kstat_named_t arcstat_l2_rebuild_l2arc_bytes;
+ kstat_named_t arcstat_l2_rebuild_bufs;
+ kstat_named_t arcstat_l2_rebuild_bufs_precached;
+ kstat_named_t arcstat_l2_rebuild_metabufs;
+ kstat_named_t arcstat_l2_rebuild_uberblk_errors;
+ kstat_named_t arcstat_l2_rebuild_io_errors;
+ kstat_named_t arcstat_l2_rebuild_cksum_errors;
+ kstat_named_t arcstat_l2_rebuild_loop_errors;
+ kstat_named_t arcstat_l2_rebuild_abort_lowmem;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_duplicate_buffers;
kstat_named_t arcstat_duplicate_buffers_size;
kstat_named_t arcstat_duplicate_reads;
kstat_named_t arcstat_meta_used;
@@ -371,10 +390,28 @@
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
{ "l2_compress_successes", KSTAT_DATA_UINT64 },
{ "l2_compress_zeros", KSTAT_DATA_UINT64 },
{ "l2_compress_failures", KSTAT_DATA_UINT64 },
+ { "l2_meta_writes", KSTAT_DATA_UINT64 },
+ { "l2_meta_avg_size", KSTAT_DATA_UINT64 },
+ { "l2_meta_avg_asize", KSTAT_DATA_UINT64 },
+ { "l2_asize_to_meta_ratio", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_attempts", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_successes", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_timeout", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_arc_bytes", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_l2arc_bytes", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_precached", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_metabufs", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_uberblk_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_abort_lowmem", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "duplicate_buffers", KSTAT_DATA_UINT64 },
{ "duplicate_buffers_size", KSTAT_DATA_UINT64 },
{ "duplicate_reads", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
@@ -418,10 +455,29 @@
} else { \
ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
} \
}
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG_FACTOR 3
+#define ARCSTAT_F_AVG(stat, value) \
+ do { \
+ uint64_t x = ARCSTAT(stat); \
+ x = x - x / ARCSTAT_F_AVG_FACTOR + \
+ (value) / ARCSTAT_F_AVG_FACTOR; \
+ ARCSTAT(stat) = x; \
+ _NOTE(NOTREACHED) \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
kstat_t *arc_ksp;
static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
@@ -625,23 +681,11 @@
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
/*
* L2ARC Internals
*/
-typedef struct l2arc_dev {
- vdev_t *l2ad_vdev; /* vdev */
- spa_t *l2ad_spa; /* spa */
- uint64_t l2ad_hand; /* next write location */
- uint64_t l2ad_start; /* first addr on device */
- uint64_t l2ad_end; /* last addr on device */
- uint64_t l2ad_evict; /* last addr eviction reached */
- boolean_t l2ad_first; /* first sweep through */
- boolean_t l2ad_writing; /* currently writing */
- list_t *l2ad_buflist; /* buffer list */
- list_node_t l2ad_node; /* device list node */
-} l2arc_dev_t;
-
+typedef struct l2arc_dev l2arc_dev_t;
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
static l2arc_dev_t *l2arc_dev_last; /* last device used */
static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
@@ -660,10 +704,13 @@
} l2arc_read_callback_t;
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+ uint8_t *l2wcb_pbuf; /* pbuf sent in this write */
+ uint32_t l2wcb_pbuf_size; /* size of committed pbuf */
+ uint8_t *l2wcb_ub_buf; /* uberblock in this write */
} l2arc_write_callback_t;
struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
@@ -687,18 +734,242 @@
static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
static void l2arc_read_done(zio_t *zio);
-static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_add(boolean_t from_arc);
static void l2arc_hdr_stat_remove(void);
static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
enum zio_compress c);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
+typedef enum {
+ L2UBLK_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */
+ L2UBLK_EVICT_FIRST = (1 << 1) /* mirror of l2ad_first in l2dev */
+} l2uberblock_flags_t;
+
+typedef struct l2uberblock {
+ uint32_t ub_magic;
+ uint8_t ub_version;
+ l2uberblock_flags_t ub_flags;
+
+ uint64_t ub_spa_guid;
+ uint64_t ub_birth;
+ uint64_t ub_evict_tail; /* current evict pointer */
+ uint64_t ub_alloc_space; /* vdev space alloc status */
+ uint64_t ub_pbuf_daddr; /* address of newest pbuf */
+ uint32_t ub_pbuf_asize; /* size of newest pbuf */
+ zio_cksum_t ub_pbuf_cksum; /* fletcher4 of newest pbuf */
+
+ zio_cksum_t ub_cksum; /* cksum of uberblock */
+} l2uberblock_t;
+
+typedef enum {
+ L2PBUF_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */
+ L2PBUF_COMPRESSED = (1 << 1) /* pbuf data items are compressed */
+} l2pbuf_flags_t;
+
+typedef struct l2pbuf {
+ uint32_t pb_magic;
+ unsigned int pb_version;
+ l2pbuf_flags_t pb_flags;
+
+ uint64_t pb_prev_daddr; /* address of previous pbuf */
+ uint32_t pb_prev_asize; /* size of previous pbuf */
+ zio_cksum_t pb_prev_cksum; /* fletcher4 of prev. pbuf */
+
+ /*
+ * This is a set of item lists that are contained in this pbuf. Each
+ * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's.
+ * This serves as a soft timeout feature - once the limit of the
+ * number of item lists that a pbuf can hold is reached, the pbuf is
+ * flushed to stable storage, regardless of its total size.
+ */
+ list_t *pb_buflists_list;
+
+ /*
+ * Number of compressed bytes referenced by items in this pbuf and
+ * the number of lists present.
+ * This is not actually written to storage, it is only used by
+ * internal algorithms which check for when a pbuf reaches a
+ * certain size limit, after which it is flushed in a write.
+ */
+ uint64_t pb_payload_asz;
+ /* Same thing for number of buflists */
+ int pb_nbuflists;
+
+ /*
+ * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size.
+ * This is then used by l2arc_pbuf_restore to update used space
+ * on the L2ARC vdev.
+ */
+ size_t pb_asize;
+} l2pbuf_t;
+
+typedef struct l2pbuf_buf l2pbuf_buf_t;
+typedef struct l2pbuf_buflist {
+ uint32_t l2pbl_nbufs;
+ l2pbuf_buf_t *l2pbl_bufs;
+ list_node_t l2pbl_node;
+} l2pbuf_buflist_t;
+
+struct l2pbuf_buf {
+ dva_t b_dva; /* dva of buffer */
+ uint64_t b_birth; /* birth txg of buffer */
+ uint64_t b_cksum0;
+ zio_cksum_t b_freeze_cksum;
+ uint32_t b_size; /* uncompressed buf size */
+ uint64_t b_l2daddr; /* buf location on l2dev */
+ uint32_t b_l2asize; /* actual buf data size */
+ enum zio_compress b_l2compress; /* compression applied */
+ uint16_t b_contents_type;
+ uint32_t b_flags;
+};
+
+struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ uint64_t l2ad_evict; /* last addr eviction reached */
+ boolean_t l2ad_first; /* first sweep through */
+ boolean_t l2ad_writing; /* currently writing */
+ list_t *l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+ l2pbuf_t l2ad_pbuf; /* currently open pbuf */
+ uint64_t l2ad_pbuf_daddr; /* prev pbuf daddr */
+ uint64_t l2ad_pbuf_asize; /* prev pbuf asize */
+ zio_cksum_t l2ad_pbuf_cksum; /* prev pbuf cksum */
+ /* uberblock birth counter - incremented for each committed uberblk */
+ uint64_t l2ad_uberblock_birth;
+ /* flag indicating whether a rebuild is currently going on */
+ boolean_t l2ad_rebuilding;
+};
+
+/* Stores information about an L2ARC prefetch zio */
+typedef struct l2arc_prefetch_info {
+ uint8_t *pi_buf; /* where the zio writes to */
+ uint64_t pi_buflen; /* length of `buf' */
+ zio_t *pi_hdr_io; /* see l2arc_pbuf_read below */
+} l2arc_prefetch_info_t;
+
+/* 256 x 4k of l2uberblocks */
+#define L2UBERBLOCK_SIZE 4096
+#define L2UBERBLOCK_MAGIC 0x12bab10c
+#define L2UBERBLOCK_MAX_VERSION 1 /* our maximum uberblock version */
+#define L2PBUF_MAGIC 0xdb0faba6
+#define L2PBUF_MAX_VERSION 1 /* our maximum pbuf version */
+#define L2PBUF_BUF_SIZE 88 /* size of one pbuf buf entry */
+#define L2PBUF_HDR_SIZE 56 /* pbuf header excluding any payload */
+#define L2PBUF_ENCODED_SIZE(_pb) \
+ (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb))
+/*
+ * Allocation limit for the payload of a pbuf. This also fundamentally
+ * limits the number of bufs we can reference in a pbuf.
+ */
+#define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024)
+#define L2PBUF_MAX_BUFS (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE)
+#define L2PBUF_COMPRESS_MINSZ 8192 /* minimum size to compress a pbuf */
+#define L2PBUF_MAXSZ 100 * 1024 * 1024 /* maximum pbuf size */
+#define L2PBUF_MAX_BUFLISTS 128 /* max number of buflists per pbuf */
+#define L2ARC_REBUILD_TIMEOUT 60 /* a rebuild may take at most 60s */
+#define L2PBUF_IS_FULL(_pb) \
+ ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \
+ (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists)
+/*
+ * These are the flags we allow to persist in L2ARC pbufs. The other flags
+ * of an ARC buffer pertain to the buffer's runtime behavior.
+ */
+#define L2ARC_PERSIST_FLAGS \
+ (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH)
+
+/*
+ * Used during L2ARC rebuild after each read operation to check whether we
+ * haven't exceeded the rebuild timeout value.
+ */
+#define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \
+ do { \
+ if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \
+ __VA_ARGS__; \
+ ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \
+ cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \
+ "dropping remaining L2ARC metadata."); \
+ return; \
+ } \
+ _NOTE(NOTREACHED) \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
+/*
+ * Performance tuning of L2ARC persistency:
+ *
+ * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt
+ * compressing it.
+ * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers
+ * referenced from a pbuf. Once a pbuf reaches this size, it is
+ * committed to stable storage. Ideally, there should be approx.
+ * l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device.
+ * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will
+ * be buffered in a pbuf before it is committed to L2ARC. This
+ * puts a soft temporal upper bound on pbuf commit intervals.
+ * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
+ * pool import or when adding one manually later) will attempt
+ * to rebuild L2ARC buffer contents. In special circumstances,
+ * the administrator may want to set this to B_FALSE, if they
+ * are having trouble importing a pool or attaching an L2ARC
+ * device (e.g. the L2ARC device is slow to read in stored pbuf
+ * metadata, or the metadata has become somehow
+ * fragmented/unusable).
+ * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
+ * avoid a slow L2ARC device from preventing pool import. If we
+ * are not done rebuilding an L2ARC device by this time, we
+ * stop the rebuild and return immediately.
+ */
+uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ;
+uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ;
+uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS;
+boolean_t l2arc_rebuild_enabled = B_TRUE;
+uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
+
+static void l2arc_rebuild_start(l2arc_dev_t *dev);
+static void l2arc_rebuild(l2arc_dev_t *dev);
+static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb);
+static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev,
+ uint64_t guid);
+
+static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub);
+static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
+ zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io);
+static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr,
+ uint32_t asize);
+static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize);
+static void l2arc_pbuf_prefetch_abort(zio_t *zio);
+
+static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf);
+static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub);
+static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
+ uint64_t guid);
+static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio,
+ l2arc_write_callback_t *cb);
+
+static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen);
+static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen,
+ l2pbuf_t *pbuf);
+static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen,
+ uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum);
+static void l2arc_pbuf_init(l2pbuf_t *pb);
+static void l2arc_pbuf_destroy(l2pbuf_t *pb);
+static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio,
+ l2arc_write_callback_t *cb);
+static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs);
+static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
+ const arc_buf_hdr_t *ab, int index);
+static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb);
+
static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL;
@@ -1235,11 +1506,11 @@
}
ab->b_state = new_state;
/* adjust l2arc hdr stats */
if (new_state == arc_l2c_only)
- l2arc_hdr_stat_add();
+ l2arc_hdr_stat_add(old_state != arc_anon);
else if (old_state == arc_l2c_only)
l2arc_hdr_stat_remove();
}
void
@@ -1339,10 +1610,37 @@
(void) refcount_add(&hdr->b_refcnt, tag);
return (buf);
}
+/*
+ * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc->(disk).
+ */
+arc_buf_hdr_t *
+arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT3U(size, >, 0);
+ hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ ASSERT(BUF_EMPTY(hdr));
+ hdr->b_size = size;
+ hdr->b_type = type;
+ hdr->b_spa = guid;
+ hdr->b_state = arc_anon;
+ hdr->b_arc_access = 0;
+ hdr->b_buf = NULL;
+ hdr->b_datacnt = 0;
+ hdr->b_flags = 0;
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+ return (hdr);
+}
+
static char *arc_onloan_tag = "onloan";
/*
* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
* flight data by arc_tempreserve_space() until they are "returned". Loaned
@@ -3971,10 +4269,148 @@
* l2arc_write_size() calculate how much to write
* l2arc_write_interval() calculate sleep delay between writes
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
+ *
+ * L2ARC persistency:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) Every now and then, at end of an L2ARC feed cycle, we append a piece
+ * of metadata (called a "pbuf", or "persistency buffer") to the L2ARC
+ * write. This allows us to understand what what's been written, so that
+ * we can rebuild the arc_buf_hdr_t structures of the main ARC buffers.
+ * The pbuf also includes a "back-reference" pointer to the previous
+ * pbuf, forming a linked list of pbufs on the L2ARC device.
+ *
+ * *) We reserve 4k of space at the start of each L2ARC device for our
+ * header bookkeeping purposes. This contains a single 4k uberblock, which
+ * contains our top-level reference structures. We update it on each pbuf
+ * write. If this write results in an inconsistent uberblock (e.g. due to
+ * power failure), we detect this by verifying the uberblock's checksum
+ * and simply drop the entries from L2ARC. Once an L2ARC pbuf update
+ * completes, we update the uberblock to point to it.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * | ____________newest pbuf pointer_____________ |
+ * | / \ |
+ * | / V |
+ * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---|
+ * | ^ / ^ / ^ / |
+ * | `-prev-' `-prev-' `-prev-' |
+ * | pbuf pbuf pbuf |
+ * +======================================================================+
+ *
+ * On-device data structures:
+ *
+ * (L2ARC persistent uberblock)
+ * struct l2uberblock {
+ * (these fields are in network byte order)
+ * uint32_t magic = 0x12bab10c; l2-ber-block
+ * uint8_t version = 0x1;
+ * uint8_t reserved = 0x0;
+ * uint16_t ublk_flags; see l2uberblock_flags_t
+ *
+ * (byte order of fields below determined by `ublk_flags')
+ * uint64_t spa_guid; what pool this l2arc dev belongs to
+ * uint64_t birth_txg; ublk with highest birth_txg is newest
+ * uint64_t evict_tail; current evict pointer on l2arc dev
+ * uint64_t alloc_space; how much space is alloc'd on the dev
+ * uint64_t pbuf_daddr; dev addr of the newest l2pbuf_t
+ * uint32_t pbuf_asize; size of newest pbuf
+ * uint64_t pbuf_cksum[4]; fletcher4 of newest pbuf
+ *
+ * uint8_t reserved[3996] = {0x0, 0x0, ... 0x0};
+ *
+ * uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above);
+ * } l2dev_uberblock;
+ *
+ * (L2ARC persistent buffer list)
+ * typedef struct l2pbuf_t {
+ * (these fields are in network byte order)
+ * uint32_t magic = 0xdb0faba6; the-buffer-bag
+ * uint8_t version = 0x1;
+ * uint8_t reserved = 0x0;
+ * uint16_t pbuf_flags; see l2pbuf_flags_t
+ *
+ * (byte order of fields below determined by `pbuf_flags')
+ * uint64_t prev_pbuf_daddr; previous pbuf dev addr
+ * uint32_t prev_pbuf_asize; previous pbuf size
+ * uint64_t prev_pbuf_cksum[4]; fletcher4(of previous pbuf)
+ *
+ * uint32_t items_size; uncompressed size of `items' below
+ * (if (pbuf_flags & compress) decompress `items' prior to decoding)
+ * struct l2pbuf_buf_item {
+ * (these fields mirror [l2]arc_buf_hdr fields)
+ * uint64_t dva[2]; buffer's DVA
+ * uint64_t birth; buffer's birth TXG in ARC
+ * uint64_t cksum0; lower 64-bits of buffer's cksum
+ * uint64_t freeze_cksum[4]; buffer's freeze cksum
+ * uint32_t size; uncompressed buffer data size
+ * uint64_t l2daddr; device address (offset) of buf
+ * uint32_t l2asize; actual space occupied by buf
+ * uint8_t compress; compress algo used on data
+ * uint8_t contents_type; buffer's contents type
+ * uint16_t reserved = 0x0; for alignment and future use
+ * uint32_t flags; buffer's persistent flags
+ * } items[]; continues for remainder of pbuf
+ * } l2pbuf_t;
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (appending
+ * an updated l2pbuf_t every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed l2pbuf (and its referenced data buffers), like so:
+ *
+ * current write head__ __old tail
+ * \ /
+ * V V
+ * <--|bufs|pbuf|bufs|pbuf| |bufs|pbuf|bufs|pbuf|-->
+ * ^ ^^^^^^^^^_____________________________
+ * | \
+ * <<nextwrite>> - will overwrite this pbuf --/
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process:
+ * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the
+ * previous one.
+ * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum)
+ * then the pbuf is invalid and stop scanning (goto step 3 below).
+ * 3) if (this is the last valid pbuf)
+ * discard this pbuf as well (its ARC bufs may have been damaged by a
+ * partial overwrite).
+ * (We could potentially salvage the remaining good arc bufs above in step 3,
+ * buf the cost of doing so probably outweighs the value of the entire pbuf).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update pbufs which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
*/
static boolean_t
l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
{
@@ -4037,13 +4473,14 @@
return (next);
}
static void
-l2arc_hdr_stat_add(void)
+l2arc_hdr_stat_add(boolean_t from_arc)
{
ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+ if (from_arc)
ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
}
static void
l2arc_hdr_stat_remove(void)
@@ -4074,11 +4511,14 @@
goto out;
first = NULL;
next = l2arc_dev_last;
do {
- /* loop around the list looking for a non-faulted vdev */
+ /*
+ * Loop around the list looking for a non-faulted vdev
+ * and one that isn't currently doing an L2ARC rebuild.
+ */
if (next == NULL) {
next = list_head(l2arc_dev_list);
} else {
next = list_next(l2arc_dev_list, next);
if (next == NULL)
@@ -4089,14 +4529,14 @@
if (first == NULL)
first = next;
else if (next == first)
break;
- } while (vdev_is_dead(next->l2ad_vdev));
+ } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding);
/* if we were unable to find any usable vdevs, return NULL */
- if (vdev_is_dead(next->l2ad_vdev))
+ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding)
next = NULL;
l2arc_dev_last = next;
out:
@@ -4170,11 +4610,18 @@
/*
* All writes completed, or an error was hit.
*/
for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
ab_prev = list_prev(buflist, ab);
+ abl2 = ab->b_l2hdr;
+ /*
+ * Release the temporary compressed buffer as soon as possible.
+ */
+ if (abl2->b_compress != ZIO_COMPRESS_OFF)
+ l2arc_release_cdata_buf(ab);
+
hash_lock = HDR_LOCK(ab);
if (!mutex_tryenter(hash_lock)) {
/*
* This buffer misses out. It may be in a stage
* of eviction. Its ARC_L2_WRITING flag will be
@@ -4182,18 +4629,10 @@
*/
ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
continue;
}
- abl2 = ab->b_l2hdr;
-
- /*
- * Release the temporary compressed buffer as soon as possible.
- */
- if (abl2->b_compress != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(ab);
-
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
list_remove(buflist, ab);
@@ -4216,10 +4655,14 @@
kmem_cache_free(hdr_cache, head);
mutex_exit(&l2arc_buflist_mtx);
l2arc_do_free_on_write();
+ if (cb->l2wcb_pbuf)
+ kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size);
+ if (cb->l2wcb_ub_buf)
+ kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE);
kmem_free(cb, sizeof (l2arc_write_callback_t));
}
/*
* A read to a cache device completed. Validate buffer contents before
@@ -4497,16 +4940,22 @@
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
const boolean_t do_headroom_boost = *headroom_boost;
+ /* persistency-related */
+ l2pbuf_t *pb;
+ l2pbuf_buflist_t *pb_buflist;
+ int num_bufs, buf_index;
+
ASSERT(dev->l2ad_vdev != NULL);
/* Lower the flag now, we might want to raise it again later. */
*headroom_boost = B_FALSE;
pio = NULL;
+ cb = NULL;
write_sz = write_asize = write_psize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
head->b_flags |= ARC_L2_WRITE_HEAD;
@@ -4514,11 +4963,20 @@
* We will want to try to compress buffers that are at least 2x the
* device sector size.
*/
buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
+ pb = &dev->l2ad_pbuf;
+ num_bufs = 0;
+
/*
+ * We will want to try to compress buffers that are at least 2x the
+ * device sector size.
+ */
+ buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
+
+ /*
* Copy buffers for L2ARC writing.
*/
mutex_enter(&l2arc_buflist_mtx);
for (int try = 0; try <= 3; try++) {
uint64_t passed_sz = 0;
@@ -4584,11 +5042,11 @@
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
list_insert_head(dev->l2ad_buflist, head);
- cb = kmem_alloc(
+ cb = kmem_zalloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
@@ -4626,10 +5084,11 @@
arc_cksum_compute(ab->b_buf, B_TRUE);
mutex_exit(hash_lock);
write_sz += buf_sz;
+ num_bufs++;
}
mutex_exit(list_lock);
if (full == B_TRUE)
@@ -4642,17 +5101,20 @@
mutex_exit(&l2arc_buflist_mtx);
kmem_cache_free(hdr_cache, head);
return (0);
}
+ /* expand the pbuf to include a new list */
+ pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs);
+
/*
* Now start writing the buffers. We're starting at the write head
* and work backwards, retracing the course of the buffer selector
* loop above.
*/
- for (ab = list_prev(dev->l2ad_buflist, head); ab;
- ab = list_prev(dev->l2ad_buflist, ab)) {
+ for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab;
+ ab = list_prev(dev->l2ad_buflist, ab), buf_index++) {
l2arc_buf_hdr_t *l2hdr;
uint64_t buf_sz;
/*
* We shouldn't need to lock the buffer here, since we flagged
@@ -4700,21 +5162,31 @@
*/
buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
write_psize += buf_p_sz;
dev->l2ad_hand += buf_p_sz;
}
- }
+ l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index);
+ }
+ ASSERT(buf_index == num_bufs);
mutex_exit(&l2arc_buflist_mtx);
ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
ARCSTAT_INCR(arcstat_l2_size, write_sz);
ARCSTAT_INCR(arcstat_l2_asize, write_asize);
vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
+ /* Is it time to commit this pbuf? */
+ if (L2PBUF_IS_FULL(pb) &&
+ dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) {
+ l2arc_pbuf_commit(dev, pio, cb);
+ l2arc_pbuf_destroy(pb);
+ l2arc_pbuf_init(pb);
+ }
+
/*
* Bump device hand to the device start if it is approaching the end.
* l2arc_evict() will already have evicted ahead for this case.
*/
if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
@@ -4992,14 +5464,15 @@
return (dev != NULL);
}
/*
* Add a vdev for use by the L2ARC. By this point the spa has already
- * validated the vdev and opened it.
+ * validated the vdev and opened it. The `rebuild' flag indicates whether
+ * we should attempt an L2ARC persistency rebuild.
*/
void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd)
+l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
{
l2arc_dev_t *adddev;
ASSERT(!l2arc_vdev_present(vd));
@@ -5007,16 +5480,17 @@
* Create a new l2arc device entry.
*/
adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
adddev->l2ad_spa = spa;
adddev->l2ad_vdev = vd;
- adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ l2arc_pbuf_init(&adddev->l2ad_pbuf);
/*
* This is a list of all ARC buffers that are still valid on the
* device.
*/
@@ -5030,10 +5504,15 @@
* Add device to global list
*/
mutex_enter(&l2arc_dev_mtx);
list_insert_head(l2arc_dev_list, adddev);
atomic_inc_64(&l2arc_ndev);
+ if (rebuild && l2arc_rebuild_enabled) {
+ adddev->l2ad_rebuilding = B_TRUE;
+ (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev,
+ 0, &p0, TS_RUN, minclsyspri);
+ }
mutex_exit(&l2arc_dev_mtx);
}
/*
* Remove a vdev from the L2ARC.
@@ -5065,10 +5544,11 @@
mutex_exit(&l2arc_dev_mtx);
/*
* Clear all buflists and ARC references. L2ARC device flush.
*/
+ l2arc_pbuf_destroy(&remdev->l2ad_pbuf);
l2arc_evict(remdev, 0, B_TRUE);
list_destroy(remdev->l2ad_buflist);
kmem_free(remdev->l2ad_buflist, sizeof (list_t));
kmem_free(remdev, sizeof (l2arc_dev_t));
}
@@ -5136,6 +5616,1153 @@
cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
l2arc_thread_exit = 1;
while (l2arc_thread_exit != 0)
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
mutex_exit(&l2arc_feed_thr_lock);
+}
+
+/*
+ * Main entry point for L2ARC metadata rebuilding. This function must be
+ * called via thread_create so that the L2ARC metadata rebuild doesn't block
+ * pool import and may proceed in parallel on all available L2ARC devices.
+ */
+static void
+l2arc_rebuild_start(l2arc_dev_t *dev)
+{
+ vdev_t *vd = dev->l2ad_vdev;
+ spa_t *spa = dev->l2ad_spa;
+
+ /* Lock out device removal. */
+ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+ ASSERT(dev->l2ad_rebuilding == B_TRUE);
+ l2arc_rebuild(dev);
+ dev->l2ad_rebuilding = B_FALSE;
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ thread_exit();
+}
+
+/*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ *
+ * 1) scans the device for valid l2uberblocks
+ * 2) if it finds a good uberblock, starts reading the pbuf chain
+ * 3) restores each pbuf's contents to memory
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the pbuf chain (the previous-buffer reference
+ * in the pbuf is zero).
+ * 2) We encounter *any* error condition (cksum errors, io errors, looped
+ * pbufs, etc.).
+ * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
+ * from making severely fragmented L2ARC pbufs or slow L2ARC devices
+ * prevent a machine from importing the pool (and letting the
+ * administrator take corrective action, e.g. by kicking the misbehaving
+ * L2ARC device out of the pool, or by reimporting the pool with L2ARC
+ * rebuilding disabled).
+ */
+static void
+l2arc_rebuild(l2arc_dev_t *dev)
+{
+ int err;
+ l2uberblock_t ub;
+ l2pbuf_t pb;
+ zio_t *this_io = NULL, *next_io = NULL;
+ int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
+
+ if ((err = l2arc_uberblock_find(dev, &ub)) != 0)
+ return;
+ L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */);
+
+ /* set up uberblock update info */
+ dev->l2ad_uberblock_birth = ub.ub_birth + 1;
+
+ /* initial sanity checks */
+ l2arc_pbuf_init(&pb);
+ if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize,
+ ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) {
+ /* root pbuf is bad, we can't do anything about that */
+ if (err == EINVAL) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
+ } else {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
+ }
+ l2arc_pbuf_destroy(&pb);
+ return;
+ }
+ L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
+
+ dev->l2ad_evict = ub.ub_evict_tail;
+
+ /* keep on chaining in new blocks */
+ dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr;
+ dev->l2ad_pbuf_asize = ub.ub_pbuf_asize;
+ dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum;
+ dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
+ ub.ub_pbuf_daddr + ub.ub_pbuf_asize);
+ dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0);
+
+ /* start the rebuild process */
+ for (;;) {
+ l2pbuf_t pb_prev;
+
+ l2arc_pbuf_init(&pb_prev);
+ if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr,
+ pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io,
+ &next_io)) != 0) {
+ /*
+ * We are done reading, discard the last good buffer.
+ */
+ if (pb.pb_prev_daddr > dev->l2ad_hand &&
+ pb.pb_prev_asize > L2PBUF_HDR_SIZE) {
+ /* this is an error, we stopped too early */
+ if (err == EINVAL) {
+ ARCSTAT_BUMP(
+ arcstat_l2_rebuild_cksum_errors);
+ } else {
+ ARCSTAT_BUMP(
+ arcstat_l2_rebuild_io_errors);
+ }
+ }
+ l2arc_pbuf_destroy(&pb_prev);
+ l2arc_pbuf_destroy(&pb);
+ break;
+ }
+
+ /*
+ * Protection against infinite loops of pbufs. This is also
+ * our primary termination mechanism - once the buffer list
+ * loops around our starting pbuf, we can stop.
+ */
+ if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr &&
+ pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
+ l2arc_pbuf_destroy(&pb);
+ l2arc_pbuf_destroy(&pb_prev);
+ if (next_io)
+ l2arc_pbuf_prefetch_abort(next_io);
+ return;
+ }
+
+ /*
+ * Our memory pressure valve. If the system is running low
+ * on memory, rather than swamping memory with new ARC buf
+ * hdrs, we opt not to reconstruct the L2ARC. At this point,
+ * however, we have already set up our L2ARC dev to chain in
+ * new metadata pbufs, so the user may choose to re-add the
+ * L2ARC dev at a later time to reconstruct it (when there's
+ * less memory pressure).
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+ cmn_err(CE_NOTE, "System running low on memory, "
+ "aborting L2ARC rebuild.");
+ l2arc_pbuf_destroy(&pb);
+ l2arc_pbuf_destroy(&pb_prev);
+ if (next_io)
+ l2arc_pbuf_prefetch_abort(next_io);
+ break;
+ }
+
+ /*
+ * Now that we know that the prev_pbuf checks out alright, we
+ * can start reconstruction from this pbuf - we can be sure
+ * that the L2ARC write hand has not yet reached any of our
+ * buffers.
+ */
+ l2arc_pbuf_restore(dev, &pb);
+
+ /* pbuf restored, continue with next one in the list */
+ l2arc_pbuf_destroy(&pb);
+ pb = pb_prev;
+ this_io = next_io;
+ next_io = NULL;
+
+ L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
+ }
+
+ ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
+}
+
+/*
+ * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries
+ * which only contain an l2arc hdr, essentially restoring the buffers to
+ * their L2ARC evicted state. This function also updates space usage on the
+ * L2ARC vdev to make sure it tracks restored buffers.
+ */
+static void
+l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb)
+{
+ spa_t *spa;
+ uint64_t guid;
+ list_t *buflists_list;
+ l2pbuf_buflist_t *buflist;
+
+ mutex_enter(&l2arc_buflist_mtx);
+ spa = dev->l2ad_vdev->vdev_spa;
+ guid = spa_load_guid(spa);
+ buflists_list = pb->pb_buflists_list;
+ for (buflist = list_head(buflists_list); buflist;
+ buflist = list_next(buflists_list, buflist)) {
+ int i;
+ uint64_t size, asize, psize;
+
+ size = asize = psize = 0;
+ for (i = 0; i < buflist->l2pbl_nbufs; i++) {
+ l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev,
+ guid);
+ size += buflist->l2pbl_bufs[i].b_size;
+ asize += buflist->l2pbl_bufs[i].b_l2asize;
+ psize += vdev_psize_to_asize(dev->l2ad_vdev,
+ buflist->l2pbl_bufs[i].b_l2asize);
+ }
+ ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size);
+ ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize);
+ ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs);
+ vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs);
+ vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev,
+ pb->pb_asize), 0, 0);
+}
+
+/*
+ * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into
+ * a state indicating that it has been evicted to L2ARC.
+ * The `guid' here is the ARC-load-guid from spa_load_guid.
+ */
+static void
+l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]};
+
+ hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock);
+ if (hdr == NULL) {
+ /* not in cache, try to insert */
+ arc_buf_hdr_t *exists;
+ arc_buf_contents_t type = buf->b_contents_type;
+ l2arc_buf_hdr_t *l2hdr;
+
+ hdr = arc_buf_hdr_alloc(guid, buf->b_size, type);
+ hdr->b_dva = buf->b_dva;
+ hdr->b_birth = buf->b_birth;
+ hdr->b_cksum0 = buf->b_cksum0;
+ hdr->b_size = buf->b_size;
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* somebody beat us to the hash insert */
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(hdr);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+ return;
+ }
+ hdr->b_flags = buf->b_flags;
+ mutex_enter(&hdr->b_freeze_lock);
+ ASSERT(hdr->b_freeze_cksum == NULL);
+ hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+ KM_SLEEP);
+ *hdr->b_freeze_cksum = buf->b_freeze_cksum;
+ mutex_exit(&hdr->b_freeze_lock);
+
+ /* now rebuild the l2arc entry */
+ ASSERT(hdr->b_l2hdr == NULL);
+ l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+ l2hdr->b_dev = dev;
+ l2hdr->b_daddr = buf->b_l2daddr;
+ l2hdr->b_asize = buf->b_l2asize;
+ l2hdr->b_compress = buf->b_l2compress;
+ hdr->b_l2hdr = l2hdr;
+ list_insert_head(dev->l2ad_buflist, hdr);
+ ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
+ ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
+
+ arc_change_state(arc_l2c_only, hdr, hash_lock);
+ }
+ mutex_exit(hash_lock);
+}
+
+/*
+ * Attempts to locate and read the newest valid uberblock on the provided
+ * L2ARC device and writes it to `ub'. On success, this function returns 0,
+ * otherwise the appropriate error code is returned.
+ */
+static int
+l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub)
+{
+ int err = 0;
+ uint8_t *ub_buf;
+ uint64_t guid;
+
+ ARCSTAT_BUMP(arcstat_l2_rebuild_attempts);
+ ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
+ guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+ if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf,
+ ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
+ goto cleanup;
+ }
+
+ /*
+ * Initial peek - does the device even have any usable uberblocks?
+ * If not, don't bother continuing.
+ */
+ l2arc_uberblock_decode(ub_buf, ub);
+ if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 ||
+ ub->ub_version > L2UBERBLOCK_MAX_VERSION ||
+ ub->ub_spa_guid != guid) {
+ err = ENOTSUP;
+ ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported);
+ goto cleanup;
+ }
+
+ /* now check to make sure that what we selected is okay */
+ if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) {
+ if (err == EINVAL) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
+ } else {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors);
+ }
+ goto cleanup;
+ }
+
+ /* this uberblock is valid */
+
+cleanup:
+ kmem_free(ub_buf, L2UBERBLOCK_SIZE);
+ return (err);
+}
+
+/*
+ * Reads a pbuf from storage, decodes it and validates its contents against
+ * the provided checksum. The result is placed in `pb'.
+ *
+ * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching.
+ * When issuing the first pbuf IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the pbuf and
+ * also issue an async IO to fetch the next pbuf in the pbuf chain. The
+ * prefetch IO is returned in `prefetch_io. On subsequent calls to this
+ * function, pass the value returned in `prefetch_io' from the previous
+ * call as `this_io' and a fresh `prefetch_io' pointer to hold the next
+ * prefetch IO. Prior to the call, you should initialize your `prefetch_io'
+ * pointer to be NULL. If no prefetch IO was issued, the pointer is left
+ * set at NULL.
+ *
+ * Actual prefetching takes place in two steps: a header IO (pi_hdr_io)
+ * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io
+ * IO is used internally in this function to be able to `peek' at the next
+ * buffer's header before the main IO to read it in completely has finished.
+ * We can then begin to issue the IO for the next buffer in the chain before
+ * we are done reading, keeping the L2ARC device's pipeline saturated with
+ * reads (rather than issuing an IO, waiting for it to complete, validating
+ * the returned buffer and issuing the next one). This will make sure that
+ * the rebuild proceeds at maximum read throughput.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the prefetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of prefetch IOs.
+ */
+static int
+l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
+ zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io)
+{
+ int err = 0;
+ uint64_t prev_pb_start;
+ uint32_t prev_pb_asize;
+ zio_cksum_t calc_cksum, prev_pb_cksum;
+ l2arc_prefetch_info_t *pi = NULL;
+
+ ASSERT(dev != NULL);
+ ASSERT(pb != NULL);
+ ASSERT(*prefetch_io == NULL);
+
+ if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) {
+ /* We could not have issued a prefetch IO for this */
+ ASSERT(this_io == NULL);
+ return (EINVAL);
+ }
+
+ /*
+ * Check to see if we have issued the IO for this pbuf in a previous
+ * run. If not, issue it now.
+ */
+ if (this_io == NULL)
+ this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize);
+
+ /* Pick up the prefetch info buffer and read its contents */
+ pi = this_io->io_private;
+ ASSERT(pi != NULL);
+ ASSERT(asize <= pi->pi_buflen);
+
+ /* Wait for the IO to read this pbuf's header to complete */
+ if ((err = zio_wait(pi->pi_hdr_io)) != 0) {
+ (void) zio_wait(this_io);
+ goto cleanup;
+ }
+
+ /*
+ * Peek to see if we can start issuing the next pbuf IO immediately.
+ * At this point, only the current pbuf's header has been read.
+ */
+ if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start,
+ &prev_pb_asize, &prev_pb_cksum) == 0) {
+ uint64_t this_pb_start, this_pb_end, prev_pb_end;
+ /* Detect malformed pbuf references and loops */
+ this_pb_start = daddr;
+ this_pb_end = daddr + asize;
+ prev_pb_end = prev_pb_start + prev_pb_asize;
+ if ((prev_pb_start >= this_pb_start && prev_pb_start <
+ this_pb_end) ||
+ (prev_pb_end >= this_pb_start && prev_pb_end <
+ this_pb_end)) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
+ cmn_err(CE_WARN, "Looping L2ARC metadata reference "
+ "detected, aborting rebuild.");
+ err = EINVAL;
+ goto cleanup;
+ }
+ /*
+ * Start issuing IO for the next pbuf early - this should
+ * help keep the L2ARC device busy while we read, decode
+ * and restore this pbuf.
+ */
+ if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize))
+ *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev,
+ prev_pb_start, prev_pb_asize);
+ }
+
+ /* Wait for the main pbuf IO to complete */
+ if ((err = zio_wait(this_io)) != 0)
+ goto cleanup;
+
+ /* Make sure the buffer checks out ok */
+ fletcher_4_native(pi->pi_buf, asize, &calc_cksum);
+ if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) {
+ err = EINVAL;
+ goto cleanup;
+ }
+
+ /* Now we can take our time decoding this buffer */
+ if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0)
+ goto cleanup;
+
+ /* This will be used in l2arc_pbuf_restore for space accounting */
+ pb->pb_asize = asize;
+
+ ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb));
+ ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize);
+ ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
+ pb->pb_payload_asz / asize);
+
+cleanup:
+ kmem_free(pi->pi_buf, pi->pi_buflen);
+ pi->pi_buf = NULL;
+ kmem_free(pi, sizeof (l2arc_prefetch_info_t));
+ /* Abort an in-flight prefetch in case of error */
+ if (err != 0 && *prefetch_io != NULL) {
+ l2arc_pbuf_prefetch_abort(*prefetch_io);
+ *prefetch_io = NULL;
+ }
+ return (err);
+}
+
+/*
+ * Validates a pbuf device address to make sure that it can be read
+ * from the provided L2ARC device. Returns 1 if the address is within
+ * the device's bounds, or 0 if not.
+ */
+static int
+l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize)
+{
+ uint32_t psize;
+ uint64_t end;
+
+ psize = vdev_psize_to_asize(dev->l2ad_vdev, asize);
+ end = daddr + psize;
+
+ if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE ||
+ asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start ||
+ /* check that the buffer address is correctly aligned */
+ (daddr & (vdev_psize_to_asize(dev->l2ad_vdev,
+ SPA_MINBLOCKSIZE) - 1)) != 0)
+ return (0);
+ else
+ return (1);
+}
+
+/*
+ * Starts an asynchronous read IO to read a pbuf. This is used in pbuf
+ * reconstruction to start reading the next pbuf before we are done
+ * decoding and reconstructing the current pbuf, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain a newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care
+ * of disposing of the allocated buffers correctly.
+ */
+static zio_t *
+l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize)
+{
+ uint32_t i, psize;
+ zio_t *pio, *hdr_io;
+ uint64_t hdr_rsize;
+ uint8_t *buf;
+ l2arc_prefetch_info_t *pinfo;
+
+ psize = vdev_psize_to_asize(vd, asize);
+ buf = kmem_alloc(psize, KM_SLEEP);
+ pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP);
+ pinfo->pi_buf = buf;
+ pinfo->pi_buflen = psize;
+
+ /*
+ * We start issuing the IO for the pbuf header early. This
+ * allows l2arc_pbuf_read to start issuing IO for the next
+ * buffer before the current pbuf is read in completely.
+ */
+
+ hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE);
+ ASSERT(hdr_rsize <= psize);
+ pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
+ hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf,
+ ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
+ (void) zio_nowait(hdr_io);
+
+ /*
+ * Read in the rest of the pbuf - this can take longer than just
+ * having a peek at the header.
+ */
+ pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY);
+ for (i = hdr_rsize; i < psize; ) {
+ uint64_t rsize = psize - i;
+ zio_t *rzio;
+
+ if (psize - i > SPA_MAXBLOCKSIZE)
+ rsize = SPA_MAXBLOCKSIZE;
+ ASSERT(rsize >= SPA_MINBLOCKSIZE);
+ rzio = zio_read_phys(pio, vd, daddr + i,
+ rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
+ (void) zio_nowait(rzio);
+ i += rsize;
+ }
+
+ return (pio);
+}
+
+/*
+ * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data
+ * buffers allocated for it.
+ */
+static void
+l2arc_pbuf_prefetch_abort(zio_t *zio)
+{
+ l2arc_prefetch_info_t *pi;
+
+ pi = zio->io_private;
+ ASSERT(pi != NULL);
+ if (pi->pi_hdr_io != NULL)
+ (void) zio_wait(pi->pi_hdr_io);
+ (void) zio_wait(zio);
+ kmem_free(pi->pi_buf, pi->pi_buflen);
+ pi->pi_buf = NULL;
+ kmem_free(pi, sizeof (l2arc_prefetch_info_t));
+}
+
+/*
+ * Encodes an l2uberblock_t structure into a destination buffer. This
+ * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting
+ * uberblock is always of this constant size.
+ */
+static void
+l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf)
+{
+ zio_cksum_t cksum;
+
+ bzero(buf, L2UBERBLOCK_SIZE);
+
+#if defined(_BIG_ENDIAN)
+ *(uint32_t *)buf = L2UBERBLOCK_MAGIC;
+ *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN;
+#else /* !defined(_BIG_ENDIAN) */
+ *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC);
+ /* zero flags is ok */
+#endif /* !defined(_BIG_ENDIAN) */
+ buf[4] = L2UBERBLOCK_MAX_VERSION;
+
+ /* rest in native byte order */
+ *(uint64_t *)(buf + 8) = ub->ub_spa_guid;
+ *(uint64_t *)(buf + 16) = ub->ub_birth;
+ *(uint64_t *)(buf + 24) = ub->ub_evict_tail;
+ *(uint64_t *)(buf + 32) = ub->ub_alloc_space;
+ *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr;
+ *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize;
+ bcopy(&ub->ub_pbuf_cksum, buf + 52, 32);
+
+ fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
+ bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32);
+}
+
+/*
+ * Decodes an l2uberblock_t from an on-disk representation. Please note
+ * that this function does not perform any uberblock validation and
+ * checksumming - call l2arc_uberblock_verify() for that.
+ */
+static void
+l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub)
+{
+ boolean_t bswap_needed;
+
+ /* these always come in big endian */
+#if defined(_BIG_ENDIAN)
+ ub->ub_magic = *(uint32_t *)buf;
+ ub->ub_flags = *(uint16_t *)(buf + 6);
+ bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1);
+#else /* !defined(_BIG_ENDIAN) */
+ ub->ub_magic = BSWAP_32(*(uint32_t *)buf);
+ ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6));
+ bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0);
+#endif /* !defined(_BIG_ENDIAN) */
+ ub->ub_version = buf[4];
+
+ ub->ub_spa_guid = *(uint64_t *)(buf + 8);
+ ub->ub_birth = *(uint64_t *)(buf + 16);
+ ub->ub_evict_tail = *(uint64_t *)(buf + 24);
+ ub->ub_alloc_space = *(uint64_t *)(buf + 32);
+ ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40);
+ ub->ub_pbuf_asize = *(uint32_t *)(buf + 48);
+ bcopy(buf + 52, &ub->ub_pbuf_cksum, 36);
+ bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32);
+
+ /* swap the rest if endianness doesn't match us */
+ if (bswap_needed) {
+ ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid);
+ ub->ub_birth = BSWAP_64(ub->ub_birth);
+ ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail);
+ ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space);
+ ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr);
+ ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize);
+ ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum);
+ ZIO_CHECKSUM_BSWAP(&ub->ub_cksum);
+ }
+}
+
+/*
+ * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is
+ * valid and matches its checksum.
+ */
+static int
+l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
+ uint64_t guid)
+{
+ zio_cksum_t cksum;
+
+ if (ub->ub_magic != L2UBERBLOCK_MAGIC ||
+ ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION)
+ /*
+ * bad magic or invalid version => persistent l2arc not
+ * supported
+ */
+ return (ENOTSUP);
+
+ if (ub->ub_spa_guid != guid)
+ /* this l2arc dev isn't ours */
+ return (EINVAL);
+
+ fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum))
+ /* bad checksum, corrupt uberblock */
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Schedules a zio to update the uberblock on an l2arc device. The zio is
+ * initiated as a child of `pio' and `cb' is filled with the information
+ * needed to free the uberblock data buffer after writing.
+ */
+static void
+l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
+{
+ uint8_t *ub_buf;
+ l2uberblock_t ub;
+ zio_t *wzio;
+ vdev_stat_t st;
+
+ ASSERT(cb->l2wcb_ub_buf == NULL);
+ vdev_get_stats(dev->l2ad_vdev, &st);
+
+ bzero(&ub, sizeof (ub));
+ ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+ ub.ub_birth = dev->l2ad_uberblock_birth++;
+ ub.ub_evict_tail = dev->l2ad_evict;
+ ub.ub_alloc_space = st.vs_alloc;
+ ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr;
+ ub.ub_pbuf_asize = dev->l2ad_pbuf_asize;
+ ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum;
+ if (dev->l2ad_first)
+ ub.ub_flags |= L2UBLK_EVICT_FIRST;
+
+ ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
+ cb->l2wcb_ub_buf = ub_buf;
+ l2arc_uberblock_encode(&ub, ub_buf);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
+ L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+ (void) zio_nowait(wzio);
+}
+
+/*
+ * Encodes a l2pbuf_t structure into the portable on-disk format. The
+ * `buf' buffer must be suitably sized to hold the entire uncompressed
+ * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function
+ * also compresses the buffer.
+ *
+ * The return value is the length of the resulting encoded pbuf structure.
+ * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression
+ * was applied, or smaller if compression was applied. In either case,
+ * prior to writing to disk, the caller must suitably pad the output
+ * buffer so that it is aligned on a multiple of the underlying storage
+ * system's block size.
+ */
+static uint32_t
+l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen)
+{
+ uint16_t flags = 0;
+ uint8_t *dst_buf;
+ uint32_t enclen;
+ l2pbuf_buflist_t *buflist;
+
+ enclen = L2PBUF_ENCODED_SIZE(pb);
+ ASSERT(buflen >= enclen);
+ bzero(buf, enclen);
+
+ /* non-header portions of pbufs are in native byte order */
+ *(uint64_t *)(buf + 8) = pb->pb_prev_daddr;
+ *(uint32_t *)(buf + 16) = pb->pb_prev_asize;
+ bcopy(&pb->pb_prev_cksum, buf + 20, 32);
+ *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE;
+
+ /* first we encode the buflists uncompressed */
+ dst_buf = buf + L2PBUF_HDR_SIZE;
+ for (buflist = list_head(pb->pb_buflists_list); buflist;
+ buflist = list_next(pb->pb_buflists_list, buflist)) {
+ int i;
+
+ ASSERT(buflist->l2pbl_nbufs != 0);
+ for (i = 0; i < buflist->l2pbl_nbufs; i++) {
+ l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
+
+ ASSERT(pbl_buf->b_size != 0);
+ *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0];
+ *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1];
+ *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth;
+ *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0;
+ bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32);
+ *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size;
+ *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr;
+ *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize;
+ dst_buf[80] = pbl_buf->b_l2compress;
+ dst_buf[81] = pbl_buf->b_contents_type;
+ *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags;
+ dst_buf += L2PBUF_BUF_SIZE;
+ }
+ }
+ ASSERT((uint32_t)(dst_buf - buf) == enclen);
+
+ /* and then compress them if necessary */
+ if (enclen >= l2arc_pbuf_compress_minsz) {
+ uint8_t *cbuf;
+ size_t slen, clen;
+
+ slen = l2arc_pbuf_items_encoded_size(pb);
+ cbuf = kmem_alloc(slen, KM_SLEEP);
+ clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0);
+ ASSERT(clen != 0);
+ if (clen < slen) {
+ bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen);
+ flags |= L2PBUF_COMPRESSED;
+ /* zero out the rest of the input buffer */
+ bzero(buf + L2PBUF_HDR_SIZE + clen,
+ buflen - (L2PBUF_HDR_SIZE + clen));
+ /* adjust our buffer length now that it's shortened */
+ enclen = L2PBUF_HDR_SIZE + clen;
+ }
+ kmem_free(cbuf, slen);
+ }
+
+ /* the header goes last since `flags' may change due to compression */
+#if defined(_BIG_ENDIAN)
+ *(uint32_t *)buf = L2PBUF_MAGIC;
+ flags |= L2PBUF_BIG_ENDIAN;
+ *(uint16_t *)(buf + 6) = flags;
+#else /* !defined(_BIG_ENDIAN) */
+ *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC);
+ *(uint16_t *)(buf + 6) = BSWAP_16(flags);
+#endif /* !defined(_BIG_ENDIAN) */
+ buf[4] = L2PBUF_MAX_VERSION;
+
+ return (enclen);
+}
+
+/*
+ * Decodes a stored l2pbuf_t structure previously encoded using
+ * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf
+ * must be initialized by l2arc_pbuf_init by the caller beforehand, but
+ * must not have been used to store any buffers yet.
+ *
+ * Please note that we don't do checksum verification here, as we don't
+ * know our own checksum (that's know by the previous block in the linked
+ * list, or by the uberblock). This should be performed by the caller
+ * prior to calling l2arc_pbuf_decode.
+ */
+static int
+l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb)
+{
+ boolean_t bswap_needed;
+ uint32_t payload_sz, payload_asz;
+ uint8_t *src_bufs;
+ l2pbuf_buflist_t *buflist;
+ int i, nbufs;
+
+ ASSERT(input_buf != NULL);
+ ASSERT(pb != NULL);
+ ASSERT(pb->pb_version != 0);
+ ASSERT(pb->pb_nbuflists == 0);
+
+ /* no valid buffer can be this small */
+ if (buflen < L2PBUF_HDR_SIZE)
+ return (EINVAL);
+
+ /* these always come in big endian */
+#if defined(_BIG_ENDIAN)
+ pb->pb_magic = *(uint32_t *)input_buf;
+ pb->pb_flags = *(uint16_t *)(input_buf + 6);
+ bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1);
+#else /* !defined(_BIG_ENDIAN) */
+ pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf);
+ pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6));
+ bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0);
+#endif /* !defined(_BIG_ENDIAN) */
+ pb->pb_version = input_buf[4];
+
+ if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0)
+ return (EINVAL);
+ if (pb->pb_version > L2PBUF_MAX_VERSION)
+ return (ENOTSUP);
+
+ /* remainder of pbuf may need bswap'ping */
+ pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8);
+ pb->pb_prev_asize = *(uint64_t *)(input_buf + 16);
+ bcopy(input_buf + 20, &pb->pb_prev_cksum, 32);
+ payload_sz = *(uint32_t *)(input_buf + 52);
+ payload_asz = buflen - L2PBUF_HDR_SIZE;
+
+ if (bswap_needed) {
+ pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr);
+ pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize);
+ ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum);
+ payload_sz = BSWAP_32(payload_sz);
+ }
+
+ /* check for sensible buffer allocation limits */
+ if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) ||
+ (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) ||
+ (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0)
+ return (EINVAL);
+ nbufs = payload_sz / L2PBUF_BUF_SIZE;
+
+ /* decompression might be needed */
+ if (pb->pb_flags & L2PBUF_COMPRESSED) {
+ src_bufs = kmem_alloc(payload_sz, KM_SLEEP);
+ if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs,
+ payload_asz, payload_sz, 0) != 0) {
+ kmem_free(src_bufs, payload_sz);
+ return (EINVAL);
+ }
+ } else {
+ src_bufs = input_buf + L2PBUF_HDR_SIZE;
+ }
+
+ /* Decode individual pbuf items from our source buffer. */
+ buflist = l2arc_pbuf_buflist_alloc(pb, nbufs);
+ for (i = 0; i < nbufs; i++) {
+ l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
+ const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE;
+
+ pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src;
+ pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8);
+ pbl_buf->b_birth = *(uint64_t *)(src + 16);
+ pbl_buf->b_cksum0 = *(uint64_t *)(src + 24);
+ bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32);
+ pbl_buf->b_size = *(uint32_t *)(src + 64);
+ pbl_buf->b_l2daddr = *(uint64_t *)(src + 68);
+ pbl_buf->b_l2asize = *(uint32_t *)(src + 76);
+ pbl_buf->b_l2compress = src[80];
+ pbl_buf->b_contents_type = src[81];
+ pbl_buf->b_flags = *(uint32_t *)(src + 84);
+
+ if (bswap_needed) {
+ pbl_buf->b_dva.dva_word[0] =
+ BSWAP_64(pbl_buf->b_dva.dva_word[0]);
+ pbl_buf->b_dva.dva_word[1] =
+ BSWAP_64(pbl_buf->b_dva.dva_word[1]);
+ pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth);
+ pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0);
+ ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum);
+ pbl_buf->b_size = BSWAP_32(pbl_buf->b_size);
+ pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr);
+ pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize);
+ pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags);
+ }
+
+ pb->pb_payload_asz += pbl_buf->b_l2asize;
+ }
+
+ if (pb->pb_flags & L2PBUF_COMPRESSED)
+ kmem_free(src_bufs, payload_sz);
+
+ return (0);
+}
+
+/*
+ * Decodes the previous buffer pointer encoded in a pbuf. This is used
+ * during L2ARC reconstruction to "peek" at the next buffer and start
+ * issuing IO to fetch it early, before decoding of the current buffer
+ * is done (which can take time due to decompression).
+ * Returns 0 on success (and fills in the return parameters `daddr',
+ * `asize' and `cksum' with the info of the previous pbuf), and an errno
+ * on error.
+ */
+static int
+l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr,
+ uint32_t *asize, zio_cksum_t *cksum)
+{
+ boolean_t bswap_needed;
+ uint16_t version, flags;
+ uint32_t magic;
+
+ ASSERT(buf != NULL);
+
+ /* no valid buffer can be this small */
+ if (buflen <= L2PBUF_HDR_SIZE)
+ return (EINVAL);
+
+ /* these always come in big endian */
+#if defined(_BIG_ENDIAN)
+ magic = *(uint32_t *)buf;
+ flags = *(uint16_t *)(buf + 6);
+ bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1);
+#else /* !defined(_BIG_ENDIAN) */
+ magic = BSWAP_32(*(uint32_t *)buf);
+ flags = BSWAP_16(*(uint16_t *)(buf + 6));
+ bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0);
+#endif /* !defined(_BIG_ENDIAN) */
+ version = buf[4];
+
+ if (magic != L2PBUF_MAGIC || version == 0)
+ return (EINVAL);
+ if (version > L2PBUF_MAX_VERSION)
+ return (ENOTSUP);
+
+ *daddr = *(uint64_t *)(buf + 4);
+ *asize = *(uint64_t *)(buf + 12);
+ bcopy(buf + 16, cksum, 32);
+
+ if (bswap_needed) {
+ *daddr = BSWAP_64(*daddr);
+ *asize = BSWAP_64(*asize);
+ ZIO_CHECKSUM_BSWAP(cksum);
+ }
+
+ return (0);
+}
+
+/*
+ * Initializes a pbuf structure into a clean state. All version and flags
+ * fields are filled in as appropriate for this architecture.
+ * If the structure was used before, first call l2arc_pbuf_destroy on it,
+ * as this function assumes the structure is uninitialized.
+ */
+static void
+l2arc_pbuf_init(l2pbuf_t *pb)
+{
+ bzero(pb, sizeof (l2pbuf_t));
+ pb->pb_version = L2PBUF_MAX_VERSION;
+#if defined(_BIG_ENDIAN)
+ pb->pb_flags |= L2PB_BIG_ENDIAN;
+#endif
+ pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP);
+ list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t),
+ offsetof(l2pbuf_buflist_t, l2pbl_node));
+}
+
+/*
+ * Destroys a pbuf structure and puts it into a clean state ready to be
+ * initialized by l2arc_pbuf_init. All buflists created by
+ * l2arc_pbuf_buflist_alloc are released as well.
+ */
+static void
+l2arc_pbuf_destroy(l2pbuf_t *pb)
+{
+ list_t *buflist_list = pb->pb_buflists_list;
+ l2pbuf_buflist_t *buflist;
+
+ while ((buflist = list_head(buflist_list)) != NULL) {
+ ASSERT(buflist->l2pbl_nbufs > 0);
+ kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) *
+ buflist->l2pbl_nbufs);
+ list_remove(buflist_list, buflist);
+ kmem_free(buflist, sizeof (l2pbuf_buflist_t));
+ }
+ pb->pb_nbuflists = 0;
+ list_destroy(pb->pb_buflists_list);
+ kmem_free(pb->pb_buflists_list, sizeof (list_t));
+ bzero(pb, sizeof (l2pbuf_t));
+}
+
+/*
+ * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs'
+ * buffers. This is used during the buffer write cycle - each cycle allocates
+ * a new buflist and fills it with buffers it writes. Then, when the pbuf
+ * reaches its buflist limit, it is commited to stable storage.
+ */
+static l2pbuf_buflist_t *
+l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs)
+{
+ l2pbuf_buflist_t *buflist;
+
+ ASSERT(pb->pb_buflists_list != NULL);
+ buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP);
+ buflist->l2pbl_nbufs = nbufs;
+ buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs,
+ KM_SLEEP);
+ list_insert_tail(pb->pb_buflists_list, buflist);
+ pb->pb_nbuflists++;
+
+ return (buflist);
+}
+
+/*
+ * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'.
+ * The buffer being inserted must be present in L2ARC.
+ */
+static void
+l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
+ const arc_buf_hdr_t *ab, int index)
+{
+ l2pbuf_buf_t *pb_buf;
+ const l2arc_buf_hdr_t *l2hdr;
+
+ l2hdr = ab->b_l2hdr;
+ ASSERT(l2hdr != NULL);
+ ASSERT(pbl->l2pbl_nbufs > index);
+
+ pb_buf = &pbl->l2pbl_bufs[index];
+ pb_buf->b_dva = ab->b_dva;
+ pb_buf->b_birth = ab->b_birth;
+ pb_buf->b_cksum0 = ab->b_cksum0;
+ pb_buf->b_freeze_cksum = *ab->b_freeze_cksum;
+ pb_buf->b_size = ab->b_size;
+ pb_buf->b_l2daddr = l2hdr->b_daddr;
+ pb_buf->b_l2asize = l2hdr->b_asize;
+ pb_buf->b_l2compress = l2hdr->b_compress;
+ pb_buf->b_contents_type = ab->b_type;
+ pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS;
+ pb->pb_payload_asz += l2hdr->b_asize;
+}
+
+/*
+ * Commits a pbuf to stable storage. This routine is invoked when writing
+ * ARC buffers to an L2ARC device. When the pbuf associated with the device
+ * has reached its limits (either in size or in number of writes), it is
+ * scheduled here for writing.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+static void
+l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
+{
+ l2pbuf_t *pb = &dev->l2ad_pbuf;
+ uint64_t i, est_encsize, bufsize, encsize, io_size;
+ uint8_t *pb_buf;
+
+ pb->pb_prev_daddr = dev->l2ad_pbuf_daddr;
+ pb->pb_prev_asize = dev->l2ad_pbuf_asize;
+ pb->pb_prev_cksum = dev->l2ad_pbuf_cksum;
+
+ est_encsize = L2PBUF_ENCODED_SIZE(pb);
+ bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize);
+ pb_buf = kmem_zalloc(bufsize, KM_SLEEP);
+ encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize);
+ cb->l2wcb_pbuf = pb_buf;
+ cb->l2wcb_pbuf_size = bufsize;
+
+ dev->l2ad_pbuf_daddr = dev->l2ad_hand;
+ dev->l2ad_pbuf_asize = encsize;
+ fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum);
+
+ io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize);
+ for (i = 0; i < io_size; ) {
+ zio_t *wzio;
+ uint64_t wsize = io_size - i;
+
+ if (wsize > SPA_MAXBLOCKSIZE)
+ wsize = SPA_MAXBLOCKSIZE;
+ ASSERT(wsize >= SPA_MINBLOCKSIZE);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i,
+ wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+ (void) zio_nowait(wzio);
+ i += wsize;
+ }
+
+ dev->l2ad_hand += io_size;
+ vdev_space_update(dev->l2ad_vdev, io_size, 0, 0);
+ l2arc_uberblock_update(dev, pio, cb);
+
+ ARCSTAT_INCR(arcstat_l2_write_bytes, io_size);
+ ARCSTAT_BUMP(arcstat_l2_meta_writes);
+ ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize);
+ ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize);
+ ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
+ pb->pb_payload_asz / encsize);
+}
+
+/*
+ * Returns the number of bytes occupied by the payload buffer items of
+ * a pbuf in portable (on-disk) encoded form, i.e. the bytes following
+ * L2PBUF_HDR_SIZE.
+ */
+static uint32_t
+l2arc_pbuf_items_encoded_size(l2pbuf_t *pb)
+{
+ uint32_t size = 0;
+ l2pbuf_buflist_t *buflist;
+
+ for (buflist = list_head(pb->pb_buflists_list); buflist != NULL;
+ buflist = list_next(pb->pb_buflists_list, buflist))
+ size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs;
+
+ return (size);
}