Print this page
3525 Persistent L2ARC
*** 136,145 ****
--- 136,147 ----
#include <sys/dnlc.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
#include <zfs_fletcher.h>
+ #include <sys/byteorder.h>
+ #include <sys/spa_impl.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
boolean_t arc_watch = B_FALSE;
int arc_procfd;
*** 314,323 ****
--- 316,340 ----
kstat_named_t arcstat_l2_asize;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_l2_compress_successes;
kstat_named_t arcstat_l2_compress_zeros;
kstat_named_t arcstat_l2_compress_failures;
+ kstat_named_t arcstat_l2_log_blk_writes;
+ kstat_named_t arcstat_l2_log_blk_avg_size;
+ kstat_named_t arcstat_l2_data_to_meta_ratio;
+ kstat_named_t arcstat_l2_rebuild_successes;
+ kstat_named_t arcstat_l2_rebuild_abort_unsupported;
+ kstat_named_t arcstat_l2_rebuild_abort_timeout;
+ kstat_named_t arcstat_l2_rebuild_abort_io_errors;
+ kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
+ kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
+ kstat_named_t arcstat_l2_rebuild_abort_lowmem;
+ kstat_named_t arcstat_l2_rebuild_size;
+ kstat_named_t arcstat_l2_rebuild_bufs;
+ kstat_named_t arcstat_l2_rebuild_bufs_precached;
+ kstat_named_t arcstat_l2_rebuild_psize;
+ kstat_named_t arcstat_l2_rebuild_log_blks;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_duplicate_buffers;
kstat_named_t arcstat_duplicate_buffers_size;
kstat_named_t arcstat_duplicate_reads;
kstat_named_t arcstat_meta_used;
*** 380,389 ****
--- 397,421 ----
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
{ "l2_compress_successes", KSTAT_DATA_UINT64 },
{ "l2_compress_zeros", KSTAT_DATA_UINT64 },
{ "l2_compress_failures", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 },
+ { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_successes", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_timeout", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_psize", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_size", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "duplicate_buffers", KSTAT_DATA_UINT64 },
{ "duplicate_buffers_size", KSTAT_DATA_UINT64 },
{ "duplicate_reads", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
*** 427,436 ****
--- 459,487 ----
} else { \
ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
} \
}
+ /*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+ #define ARCSTAT_F_AVG_FACTOR 3
+ #define ARCSTAT_F_AVG(stat, value) \
+ do { \
+ uint64_t x = ARCSTAT(stat); \
+ x = x - x / ARCSTAT_F_AVG_FACTOR + \
+ (value) / ARCSTAT_F_AVG_FACTOR; \
+ ARCSTAT(stat) = x; \
+ _NOTE(NOTREACHED) \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
kstat_t *arc_ksp;
static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
*** 635,657 ****
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
/*
* L2ARC Internals
*/
! typedef struct l2arc_dev {
! vdev_t *l2ad_vdev; /* vdev */
! spa_t *l2ad_spa; /* spa */
! uint64_t l2ad_hand; /* next write location */
! uint64_t l2ad_start; /* first addr on device */
! uint64_t l2ad_end; /* last addr on device */
! uint64_t l2ad_evict; /* last addr eviction reached */
! boolean_t l2ad_first; /* first sweep through */
! boolean_t l2ad_writing; /* currently writing */
! list_t *l2ad_buflist; /* buffer list */
! list_node_t l2ad_node; /* device list node */
! } l2arc_dev_t;
!
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
static l2arc_dev_t *l2arc_dev_last; /* last device used */
static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
--- 686,696 ----
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
/*
* L2ARC Internals
*/
! typedef struct l2arc_dev l2arc_dev_t;
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
static l2arc_dev_t *l2arc_dev_last; /* last device used */
static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
*** 670,679 ****
--- 709,720 ----
} l2arc_read_callback_t;
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+ /* list of in-flight l2arc_log_blk_buf_t's */
+ list_t l2wcb_log_blk_buf_list;
} l2arc_write_callback_t;
struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
*** 697,715 ****
static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
static void l2arc_read_done(zio_t *zio);
! static void l2arc_hdr_stat_add(void);
static void l2arc_hdr_stat_remove(void);
static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
enum zio_compress c);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
! static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL;
int i;
--- 738,1025 ----
static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
static void l2arc_read_done(zio_t *zio);
! static void l2arc_hdr_stat_add(boolean_t from_arc);
static void l2arc_hdr_stat_remove(void);
+ static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
enum zio_compress c);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
! enum {
! L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
! };
!
! /*
! * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
! */
! typedef struct l2arc_log_blk_ptr {
! uint64_t l2lbp_daddr; /* device address of log */
! /*
! * l2lbp_prop is the same format as the blk_prop in blkptr_t:
! * * logical size (in sectors)
! * * physical (compressed) size (in sectors)
! * * compression algorithm (we always LZ4-compress l2arc logs)
! * * checksum algorithm (used for l2lbp_cksum)
! * * object type & level (unused for now)
! */
! uint64_t l2lbp_prop;
! zio_cksum_t l2lbp_cksum; /* fletcher4 of log */
! } l2arc_log_blk_ptr_t;
!
! /*
! * The persistent L2ARC device header.
! */
! typedef struct l2arc_dev_hdr_phys {
! uint64_t l2dh_magic;
! zio_cksum_t l2dh_self_cksum; /* fletcher4 of fields below */
!
! /*
! * Global L2ARC device state and metadata.
! */
! uint64_t l2dh_spa_guid;
! uint64_t l2dh_evict_tail; /* current evict pointer */
! uint64_t l2dh_alloc_space; /* vdev space alloc status */
! uint64_t l2dh_flags; /* l2arc_dev_hdr_flags_t */
!
! /*
! * Start of log block chain. [0] -> newest log, [1] -> one older (used
! * for initiating prefetch).
! */
! l2arc_log_blk_ptr_t l2dh_start_lbps[2];
!
! const uint64_t l2dh_pad[43]; /* pad to 512 bytes */
! } l2arc_dev_hdr_phys_t;
! CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
!
! /*
! * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
! */
! typedef struct l2arc_log_ent_phys {
! dva_t l2le_dva; /* dva of buffer */
! uint64_t l2le_birth; /* birth txg of buffer */
! uint64_t l2le_cksum0;
! zio_cksum_t l2le_freeze_cksum;
! /*
! * l2le_prop is the same format as the blk_prop in blkptr_t:
! * * logical size (in sectors)
! * * physical (compressed) size (in sectors)
! * * compression algorithm
! * * checksum algorithm (used for cksum0)
! * * object type & level (used to restore arc_buf_contents_t)
! */
! uint64_t l2le_prop;
! uint64_t l2le_daddr; /* buf location on l2dev */
! const uint64_t l2le_pad[6]; /* resv'd for future use */
! } l2arc_log_ent_phys_t;
!
! /*
! * These design limits give us the following overhead (before compression):
! * avg_blk_sz overhead
! * 1k 12.51 %
! * 2k 6.26 %
! * 4k 3.13 %
! * 8k 1.56 %
! * 16k 0.78 %
! * 32k 0.39 %
! * 64k 0.20 %
! * 128k 0.10 %
! * Compression should be able to sequeeze these down by about a factor of 2x.
! */
! #define L2ARC_LOG_BLK_SIZE (128 * 1024) /* 128k */
! #define L2ARC_LOG_BLK_HEADER_LEN (128)
! #define L2ARC_LOG_BLK_ENTRIES /* 1023 entries */ \
! ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) / \
! sizeof (l2arc_log_ent_phys_t))
! /*
! * Maximum amount of data in an l2arc log block (used to terminate rebuilding
! * before we hit the write head and restore potentially corrupted blocks).
! */
! #define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE \
! (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
! /*
! * For the persistency and rebuild algorithms to operate reliably we need
! * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
! * excessive log block looping might confuse the log chain end detection).
! * Under normal circumstances this is not a problem, since this is somewhere
! * around only 400 MB.
! */
! #define L2ARC_PERSIST_MIN_SIZE (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
!
! /*
! * A log block of up to 1023 ARC buffer log entries, chained into the
! * persistent L2ARC metadata linked list.
! */
! typedef struct l2arc_log_blk_phys {
! /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
! uint64_t l2lb_magic;
! l2arc_log_blk_ptr_t l2lb_back2_lbp; /* back 2 steps in chain */
! uint64_t l2lb_pad[9]; /* resv'd for future use */
! /* Payload */
! l2arc_log_ent_phys_t l2lb_entries[L2ARC_LOG_BLK_ENTRIES];
! } l2arc_log_blk_phys_t;
!
! CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
! CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) -
! offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
!
! /*
! * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
! * written to the L2ARC device. They may be compressed, hence the uint8_t[].
! */
! typedef struct l2arc_log_blk_buf {
! uint8_t l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
! list_node_t l2lbb_node;
! } l2arc_log_blk_buf_t;
!
! /* Macros for the manipulation fields in the blk_prop format of blkptr_t */
! #define BLKPROP_GET_LSIZE(_obj, _field) \
! BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
! #define BLKPROP_SET_LSIZE(_obj, _field, x) \
! BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
! #define BLKPROP_GET_PSIZE(_obj, _field) \
! BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
! #define BLKPROP_SET_PSIZE(_obj, _field, x) \
! BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
! #define BLKPROP_GET_COMPRESS(_obj, _field) \
! BF64_GET((_obj)->_field, 32, 8)
! #define BLKPROP_SET_COMPRESS(_obj, _field, x) \
! BF64_SET((_obj)->_field, 32, 8, x)
! #define BLKPROP_GET_CHECKSUM(_obj, _field) \
! BF64_GET((_obj)->_field, 40, 8)
! #define BLKPROP_SET_CHECKSUM(_obj, _field, x) \
! BF64_SET((_obj)->_field, 40, 8, x)
! #define BLKPROP_GET_TYPE(_obj, _field) \
! BF64_GET((_obj)->_field, 48, 8)
! #define BLKPROP_SET_TYPE(_obj, _field, x) \
! BF64_SET((_obj)->_field, 48, 8, x)
!
! /* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */
! #define LBP_GET_LSIZE(_add) BLKPROP_GET_LSIZE(_add, l2lbp_prop)
! #define LBP_SET_LSIZE(_add, x) BLKPROP_SET_LSIZE(_add, l2lbp_prop, x)
! #define LBP_GET_PSIZE(_add) BLKPROP_GET_PSIZE(_add, l2lbp_prop)
! #define LBP_SET_PSIZE(_add, x) BLKPROP_SET_PSIZE(_add, l2lbp_prop, x)
! #define LBP_GET_COMPRESS(_add) BLKPROP_GET_COMPRESS(_add, l2lbp_prop)
! #define LBP_SET_COMPRESS(_add, x) BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \
! x)
! #define LBP_GET_CHECKSUM(_add) BLKPROP_GET_CHECKSUM(_add, l2lbp_prop)
! #define LBP_SET_CHECKSUM(_add, x) BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \
! x)
! #define LBP_GET_TYPE(_add) BLKPROP_GET_TYPE(_add, l2lbp_prop)
! #define LBP_SET_TYPE(_add, x) BLKPROP_SET_TYPE(_add, l2lbp_prop, x)
!
! /* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */
! #define LE_GET_LSIZE(_le) BLKPROP_GET_LSIZE(_le, l2le_prop)
! #define LE_SET_LSIZE(_le, x) BLKPROP_SET_LSIZE(_le, l2le_prop, x)
! #define LE_GET_PSIZE(_le) BLKPROP_GET_PSIZE(_le, l2le_prop)
! #define LE_SET_PSIZE(_le, x) BLKPROP_SET_PSIZE(_le, l2le_prop, x)
! #define LE_GET_COMPRESS(_le) BLKPROP_GET_COMPRESS(_le, l2le_prop)
! #define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x)
! #define LE_GET_CHECKSUM(_le) BLKPROP_GET_CHECKSUM(_le, l2le_prop)
! #define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x)
! #define LE_GET_TYPE(_le) BLKPROP_GET_TYPE(_le, l2le_prop)
! #define LE_SET_TYPE(_le, x) BLKPROP_SET_TYPE(_le, l2le_prop, x)
!
! #define PTR_SWAP(x, y) \
! do { \
! void *tmp = (x);\
! x = y; \
! y = tmp; \
! _NOTE(CONSTCOND)\
! } while (0)
!
! #define L2ARC_DEV_HDR_MAGIC 0x12bab10c00000001LLU
! #define L2ARC_LOG_BLK_MAGIC 0x120103b10c000001LLU
! #define L2ARC_REBUILD_TIMEOUT 300 /* a rebuild may take at most 300s */
!
! struct l2arc_dev {
! vdev_t *l2ad_vdev; /* vdev */
! spa_t *l2ad_spa; /* spa */
! uint64_t l2ad_hand; /* next write location */
! uint64_t l2ad_start; /* first addr on device */
! uint64_t l2ad_end; /* last addr on device */
! uint64_t l2ad_evict; /* last addr eviction reached */
! boolean_t l2ad_first; /* first sweep through */
! boolean_t l2ad_writing; /* currently writing */
! list_t *l2ad_buflist; /* buffer list */
! list_node_t l2ad_node; /* device list node */
! l2arc_dev_hdr_phys_t l2ad_dev_hdr; /* persistent device header */
! l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
! int l2ad_log_ent_idx; /* index into cur log blk */
! /* number of bytes in current log block's payload */
! uint64_t l2ad_log_blk_payload_asize;
! /* flag indicating whether a rebuild is scheduled or is going on */
! boolean_t l2ad_rebuild;
! };
!
! /*
! * Performance tuning of L2ARC persistency:
! *
! * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
! * pool import or when adding one manually later) will attempt
! * to rebuild L2ARC buffer contents. In special circumstances,
! * the administrator may want to set this to B_FALSE, if they
! * are having trouble importing a pool or attaching an L2ARC
! * device (e.g. the L2ARC device is slow to read in stored log
! * metadata, or the metadata has become somehow
! * fragmented/unusable).
! * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
! * avoid a slow L2ARC device from preventing pool import. If we
! * are not done rebuilding an L2ARC device by this time, we
! * stop the rebuild and return immediately.
! */
! boolean_t l2arc_rebuild_enabled = B_TRUE;
! uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
!
! /*
! * L2ARC persistency rebuild routines.
! */
! static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
! static int l2arc_rebuild(l2arc_dev_t *dev);
! static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
! l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
! static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
! l2arc_dev_t *dev, uint64_t guid);
!
! /*
! * L2ARC persistency read I/O routines.
! */
! static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr);
! static int l2arc_log_blk_read(l2arc_dev_t *dev,
! const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp,
! l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
! uint8_t *this_lb_buf, uint8_t *next_lb_buf,
! zio_t *this_io, zio_t **next_io);
! static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev,
! const l2arc_log_blk_ptr_t *lp);
! static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
! const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf);
! static void l2arc_log_blk_prefetch_abort(zio_t *zio);
!
! /*
! * L2ARC persistency write I/O routines.
! */
! static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
! static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
! l2arc_write_callback_t *cb);
!
! /*
! * L2ARC persistency auxilliary routines.
! */
! static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
! zio_cksum_t *cksum);
! static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
! const arc_buf_hdr_t *ab);
! static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
! uint64_t top, uint64_t check);
! static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline);
!
! static inline uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL;
int i;
*** 1245,1255 ****
}
ab->b_state = new_state;
/* adjust l2arc hdr stats */
if (new_state == arc_l2c_only)
! l2arc_hdr_stat_add();
else if (old_state == arc_l2c_only)
l2arc_hdr_stat_remove();
}
void
--- 1555,1565 ----
}
ab->b_state = new_state;
/* adjust l2arc hdr stats */
if (new_state == arc_l2c_only)
! l2arc_hdr_stat_add(old_state != arc_anon);
else if (old_state == arc_l2c_only)
l2arc_hdr_stat_remove();
}
void
*** 1349,1358 ****
--- 1659,1695 ----
(void) refcount_add(&hdr->b_refcnt, tag);
return (buf);
}
+ /*
+ * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc->(disk).
+ */
+ arc_buf_hdr_t *
+ arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
+ {
+ arc_buf_hdr_t *hdr;
+
+ ASSERT3U(size, >, 0);
+ hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ ASSERT(BUF_EMPTY(hdr));
+ hdr->b_size = size;
+ hdr->b_type = type;
+ hdr->b_spa = guid;
+ hdr->b_state = arc_anon;
+ hdr->b_arc_access = 0;
+ hdr->b_buf = NULL;
+ hdr->b_datacnt = 0;
+ hdr->b_flags = 0;
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+ return (hdr);
+ }
+
static char *arc_onloan_tag = "onloan";
/*
* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
* flight data by arc_tempreserve_space() until they are "returned". Loaned
*** 1586,1596 ****
if (l2hdr != NULL) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
! kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
if (hdr->b_state == arc_l2c_only)
l2arc_hdr_stat_remove();
hdr->b_l2hdr = NULL;
}
--- 1923,1933 ----
if (l2hdr != NULL) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
! kmem_free(l2hdr, sizeof (*l2hdr));
if (hdr->b_state == arc_l2c_only)
l2arc_hdr_stat_remove();
hdr->b_l2hdr = NULL;
}
*** 3043,3052 ****
--- 3380,3392 ----
hdr->b_acb = acb;
hdr->b_flags |= ARC_IO_IN_PROGRESS;
if (hdr->b_l2hdr != NULL &&
(vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+ /*
+ * Need to stash these before letting go of hash_lock
+ */
devw = hdr->b_l2hdr->b_dev->l2ad_writing;
addr = hdr->b_l2hdr->b_daddr;
b_compress = hdr->b_l2hdr->b_compress;
b_asize = hdr->b_l2hdr->b_asize;
/*
*** 3416,3426 ****
buf->b_efunc = NULL;
buf->b_private = NULL;
if (l2hdr) {
ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
! kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
ARCSTAT_INCR(arcstat_l2_size, -buf_size);
mutex_exit(&l2arc_buflist_mtx);
}
}
--- 3756,3766 ----
buf->b_efunc = NULL;
buf->b_private = NULL;
if (l2hdr) {
ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
! kmem_free(l2hdr, sizeof (*l2hdr));
ARCSTAT_INCR(arcstat_l2_size, -buf_size);
mutex_exit(&l2arc_buflist_mtx);
}
}
*** 4031,4040 ****
--- 4371,4458 ----
* l2arc_write_size() calculate how much to write
* l2arc_write_interval() calculate sleep delay between writes
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
+ *
+ * L2ARC persistency:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) Every now and then we mix in a piece of metadata (called a log block)
+ * into the L2ARC write. This allows us to understand what's been written,
+ * so that we can rebuild the arc_buf_hdr_t structures of the main ARC
+ * buffers. The log block also includes a "back-reference" pointer to the
+ * previous block, forming a back-linked list of blocks on the L2ARC device.
+ *
+ * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
+ * for our header bookkeeping purposes. This contains a device header, which
+ * contains our top-level reference structures. We update it each time we
+ * write a new log block, so that we're able to locate it in the L2ARC
+ * device. If this write results in an inconsistent device header (e.g. due
+ * to power failure), we detect this by verifying the header's checksum
+ * and simply drop the entries from L2ARC.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * | __________newest log block pointers_________ |
+ * | / \1 back \latest |
+ * | / V V |
+ * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
+ * | ^ / ^ / ^ / |
+ * | `-prev-' `-prev-' `-prev-' |
+ * | lb lb lb |
+ * +======================================================================+
+ *
+ * On-device data structures:
+ *
+ * L2ARC device header: l2arc_dev_hdr_phys_t
+ * L2ARC log block: l2arc_log_blk_phys_t
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (writing
+ * a new log block every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed log block (and its referenced data buffers), like so:
+ *
+ * current write head__ __old tail
+ * \ /
+ * V V
+ * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
+ * ^ ^^^^^^^^^___________________________________
+ * | \
+ * <<nextwrite>> may overwrite this blk and/or its bufs --'
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process (see l2arc_rebuild).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update blocks which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
*/
static boolean_t
l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
{
*** 4097,4109 ****
return (next);
}
static void
! l2arc_hdr_stat_add(void)
{
ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
}
static void
l2arc_hdr_stat_remove(void)
--- 4515,4528 ----
return (next);
}
static void
! l2arc_hdr_stat_add(boolean_t from_arc)
{
ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+ if (from_arc)
ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
}
static void
l2arc_hdr_stat_remove(void)
*** 4134,4144 ****
goto out;
first = NULL;
next = l2arc_dev_last;
do {
! /* loop around the list looking for a non-faulted vdev */
if (next == NULL) {
next = list_head(l2arc_dev_list);
} else {
next = list_next(l2arc_dev_list, next);
if (next == NULL)
--- 4553,4566 ----
goto out;
first = NULL;
next = l2arc_dev_last;
do {
! /*
! * Loop around the list looking for a non-faulted vdev
! * and one that isn't currently doing an L2ARC rebuild.
! */
if (next == NULL) {
next = list_head(l2arc_dev_list);
} else {
next = list_next(l2arc_dev_list, next);
if (next == NULL)
*** 4149,4162 ****
if (first == NULL)
first = next;
else if (next == first)
break;
! } while (vdev_is_dead(next->l2ad_vdev));
/* if we were unable to find any usable vdevs, return NULL */
! if (vdev_is_dead(next->l2ad_vdev))
next = NULL;
l2arc_dev_last = next;
out:
--- 4571,4584 ----
if (first == NULL)
first = next;
else if (next == first)
break;
! } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
/* if we were unable to find any usable vdevs, return NULL */
! if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
next = NULL;
l2arc_dev_last = next;
out:
*** 4206,4217 ****
{
l2arc_write_callback_t *cb;
l2arc_dev_t *dev;
list_t *buflist;
arc_buf_hdr_t *head, *ab, *ab_prev;
! l2arc_buf_hdr_t *abl2;
kmutex_t *hash_lock;
cb = zio->io_private;
ASSERT(cb != NULL);
dev = cb->l2wcb_dev;
ASSERT(dev != NULL);
--- 4628,4640 ----
{
l2arc_write_callback_t *cb;
l2arc_dev_t *dev;
list_t *buflist;
arc_buf_hdr_t *head, *ab, *ab_prev;
! l2arc_buf_hdr_t *l2hdr;
kmutex_t *hash_lock;
+ l2arc_log_blk_buf_t *lb_buf;
cb = zio->io_private;
ASSERT(cb != NULL);
dev = cb->l2wcb_dev;
ASSERT(dev != NULL);
*** 4230,4240 ****
--- 4653,4670 ----
/*
* All writes completed, or an error was hit.
*/
for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
ab_prev = list_prev(buflist, ab);
+ l2hdr = ab->b_l2hdr;
+ /*
+ * Release the temporary compressed buffer as soon as possible.
+ */
+ if (l2hdr->b_compress != ZIO_COMPRESS_OFF)
+ l2arc_release_cdata_buf(ab);
+
hash_lock = HDR_LOCK(ab);
if (!mutex_tryenter(hash_lock)) {
/*
* This buffer misses out. It may be in a stage
* of eviction. Its ARC_L2_WRITING flag will be
*** 4242,4267 ****
*/
ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
continue;
}
- abl2 = ab->b_l2hdr;
-
- /*
- * Release the temporary compressed buffer as soon as possible.
- */
- if (abl2->b_compress != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(ab);
-
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
list_remove(buflist, ab);
! ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
ab->b_l2hdr = NULL;
! kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
/*
* Allow ARC to begin reads to this L2ARC entry.
--- 4672,4689 ----
*/
ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
continue;
}
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
list_remove(buflist, ab);
! ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
ab->b_l2hdr = NULL;
! kmem_free(l2hdr, sizeof (*l2hdr));
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
/*
* Allow ARC to begin reads to this L2ARC entry.
*** 4276,4285 ****
--- 4698,4713 ----
kmem_cache_free(hdr_cache, head);
mutex_exit(&l2arc_buflist_mtx);
l2arc_do_free_on_write();
+ for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL;
+ lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) {
+ (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list);
+ kmem_free(lb_buf, sizeof (*lb_buf));
+ }
+ list_destroy(&cb->l2wcb_log_blk_buf_list);
kmem_free(cb, sizeof (l2arc_write_callback_t));
}
/*
* A read to a cache device completed. Validate buffer contents before
*** 4399,4418 ****
mutex_enter(*lock);
return (list);
}
/*
* Evict buffers from the device write hand to the distance specified in
* bytes. This distance may span populated buffers, it may span nothing.
* This is clearing a region on the L2ARC device ready for writing.
* If the 'all' boolean is set, every buffer is evicted.
*/
static void
l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
{
list_t *buflist;
! l2arc_buf_hdr_t *abl2;
arc_buf_hdr_t *ab, *ab_prev;
kmutex_t *hash_lock;
uint64_t taddr;
buflist = dev->l2ad_buflist;
--- 4827,4859 ----
mutex_enter(*lock);
return (list);
}
/*
+ * Calculates the maximum overhead of L2ARC metadata log blocks for a given
+ * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
+ * overhead in processing to make sure there is enough headroom available
+ * when writing buffers.
+ */
+ static inline uint64_t
+ l2arc_log_blk_overhead(uint64_t write_sz)
+ {
+ return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
+ L2ARC_LOG_BLK_SIZE;
+ }
+
+ /*
* Evict buffers from the device write hand to the distance specified in
* bytes. This distance may span populated buffers, it may span nothing.
* This is clearing a region on the L2ARC device ready for writing.
* If the 'all' boolean is set, every buffer is evicted.
*/
static void
l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
{
list_t *buflist;
! l2arc_buf_hdr_t *l2hdr;
arc_buf_hdr_t *ab, *ab_prev;
kmutex_t *hash_lock;
uint64_t taddr;
buflist = dev->l2ad_buflist;
*** 4426,4435 ****
--- 4867,4880 ----
* nothing to evict.
*/
return;
}
+ /*
+ * We need to add in the worst case scenario of log block overhead.
+ */
+ distance += l2arc_log_blk_overhead(distance);
if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
/*
* When nearing the end of the device, evict to the end
* before the device write hand jumps to the start.
*/
*** 4508,4521 ****
/*
* Tell ARC this no longer exists in L2ARC.
*/
if (ab->b_l2hdr != NULL) {
! abl2 = ab->b_l2hdr;
! ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
ab->b_l2hdr = NULL;
! kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
list_remove(buflist, ab);
/*
--- 4953,4966 ----
/*
* Tell ARC this no longer exists in L2ARC.
*/
if (ab->b_l2hdr != NULL) {
! l2hdr = ab->b_l2hdr;
! ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
ab->b_l2hdr = NULL;
! kmem_free(l2hdr, sizeof (*l2hdr));
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
list_remove(buflist, ab);
/*
*** 4547,4573 ****
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
boolean_t *headroom_boost)
{
arc_buf_hdr_t *ab, *ab_prev, *head;
list_t *list;
! uint64_t write_asize, write_psize, write_sz, headroom,
buf_compress_minsz;
void *buf_data;
kmutex_t *list_lock;
boolean_t full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
const boolean_t do_headroom_boost = *headroom_boost;
ASSERT(dev->l2ad_vdev != NULL);
/* Lower the flag now, we might want to raise it again later. */
*headroom_boost = B_FALSE;
pio = NULL;
! write_sz = write_asize = write_psize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
head->b_flags |= ARC_L2_WRITE_HEAD;
/*
--- 4992,5034 ----
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
boolean_t *headroom_boost)
{
arc_buf_hdr_t *ab, *ab_prev, *head;
list_t *list;
! /*
! * These variables mean:
! * - write_size: in-memory size of ARC buffers we've written (before
! * compression).
! * - write_asize: actual on-disk size of ARC buffers we've written
! * (after compression).
! * - write_aligned_asize: actual sum of space taken by ARC buffers
! * on the device (after compression and alignment, so that
! * every buffer starts on a multiple of the device block size).
! * - headroom: L2ARC scanning headroom (we won't scan beyond this
! * distance from the list tail).
! * - buf_compress_minsz: minimum in-memory ARC buffer size for us
! * to try compressing it.
! */
! uint64_t write_size, write_asize, write_aligned_asize, headroom,
buf_compress_minsz;
void *buf_data;
kmutex_t *list_lock;
boolean_t full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
const boolean_t do_headroom_boost = *headroom_boost;
+ boolean_t dev_hdr_update = B_FALSE;
ASSERT(dev->l2ad_vdev != NULL);
/* Lower the flag now, we might want to raise it again later. */
*headroom_boost = B_FALSE;
pio = NULL;
! cb = NULL;
! write_size = write_asize = write_aligned_asize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
head->b_flags |= ARC_L2_WRITE_HEAD;
/*
*** 4601,4611 ****
headroom = (headroom * l2arc_headroom_boost) / 100;
for (; ab; ab = ab_prev) {
l2arc_buf_hdr_t *l2hdr;
kmutex_t *hash_lock;
! uint64_t buf_sz;
if (arc_warm == B_FALSE)
ab_prev = list_next(list, ab);
else
ab_prev = list_prev(list, ab);
--- 5062,5072 ----
headroom = (headroom * l2arc_headroom_boost) / 100;
for (; ab; ab = ab_prev) {
l2arc_buf_hdr_t *l2hdr;
kmutex_t *hash_lock;
! uint64_t buf_aligned_size;
if (arc_warm == B_FALSE)
ab_prev = list_next(list, ab);
else
ab_prev = list_prev(list, ab);
*** 4616,4626 ****
* Skip this buffer rather than waiting.
*/
continue;
}
! passed_sz += ab->b_size;
if (passed_sz > headroom) {
/*
* Searched too far.
*/
mutex_exit(hash_lock);
--- 5077,5095 ----
* Skip this buffer rather than waiting.
*/
continue;
}
! /*
! * When examining whether we've met our write target,
! * we must always use the aligned size of the buffer,
! * since that's the maximum amount of space a buffer
! * can take up on the L2ARC device.
! */
! buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev,
! ab->b_size);
! passed_sz += buf_aligned_size;
if (passed_sz > headroom) {
/*
* Searched too far.
*/
mutex_exit(hash_lock);
*** 4630,4640 ****
if (!l2arc_write_eligible(guid, ab)) {
mutex_exit(hash_lock);
continue;
}
! if ((write_sz + ab->b_size) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
break;
}
--- 5099,5109 ----
if (!l2arc_write_eligible(guid, ab)) {
mutex_exit(hash_lock);
continue;
}
! if ((write_size + buf_aligned_size) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
break;
}
*** 4644,4665 ****
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
list_insert_head(dev->l2ad_buflist, head);
! cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
}
/*
* Create and add a new L2ARC header.
*/
! l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
l2hdr->b_dev = dev;
ab->b_flags |= ARC_L2_WRITING;
/*
* Temporarily stash the data buffer in b_tmp_cdata.
--- 5113,5137 ----
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
list_insert_head(dev->l2ad_buflist, head);
! cb = kmem_zalloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
+ list_create(&cb->l2wcb_log_blk_buf_list,
+ sizeof (l2arc_log_blk_buf_t),
+ offsetof(l2arc_log_blk_buf_t, l2lbb_node));
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
}
/*
* Create and add a new L2ARC header.
*/
! l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
l2hdr->b_dev = dev;
ab->b_flags |= ARC_L2_WRITING;
/*
* Temporarily stash the data buffer in b_tmp_cdata.
*** 4671,4681 ****
*/
l2hdr->b_compress = ZIO_COMPRESS_OFF;
l2hdr->b_asize = ab->b_size;
l2hdr->b_tmp_cdata = ab->b_buf->b_data;
- buf_sz = ab->b_size;
ab->b_l2hdr = l2hdr;
list_insert_head(dev->l2ad_buflist, ab);
/*
--- 5143,5152 ----
*** 4685,4695 ****
arc_cksum_verify(ab->b_buf);
arc_cksum_compute(ab->b_buf, B_TRUE);
mutex_exit(hash_lock);
! write_sz += buf_sz;
}
mutex_exit(list_lock);
if (full == B_TRUE)
--- 5156,5166 ----
arc_cksum_verify(ab->b_buf);
arc_cksum_compute(ab->b_buf, B_TRUE);
mutex_exit(hash_lock);
! write_size += buf_aligned_size;
}
mutex_exit(list_lock);
if (full == B_TRUE)
*** 4696,4706 ****
break;
}
/* No buffers selected for writing? */
if (pio == NULL) {
! ASSERT0(write_sz);
mutex_exit(&l2arc_buflist_mtx);
kmem_cache_free(hdr_cache, head);
return (0);
}
--- 5167,5177 ----
break;
}
/* No buffers selected for writing? */
if (pio == NULL) {
! ASSERT0(write_size);
mutex_exit(&l2arc_buflist_mtx);
kmem_cache_free(hdr_cache, head);
return (0);
}
*** 4741,4751 ****
buf_data = l2hdr->b_tmp_cdata;
buf_sz = l2hdr->b_asize;
/* Compression may have squashed the buffer to zero length. */
if (buf_sz != 0) {
! uint64_t buf_p_sz;
wzio = zio_write_phys(pio, dev->l2ad_vdev,
dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_CANFAIL, B_FALSE);
--- 5212,5222 ----
buf_data = l2hdr->b_tmp_cdata;
buf_sz = l2hdr->b_asize;
/* Compression may have squashed the buffer to zero length. */
if (buf_sz != 0) {
! uint64_t buf_aligned_asize;
wzio = zio_write_phys(pio, dev->l2ad_vdev,
dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_CANFAIL, B_FALSE);
*** 4756,4785 ****
write_asize += buf_sz;
/*
* Keep the clock hand suitably device-aligned.
*/
! buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
! write_psize += buf_p_sz;
! dev->l2ad_hand += buf_p_sz;
}
- }
mutex_exit(&l2arc_buflist_mtx);
! ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
! ARCSTAT_INCR(arcstat_l2_size, write_sz);
! ARCSTAT_INCR(arcstat_l2_asize, write_asize);
! vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
* l2arc_evict() will already have evicted ahead for this case.
*/
! if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
vdev_space_update(dev->l2ad_vdev,
dev->l2ad_end - dev->l2ad_hand, 0, 0);
dev->l2ad_hand = dev->l2ad_start;
dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
--- 5227,5267 ----
write_asize += buf_sz;
/*
* Keep the clock hand suitably device-aligned.
*/
! buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev,
! buf_sz);
! write_aligned_asize += buf_aligned_asize;
! dev->l2ad_hand += buf_aligned_asize;
! ASSERT(dev->l2ad_hand <= dev->l2ad_evict ||
! dev->l2ad_first);
}
+ if (l2arc_log_blk_insert(dev, ab)) {
+ l2arc_log_blk_commit(dev, pio, cb);
+ dev_hdr_update = B_TRUE;
+ }
+ }
mutex_exit(&l2arc_buflist_mtx);
! if (dev_hdr_update)
! l2arc_dev_hdr_update(dev, pio);
!
! VERIFY3U(write_aligned_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
! ARCSTAT_INCR(arcstat_l2_size, write_size);
! ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize);
! vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
* l2arc_evict() will already have evicted ahead for this case.
*/
! if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
! dev->l2ad_end) {
vdev_space_update(dev->l2ad_vdev,
dev->l2ad_end - dev->l2ad_hand, 0, 0);
dev->l2ad_hand = dev->l2ad_start;
dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
*** 5037,5065 ****
}
boolean_t
l2arc_vdev_present(vdev_t *vd)
{
l2arc_dev_t *dev;
mutex_enter(&l2arc_dev_mtx);
for (dev = list_head(l2arc_dev_list); dev != NULL;
dev = list_next(l2arc_dev_list, dev)) {
if (dev->l2ad_vdev == vd)
break;
}
mutex_exit(&l2arc_dev_mtx);
! return (dev != NULL);
}
/*
* Add a vdev for use by the L2ARC. By this point the spa has already
! * validated the vdev and opened it.
*/
void
! l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{
l2arc_dev_t *adddev;
ASSERT(!l2arc_vdev_present(vd));
--- 5519,5557 ----
}
boolean_t
l2arc_vdev_present(vdev_t *vd)
{
+ return (l2arc_vdev_get(vd) != NULL);
+ }
+
+ static l2arc_dev_t *
+ l2arc_vdev_get(vdev_t *vd)
+ {
l2arc_dev_t *dev;
+ boolean_t held = MUTEX_HELD(&l2arc_dev_mtx);
+ if (!held)
mutex_enter(&l2arc_dev_mtx);
for (dev = list_head(l2arc_dev_list); dev != NULL;
dev = list_next(l2arc_dev_list, dev)) {
if (dev->l2ad_vdev == vd)
break;
}
+ if (!held)
mutex_exit(&l2arc_dev_mtx);
! return (dev);
}
/*
* Add a vdev for use by the L2ARC. By this point the spa has already
! * validated the vdev and opened it. The `rebuild' flag indicates whether
! * we should attempt an L2ARC persistency rebuild.
*/
void
! l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
{
l2arc_dev_t *adddev;
ASSERT(!l2arc_vdev_present(vd));
*** 5067,5077 ****
* Create a new l2arc device entry.
*/
adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
adddev->l2ad_spa = spa;
adddev->l2ad_vdev = vd;
! adddev->l2ad_start = VDEV_LABEL_START_SIZE;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
--- 5559,5570 ----
* Create a new l2arc device entry.
*/
adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
adddev->l2ad_spa = spa;
adddev->l2ad_vdev = vd;
! /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */
! adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
*** 5090,5099 ****
--- 5583,5602 ----
* Add device to global list
*/
mutex_enter(&l2arc_dev_mtx);
list_insert_head(l2arc_dev_list, adddev);
atomic_inc_64(&l2arc_ndev);
+ if (rebuild && l2arc_rebuild_enabled &&
+ adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
+ /*
+ * Just mark the device as pending for a rebuild. We won't
+ * be starting a rebuild in line here as it would block pool
+ * import. Instead spa_load_impl will hand that off to an
+ * async task which will call l2arc_spa_rebuild_start.
+ */
+ adddev->l2ad_rebuild = B_TRUE;
+ }
mutex_exit(&l2arc_dev_mtx);
}
/*
* Remove a vdev from the L2ARC.
*** 5196,5201 ****
--- 5699,6429 ----
cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
l2arc_thread_exit = 1;
while (l2arc_thread_exit != 0)
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
mutex_exit(&l2arc_feed_thr_lock);
+ }
+
+ /*
+ * Punches out rebuild threads for the L2ARC devices in a spa. This should
+ * be called as one of the final steps of a pool import.
+ */
+ void
+ l2arc_spa_rebuild_start(spa_t *spa)
+ {
+ l2arc_dev_t *dev;
+ /*
+ * Locate the spa's l2arc devices and kick off rebuild threads.
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
+ ASSERT(dev != NULL);
+ if (dev->l2ad_rebuild) {
+ (void) thread_create(NULL, 0, l2arc_dev_rebuild_start,
+ dev, 0, &p0, TS_RUN, minclsyspri);
+ }
+ }
+ mutex_exit(&l2arc_dev_mtx);
+ }
+
+ /*
+ * Main entry point for L2ARC rebuilding.
+ */
+ static void
+ l2arc_dev_rebuild_start(l2arc_dev_t *dev)
+ {
+ spa_t *spa = dev->l2ad_spa;
+ vdev_t *vd = dev->l2ad_vdev;
+
+ /* Lock out device removal. */
+ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+ ASSERT(dev->l2ad_rebuild);
+ (void) l2arc_rebuild(dev);
+ dev->l2ad_rebuild = B_FALSE;
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ thread_exit();
+ }
+
+ /*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ *
+ * 1) reads the device's header
+ * 2) if a good device header is found, starts reading the log block chain
+ * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the log blk chain (the back-reference in the blk is
+ * invalid or loops over our starting point).
+ * 2) We encounter *any* error condition (cksum errors, io errors, looped
+ * blocks, etc.).
+ * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
+ * from making severely fragmented L2ARC log blocks or slow L2ARC devices
+ * prevent a machine from finishing a pool import (and thus letting the
+ * administrator take corrective action, e.g. by kicking the misbehaving
+ * L2ARC device out of the pool, or by reimporting the pool with L2ARC
+ * rebuilding disabled).
+ */
+ static int
+ l2arc_rebuild(l2arc_dev_t *dev)
+ {
+ int err;
+ l2arc_log_blk_phys_t *this_lb, *next_lb;
+ uint8_t *this_lb_buf, *next_lb_buf;
+ zio_t *this_io = NULL, *next_io = NULL;
+ int64_t deadline;
+ l2arc_log_blk_ptr_t lb_ptrs[2];
+ boolean_t first_pass;
+ uint64_t load_guid;
+
+ load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
+ deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
+ /*
+ * Device header processing phase.
+ */
+ if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) {
+ /* device header corrupted, start a new one */
+ bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr));
+ return (err);
+ }
+ if (l2arc_check_rebuild_timeout_hit(deadline))
+ return (SET_ERROR(ETIMEDOUT));
+
+ /* Retrieve the persistent L2ARC device state */
+ dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail;
+ dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr +
+ LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0]));
+ dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags &
+ L2ARC_DEV_HDR_EVICT_FIRST);
+
+ /* Prepare the rebuild processing state */
+ bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
+ this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
+ next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
+ this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
+ next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
+ first_pass = B_TRUE;
+
+ /* Start the rebuild process */
+ for (;;) {
+ if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0]))
+ /* We hit an invalid block address, end the rebuild. */
+ break;
+
+ if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
+ this_lb, next_lb, this_lb_buf, next_lb_buf,
+ this_io, &next_io)) != 0)
+ break;
+
+ /* Protection against infinite loops of log blocks. */
+ if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr,
+ lb_ptrs[0].l2lbp_daddr,
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
+ !first_pass) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
+ err = SET_ERROR(ELOOP);
+ break;
+ }
+
+ /*
+ * Our memory pressure valve. If the system is running low
+ * on memory, rather than swamping memory with new ARC buf
+ * hdrs, we opt not to rebuild the L2ARC. At this point,
+ * however, we have already set up our L2ARC dev to chain in
+ * new metadata log blk, so the user may choose to re-add the
+ * L2ARC dev at a later time to reconstruct it (when there's
+ * less memory pressure).
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+ cmn_err(CE_NOTE, "System running low on memory, "
+ "aborting L2ARC rebuild.");
+ err = SET_ERROR(ENOMEM);
+ break;
+ }
+
+ /*
+ * Now that we know that the next_lb checks out alright, we
+ * can start reconstruction from this lb - we can be sure
+ * that the L2ARC write hand has not yet reached any of our
+ * buffers.
+ */
+ l2arc_log_blk_restore(dev, load_guid, this_lb,
+ LBP_GET_PSIZE(&lb_ptrs[0]));
+
+ /*
+ * End of list detection. We can look ahead two steps in the
+ * blk chain and if the 2nd blk from this_lb dips below the
+ * initial chain starting point, then we know two things:
+ * 1) it can't be valid, and
+ * 2) the next_lb's ARC entries might have already been
+ * partially overwritten and so we should stop before
+ * we restore it
+ */
+ if (l2arc_range_check_overlap(
+ this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr,
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
+ !first_pass)
+ break;
+
+ /* log blk restored, continue with next one in the list */
+ lb_ptrs[0] = lb_ptrs[1];
+ lb_ptrs[1] = this_lb->l2lb_back2_lbp;
+ PTR_SWAP(this_lb, next_lb);
+ PTR_SWAP(this_lb_buf, next_lb_buf);
+ this_io = next_io;
+ next_io = NULL;
+ first_pass = B_FALSE;
+
+ if (l2arc_check_rebuild_timeout_hit(deadline)) {
+ err = SET_ERROR(ETIMEDOUT);
+ break;
+ }
+ }
+ if (next_io != NULL)
+ l2arc_log_blk_prefetch_abort(next_io);
+ kmem_free(this_lb, sizeof (*this_lb));
+ kmem_free(next_lb, sizeof (*next_lb));
+ kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
+ kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
+ if (err == 0)
+ ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
+
+ return (err);
+ }
+
+ /*
+ * Restores the payload of a log blk to ARC. This creates empty ARC hdr
+ * entries which only contain an l2arc hdr, essentially restoring the
+ * buffers to their L2ARC evicted state. This function also updates space
+ * usage on the L2ARC vdev to make sure it tracks restored buffers.
+ */
+ static void
+ l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
+ l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
+ {
+ uint64_t size = 0, psize = 0;
+
+ mutex_enter(&l2arc_buflist_mtx);
+
+ for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
+ /*
+ * Restore goes in the reverse direction to preserve correct
+ * temporal ordering of buffers in the l2ad_buflist.
+ */
+ l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid);
+ size += LE_GET_LSIZE(&lb->l2lb_entries[i]);
+ psize += LE_GET_PSIZE(&lb->l2lb_entries[i]);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ /*
+ * Record rebuild stats:
+ * size In-memory size of restored buffer data in ARC
+ * psize Physical size of restored buffers in the L2ARC
+ * bufs # of ARC buffer headers restored
+ * log_blks # of L2ARC log entries processed during restore
+ */
+ ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
+ ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
+ ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
+ vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
+ }
+
+ /*
+ * Restores a single ARC buf hdr from a log block. The ARC buffer is put
+ * into a state indicating that it has been evicted to L2ARC.
+ */
+ static void
+ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
+ uint64_t load_guid)
+ {
+ arc_buf_hdr_t *hdr, *exists;
+ kmutex_t *hash_lock;
+ arc_buf_contents_t type = LE_GET_TYPE(le);
+ l2arc_buf_hdr_t *l2hdr;
+
+ hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type);
+ hdr->b_dva = le->l2le_dva;
+ hdr->b_birth = le->l2le_birth;
+ hdr->b_cksum0 = le->l2le_cksum0;
+ hdr->b_size = LE_GET_LSIZE(le);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* Buffer was already cached, no need to restore it. */
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(hdr);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+ return;
+ }
+ hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE;
+ if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF)
+ hdr->b_flags |= ARC_L2COMPRESS;
+ mutex_enter(&hdr->b_freeze_lock);
+ ASSERT(hdr->b_freeze_cksum == NULL);
+ hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+ *hdr->b_freeze_cksum = le->l2le_freeze_cksum;
+ mutex_exit(&hdr->b_freeze_lock);
+
+ /* now rebuild the l2arc entry */
+ ASSERT(hdr->b_l2hdr == NULL);
+ l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
+ l2hdr->b_dev = dev;
+ l2hdr->b_daddr = le->l2le_daddr;
+ l2hdr->b_asize = LE_GET_PSIZE(le);
+ l2hdr->b_compress = LE_GET_COMPRESS(le);
+ hdr->b_l2hdr = l2hdr;
+ list_insert_tail(dev->l2ad_buflist, hdr);
+ ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
+ ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
+
+ arc_change_state(arc_l2c_only, hdr, hash_lock);
+ mutex_exit(hash_lock);
+ }
+
+ /*
+ * Attempts to read the device header on the provided L2ARC device and writes
+ * it to `ub'. On success, this function returns 0, otherwise the appropriate
+ * error code is returned.
+ */
+ static int
+ l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr)
+ {
+ int err;
+ uint64_t guid;
+ zio_cksum_t cksum;
+
+ guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+ if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr,
+ ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+ return (err);
+ }
+
+ if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+ byteswap_uint64_array(hdr, sizeof (*hdr));
+
+ if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC ||
+ hdr->l2dh_spa_guid != guid) {
+ /*
+ * Attempt to rebuild a device containing no actual dev hdr
+ * or containing a header from some other pool.
+ */
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ l2arc_dev_hdr_checksum(hdr, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
+ return (SET_ERROR(EINVAL));
+ }
+ if (hdr->l2dh_evict_tail < dev->l2ad_start ||
+ hdr->l2dh_evict_tail >= dev->l2ad_end) {
+ /* Data in dev hdr is invalid for this device. */
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+ return (SET_ERROR(EINVAL));
+ }
+
+ return (0);
+ }
+
+ /*
+ * Reads L2ARC log blocks from storage and validates their contents.
+ *
+ * This function implements a simple prefetcher to make sure that while
+ * we're processing one buffer the L2ARC is already prefetching the next
+ * one in the chain.
+ *
+ * The arguments this_lp and next_lp point to the current and next log blk
+ * address in the block chain. Similarly, this_lb and next_lb hold the
+ * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
+ * and next_lb_buf must be buffers of appropriate to hold a raw
+ * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
+ * to buffer decompression).
+ *
+ * The `this_io' and `next_io' arguments are used for block prefetching.
+ * When issuing the first blk IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the block and
+ * also issue an async IO to fetch the next block in the block chain. The
+ * prefetch IO is returned in `next_io'. On subsequent calls to this
+ * function, pass the value returned in `next_io' from the previous call
+ * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
+ * Prior to the call, you should initialize your `next_io' pointer to be
+ * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the prefetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of prefetch IOs.
+ */
+ static int
+ l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ uint8_t *this_lb_buf, uint8_t *next_lb_buf,
+ zio_t *this_io, zio_t **next_io)
+ {
+ int err = 0;
+ zio_cksum_t cksum;
+
+ ASSERT(this_lbp != NULL && next_lbp != NULL);
+ ASSERT(this_lb != NULL && next_lb != NULL);
+ ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
+ ASSERT(next_io != NULL && *next_io == NULL);
+ ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp));
+
+ /*
+ * Check to see if we have issued the IO for this log blk in a
+ * previous run. If not, this is the first call, so issue it now.
+ */
+ if (this_io == NULL) {
+ this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
+ this_lb_buf);
+ }
+
+ /*
+ * Peek to see if we can start issuing the next IO immediately.
+ */
+ if (l2arc_log_blk_ptr_valid(dev, next_lbp)) {
+ /*
+ * Start issuing IO for the next log blk early - this
+ * should help keep the L2ARC device busy while we
+ * decompress and restore this log blk.
+ */
+ *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
+ next_lb_buf);
+ }
+
+ /* Wait for the IO to read this log block to complete */
+ if ((err = zio_wait(this_io)) != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+ goto cleanup;
+ }
+
+ /* Make sure the buffer checks out */
+ fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+
+ /* Now we can take our time decoding this buffer */
+ switch (LBP_GET_COMPRESS(this_lbp)) {
+ case ZIO_COMPRESS_OFF:
+ bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
+ break;
+ case ZIO_COMPRESS_LZ4:
+ if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
+ this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
+ sizeof (*this_lb))) != 0) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ break;
+ default:
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+ byteswap_uint64_array(this_lb, sizeof (*this_lb));
+ if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ cleanup:
+ /* Abort an in-flight prefetch I/O in case of error */
+ if (err != 0 && *next_io != NULL) {
+ l2arc_log_blk_prefetch_abort(*next_io);
+ *next_io = NULL;
+ }
+ return (err);
+ }
+
+ /*
+ * Validates an L2ARC log blk address to make sure that it can be read
+ * from the provided L2ARC device. Returns B_TRUE if the address is
+ * within the device's bounds, or B_FALSE if not.
+ */
+ static boolean_t
+ l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp)
+ {
+ uint64_t psize = LBP_GET_PSIZE(lbp);
+ uint64_t end = lbp->l2lbp_daddr + psize;
+
+ /*
+ * A log block is valid if all of the following conditions are true:
+ * - it fits entirely between l2ad_start and l2ad_end
+ * - it has a valid size
+ * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it
+ * doesn't sit in the evicted region)
+ */
+ return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end &&
+ psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
+ lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand);
+ }
+
+ /*
+ * Starts an asynchronous read IO to read a log block. This is used in log
+ * block reconstruction to start reading the next block before we are done
+ * decoding and reconstructing the current block, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain a newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
+ * care of disposing of the allocated buffers correctly.
+ */
+ static zio_t *
+ l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp,
+ uint8_t *lb_buf)
+ {
+ uint32_t psize;
+ zio_t *pio;
+
+ psize = LBP_GET_PSIZE(lbp);
+ ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
+ pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY);
+ (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize,
+ lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
+
+ return (pio);
+ }
+
+ /*
+ * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
+ * buffers allocated for it.
+ */
+ static void
+ l2arc_log_blk_prefetch_abort(zio_t *zio)
+ {
+ (void) zio_wait(zio);
+ }
+
+ /*
+ * Creates a zio to update the device header on an l2arc device. The zio is
+ * initiated as a child of `pio'.
+ */
+ static void
+ l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
+ {
+ zio_t *wzio;
+ vdev_stat_t st;
+ l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr;
+
+ vdev_get_stats(dev->l2ad_vdev, &st);
+
+ hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC;
+ hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+ hdr->l2dh_evict_tail = dev->l2ad_evict;
+ hdr->l2dh_alloc_space = st.vs_alloc;
+ hdr->l2dh_flags = 0;
+ if (dev->l2ad_first)
+ hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
+
+ /* checksum operation goes last */
+ l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum);
+
+ CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE &&
+ sizeof (*hdr) <= SPA_MAXBLOCKSIZE);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
+ sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL,
+ NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+ (void) zio_nowait(wzio);
+ }
+
+ /*
+ * Commits a log block to the L2ARC device. This routine is invoked from
+ * l2arc_write_buffers when the log block fills up.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+ static void
+ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+ l2arc_write_callback_t *cb)
+ {
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ uint64_t psize, asize;
+ l2arc_log_blk_buf_t *lb_buf;
+ zio_t *wzio;
+
+ VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
+
+ /* link the buffer into the block chain */
+ lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1];
+ lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC;
+
+ /* try to compress the buffer */
+ lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
+ list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf);
+ VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb,
+ lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0);
+
+ /*
+ * Update the start log blk pointer in the device header to point
+ * to the log block we're about to write.
+ */
+ dev->l2ad_dev_hdr.l2dh_start_lbps[1] =
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0];
+ dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand;
+ LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb));
+ LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize);
+ LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
+ ZIO_CHECKSUM_FLETCHER_4);
+ LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0);
+ if (psize < sizeof (*lb)) {
+ /* compression succeeded */
+ LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
+ ZIO_COMPRESS_LZ4);
+ } else {
+ /* compression failed */
+ bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb));
+ LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
+ ZIO_COMPRESS_OFF);
+ }
+ /* checksum what we're about to write */
+ fletcher_4_native(lb_buf->l2lbb_log_blk, psize,
+ &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum);
+
+ /* perform the write itself */
+ CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
+ L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
+ psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ /* realign the device hand */
+ asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ dev->l2ad_hand += asize;
+ VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ /* bump the kstats */
+ ARCSTAT_INCR(arcstat_l2_write_bytes, psize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
+ dev->l2ad_log_blk_payload_asize / asize);
+
+ dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0;
+ }
+
+ /*
+ * Computes the checksum of `hdr' and stores it in `cksum'.
+ */
+ static void
+ l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
+ {
+ fletcher_4_native((uint8_t *)hdr +
+ offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
+ sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
+ cksum);
+ }
+
+ /*
+ * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
+ * The buffer being inserted must be present in L2ARC.
+ * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
+ * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
+ */
+ static boolean_t
+ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
+ {
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_log_ent_phys_t *le;
+ const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+ int index = dev->l2ad_log_ent_idx++;
+
+ ASSERT(l2hdr != NULL);
+ ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
+
+ le = &lb->l2lb_entries[index];
+ bzero(le, sizeof (*le));
+ le->l2le_dva = ab->b_dva;
+ le->l2le_birth = ab->b_birth;
+ le->l2le_cksum0 = ab->b_cksum0;
+ le->l2le_daddr = l2hdr->b_daddr;
+ LE_SET_LSIZE(le, ab->b_size);
+ LE_SET_PSIZE(le, l2hdr->b_asize);
+ LE_SET_COMPRESS(le, l2hdr->b_compress);
+ le->l2le_freeze_cksum = *ab->b_freeze_cksum;
+ LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
+ LE_SET_TYPE(le, ab->b_type);
+ dev->l2ad_log_blk_payload_asize += l2hdr->b_asize;
+
+ return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
+ }
+
+ /*
+ * Checks whether a given L2ARC device address sits in a time-sequential
+ * range. The trick here is that the L2ARC is a rotary buffer, so we can't
+ * just do a range comparison, we need to handle the situation in which the
+ * range wraps around the end of the L2ARC device. Arguments:
+ * bottom Lower end of the range to check (written to earlier).
+ * top Upper end of the range to check (written to later).
+ * check The address for which we want to determine if it sits in
+ * between the top and bottom.
+ *
+ * The 3-way conditional below represents the following cases:
+ *
+ * bottom < top : Sequentially ordered case:
+ * <check>--------+-------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |---------------<bottom>============<top>--------------|
+ *
+ * bottom > top: Looped-around case:
+ * <check>--------+------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |===============<top>---------------<bottom>===========|
+ * ^ ^
+ * | (or here?) |
+ * +---------------+---------<check>
+ *
+ * top == bottom : Just a single address comparison.
+ */
+ static inline boolean_t
+ l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
+ {
+ if (bottom < top)
+ return (bottom <= check && check <= top);
+ else if (bottom > top)
+ return (check <= top || bottom <= check);
+ else
+ return (check == top);
+ }
+
+ /*
+ * Checks whether a rebuild timeout deadline has been hit and if it has,
+ * increments the appropriate error counters.
+ */
+ static boolean_t
+ l2arc_check_rebuild_timeout_hit(int64_t deadline)
+ {
+ if (deadline != 0 && deadline < ddi_get_lbolt64()) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout);
+ cmn_err(CE_WARN, "L2ARC rebuild is taking too long, "
+ "dropping remaining L2ARC metadata.");
+ return (B_TRUE);
+ } else {
+ return (B_FALSE);
+ }
}