Print this page
3525 Persistent L2ARC

*** 134,143 **** --- 134,144 ---- #include <sys/dnlc.h> #endif #include <sys/callb.h> #include <sys/kstat.h> #include <zfs_fletcher.h> + #include <sys/byteorder.h> #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; int arc_procfd;
*** 305,314 **** --- 306,333 ---- kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_l2_compress_successes; kstat_named_t arcstat_l2_compress_zeros; kstat_named_t arcstat_l2_compress_failures; + kstat_named_t arcstat_l2_meta_writes; + kstat_named_t arcstat_l2_meta_avg_size; + kstat_named_t arcstat_l2_meta_avg_asize; + kstat_named_t arcstat_l2_asize_to_meta_ratio; + kstat_named_t arcstat_l2_rebuild_attempts; + kstat_named_t arcstat_l2_rebuild_successes; + kstat_named_t arcstat_l2_rebuild_unsupported; + kstat_named_t arcstat_l2_rebuild_timeout; + kstat_named_t arcstat_l2_rebuild_arc_bytes; + kstat_named_t arcstat_l2_rebuild_l2arc_bytes; + kstat_named_t arcstat_l2_rebuild_bufs; + kstat_named_t arcstat_l2_rebuild_bufs_precached; + kstat_named_t arcstat_l2_rebuild_metabufs; + kstat_named_t arcstat_l2_rebuild_uberblk_errors; + kstat_named_t arcstat_l2_rebuild_io_errors; + kstat_named_t arcstat_l2_rebuild_cksum_errors; + kstat_named_t arcstat_l2_rebuild_loop_errors; + kstat_named_t arcstat_l2_rebuild_abort_lowmem; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_duplicate_buffers; kstat_named_t arcstat_duplicate_buffers_size; kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_meta_used;
*** 371,380 **** --- 390,417 ---- { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_compress_successes", KSTAT_DATA_UINT64 }, { "l2_compress_zeros", KSTAT_DATA_UINT64 }, { "l2_compress_failures", KSTAT_DATA_UINT64 }, + { "l2_meta_writes", KSTAT_DATA_UINT64 }, + { "l2_meta_avg_size", KSTAT_DATA_UINT64 }, + { "l2_meta_avg_asize", KSTAT_DATA_UINT64 }, + { "l2_asize_to_meta_ratio", KSTAT_DATA_UINT64 }, + { "l2_rebuild_attempts", KSTAT_DATA_UINT64 }, + { "l2_rebuild_successes", KSTAT_DATA_UINT64 }, + { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, + { "l2_rebuild_timeout", KSTAT_DATA_UINT64 }, + { "l2_rebuild_arc_bytes", KSTAT_DATA_UINT64 }, + { "l2_rebuild_l2arc_bytes", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, + { "l2_rebuild_precached", KSTAT_DATA_UINT64 }, + { "l2_rebuild_metabufs", KSTAT_DATA_UINT64 }, + { "l2_rebuild_uberblk_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_abort_lowmem", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "duplicate_buffers", KSTAT_DATA_UINT64 }, { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, { "duplicate_reads", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 },
*** 418,427 **** --- 455,483 ---- } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } + /* + * This macro allows us to use kstats as floating averages. Each time we + * update this kstat, we first factor it and the update value by + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall + * average. This macro assumes that integer loads and stores are atomic, but + * is not safe for multiple writers updating the kstat in parallel (only the + * last writer's update will remain). + */ + #define ARCSTAT_F_AVG_FACTOR 3 + #define ARCSTAT_F_AVG(stat, value) \ + do { \ + uint64_t x = ARCSTAT(stat); \ + x = x - x / ARCSTAT_F_AVG_FACTOR + \ + (value) / ARCSTAT_F_AVG_FACTOR; \ + ARCSTAT(stat) = x; \ + _NOTE(NOTREACHED) \ + _NOTE(CONSTCOND) \ + } while (0) + kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu;
*** 625,647 **** boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals */ ! typedef struct l2arc_dev { ! vdev_t *l2ad_vdev; /* vdev */ ! spa_t *l2ad_spa; /* spa */ ! uint64_t l2ad_hand; /* next write location */ ! uint64_t l2ad_start; /* first addr on device */ ! uint64_t l2ad_end; /* last addr on device */ ! uint64_t l2ad_evict; /* last addr eviction reached */ ! boolean_t l2ad_first; /* first sweep through */ ! boolean_t l2ad_writing; /* currently writing */ ! list_t *l2ad_buflist; /* buffer list */ ! list_node_t l2ad_node; /* device list node */ ! } l2arc_dev_t; ! static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ --- 681,691 ---- boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals */ ! typedef struct l2arc_dev l2arc_dev_t; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
*** 660,669 **** --- 704,716 ---- } l2arc_read_callback_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ + uint8_t *l2wcb_pbuf; /* pbuf sent in this write */ + uint32_t l2wcb_pbuf_size; /* size of committed pbuf */ + uint8_t *l2wcb_ub_buf; /* uberblock in this write */ } l2arc_write_callback_t; struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */
*** 687,704 **** static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static void l2arc_read_done(zio_t *zio); ! static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c); static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; --- 734,975 ---- static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static void l2arc_read_done(zio_t *zio); ! static void l2arc_hdr_stat_add(boolean_t from_arc); static void l2arc_hdr_stat_remove(void); static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c); static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); + typedef enum { + L2UBLK_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */ + L2UBLK_EVICT_FIRST = (1 << 1) /* mirror of l2ad_first in l2dev */ + } l2uberblock_flags_t; + + typedef struct l2uberblock { + uint32_t ub_magic; + uint8_t ub_version; + l2uberblock_flags_t ub_flags; + + uint64_t ub_spa_guid; + uint64_t ub_birth; + uint64_t ub_evict_tail; /* current evict pointer */ + uint64_t ub_alloc_space; /* vdev space alloc status */ + uint64_t ub_pbuf_daddr; /* address of newest pbuf */ + uint32_t ub_pbuf_asize; /* size of newest pbuf */ + zio_cksum_t ub_pbuf_cksum; /* fletcher4 of newest pbuf */ + + zio_cksum_t ub_cksum; /* cksum of uberblock */ + } l2uberblock_t; + + typedef enum { + L2PBUF_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */ + L2PBUF_COMPRESSED = (1 << 1) /* pbuf data items are compressed */ + } l2pbuf_flags_t; + + typedef struct l2pbuf { + uint32_t pb_magic; + unsigned int pb_version; + l2pbuf_flags_t pb_flags; + + uint64_t pb_prev_daddr; /* address of previous pbuf */ + uint32_t pb_prev_asize; /* size of previous pbuf */ + zio_cksum_t pb_prev_cksum; /* fletcher4 of prev. pbuf */ + + /* + * This is a set of item lists that are contained in this pbuf. Each + * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's. + * This serves as a soft timeout feature - once the limit of the + * number of item lists that a pbuf can hold is reached, the pbuf is + * flushed to stable storage, regardless of its total size. + */ + list_t *pb_buflists_list; + + /* + * Number of compressed bytes referenced by items in this pbuf and + * the number of lists present. + * This is not actually written to storage, it is only used by + * internal algorithms which check for when a pbuf reaches a + * certain size limit, after which it is flushed in a write. + */ + uint64_t pb_payload_asz; + /* Same thing for number of buflists */ + int pb_nbuflists; + + /* + * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size. + * This is then used by l2arc_pbuf_restore to update used space + * on the L2ARC vdev. + */ + size_t pb_asize; + } l2pbuf_t; + + typedef struct l2pbuf_buf l2pbuf_buf_t; + typedef struct l2pbuf_buflist { + uint32_t l2pbl_nbufs; + l2pbuf_buf_t *l2pbl_bufs; + list_node_t l2pbl_node; + } l2pbuf_buflist_t; + + struct l2pbuf_buf { + dva_t b_dva; /* dva of buffer */ + uint64_t b_birth; /* birth txg of buffer */ + uint64_t b_cksum0; + zio_cksum_t b_freeze_cksum; + uint32_t b_size; /* uncompressed buf size */ + uint64_t b_l2daddr; /* buf location on l2dev */ + uint32_t b_l2asize; /* actual buf data size */ + enum zio_compress b_l2compress; /* compression applied */ + uint16_t b_contents_type; + uint32_t b_flags; + }; + + struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + uint64_t l2ad_evict; /* last addr eviction reached */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + list_t *l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ + l2pbuf_t l2ad_pbuf; /* currently open pbuf */ + uint64_t l2ad_pbuf_daddr; /* prev pbuf daddr */ + uint64_t l2ad_pbuf_asize; /* prev pbuf asize */ + zio_cksum_t l2ad_pbuf_cksum; /* prev pbuf cksum */ + /* uberblock birth counter - incremented for each committed uberblk */ + uint64_t l2ad_uberblock_birth; + /* flag indicating whether a rebuild is currently going on */ + boolean_t l2ad_rebuilding; + }; + + /* Stores information about an L2ARC prefetch zio */ + typedef struct l2arc_prefetch_info { + uint8_t *pi_buf; /* where the zio writes to */ + uint64_t pi_buflen; /* length of `buf' */ + zio_t *pi_hdr_io; /* see l2arc_pbuf_read below */ + } l2arc_prefetch_info_t; + + /* 256 x 4k of l2uberblocks */ + #define L2UBERBLOCK_SIZE 4096 + #define L2UBERBLOCK_MAGIC 0x12bab10c + #define L2UBERBLOCK_MAX_VERSION 1 /* our maximum uberblock version */ + #define L2PBUF_MAGIC 0xdb0faba6 + #define L2PBUF_MAX_VERSION 1 /* our maximum pbuf version */ + #define L2PBUF_BUF_SIZE 88 /* size of one pbuf buf entry */ + #define L2PBUF_HDR_SIZE 56 /* pbuf header excluding any payload */ + #define L2PBUF_ENCODED_SIZE(_pb) \ + (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb)) + /* + * Allocation limit for the payload of a pbuf. This also fundamentally + * limits the number of bufs we can reference in a pbuf. + */ + #define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024) + #define L2PBUF_MAX_BUFS (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE) + #define L2PBUF_COMPRESS_MINSZ 8192 /* minimum size to compress a pbuf */ + #define L2PBUF_MAXSZ 100 * 1024 * 1024 /* maximum pbuf size */ + #define L2PBUF_MAX_BUFLISTS 128 /* max number of buflists per pbuf */ + #define L2ARC_REBUILD_TIMEOUT 60 /* a rebuild may take at most 60s */ + #define L2PBUF_IS_FULL(_pb) \ + ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \ + (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists) + /* + * These are the flags we allow to persist in L2ARC pbufs. The other flags + * of an ARC buffer pertain to the buffer's runtime behavior. + */ + #define L2ARC_PERSIST_FLAGS \ + (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH) + + /* + * Used during L2ARC rebuild after each read operation to check whether we + * haven't exceeded the rebuild timeout value. + */ + #define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \ + do { \ + if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \ + __VA_ARGS__; \ + ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \ + cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \ + "dropping remaining L2ARC metadata."); \ + return; \ + } \ + _NOTE(NOTREACHED) \ + _NOTE(CONSTCOND) \ + } while (0) + + /* + * Performance tuning of L2ARC persistency: + * + * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt + * compressing it. + * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers + * referenced from a pbuf. Once a pbuf reaches this size, it is + * committed to stable storage. Ideally, there should be approx. + * l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device. + * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will + * be buffered in a pbuf before it is committed to L2ARC. This + * puts a soft temporal upper bound on pbuf commit intervals. + * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at + * pool import or when adding one manually later) will attempt + * to rebuild L2ARC buffer contents. In special circumstances, + * the administrator may want to set this to B_FALSE, if they + * are having trouble importing a pool or attaching an L2ARC + * device (e.g. the L2ARC device is slow to read in stored pbuf + * metadata, or the metadata has become somehow + * fragmented/unusable). + * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help + * avoid a slow L2ARC device from preventing pool import. If we + * are not done rebuilding an L2ARC device by this time, we + * stop the rebuild and return immediately. + */ + uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ; + uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ; + uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS; + boolean_t l2arc_rebuild_enabled = B_TRUE; + uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT; + + static void l2arc_rebuild_start(l2arc_dev_t *dev); + static void l2arc_rebuild(l2arc_dev_t *dev); + static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb); + static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, + uint64_t guid); + + static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub); + static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize, + zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io); + static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, + uint32_t asize); + static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize); + static void l2arc_pbuf_prefetch_abort(zio_t *zio); + + static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf); + static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub); + static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub, + uint64_t guid); + static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, + l2arc_write_callback_t *cb); + + static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen); + static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen, + l2pbuf_t *pbuf); + static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, + uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum); + static void l2arc_pbuf_init(l2pbuf_t *pb); + static void l2arc_pbuf_destroy(l2pbuf_t *pb); + static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, + l2arc_write_callback_t *cb); + static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs); + static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl, + const arc_buf_hdr_t *ab, int index); + static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb); + static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL;
*** 1235,1245 **** } ab->b_state = new_state; /* adjust l2arc hdr stats */ if (new_state == arc_l2c_only) ! l2arc_hdr_stat_add(); else if (old_state == arc_l2c_only) l2arc_hdr_stat_remove(); } void --- 1506,1516 ---- } ab->b_state = new_state; /* adjust l2arc hdr stats */ if (new_state == arc_l2c_only) ! l2arc_hdr_stat_add(old_state != arc_anon); else if (old_state == arc_l2c_only) l2arc_hdr_stat_remove(); } void
*** 1339,1348 **** --- 1610,1646 ---- (void) refcount_add(&hdr->b_refcnt, tag); return (buf); } + /* + * Allocates an empty arc_buf_hdr structure (lacking any data buffer). + * This is used during l2arc reconstruction to make empty ARC buffers + * which circumvent the regular disk->arc->l2arc path and instead come + * into being in the reverse order, i.e. l2arc->arc->(disk). + */ + arc_buf_hdr_t * + arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type) + { + arc_buf_hdr_t *hdr; + + ASSERT3U(size, >, 0); + hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + ASSERT(BUF_EMPTY(hdr)); + hdr->b_size = size; + hdr->b_type = type; + hdr->b_spa = guid; + hdr->b_state = arc_anon; + hdr->b_arc_access = 0; + hdr->b_buf = NULL; + hdr->b_datacnt = 0; + hdr->b_flags = 0; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + + return (hdr); + } + static char *arc_onloan_tag = "onloan"; /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned
*** 3971,3980 **** --- 4269,4416 ---- * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. + * + * L2ARC persistency: + * + * When writing buffers to L2ARC, we periodically add some metadata to + * make sure we can pick them up after reboot, thus dramatically reducing + * the impact that any downtime has on the performance of storage systems + * with large caches. + * + * The implementation works fairly simply by integrating the following two + * modifications: + * + * *) Every now and then, at end of an L2ARC feed cycle, we append a piece + * of metadata (called a "pbuf", or "persistency buffer") to the L2ARC + * write. This allows us to understand what what's been written, so that + * we can rebuild the arc_buf_hdr_t structures of the main ARC buffers. + * The pbuf also includes a "back-reference" pointer to the previous + * pbuf, forming a linked list of pbufs on the L2ARC device. + * + * *) We reserve 4k of space at the start of each L2ARC device for our + * header bookkeeping purposes. This contains a single 4k uberblock, which + * contains our top-level reference structures. We update it on each pbuf + * write. If this write results in an inconsistent uberblock (e.g. due to + * power failure), we detect this by verifying the uberblock's checksum + * and simply drop the entries from L2ARC. Once an L2ARC pbuf update + * completes, we update the uberblock to point to it. + * + * Implementation diagram: + * + * +=== L2ARC device (not to scale) ======================================+ + * | ____________newest pbuf pointer_____________ | + * | / \ | + * | / V | + * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---| + * | ^ / ^ / ^ / | + * | `-prev-' `-prev-' `-prev-' | + * | pbuf pbuf pbuf | + * +======================================================================+ + * + * On-device data structures: + * + * (L2ARC persistent uberblock) + * struct l2uberblock { + * (these fields are in network byte order) + * uint32_t magic = 0x12bab10c; l2-ber-block + * uint8_t version = 0x1; + * uint8_t reserved = 0x0; + * uint16_t ublk_flags; see l2uberblock_flags_t + * + * (byte order of fields below determined by `ublk_flags') + * uint64_t spa_guid; what pool this l2arc dev belongs to + * uint64_t birth_txg; ublk with highest birth_txg is newest + * uint64_t evict_tail; current evict pointer on l2arc dev + * uint64_t alloc_space; how much space is alloc'd on the dev + * uint64_t pbuf_daddr; dev addr of the newest l2pbuf_t + * uint32_t pbuf_asize; size of newest pbuf + * uint64_t pbuf_cksum[4]; fletcher4 of newest pbuf + * + * uint8_t reserved[3996] = {0x0, 0x0, ... 0x0}; + * + * uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above); + * } l2dev_uberblock; + * + * (L2ARC persistent buffer list) + * typedef struct l2pbuf_t { + * (these fields are in network byte order) + * uint32_t magic = 0xdb0faba6; the-buffer-bag + * uint8_t version = 0x1; + * uint8_t reserved = 0x0; + * uint16_t pbuf_flags; see l2pbuf_flags_t + * + * (byte order of fields below determined by `pbuf_flags') + * uint64_t prev_pbuf_daddr; previous pbuf dev addr + * uint32_t prev_pbuf_asize; previous pbuf size + * uint64_t prev_pbuf_cksum[4]; fletcher4(of previous pbuf) + * + * uint32_t items_size; uncompressed size of `items' below + * (if (pbuf_flags & compress) decompress `items' prior to decoding) + * struct l2pbuf_buf_item { + * (these fields mirror [l2]arc_buf_hdr fields) + * uint64_t dva[2]; buffer's DVA + * uint64_t birth; buffer's birth TXG in ARC + * uint64_t cksum0; lower 64-bits of buffer's cksum + * uint64_t freeze_cksum[4]; buffer's freeze cksum + * uint32_t size; uncompressed buffer data size + * uint64_t l2daddr; device address (offset) of buf + * uint32_t l2asize; actual space occupied by buf + * uint8_t compress; compress algo used on data + * uint8_t contents_type; buffer's contents type + * uint16_t reserved = 0x0; for alignment and future use + * uint32_t flags; buffer's persistent flags + * } items[]; continues for remainder of pbuf + * } l2pbuf_t; + * + * L2ARC reconstruction: + * + * When writing data, we simply write in the standard rotary fashion, + * evicting buffers as we go and simply writing new data over them (appending + * an updated l2pbuf_t every now and then). This obviously means that once we + * loop around the end of the device, we will start cutting into an already + * committed l2pbuf (and its referenced data buffers), like so: + * + * current write head__ __old tail + * \ / + * V V + * <--|bufs|pbuf|bufs|pbuf| |bufs|pbuf|bufs|pbuf|--> + * ^ ^^^^^^^^^_____________________________ + * | \ + * <<nextwrite>> - will overwrite this pbuf --/ + * + * When importing the pool, we detect this situation and use it to stop + * our scanning process: + * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the + * previous one. + * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum) + * then the pbuf is invalid and stop scanning (goto step 3 below). + * 3) if (this is the last valid pbuf) + * discard this pbuf as well (its ARC bufs may have been damaged by a + * partial overwrite). + * (We could potentially salvage the remaining good arc bufs above in step 3, + * buf the cost of doing so probably outweighs the value of the entire pbuf). + * + * There is one significant caveat to consider when rebuilding ARC contents + * from an L2ARC device: what about invalidated buffers? Given the above + * construction, we cannot update pbufs which we've already written to amend + * them to remove buffers which were invalidated. Thus, during reconstruction, + * we might be populating the cache with buffers for data that's not on the + * main pool anymore, or may have been overwritten! + * + * As it turns out, this isn't a problem. Every arc_read request includes + * both the DVA and, crucially, the birth TXG of the BP the caller is + * looking for. So even if the cache were populated by completely rotten + * blocks for data that had been long deleted and/or overwritten, we'll + * never actually return bad data from the cache, since the DVA with the + * birth TXG uniquely identify a block in space and time - once created, + * a block is immutable on disk. The worst thing we have done is wasted + * some time and memory at l2arc rebuild to reconstruct outdated ARC + * entries that will get dropped from the l2arc as it is being updated + * with new blocks. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) {
*** 4037,4049 **** return (next); } static void ! l2arc_hdr_stat_add(void) { ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); } static void l2arc_hdr_stat_remove(void) --- 4473,4486 ---- return (next); } static void ! l2arc_hdr_stat_add(boolean_t from_arc) { ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + if (from_arc) ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); } static void l2arc_hdr_stat_remove(void)
*** 4074,4084 **** goto out; first = NULL; next = l2arc_dev_last; do { ! /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) --- 4511,4524 ---- goto out; first = NULL; next = l2arc_dev_last; do { ! /* ! * Loop around the list looking for a non-faulted vdev ! * and one that isn't currently doing an L2ARC rebuild. ! */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL)
*** 4089,4102 **** if (first == NULL) first = next; else if (next == first) break; ! } while (vdev_is_dead(next->l2ad_vdev)); /* if we were unable to find any usable vdevs, return NULL */ ! if (vdev_is_dead(next->l2ad_vdev)) next = NULL; l2arc_dev_last = next; out: --- 4529,4542 ---- if (first == NULL) first = next; else if (next == first) break; ! } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding); /* if we were unable to find any usable vdevs, return NULL */ ! if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding) next = NULL; l2arc_dev_last = next; out:
*** 4170,4180 **** --- 4610,4627 ---- /* * All writes completed, or an error was hit. */ for (ab = list_prev(buflist, head); ab; ab = ab_prev) { ab_prev = list_prev(buflist, ab); + abl2 = ab->b_l2hdr; + /* + * Release the temporary compressed buffer as soon as possible. + */ + if (abl2->b_compress != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(ab); + hash_lock = HDR_LOCK(ab); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage * of eviction. Its ARC_L2_WRITING flag will be
*** 4182,4199 **** */ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); continue; } - abl2 = ab->b_l2hdr; - - /* - * Release the temporary compressed buffer as soon as possible. - */ - if (abl2->b_compress != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(ab); - if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, ab); --- 4629,4638 ----
*** 4216,4225 **** --- 4655,4668 ---- kmem_cache_free(hdr_cache, head); mutex_exit(&l2arc_buflist_mtx); l2arc_do_free_on_write(); + if (cb->l2wcb_pbuf) + kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size); + if (cb->l2wcb_ub_buf) + kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE); kmem_free(cb, sizeof (l2arc_write_callback_t)); } /* * A read to a cache device completed. Validate buffer contents before
*** 4497,4512 **** --- 4940,4961 ---- l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); const boolean_t do_headroom_boost = *headroom_boost; + /* persistency-related */ + l2pbuf_t *pb; + l2pbuf_buflist_t *pb_buflist; + int num_bufs, buf_index; + ASSERT(dev->l2ad_vdev != NULL); /* Lower the flag now, we might want to raise it again later. */ *headroom_boost = B_FALSE; pio = NULL; + cb = NULL; write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); head->b_flags |= ARC_L2_WRITE_HEAD;
*** 4514,4524 **** --- 4963,4982 ---- * We will want to try to compress buffers that are at least 2x the * device sector size. */ buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; + pb = &dev->l2ad_pbuf; + num_bufs = 0; + /* + * We will want to try to compress buffers that are at least 2x the + * device sector size. + */ + buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; + + /* * Copy buffers for L2ARC writing. */ mutex_enter(&l2arc_buflist_mtx); for (int try = 0; try <= 3; try++) { uint64_t passed_sz = 0;
*** 4584,4594 **** * l2arc_write_done() can find where the * write buffers begin without searching. */ list_insert_head(dev->l2ad_buflist, head); ! cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); --- 5042,5052 ---- * l2arc_write_done() can find where the * write buffers begin without searching. */ list_insert_head(dev->l2ad_buflist, head); ! cb = kmem_zalloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL);
*** 4626,4635 **** --- 5084,5094 ---- arc_cksum_compute(ab->b_buf, B_TRUE); mutex_exit(hash_lock); write_sz += buf_sz; + num_bufs++; } mutex_exit(list_lock); if (full == B_TRUE)
*** 4642,4658 **** mutex_exit(&l2arc_buflist_mtx); kmem_cache_free(hdr_cache, head); return (0); } /* * Now start writing the buffers. We're starting at the write head * and work backwards, retracing the course of the buffer selector * loop above. */ ! for (ab = list_prev(dev->l2ad_buflist, head); ab; ! ab = list_prev(dev->l2ad_buflist, ab)) { l2arc_buf_hdr_t *l2hdr; uint64_t buf_sz; /* * We shouldn't need to lock the buffer here, since we flagged --- 5101,5120 ---- mutex_exit(&l2arc_buflist_mtx); kmem_cache_free(hdr_cache, head); return (0); } + /* expand the pbuf to include a new list */ + pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs); + /* * Now start writing the buffers. We're starting at the write head * and work backwards, retracing the course of the buffer selector * loop above. */ ! for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab; ! ab = list_prev(dev->l2ad_buflist, ab), buf_index++) { l2arc_buf_hdr_t *l2hdr; uint64_t buf_sz; /* * We shouldn't need to lock the buffer here, since we flagged
*** 4700,4720 **** */ buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); write_psize += buf_p_sz; dev->l2ad_hand += buf_p_sz; } - } mutex_exit(&l2arc_buflist_mtx); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); ARCSTAT_INCR(arcstat_l2_asize, write_asize); vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { --- 5162,5192 ---- */ buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); write_psize += buf_p_sz; dev->l2ad_hand += buf_p_sz; } + l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index); + } + ASSERT(buf_index == num_bufs); mutex_exit(&l2arc_buflist_mtx); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); ARCSTAT_INCR(arcstat_l2_asize, write_asize); vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); + /* Is it time to commit this pbuf? */ + if (L2PBUF_IS_FULL(pb) && + dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) { + l2arc_pbuf_commit(dev, pio, cb); + l2arc_pbuf_destroy(pb); + l2arc_pbuf_init(pb); + } + /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
*** 4992,5005 **** return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already ! * validated the vdev and opened it. */ void ! l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd)); --- 5464,5478 ---- return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already ! * validated the vdev and opened it. The `rebuild' flag indicates whether ! * we should attempt an L2ARC persistency rebuild. */ void ! l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd));
*** 5007,5022 **** * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; ! adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; /* * This is a list of all ARC buffers that are still valid on the * device. */ --- 5480,5496 ---- * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; ! adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + l2arc_pbuf_init(&adddev->l2ad_pbuf); /* * This is a list of all ARC buffers that are still valid on the * device. */
*** 5030,5039 **** --- 5504,5518 ---- * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); + if (rebuild && l2arc_rebuild_enabled) { + adddev->l2ad_rebuilding = B_TRUE; + (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev, + 0, &p0, TS_RUN, minclsyspri); + } mutex_exit(&l2arc_dev_mtx); } /* * Remove a vdev from the L2ARC.
*** 5065,5074 **** --- 5544,5554 ---- mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ + l2arc_pbuf_destroy(&remdev->l2ad_pbuf); l2arc_evict(remdev, 0, B_TRUE); list_destroy(remdev->l2ad_buflist); kmem_free(remdev->l2ad_buflist, sizeof (list_t)); kmem_free(remdev, sizeof (l2arc_dev_t)); }
*** 5136,5141 **** --- 5616,6768 ---- cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); + } + + /* + * Main entry point for L2ARC metadata rebuilding. This function must be + * called via thread_create so that the L2ARC metadata rebuild doesn't block + * pool import and may proceed in parallel on all available L2ARC devices. + */ + static void + l2arc_rebuild_start(l2arc_dev_t *dev) + { + vdev_t *vd = dev->l2ad_vdev; + spa_t *spa = dev->l2ad_spa; + + /* Lock out device removal. */ + spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); + ASSERT(dev->l2ad_rebuilding == B_TRUE); + l2arc_rebuild(dev); + dev->l2ad_rebuilding = B_FALSE; + spa_config_exit(spa, SCL_L2ARC, vd); + thread_exit(); + } + + /* + * This function implements the actual L2ARC metadata rebuild. It: + * + * 1) scans the device for valid l2uberblocks + * 2) if it finds a good uberblock, starts reading the pbuf chain + * 3) restores each pbuf's contents to memory + * + * Operation stops under any of the following conditions: + * + * 1) We reach the end of the pbuf chain (the previous-buffer reference + * in the pbuf is zero). + * 2) We encounter *any* error condition (cksum errors, io errors, looped + * pbufs, etc.). + * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect + * from making severely fragmented L2ARC pbufs or slow L2ARC devices + * prevent a machine from importing the pool (and letting the + * administrator take corrective action, e.g. by kicking the misbehaving + * L2ARC device out of the pool, or by reimporting the pool with L2ARC + * rebuilding disabled). + */ + static void + l2arc_rebuild(l2arc_dev_t *dev) + { + int err; + l2uberblock_t ub; + l2pbuf_t pb; + zio_t *this_io = NULL, *next_io = NULL; + int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout; + + if ((err = l2arc_uberblock_find(dev, &ub)) != 0) + return; + L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */); + + /* set up uberblock update info */ + dev->l2ad_uberblock_birth = ub.ub_birth + 1; + + /* initial sanity checks */ + l2arc_pbuf_init(&pb); + if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize, + ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) { + /* root pbuf is bad, we can't do anything about that */ + if (err == EINVAL) { + ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors); + } else { + ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors); + } + l2arc_pbuf_destroy(&pb); + return; + } + L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb)); + + dev->l2ad_evict = ub.ub_evict_tail; + + /* keep on chaining in new blocks */ + dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr; + dev->l2ad_pbuf_asize = ub.ub_pbuf_asize; + dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum; + dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev, + ub.ub_pbuf_daddr + ub.ub_pbuf_asize); + dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0); + + /* start the rebuild process */ + for (;;) { + l2pbuf_t pb_prev; + + l2arc_pbuf_init(&pb_prev); + if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr, + pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io, + &next_io)) != 0) { + /* + * We are done reading, discard the last good buffer. + */ + if (pb.pb_prev_daddr > dev->l2ad_hand && + pb.pb_prev_asize > L2PBUF_HDR_SIZE) { + /* this is an error, we stopped too early */ + if (err == EINVAL) { + ARCSTAT_BUMP( + arcstat_l2_rebuild_cksum_errors); + } else { + ARCSTAT_BUMP( + arcstat_l2_rebuild_io_errors); + } + } + l2arc_pbuf_destroy(&pb_prev); + l2arc_pbuf_destroy(&pb); + break; + } + + /* + * Protection against infinite loops of pbufs. This is also + * our primary termination mechanism - once the buffer list + * loops around our starting pbuf, we can stop. + */ + if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr && + pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) { + ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors); + l2arc_pbuf_destroy(&pb); + l2arc_pbuf_destroy(&pb_prev); + if (next_io) + l2arc_pbuf_prefetch_abort(next_io); + return; + } + + /* + * Our memory pressure valve. If the system is running low + * on memory, rather than swamping memory with new ARC buf + * hdrs, we opt not to reconstruct the L2ARC. At this point, + * however, we have already set up our L2ARC dev to chain in + * new metadata pbufs, so the user may choose to re-add the + * L2ARC dev at a later time to reconstruct it (when there's + * less memory pressure). + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); + cmn_err(CE_NOTE, "System running low on memory, " + "aborting L2ARC rebuild."); + l2arc_pbuf_destroy(&pb); + l2arc_pbuf_destroy(&pb_prev); + if (next_io) + l2arc_pbuf_prefetch_abort(next_io); + break; + } + + /* + * Now that we know that the prev_pbuf checks out alright, we + * can start reconstruction from this pbuf - we can be sure + * that the L2ARC write hand has not yet reached any of our + * buffers. + */ + l2arc_pbuf_restore(dev, &pb); + + /* pbuf restored, continue with next one in the list */ + l2arc_pbuf_destroy(&pb); + pb = pb_prev; + this_io = next_io; + next_io = NULL; + + L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb)); + } + + ARCSTAT_BUMP(arcstat_l2_rebuild_successes); + } + + /* + * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries + * which only contain an l2arc hdr, essentially restoring the buffers to + * their L2ARC evicted state. This function also updates space usage on the + * L2ARC vdev to make sure it tracks restored buffers. + */ + static void + l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb) + { + spa_t *spa; + uint64_t guid; + list_t *buflists_list; + l2pbuf_buflist_t *buflist; + + mutex_enter(&l2arc_buflist_mtx); + spa = dev->l2ad_vdev->vdev_spa; + guid = spa_load_guid(spa); + buflists_list = pb->pb_buflists_list; + for (buflist = list_head(buflists_list); buflist; + buflist = list_next(buflists_list, buflist)) { + int i; + uint64_t size, asize, psize; + + size = asize = psize = 0; + for (i = 0; i < buflist->l2pbl_nbufs; i++) { + l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev, + guid); + size += buflist->l2pbl_bufs[i].b_size; + asize += buflist->l2pbl_bufs[i].b_l2asize; + psize += vdev_psize_to_asize(dev->l2ad_vdev, + buflist->l2pbl_bufs[i].b_l2asize); + } + ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size); + ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize); + ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs); + vdev_space_update(dev->l2ad_vdev, psize, 0, 0); + } + mutex_exit(&l2arc_buflist_mtx); + ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs); + vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev, + pb->pb_asize), 0, 0); + } + + /* + * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into + * a state indicating that it has been evicted to L2ARC. + * The `guid' here is the ARC-load-guid from spa_load_guid. + */ + static void + l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid) + { + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]}; + + hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock); + if (hdr == NULL) { + /* not in cache, try to insert */ + arc_buf_hdr_t *exists; + arc_buf_contents_t type = buf->b_contents_type; + l2arc_buf_hdr_t *l2hdr; + + hdr = arc_buf_hdr_alloc(guid, buf->b_size, type); + hdr->b_dva = buf->b_dva; + hdr->b_birth = buf->b_birth; + hdr->b_cksum0 = buf->b_cksum0; + hdr->b_size = buf->b_size; + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* somebody beat us to the hash insert */ + mutex_exit(hash_lock); + arc_hdr_destroy(hdr); + ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); + return; + } + hdr->b_flags = buf->b_flags; + mutex_enter(&hdr->b_freeze_lock); + ASSERT(hdr->b_freeze_cksum == NULL); + hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + *hdr->b_freeze_cksum = buf->b_freeze_cksum; + mutex_exit(&hdr->b_freeze_lock); + + /* now rebuild the l2arc entry */ + ASSERT(hdr->b_l2hdr == NULL); + l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); + l2hdr->b_dev = dev; + l2hdr->b_daddr = buf->b_l2daddr; + l2hdr->b_asize = buf->b_l2asize; + l2hdr->b_compress = buf->b_l2compress; + hdr->b_l2hdr = l2hdr; + list_insert_head(dev->l2ad_buflist, hdr); + ARCSTAT_INCR(arcstat_l2_size, hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize); + + arc_change_state(arc_l2c_only, hdr, hash_lock); + } + mutex_exit(hash_lock); + } + + /* + * Attempts to locate and read the newest valid uberblock on the provided + * L2ARC device and writes it to `ub'. On success, this function returns 0, + * otherwise the appropriate error code is returned. + */ + static int + l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub) + { + int err = 0; + uint8_t *ub_buf; + uint64_t guid; + + ARCSTAT_BUMP(arcstat_l2_rebuild_attempts); + ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP); + guid = spa_guid(dev->l2ad_vdev->vdev_spa); + + if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf, + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors); + goto cleanup; + } + + /* + * Initial peek - does the device even have any usable uberblocks? + * If not, don't bother continuing. + */ + l2arc_uberblock_decode(ub_buf, ub); + if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 || + ub->ub_version > L2UBERBLOCK_MAX_VERSION || + ub->ub_spa_guid != guid) { + err = ENOTSUP; + ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported); + goto cleanup; + } + + /* now check to make sure that what we selected is okay */ + if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) { + if (err == EINVAL) { + ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors); + } else { + ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors); + } + goto cleanup; + } + + /* this uberblock is valid */ + + cleanup: + kmem_free(ub_buf, L2UBERBLOCK_SIZE); + return (err); + } + + /* + * Reads a pbuf from storage, decodes it and validates its contents against + * the provided checksum. The result is placed in `pb'. + * + * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching. + * When issuing the first pbuf IO during rebuild, you should pass NULL for + * `this_io'. This function will then issue a sync IO to read the pbuf and + * also issue an async IO to fetch the next pbuf in the pbuf chain. The + * prefetch IO is returned in `prefetch_io. On subsequent calls to this + * function, pass the value returned in `prefetch_io' from the previous + * call as `this_io' and a fresh `prefetch_io' pointer to hold the next + * prefetch IO. Prior to the call, you should initialize your `prefetch_io' + * pointer to be NULL. If no prefetch IO was issued, the pointer is left + * set at NULL. + * + * Actual prefetching takes place in two steps: a header IO (pi_hdr_io) + * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io + * IO is used internally in this function to be able to `peek' at the next + * buffer's header before the main IO to read it in completely has finished. + * We can then begin to issue the IO for the next buffer in the chain before + * we are done reading, keeping the L2ARC device's pipeline saturated with + * reads (rather than issuing an IO, waiting for it to complete, validating + * the returned buffer and issuing the next one). This will make sure that + * the rebuild proceeds at maximum read throughput. + * + * On success, this function returns 0, otherwise it returns an appropriate + * error code. On error the prefetching IO is aborted and cleared before + * returning from this function. Therefore, if we return `success', the + * caller can assume that we have taken care of cleanup of prefetch IOs. + */ + static int + l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize, + zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io) + { + int err = 0; + uint64_t prev_pb_start; + uint32_t prev_pb_asize; + zio_cksum_t calc_cksum, prev_pb_cksum; + l2arc_prefetch_info_t *pi = NULL; + + ASSERT(dev != NULL); + ASSERT(pb != NULL); + ASSERT(*prefetch_io == NULL); + + if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) { + /* We could not have issued a prefetch IO for this */ + ASSERT(this_io == NULL); + return (EINVAL); + } + + /* + * Check to see if we have issued the IO for this pbuf in a previous + * run. If not, issue it now. + */ + if (this_io == NULL) + this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize); + + /* Pick up the prefetch info buffer and read its contents */ + pi = this_io->io_private; + ASSERT(pi != NULL); + ASSERT(asize <= pi->pi_buflen); + + /* Wait for the IO to read this pbuf's header to complete */ + if ((err = zio_wait(pi->pi_hdr_io)) != 0) { + (void) zio_wait(this_io); + goto cleanup; + } + + /* + * Peek to see if we can start issuing the next pbuf IO immediately. + * At this point, only the current pbuf's header has been read. + */ + if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start, + &prev_pb_asize, &prev_pb_cksum) == 0) { + uint64_t this_pb_start, this_pb_end, prev_pb_end; + /* Detect malformed pbuf references and loops */ + this_pb_start = daddr; + this_pb_end = daddr + asize; + prev_pb_end = prev_pb_start + prev_pb_asize; + if ((prev_pb_start >= this_pb_start && prev_pb_start < + this_pb_end) || + (prev_pb_end >= this_pb_start && prev_pb_end < + this_pb_end)) { + ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors); + cmn_err(CE_WARN, "Looping L2ARC metadata reference " + "detected, aborting rebuild."); + err = EINVAL; + goto cleanup; + } + /* + * Start issuing IO for the next pbuf early - this should + * help keep the L2ARC device busy while we read, decode + * and restore this pbuf. + */ + if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize)) + *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, + prev_pb_start, prev_pb_asize); + } + + /* Wait for the main pbuf IO to complete */ + if ((err = zio_wait(this_io)) != 0) + goto cleanup; + + /* Make sure the buffer checks out ok */ + fletcher_4_native(pi->pi_buf, asize, &calc_cksum); + if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) { + err = EINVAL; + goto cleanup; + } + + /* Now we can take our time decoding this buffer */ + if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0) + goto cleanup; + + /* This will be used in l2arc_pbuf_restore for space accounting */ + pb->pb_asize = asize; + + ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb)); + ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize); + ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio, + pb->pb_payload_asz / asize); + + cleanup: + kmem_free(pi->pi_buf, pi->pi_buflen); + pi->pi_buf = NULL; + kmem_free(pi, sizeof (l2arc_prefetch_info_t)); + /* Abort an in-flight prefetch in case of error */ + if (err != 0 && *prefetch_io != NULL) { + l2arc_pbuf_prefetch_abort(*prefetch_io); + *prefetch_io = NULL; + } + return (err); + } + + /* + * Validates a pbuf device address to make sure that it can be read + * from the provided L2ARC device. Returns 1 if the address is within + * the device's bounds, or 0 if not. + */ + static int + l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize) + { + uint32_t psize; + uint64_t end; + + psize = vdev_psize_to_asize(dev->l2ad_vdev, asize); + end = daddr + psize; + + if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE || + asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start || + /* check that the buffer address is correctly aligned */ + (daddr & (vdev_psize_to_asize(dev->l2ad_vdev, + SPA_MINBLOCKSIZE) - 1)) != 0) + return (0); + else + return (1); + } + + /* + * Starts an asynchronous read IO to read a pbuf. This is used in pbuf + * reconstruction to start reading the next pbuf before we are done + * decoding and reconstructing the current pbuf, to keep the l2arc device + * nice and hot with read IO to process. + * The returned zio will contain a newly allocated memory buffers for the IO + * data which should then be freed by the caller once the zio is no longer + * needed (i.e. due to it having completed). If you wish to abort this + * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care + * of disposing of the allocated buffers correctly. + */ + static zio_t * + l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize) + { + uint32_t i, psize; + zio_t *pio, *hdr_io; + uint64_t hdr_rsize; + uint8_t *buf; + l2arc_prefetch_info_t *pinfo; + + psize = vdev_psize_to_asize(vd, asize); + buf = kmem_alloc(psize, KM_SLEEP); + pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP); + pinfo->pi_buf = buf; + pinfo->pi_buflen = psize; + + /* + * We start issuing the IO for the pbuf header early. This + * allows l2arc_pbuf_read to start issuing IO for the next + * buffer before the current pbuf is read in completely. + */ + + hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE); + ASSERT(hdr_rsize <= psize); + pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); + hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf, + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + (void) zio_nowait(hdr_io); + + /* + * Read in the rest of the pbuf - this can take longer than just + * having a peek at the header. + */ + pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); + for (i = hdr_rsize; i < psize; ) { + uint64_t rsize = psize - i; + zio_t *rzio; + + if (psize - i > SPA_MAXBLOCKSIZE) + rsize = SPA_MAXBLOCKSIZE; + ASSERT(rsize >= SPA_MINBLOCKSIZE); + rzio = zio_read_phys(pio, vd, daddr + i, + rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + (void) zio_nowait(rzio); + i += rsize; + } + + return (pio); + } + + /* + * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data + * buffers allocated for it. + */ + static void + l2arc_pbuf_prefetch_abort(zio_t *zio) + { + l2arc_prefetch_info_t *pi; + + pi = zio->io_private; + ASSERT(pi != NULL); + if (pi->pi_hdr_io != NULL) + (void) zio_wait(pi->pi_hdr_io); + (void) zio_wait(zio); + kmem_free(pi->pi_buf, pi->pi_buflen); + pi->pi_buf = NULL; + kmem_free(pi, sizeof (l2arc_prefetch_info_t)); + } + + /* + * Encodes an l2uberblock_t structure into a destination buffer. This + * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting + * uberblock is always of this constant size. + */ + static void + l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf) + { + zio_cksum_t cksum; + + bzero(buf, L2UBERBLOCK_SIZE); + + #if defined(_BIG_ENDIAN) + *(uint32_t *)buf = L2UBERBLOCK_MAGIC; + *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN; + #else /* !defined(_BIG_ENDIAN) */ + *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC); + /* zero flags is ok */ + #endif /* !defined(_BIG_ENDIAN) */ + buf[4] = L2UBERBLOCK_MAX_VERSION; + + /* rest in native byte order */ + *(uint64_t *)(buf + 8) = ub->ub_spa_guid; + *(uint64_t *)(buf + 16) = ub->ub_birth; + *(uint64_t *)(buf + 24) = ub->ub_evict_tail; + *(uint64_t *)(buf + 32) = ub->ub_alloc_space; + *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr; + *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize; + bcopy(&ub->ub_pbuf_cksum, buf + 52, 32); + + fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum); + bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32); + } + + /* + * Decodes an l2uberblock_t from an on-disk representation. Please note + * that this function does not perform any uberblock validation and + * checksumming - call l2arc_uberblock_verify() for that. + */ + static void + l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub) + { + boolean_t bswap_needed; + + /* these always come in big endian */ + #if defined(_BIG_ENDIAN) + ub->ub_magic = *(uint32_t *)buf; + ub->ub_flags = *(uint16_t *)(buf + 6); + bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1); + #else /* !defined(_BIG_ENDIAN) */ + ub->ub_magic = BSWAP_32(*(uint32_t *)buf); + ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6)); + bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0); + #endif /* !defined(_BIG_ENDIAN) */ + ub->ub_version = buf[4]; + + ub->ub_spa_guid = *(uint64_t *)(buf + 8); + ub->ub_birth = *(uint64_t *)(buf + 16); + ub->ub_evict_tail = *(uint64_t *)(buf + 24); + ub->ub_alloc_space = *(uint64_t *)(buf + 32); + ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40); + ub->ub_pbuf_asize = *(uint32_t *)(buf + 48); + bcopy(buf + 52, &ub->ub_pbuf_cksum, 36); + bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32); + + /* swap the rest if endianness doesn't match us */ + if (bswap_needed) { + ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid); + ub->ub_birth = BSWAP_64(ub->ub_birth); + ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail); + ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space); + ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr); + ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize); + ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum); + ZIO_CHECKSUM_BSWAP(&ub->ub_cksum); + } + } + + /* + * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is + * valid and matches its checksum. + */ + static int + l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub, + uint64_t guid) + { + zio_cksum_t cksum; + + if (ub->ub_magic != L2UBERBLOCK_MAGIC || + ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION) + /* + * bad magic or invalid version => persistent l2arc not + * supported + */ + return (ENOTSUP); + + if (ub->ub_spa_guid != guid) + /* this l2arc dev isn't ours */ + return (EINVAL); + + fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum)) + /* bad checksum, corrupt uberblock */ + return (EINVAL); + + return (0); + } + + /* + * Schedules a zio to update the uberblock on an l2arc device. The zio is + * initiated as a child of `pio' and `cb' is filled with the information + * needed to free the uberblock data buffer after writing. + */ + static void + l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) + { + uint8_t *ub_buf; + l2uberblock_t ub; + zio_t *wzio; + vdev_stat_t st; + + ASSERT(cb->l2wcb_ub_buf == NULL); + vdev_get_stats(dev->l2ad_vdev, &st); + + bzero(&ub, sizeof (ub)); + ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); + ub.ub_birth = dev->l2ad_uberblock_birth++; + ub.ub_evict_tail = dev->l2ad_evict; + ub.ub_alloc_space = st.vs_alloc; + ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr; + ub.ub_pbuf_asize = dev->l2ad_pbuf_asize; + ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum; + if (dev->l2ad_first) + ub.ub_flags |= L2UBLK_EVICT_FIRST; + + ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP); + cb->l2wcb_ub_buf = ub_buf; + l2arc_uberblock_encode(&ub, ub_buf); + wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, + L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + (void) zio_nowait(wzio); + } + + /* + * Encodes a l2pbuf_t structure into the portable on-disk format. The + * `buf' buffer must be suitably sized to hold the entire uncompressed + * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function + * also compresses the buffer. + * + * The return value is the length of the resulting encoded pbuf structure. + * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression + * was applied, or smaller if compression was applied. In either case, + * prior to writing to disk, the caller must suitably pad the output + * buffer so that it is aligned on a multiple of the underlying storage + * system's block size. + */ + static uint32_t + l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen) + { + uint16_t flags = 0; + uint8_t *dst_buf; + uint32_t enclen; + l2pbuf_buflist_t *buflist; + + enclen = L2PBUF_ENCODED_SIZE(pb); + ASSERT(buflen >= enclen); + bzero(buf, enclen); + + /* non-header portions of pbufs are in native byte order */ + *(uint64_t *)(buf + 8) = pb->pb_prev_daddr; + *(uint32_t *)(buf + 16) = pb->pb_prev_asize; + bcopy(&pb->pb_prev_cksum, buf + 20, 32); + *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE; + + /* first we encode the buflists uncompressed */ + dst_buf = buf + L2PBUF_HDR_SIZE; + for (buflist = list_head(pb->pb_buflists_list); buflist; + buflist = list_next(pb->pb_buflists_list, buflist)) { + int i; + + ASSERT(buflist->l2pbl_nbufs != 0); + for (i = 0; i < buflist->l2pbl_nbufs; i++) { + l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i]; + + ASSERT(pbl_buf->b_size != 0); + *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0]; + *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1]; + *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth; + *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0; + bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32); + *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size; + *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr; + *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize; + dst_buf[80] = pbl_buf->b_l2compress; + dst_buf[81] = pbl_buf->b_contents_type; + *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags; + dst_buf += L2PBUF_BUF_SIZE; + } + } + ASSERT((uint32_t)(dst_buf - buf) == enclen); + + /* and then compress them if necessary */ + if (enclen >= l2arc_pbuf_compress_minsz) { + uint8_t *cbuf; + size_t slen, clen; + + slen = l2arc_pbuf_items_encoded_size(pb); + cbuf = kmem_alloc(slen, KM_SLEEP); + clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0); + ASSERT(clen != 0); + if (clen < slen) { + bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen); + flags |= L2PBUF_COMPRESSED; + /* zero out the rest of the input buffer */ + bzero(buf + L2PBUF_HDR_SIZE + clen, + buflen - (L2PBUF_HDR_SIZE + clen)); + /* adjust our buffer length now that it's shortened */ + enclen = L2PBUF_HDR_SIZE + clen; + } + kmem_free(cbuf, slen); + } + + /* the header goes last since `flags' may change due to compression */ + #if defined(_BIG_ENDIAN) + *(uint32_t *)buf = L2PBUF_MAGIC; + flags |= L2PBUF_BIG_ENDIAN; + *(uint16_t *)(buf + 6) = flags; + #else /* !defined(_BIG_ENDIAN) */ + *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC); + *(uint16_t *)(buf + 6) = BSWAP_16(flags); + #endif /* !defined(_BIG_ENDIAN) */ + buf[4] = L2PBUF_MAX_VERSION; + + return (enclen); + } + + /* + * Decodes a stored l2pbuf_t structure previously encoded using + * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf + * must be initialized by l2arc_pbuf_init by the caller beforehand, but + * must not have been used to store any buffers yet. + * + * Please note that we don't do checksum verification here, as we don't + * know our own checksum (that's know by the previous block in the linked + * list, or by the uberblock). This should be performed by the caller + * prior to calling l2arc_pbuf_decode. + */ + static int + l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb) + { + boolean_t bswap_needed; + uint32_t payload_sz, payload_asz; + uint8_t *src_bufs; + l2pbuf_buflist_t *buflist; + int i, nbufs; + + ASSERT(input_buf != NULL); + ASSERT(pb != NULL); + ASSERT(pb->pb_version != 0); + ASSERT(pb->pb_nbuflists == 0); + + /* no valid buffer can be this small */ + if (buflen < L2PBUF_HDR_SIZE) + return (EINVAL); + + /* these always come in big endian */ + #if defined(_BIG_ENDIAN) + pb->pb_magic = *(uint32_t *)input_buf; + pb->pb_flags = *(uint16_t *)(input_buf + 6); + bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1); + #else /* !defined(_BIG_ENDIAN) */ + pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf); + pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6)); + bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0); + #endif /* !defined(_BIG_ENDIAN) */ + pb->pb_version = input_buf[4]; + + if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0) + return (EINVAL); + if (pb->pb_version > L2PBUF_MAX_VERSION) + return (ENOTSUP); + + /* remainder of pbuf may need bswap'ping */ + pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8); + pb->pb_prev_asize = *(uint64_t *)(input_buf + 16); + bcopy(input_buf + 20, &pb->pb_prev_cksum, 32); + payload_sz = *(uint32_t *)(input_buf + 52); + payload_asz = buflen - L2PBUF_HDR_SIZE; + + if (bswap_needed) { + pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr); + pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize); + ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum); + payload_sz = BSWAP_32(payload_sz); + } + + /* check for sensible buffer allocation limits */ + if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) || + (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) || + (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0) + return (EINVAL); + nbufs = payload_sz / L2PBUF_BUF_SIZE; + + /* decompression might be needed */ + if (pb->pb_flags & L2PBUF_COMPRESSED) { + src_bufs = kmem_alloc(payload_sz, KM_SLEEP); + if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs, + payload_asz, payload_sz, 0) != 0) { + kmem_free(src_bufs, payload_sz); + return (EINVAL); + } + } else { + src_bufs = input_buf + L2PBUF_HDR_SIZE; + } + + /* Decode individual pbuf items from our source buffer. */ + buflist = l2arc_pbuf_buflist_alloc(pb, nbufs); + for (i = 0; i < nbufs; i++) { + l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i]; + const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE; + + pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src; + pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8); + pbl_buf->b_birth = *(uint64_t *)(src + 16); + pbl_buf->b_cksum0 = *(uint64_t *)(src + 24); + bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32); + pbl_buf->b_size = *(uint32_t *)(src + 64); + pbl_buf->b_l2daddr = *(uint64_t *)(src + 68); + pbl_buf->b_l2asize = *(uint32_t *)(src + 76); + pbl_buf->b_l2compress = src[80]; + pbl_buf->b_contents_type = src[81]; + pbl_buf->b_flags = *(uint32_t *)(src + 84); + + if (bswap_needed) { + pbl_buf->b_dva.dva_word[0] = + BSWAP_64(pbl_buf->b_dva.dva_word[0]); + pbl_buf->b_dva.dva_word[1] = + BSWAP_64(pbl_buf->b_dva.dva_word[1]); + pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth); + pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0); + ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum); + pbl_buf->b_size = BSWAP_32(pbl_buf->b_size); + pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr); + pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize); + pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags); + } + + pb->pb_payload_asz += pbl_buf->b_l2asize; + } + + if (pb->pb_flags & L2PBUF_COMPRESSED) + kmem_free(src_bufs, payload_sz); + + return (0); + } + + /* + * Decodes the previous buffer pointer encoded in a pbuf. This is used + * during L2ARC reconstruction to "peek" at the next buffer and start + * issuing IO to fetch it early, before decoding of the current buffer + * is done (which can take time due to decompression). + * Returns 0 on success (and fills in the return parameters `daddr', + * `asize' and `cksum' with the info of the previous pbuf), and an errno + * on error. + */ + static int + l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr, + uint32_t *asize, zio_cksum_t *cksum) + { + boolean_t bswap_needed; + uint16_t version, flags; + uint32_t magic; + + ASSERT(buf != NULL); + + /* no valid buffer can be this small */ + if (buflen <= L2PBUF_HDR_SIZE) + return (EINVAL); + + /* these always come in big endian */ + #if defined(_BIG_ENDIAN) + magic = *(uint32_t *)buf; + flags = *(uint16_t *)(buf + 6); + bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1); + #else /* !defined(_BIG_ENDIAN) */ + magic = BSWAP_32(*(uint32_t *)buf); + flags = BSWAP_16(*(uint16_t *)(buf + 6)); + bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0); + #endif /* !defined(_BIG_ENDIAN) */ + version = buf[4]; + + if (magic != L2PBUF_MAGIC || version == 0) + return (EINVAL); + if (version > L2PBUF_MAX_VERSION) + return (ENOTSUP); + + *daddr = *(uint64_t *)(buf + 4); + *asize = *(uint64_t *)(buf + 12); + bcopy(buf + 16, cksum, 32); + + if (bswap_needed) { + *daddr = BSWAP_64(*daddr); + *asize = BSWAP_64(*asize); + ZIO_CHECKSUM_BSWAP(cksum); + } + + return (0); + } + + /* + * Initializes a pbuf structure into a clean state. All version and flags + * fields are filled in as appropriate for this architecture. + * If the structure was used before, first call l2arc_pbuf_destroy on it, + * as this function assumes the structure is uninitialized. + */ + static void + l2arc_pbuf_init(l2pbuf_t *pb) + { + bzero(pb, sizeof (l2pbuf_t)); + pb->pb_version = L2PBUF_MAX_VERSION; + #if defined(_BIG_ENDIAN) + pb->pb_flags |= L2PB_BIG_ENDIAN; + #endif + pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP); + list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t), + offsetof(l2pbuf_buflist_t, l2pbl_node)); + } + + /* + * Destroys a pbuf structure and puts it into a clean state ready to be + * initialized by l2arc_pbuf_init. All buflists created by + * l2arc_pbuf_buflist_alloc are released as well. + */ + static void + l2arc_pbuf_destroy(l2pbuf_t *pb) + { + list_t *buflist_list = pb->pb_buflists_list; + l2pbuf_buflist_t *buflist; + + while ((buflist = list_head(buflist_list)) != NULL) { + ASSERT(buflist->l2pbl_nbufs > 0); + kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) * + buflist->l2pbl_nbufs); + list_remove(buflist_list, buflist); + kmem_free(buflist, sizeof (l2pbuf_buflist_t)); + } + pb->pb_nbuflists = 0; + list_destroy(pb->pb_buflists_list); + kmem_free(pb->pb_buflists_list, sizeof (list_t)); + bzero(pb, sizeof (l2pbuf_t)); + } + + /* + * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs' + * buffers. This is used during the buffer write cycle - each cycle allocates + * a new buflist and fills it with buffers it writes. Then, when the pbuf + * reaches its buflist limit, it is commited to stable storage. + */ + static l2pbuf_buflist_t * + l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs) + { + l2pbuf_buflist_t *buflist; + + ASSERT(pb->pb_buflists_list != NULL); + buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP); + buflist->l2pbl_nbufs = nbufs; + buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs, + KM_SLEEP); + list_insert_tail(pb->pb_buflists_list, buflist); + pb->pb_nbuflists++; + + return (buflist); + } + + /* + * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'. + * The buffer being inserted must be present in L2ARC. + */ + static void + l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl, + const arc_buf_hdr_t *ab, int index) + { + l2pbuf_buf_t *pb_buf; + const l2arc_buf_hdr_t *l2hdr; + + l2hdr = ab->b_l2hdr; + ASSERT(l2hdr != NULL); + ASSERT(pbl->l2pbl_nbufs > index); + + pb_buf = &pbl->l2pbl_bufs[index]; + pb_buf->b_dva = ab->b_dva; + pb_buf->b_birth = ab->b_birth; + pb_buf->b_cksum0 = ab->b_cksum0; + pb_buf->b_freeze_cksum = *ab->b_freeze_cksum; + pb_buf->b_size = ab->b_size; + pb_buf->b_l2daddr = l2hdr->b_daddr; + pb_buf->b_l2asize = l2hdr->b_asize; + pb_buf->b_l2compress = l2hdr->b_compress; + pb_buf->b_contents_type = ab->b_type; + pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS; + pb->pb_payload_asz += l2hdr->b_asize; + } + + /* + * Commits a pbuf to stable storage. This routine is invoked when writing + * ARC buffers to an L2ARC device. When the pbuf associated with the device + * has reached its limits (either in size or in number of writes), it is + * scheduled here for writing. + * This function allocates some memory to temporarily hold the serialized + * buffer to be written. This is then released in l2arc_write_done. + */ + static void + l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) + { + l2pbuf_t *pb = &dev->l2ad_pbuf; + uint64_t i, est_encsize, bufsize, encsize, io_size; + uint8_t *pb_buf; + + pb->pb_prev_daddr = dev->l2ad_pbuf_daddr; + pb->pb_prev_asize = dev->l2ad_pbuf_asize; + pb->pb_prev_cksum = dev->l2ad_pbuf_cksum; + + est_encsize = L2PBUF_ENCODED_SIZE(pb); + bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize); + pb_buf = kmem_zalloc(bufsize, KM_SLEEP); + encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize); + cb->l2wcb_pbuf = pb_buf; + cb->l2wcb_pbuf_size = bufsize; + + dev->l2ad_pbuf_daddr = dev->l2ad_hand; + dev->l2ad_pbuf_asize = encsize; + fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum); + + io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize); + for (i = 0; i < io_size; ) { + zio_t *wzio; + uint64_t wsize = io_size - i; + + if (wsize > SPA_MAXBLOCKSIZE) + wsize = SPA_MAXBLOCKSIZE; + ASSERT(wsize >= SPA_MINBLOCKSIZE); + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i, + wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + (void) zio_nowait(wzio); + i += wsize; + } + + dev->l2ad_hand += io_size; + vdev_space_update(dev->l2ad_vdev, io_size, 0, 0); + l2arc_uberblock_update(dev, pio, cb); + + ARCSTAT_INCR(arcstat_l2_write_bytes, io_size); + ARCSTAT_BUMP(arcstat_l2_meta_writes); + ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize); + ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize); + ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio, + pb->pb_payload_asz / encsize); + } + + /* + * Returns the number of bytes occupied by the payload buffer items of + * a pbuf in portable (on-disk) encoded form, i.e. the bytes following + * L2PBUF_HDR_SIZE. + */ + static uint32_t + l2arc_pbuf_items_encoded_size(l2pbuf_t *pb) + { + uint32_t size = 0; + l2pbuf_buflist_t *buflist; + + for (buflist = list_head(pb->pb_buflists_list); buflist != NULL; + buflist = list_next(pb->pb_buflists_list, buflist)) + size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs; + + return (size); }