Print this page
3525 Persistent L2ARC

*** 136,145 **** --- 136,147 ---- #include <sys/dnlc.h> #endif #include <sys/callb.h> #include <sys/kstat.h> #include <zfs_fletcher.h> + #include <sys/byteorder.h> + #include <sys/spa_impl.h> #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; int arc_procfd;
*** 314,323 **** --- 316,340 ---- kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_l2_compress_successes; kstat_named_t arcstat_l2_compress_zeros; kstat_named_t arcstat_l2_compress_failures; + kstat_named_t arcstat_l2_log_blk_writes; + kstat_named_t arcstat_l2_log_blk_avg_size; + kstat_named_t arcstat_l2_data_to_meta_ratio; + kstat_named_t arcstat_l2_rebuild_successes; + kstat_named_t arcstat_l2_rebuild_abort_unsupported; + kstat_named_t arcstat_l2_rebuild_abort_timeout; + kstat_named_t arcstat_l2_rebuild_abort_io_errors; + kstat_named_t arcstat_l2_rebuild_abort_cksum_errors; + kstat_named_t arcstat_l2_rebuild_abort_loop_errors; + kstat_named_t arcstat_l2_rebuild_abort_lowmem; + kstat_named_t arcstat_l2_rebuild_size; + kstat_named_t arcstat_l2_rebuild_bufs; + kstat_named_t arcstat_l2_rebuild_bufs_precached; + kstat_named_t arcstat_l2_rebuild_psize; + kstat_named_t arcstat_l2_rebuild_log_blks; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_duplicate_buffers; kstat_named_t arcstat_duplicate_buffers_size; kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_meta_used;
*** 380,389 **** --- 397,421 ---- { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_compress_successes", KSTAT_DATA_UINT64 }, { "l2_compress_zeros", KSTAT_DATA_UINT64 }, { "l2_compress_failures", KSTAT_DATA_UINT64 }, + { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 }, + { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, + { "l2_rebuild_successes", KSTAT_DATA_UINT64 }, + { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, + { "l2_rebuild_timeout", KSTAT_DATA_UINT64 }, + { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, + { "l2_rebuild_psize", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, + { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "duplicate_buffers", KSTAT_DATA_UINT64 }, { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, { "duplicate_reads", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 },
*** 427,436 **** --- 459,487 ---- } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } + /* + * This macro allows us to use kstats as floating averages. Each time we + * update this kstat, we first factor it and the update value by + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall + * average. This macro assumes that integer loads and stores are atomic, but + * is not safe for multiple writers updating the kstat in parallel (only the + * last writer's update will remain). + */ + #define ARCSTAT_F_AVG_FACTOR 3 + #define ARCSTAT_F_AVG(stat, value) \ + do { \ + uint64_t x = ARCSTAT(stat); \ + x = x - x / ARCSTAT_F_AVG_FACTOR + \ + (value) / ARCSTAT_F_AVG_FACTOR; \ + ARCSTAT(stat) = x; \ + _NOTE(NOTREACHED) \ + _NOTE(CONSTCOND) \ + } while (0) + kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu;
*** 635,657 **** boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals */ ! typedef struct l2arc_dev { ! vdev_t *l2ad_vdev; /* vdev */ ! spa_t *l2ad_spa; /* spa */ ! uint64_t l2ad_hand; /* next write location */ ! uint64_t l2ad_start; /* first addr on device */ ! uint64_t l2ad_end; /* last addr on device */ ! uint64_t l2ad_evict; /* last addr eviction reached */ ! boolean_t l2ad_first; /* first sweep through */ ! boolean_t l2ad_writing; /* currently writing */ ! list_t *l2ad_buflist; /* buffer list */ ! list_node_t l2ad_node; /* device list node */ ! } l2arc_dev_t; ! static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ --- 686,696 ---- boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals */ ! typedef struct l2arc_dev l2arc_dev_t; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
*** 670,679 **** --- 709,720 ---- } l2arc_read_callback_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ + /* list of in-flight l2arc_log_blk_buf_t's */ + list_t l2wcb_log_blk_buf_list; } l2arc_write_callback_t; struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */
*** 697,715 **** static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static void l2arc_read_done(zio_t *zio); ! static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c); static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); ! static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; --- 738,1025 ---- static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static void l2arc_read_done(zio_t *zio); ! static void l2arc_hdr_stat_add(boolean_t from_arc); static void l2arc_hdr_stat_remove(void); + static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c); static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); ! enum { ! L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ ! }; ! ! /* ! * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers). ! */ ! typedef struct l2arc_log_blk_ptr { ! uint64_t l2lbp_daddr; /* device address of log */ ! /* ! * l2lbp_prop is the same format as the blk_prop in blkptr_t: ! * * logical size (in sectors) ! * * physical (compressed) size (in sectors) ! * * compression algorithm (we always LZ4-compress l2arc logs) ! * * checksum algorithm (used for l2lbp_cksum) ! * * object type & level (unused for now) ! */ ! uint64_t l2lbp_prop; ! zio_cksum_t l2lbp_cksum; /* fletcher4 of log */ ! } l2arc_log_blk_ptr_t; ! ! /* ! * The persistent L2ARC device header. ! */ ! typedef struct l2arc_dev_hdr_phys { ! uint64_t l2dh_magic; ! zio_cksum_t l2dh_self_cksum; /* fletcher4 of fields below */ ! ! /* ! * Global L2ARC device state and metadata. ! */ ! uint64_t l2dh_spa_guid; ! uint64_t l2dh_evict_tail; /* current evict pointer */ ! uint64_t l2dh_alloc_space; /* vdev space alloc status */ ! uint64_t l2dh_flags; /* l2arc_dev_hdr_flags_t */ ! ! /* ! * Start of log block chain. [0] -> newest log, [1] -> one older (used ! * for initiating prefetch). ! */ ! l2arc_log_blk_ptr_t l2dh_start_lbps[2]; ! ! const uint64_t l2dh_pad[43]; /* pad to 512 bytes */ ! } l2arc_dev_hdr_phys_t; ! CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); ! ! /* ! * A single ARC buffer header entry in a l2arc_log_blk_phys_t. ! */ ! typedef struct l2arc_log_ent_phys { ! dva_t l2le_dva; /* dva of buffer */ ! uint64_t l2le_birth; /* birth txg of buffer */ ! uint64_t l2le_cksum0; ! zio_cksum_t l2le_freeze_cksum; ! /* ! * l2le_prop is the same format as the blk_prop in blkptr_t: ! * * logical size (in sectors) ! * * physical (compressed) size (in sectors) ! * * compression algorithm ! * * checksum algorithm (used for cksum0) ! * * object type & level (used to restore arc_buf_contents_t) ! */ ! uint64_t l2le_prop; ! uint64_t l2le_daddr; /* buf location on l2dev */ ! const uint64_t l2le_pad[6]; /* resv'd for future use */ ! } l2arc_log_ent_phys_t; ! ! /* ! * These design limits give us the following overhead (before compression): ! * avg_blk_sz overhead ! * 1k 12.51 % ! * 2k 6.26 % ! * 4k 3.13 % ! * 8k 1.56 % ! * 16k 0.78 % ! * 32k 0.39 % ! * 64k 0.20 % ! * 128k 0.10 % ! * Compression should be able to sequeeze these down by about a factor of 2x. ! */ ! #define L2ARC_LOG_BLK_SIZE (128 * 1024) /* 128k */ ! #define L2ARC_LOG_BLK_HEADER_LEN (128) ! #define L2ARC_LOG_BLK_ENTRIES /* 1023 entries */ \ ! ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) / \ ! sizeof (l2arc_log_ent_phys_t)) ! /* ! * Maximum amount of data in an l2arc log block (used to terminate rebuilding ! * before we hit the write head and restore potentially corrupted blocks). ! */ ! #define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE \ ! (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES) ! /* ! * For the persistency and rebuild algorithms to operate reliably we need ! * the L2ARC device to at least be able to hold 3 full log blocks (otherwise ! * excessive log block looping might confuse the log chain end detection). ! * Under normal circumstances this is not a problem, since this is somewhere ! * around only 400 MB. ! */ ! #define L2ARC_PERSIST_MIN_SIZE (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE) ! ! /* ! * A log block of up to 1023 ARC buffer log entries, chained into the ! * persistent L2ARC metadata linked list. ! */ ! typedef struct l2arc_log_blk_phys { ! /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */ ! uint64_t l2lb_magic; ! l2arc_log_blk_ptr_t l2lb_back2_lbp; /* back 2 steps in chain */ ! uint64_t l2lb_pad[9]; /* resv'd for future use */ ! /* Payload */ ! l2arc_log_ent_phys_t l2lb_entries[L2ARC_LOG_BLK_ENTRIES]; ! } l2arc_log_blk_phys_t; ! ! CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE); ! CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) - ! offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN); ! ! /* ! * These structures hold in-flight l2arc_log_blk_phys_t's as they're being ! * written to the L2ARC device. They may be compressed, hence the uint8_t[]. ! */ ! typedef struct l2arc_log_blk_buf { ! uint8_t l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)]; ! list_node_t l2lbb_node; ! } l2arc_log_blk_buf_t; ! ! /* Macros for the manipulation fields in the blk_prop format of blkptr_t */ ! #define BLKPROP_GET_LSIZE(_obj, _field) \ ! BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1) ! #define BLKPROP_SET_LSIZE(_obj, _field, x) \ ! BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x) ! #define BLKPROP_GET_PSIZE(_obj, _field) \ ! BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1) ! #define BLKPROP_SET_PSIZE(_obj, _field, x) \ ! BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x) ! #define BLKPROP_GET_COMPRESS(_obj, _field) \ ! BF64_GET((_obj)->_field, 32, 8) ! #define BLKPROP_SET_COMPRESS(_obj, _field, x) \ ! BF64_SET((_obj)->_field, 32, 8, x) ! #define BLKPROP_GET_CHECKSUM(_obj, _field) \ ! BF64_GET((_obj)->_field, 40, 8) ! #define BLKPROP_SET_CHECKSUM(_obj, _field, x) \ ! BF64_SET((_obj)->_field, 40, 8, x) ! #define BLKPROP_GET_TYPE(_obj, _field) \ ! BF64_GET((_obj)->_field, 48, 8) ! #define BLKPROP_SET_TYPE(_obj, _field, x) \ ! BF64_SET((_obj)->_field, 48, 8, x) ! ! /* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */ ! #define LBP_GET_LSIZE(_add) BLKPROP_GET_LSIZE(_add, l2lbp_prop) ! #define LBP_SET_LSIZE(_add, x) BLKPROP_SET_LSIZE(_add, l2lbp_prop, x) ! #define LBP_GET_PSIZE(_add) BLKPROP_GET_PSIZE(_add, l2lbp_prop) ! #define LBP_SET_PSIZE(_add, x) BLKPROP_SET_PSIZE(_add, l2lbp_prop, x) ! #define LBP_GET_COMPRESS(_add) BLKPROP_GET_COMPRESS(_add, l2lbp_prop) ! #define LBP_SET_COMPRESS(_add, x) BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \ ! x) ! #define LBP_GET_CHECKSUM(_add) BLKPROP_GET_CHECKSUM(_add, l2lbp_prop) ! #define LBP_SET_CHECKSUM(_add, x) BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \ ! x) ! #define LBP_GET_TYPE(_add) BLKPROP_GET_TYPE(_add, l2lbp_prop) ! #define LBP_SET_TYPE(_add, x) BLKPROP_SET_TYPE(_add, l2lbp_prop, x) ! ! /* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */ ! #define LE_GET_LSIZE(_le) BLKPROP_GET_LSIZE(_le, l2le_prop) ! #define LE_SET_LSIZE(_le, x) BLKPROP_SET_LSIZE(_le, l2le_prop, x) ! #define LE_GET_PSIZE(_le) BLKPROP_GET_PSIZE(_le, l2le_prop) ! #define LE_SET_PSIZE(_le, x) BLKPROP_SET_PSIZE(_le, l2le_prop, x) ! #define LE_GET_COMPRESS(_le) BLKPROP_GET_COMPRESS(_le, l2le_prop) ! #define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x) ! #define LE_GET_CHECKSUM(_le) BLKPROP_GET_CHECKSUM(_le, l2le_prop) ! #define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x) ! #define LE_GET_TYPE(_le) BLKPROP_GET_TYPE(_le, l2le_prop) ! #define LE_SET_TYPE(_le, x) BLKPROP_SET_TYPE(_le, l2le_prop, x) ! ! #define PTR_SWAP(x, y) \ ! do { \ ! void *tmp = (x);\ ! x = y; \ ! y = tmp; \ ! _NOTE(CONSTCOND)\ ! } while (0) ! ! #define L2ARC_DEV_HDR_MAGIC 0x12bab10c00000001LLU ! #define L2ARC_LOG_BLK_MAGIC 0x120103b10c000001LLU ! #define L2ARC_REBUILD_TIMEOUT 300 /* a rebuild may take at most 300s */ ! ! struct l2arc_dev { ! vdev_t *l2ad_vdev; /* vdev */ ! spa_t *l2ad_spa; /* spa */ ! uint64_t l2ad_hand; /* next write location */ ! uint64_t l2ad_start; /* first addr on device */ ! uint64_t l2ad_end; /* last addr on device */ ! uint64_t l2ad_evict; /* last addr eviction reached */ ! boolean_t l2ad_first; /* first sweep through */ ! boolean_t l2ad_writing; /* currently writing */ ! list_t *l2ad_buflist; /* buffer list */ ! list_node_t l2ad_node; /* device list node */ ! l2arc_dev_hdr_phys_t l2ad_dev_hdr; /* persistent device header */ ! l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ ! int l2ad_log_ent_idx; /* index into cur log blk */ ! /* number of bytes in current log block's payload */ ! uint64_t l2ad_log_blk_payload_asize; ! /* flag indicating whether a rebuild is scheduled or is going on */ ! boolean_t l2ad_rebuild; ! }; ! ! /* ! * Performance tuning of L2ARC persistency: ! * ! * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at ! * pool import or when adding one manually later) will attempt ! * to rebuild L2ARC buffer contents. In special circumstances, ! * the administrator may want to set this to B_FALSE, if they ! * are having trouble importing a pool or attaching an L2ARC ! * device (e.g. the L2ARC device is slow to read in stored log ! * metadata, or the metadata has become somehow ! * fragmented/unusable). ! * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help ! * avoid a slow L2ARC device from preventing pool import. If we ! * are not done rebuilding an L2ARC device by this time, we ! * stop the rebuild and return immediately. ! */ ! boolean_t l2arc_rebuild_enabled = B_TRUE; ! uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT; ! ! /* ! * L2ARC persistency rebuild routines. ! */ ! static void l2arc_dev_rebuild_start(l2arc_dev_t *dev); ! static int l2arc_rebuild(l2arc_dev_t *dev); ! static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid, ! l2arc_log_blk_phys_t *lb, uint64_t lb_psize); ! static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, ! l2arc_dev_t *dev, uint64_t guid); ! ! /* ! * L2ARC persistency read I/O routines. ! */ ! static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr); ! static int l2arc_log_blk_read(l2arc_dev_t *dev, ! const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp, ! l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, ! uint8_t *this_lb_buf, uint8_t *next_lb_buf, ! zio_t *this_io, zio_t **next_io); ! static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, ! const l2arc_log_blk_ptr_t *lp); ! static zio_t *l2arc_log_blk_prefetch(vdev_t *vd, ! const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf); ! static void l2arc_log_blk_prefetch_abort(zio_t *zio); ! ! /* ! * L2ARC persistency write I/O routines. ! */ ! static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio); ! static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, ! l2arc_write_callback_t *cb); ! ! /* ! * L2ARC persistency auxilliary routines. ! */ ! static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, ! zio_cksum_t *cksum); ! static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, ! const arc_buf_hdr_t *ab); ! static inline boolean_t l2arc_range_check_overlap(uint64_t bottom, ! uint64_t top, uint64_t check); ! static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline); ! ! static inline uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i;
*** 1245,1255 **** } ab->b_state = new_state; /* adjust l2arc hdr stats */ if (new_state == arc_l2c_only) ! l2arc_hdr_stat_add(); else if (old_state == arc_l2c_only) l2arc_hdr_stat_remove(); } void --- 1555,1565 ---- } ab->b_state = new_state; /* adjust l2arc hdr stats */ if (new_state == arc_l2c_only) ! l2arc_hdr_stat_add(old_state != arc_anon); else if (old_state == arc_l2c_only) l2arc_hdr_stat_remove(); } void
*** 1349,1358 **** --- 1659,1695 ---- (void) refcount_add(&hdr->b_refcnt, tag); return (buf); } + /* + * Allocates an empty arc_buf_hdr structure (lacking any data buffer). + * This is used during l2arc reconstruction to make empty ARC buffers + * which circumvent the regular disk->arc->l2arc path and instead come + * into being in the reverse order, i.e. l2arc->arc->(disk). + */ + arc_buf_hdr_t * + arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type) + { + arc_buf_hdr_t *hdr; + + ASSERT3U(size, >, 0); + hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + ASSERT(BUF_EMPTY(hdr)); + hdr->b_size = size; + hdr->b_type = type; + hdr->b_spa = guid; + hdr->b_state = arc_anon; + hdr->b_arc_access = 0; + hdr->b_buf = NULL; + hdr->b_datacnt = 0; + hdr->b_flags = 0; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + + return (hdr); + } + static char *arc_onloan_tag = "onloan"; /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned
*** 1586,1596 **** if (l2hdr != NULL) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); ! kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); hdr->b_l2hdr = NULL; } --- 1923,1933 ---- if (l2hdr != NULL) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); ! kmem_free(l2hdr, sizeof (*l2hdr)); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); hdr->b_l2hdr = NULL; }
*** 3043,3052 **** --- 3380,3392 ---- hdr->b_acb = acb; hdr->b_flags |= ARC_IO_IN_PROGRESS; if (hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + /* + * Need to stash these before letting go of hash_lock + */ devw = hdr->b_l2hdr->b_dev->l2ad_writing; addr = hdr->b_l2hdr->b_daddr; b_compress = hdr->b_l2hdr->b_compress; b_asize = hdr->b_l2hdr->b_asize; /*
*** 3416,3426 **** buf->b_efunc = NULL; buf->b_private = NULL; if (l2hdr) { ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); ! kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); } } --- 3756,3766 ---- buf->b_efunc = NULL; buf->b_private = NULL; if (l2hdr) { ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); ! kmem_free(l2hdr, sizeof (*l2hdr)); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); } }
*** 4031,4040 **** --- 4371,4458 ---- * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. + * + * L2ARC persistency: + * + * When writing buffers to L2ARC, we periodically add some metadata to + * make sure we can pick them up after reboot, thus dramatically reducing + * the impact that any downtime has on the performance of storage systems + * with large caches. + * + * The implementation works fairly simply by integrating the following two + * modifications: + * + * *) Every now and then we mix in a piece of metadata (called a log block) + * into the L2ARC write. This allows us to understand what's been written, + * so that we can rebuild the arc_buf_hdr_t structures of the main ARC + * buffers. The log block also includes a "back-reference" pointer to the + * previous block, forming a back-linked list of blocks on the L2ARC device. + * + * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device + * for our header bookkeeping purposes. This contains a device header, which + * contains our top-level reference structures. We update it each time we + * write a new log block, so that we're able to locate it in the L2ARC + * device. If this write results in an inconsistent device header (e.g. due + * to power failure), we detect this by verifying the header's checksum + * and simply drop the entries from L2ARC. + * + * Implementation diagram: + * + * +=== L2ARC device (not to scale) ======================================+ + * | __________newest log block pointers_________ | + * | / \1 back \latest | + * | / V V | + * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| + * | ^ / ^ / ^ / | + * | `-prev-' `-prev-' `-prev-' | + * | lb lb lb | + * +======================================================================+ + * + * On-device data structures: + * + * L2ARC device header: l2arc_dev_hdr_phys_t + * L2ARC log block: l2arc_log_blk_phys_t + * + * L2ARC reconstruction: + * + * When writing data, we simply write in the standard rotary fashion, + * evicting buffers as we go and simply writing new data over them (writing + * a new log block every now and then). This obviously means that once we + * loop around the end of the device, we will start cutting into an already + * committed log block (and its referenced data buffers), like so: + * + * current write head__ __old tail + * \ / + * V V + * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> + * ^ ^^^^^^^^^___________________________________ + * | \ + * <<nextwrite>> may overwrite this blk and/or its bufs --' + * + * When importing the pool, we detect this situation and use it to stop + * our scanning process (see l2arc_rebuild). + * + * There is one significant caveat to consider when rebuilding ARC contents + * from an L2ARC device: what about invalidated buffers? Given the above + * construction, we cannot update blocks which we've already written to amend + * them to remove buffers which were invalidated. Thus, during reconstruction, + * we might be populating the cache with buffers for data that's not on the + * main pool anymore, or may have been overwritten! + * + * As it turns out, this isn't a problem. Every arc_read request includes + * both the DVA and, crucially, the birth TXG of the BP the caller is + * looking for. So even if the cache were populated by completely rotten + * blocks for data that had been long deleted and/or overwritten, we'll + * never actually return bad data from the cache, since the DVA with the + * birth TXG uniquely identify a block in space and time - once created, + * a block is immutable on disk. The worst thing we have done is wasted + * some time and memory at l2arc rebuild to reconstruct outdated ARC + * entries that will get dropped from the l2arc as it is being updated + * with new blocks. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) {
*** 4097,4109 **** return (next); } static void ! l2arc_hdr_stat_add(void) { ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); } static void l2arc_hdr_stat_remove(void) --- 4515,4528 ---- return (next); } static void ! l2arc_hdr_stat_add(boolean_t from_arc) { ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + if (from_arc) ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); } static void l2arc_hdr_stat_remove(void)
*** 4134,4144 **** goto out; first = NULL; next = l2arc_dev_last; do { ! /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) --- 4553,4566 ---- goto out; first = NULL; next = l2arc_dev_last; do { ! /* ! * Loop around the list looking for a non-faulted vdev ! * and one that isn't currently doing an L2ARC rebuild. ! */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL)
*** 4149,4162 **** if (first == NULL) first = next; else if (next == first) break; ! } while (vdev_is_dead(next->l2ad_vdev)); /* if we were unable to find any usable vdevs, return NULL */ ! if (vdev_is_dead(next->l2ad_vdev)) next = NULL; l2arc_dev_last = next; out: --- 4571,4584 ---- if (first == NULL) first = next; else if (next == first) break; ! } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild); /* if we were unable to find any usable vdevs, return NULL */ ! if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild) next = NULL; l2arc_dev_last = next; out:
*** 4206,4217 **** { l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *ab, *ab_prev; ! l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; cb = zio->io_private; ASSERT(cb != NULL); dev = cb->l2wcb_dev; ASSERT(dev != NULL); --- 4628,4640 ---- { l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *ab, *ab_prev; ! l2arc_buf_hdr_t *l2hdr; kmutex_t *hash_lock; + l2arc_log_blk_buf_t *lb_buf; cb = zio->io_private; ASSERT(cb != NULL); dev = cb->l2wcb_dev; ASSERT(dev != NULL);
*** 4230,4240 **** --- 4653,4670 ---- /* * All writes completed, or an error was hit. */ for (ab = list_prev(buflist, head); ab; ab = ab_prev) { ab_prev = list_prev(buflist, ab); + l2hdr = ab->b_l2hdr; + /* + * Release the temporary compressed buffer as soon as possible. + */ + if (l2hdr->b_compress != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(ab); + hash_lock = HDR_LOCK(ab); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage * of eviction. Its ARC_L2_WRITING flag will be
*** 4242,4267 **** */ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); continue; } - abl2 = ab->b_l2hdr; - - /* - * Release the temporary compressed buffer as soon as possible. - */ - if (abl2->b_compress != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(ab); - if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, ab); ! ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); ab->b_l2hdr = NULL; ! kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } /* * Allow ARC to begin reads to this L2ARC entry. --- 4672,4689 ---- */ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); continue; } if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, ab); ! ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); ab->b_l2hdr = NULL; ! kmem_free(l2hdr, sizeof (*l2hdr)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } /* * Allow ARC to begin reads to this L2ARC entry.
*** 4276,4285 **** --- 4698,4713 ---- kmem_cache_free(hdr_cache, head); mutex_exit(&l2arc_buflist_mtx); l2arc_do_free_on_write(); + for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL; + lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) { + (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list); + kmem_free(lb_buf, sizeof (*lb_buf)); + } + list_destroy(&cb->l2wcb_log_blk_buf_list); kmem_free(cb, sizeof (l2arc_write_callback_t)); } /* * A read to a cache device completed. Validate buffer contents before
*** 4399,4418 **** mutex_enter(*lock); return (list); } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; ! l2arc_buf_hdr_t *abl2; arc_buf_hdr_t *ab, *ab_prev; kmutex_t *hash_lock; uint64_t taddr; buflist = dev->l2ad_buflist; --- 4827,4859 ---- mutex_enter(*lock); return (list); } /* + * Calculates the maximum overhead of L2ARC metadata log blocks for a given + * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this + * overhead in processing to make sure there is enough headroom available + * when writing buffers. + */ + static inline uint64_t + l2arc_log_blk_overhead(uint64_t write_sz) + { + return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) * + L2ARC_LOG_BLK_SIZE; + } + + /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; ! l2arc_buf_hdr_t *l2hdr; arc_buf_hdr_t *ab, *ab_prev; kmutex_t *hash_lock; uint64_t taddr; buflist = dev->l2ad_buflist;
*** 4426,4435 **** --- 4867,4880 ---- * nothing to evict. */ return; } + /* + * We need to add in the worst case scenario of log block overhead. + */ + distance += l2arc_log_blk_overhead(distance); if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { /* * When nearing the end of the device, evict to the end * before the device write hand jumps to the start. */
*** 4508,4521 **** /* * Tell ARC this no longer exists in L2ARC. */ if (ab->b_l2hdr != NULL) { ! abl2 = ab->b_l2hdr; ! ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); ab->b_l2hdr = NULL; ! kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } list_remove(buflist, ab); /* --- 4953,4966 ---- /* * Tell ARC this no longer exists in L2ARC. */ if (ab->b_l2hdr != NULL) { ! l2hdr = ab->b_l2hdr; ! ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); ab->b_l2hdr = NULL; ! kmem_free(l2hdr, sizeof (*l2hdr)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } list_remove(buflist, ab); /*
*** 4547,4573 **** l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { arc_buf_hdr_t *ab, *ab_prev, *head; list_t *list; ! uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; void *buf_data; kmutex_t *list_lock; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); const boolean_t do_headroom_boost = *headroom_boost; ASSERT(dev->l2ad_vdev != NULL); /* Lower the flag now, we might want to raise it again later. */ *headroom_boost = B_FALSE; pio = NULL; ! write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); head->b_flags |= ARC_L2_WRITE_HEAD; /* --- 4992,5034 ---- l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { arc_buf_hdr_t *ab, *ab_prev, *head; list_t *list; ! /* ! * These variables mean: ! * - write_size: in-memory size of ARC buffers we've written (before ! * compression). ! * - write_asize: actual on-disk size of ARC buffers we've written ! * (after compression). ! * - write_aligned_asize: actual sum of space taken by ARC buffers ! * on the device (after compression and alignment, so that ! * every buffer starts on a multiple of the device block size). ! * - headroom: L2ARC scanning headroom (we won't scan beyond this ! * distance from the list tail). ! * - buf_compress_minsz: minimum in-memory ARC buffer size for us ! * to try compressing it. ! */ ! uint64_t write_size, write_asize, write_aligned_asize, headroom, buf_compress_minsz; void *buf_data; kmutex_t *list_lock; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); const boolean_t do_headroom_boost = *headroom_boost; + boolean_t dev_hdr_update = B_FALSE; ASSERT(dev->l2ad_vdev != NULL); /* Lower the flag now, we might want to raise it again later. */ *headroom_boost = B_FALSE; pio = NULL; ! cb = NULL; ! write_size = write_asize = write_aligned_asize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); head->b_flags |= ARC_L2_WRITE_HEAD; /*
*** 4601,4611 **** headroom = (headroom * l2arc_headroom_boost) / 100; for (; ab; ab = ab_prev) { l2arc_buf_hdr_t *l2hdr; kmutex_t *hash_lock; ! uint64_t buf_sz; if (arc_warm == B_FALSE) ab_prev = list_next(list, ab); else ab_prev = list_prev(list, ab); --- 5062,5072 ---- headroom = (headroom * l2arc_headroom_boost) / 100; for (; ab; ab = ab_prev) { l2arc_buf_hdr_t *l2hdr; kmutex_t *hash_lock; ! uint64_t buf_aligned_size; if (arc_warm == B_FALSE) ab_prev = list_next(list, ab); else ab_prev = list_prev(list, ab);
*** 4616,4626 **** * Skip this buffer rather than waiting. */ continue; } ! passed_sz += ab->b_size; if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); --- 5077,5095 ---- * Skip this buffer rather than waiting. */ continue; } ! /* ! * When examining whether we've met our write target, ! * we must always use the aligned size of the buffer, ! * since that's the maximum amount of space a buffer ! * can take up on the L2ARC device. ! */ ! buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev, ! ab->b_size); ! passed_sz += buf_aligned_size; if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock);
*** 4630,4640 **** if (!l2arc_write_eligible(guid, ab)) { mutex_exit(hash_lock); continue; } ! if ((write_sz + ab->b_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } --- 5099,5109 ---- if (!l2arc_write_eligible(guid, ab)) { mutex_exit(hash_lock); continue; } ! if ((write_size + buf_aligned_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; }
*** 4644,4665 **** * l2arc_write_done() can find where the * write buffers begin without searching. */ list_insert_head(dev->l2ad_buflist, head); ! cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } /* * Create and add a new L2ARC header. */ ! l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); l2hdr->b_dev = dev; ab->b_flags |= ARC_L2_WRITING; /* * Temporarily stash the data buffer in b_tmp_cdata. --- 5113,5137 ---- * l2arc_write_done() can find where the * write buffers begin without searching. */ list_insert_head(dev->l2ad_buflist, head); ! cb = kmem_zalloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; + list_create(&cb->l2wcb_log_blk_buf_list, + sizeof (l2arc_log_blk_buf_t), + offsetof(l2arc_log_blk_buf_t, l2lbb_node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } /* * Create and add a new L2ARC header. */ ! l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP); l2hdr->b_dev = dev; ab->b_flags |= ARC_L2_WRITING; /* * Temporarily stash the data buffer in b_tmp_cdata.
*** 4671,4681 **** */ l2hdr->b_compress = ZIO_COMPRESS_OFF; l2hdr->b_asize = ab->b_size; l2hdr->b_tmp_cdata = ab->b_buf->b_data; - buf_sz = ab->b_size; ab->b_l2hdr = l2hdr; list_insert_head(dev->l2ad_buflist, ab); /* --- 5143,5152 ----
*** 4685,4695 **** arc_cksum_verify(ab->b_buf); arc_cksum_compute(ab->b_buf, B_TRUE); mutex_exit(hash_lock); ! write_sz += buf_sz; } mutex_exit(list_lock); if (full == B_TRUE) --- 5156,5166 ---- arc_cksum_verify(ab->b_buf); arc_cksum_compute(ab->b_buf, B_TRUE); mutex_exit(hash_lock); ! write_size += buf_aligned_size; } mutex_exit(list_lock); if (full == B_TRUE)
*** 4696,4706 **** break; } /* No buffers selected for writing? */ if (pio == NULL) { ! ASSERT0(write_sz); mutex_exit(&l2arc_buflist_mtx); kmem_cache_free(hdr_cache, head); return (0); } --- 5167,5177 ---- break; } /* No buffers selected for writing? */ if (pio == NULL) { ! ASSERT0(write_size); mutex_exit(&l2arc_buflist_mtx); kmem_cache_free(hdr_cache, head); return (0); }
*** 4741,4751 **** buf_data = l2hdr->b_tmp_cdata; buf_sz = l2hdr->b_asize; /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { ! uint64_t buf_p_sz; wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); --- 5212,5222 ---- buf_data = l2hdr->b_tmp_cdata; buf_sz = l2hdr->b_asize; /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { ! uint64_t buf_aligned_asize; wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
*** 4756,4785 **** write_asize += buf_sz; /* * Keep the clock hand suitably device-aligned. */ ! buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); ! write_psize += buf_p_sz; ! dev->l2ad_hand += buf_p_sz; } - } mutex_exit(&l2arc_buflist_mtx); ! ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ! ARCSTAT_INCR(arcstat_l2_size, write_sz); ! ARCSTAT_INCR(arcstat_l2_asize, write_asize); ! vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ ! if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { vdev_space_update(dev->l2ad_vdev, dev->l2ad_end - dev->l2ad_hand, 0, 0); dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; --- 5227,5267 ---- write_asize += buf_sz; /* * Keep the clock hand suitably device-aligned. */ ! buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev, ! buf_sz); ! write_aligned_asize += buf_aligned_asize; ! dev->l2ad_hand += buf_aligned_asize; ! ASSERT(dev->l2ad_hand <= dev->l2ad_evict || ! dev->l2ad_first); } + if (l2arc_log_blk_insert(dev, ab)) { + l2arc_log_blk_commit(dev, pio, cb); + dev_hdr_update = B_TRUE; + } + } mutex_exit(&l2arc_buflist_mtx); ! if (dev_hdr_update) ! l2arc_dev_hdr_update(dev, pio); ! ! VERIFY3U(write_aligned_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ! ARCSTAT_INCR(arcstat_l2_size, write_size); ! ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize); ! vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ ! if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >= ! dev->l2ad_end) { vdev_space_update(dev->l2ad_vdev, dev->l2ad_end - dev->l2ad_hand, 0, 0); dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE;
*** 5037,5065 **** } boolean_t l2arc_vdev_present(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); ! return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already ! * validated the vdev and opened it. */ void ! l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd)); --- 5519,5557 ---- } boolean_t l2arc_vdev_present(vdev_t *vd) { + return (l2arc_vdev_get(vd) != NULL); + } + + static l2arc_dev_t * + l2arc_vdev_get(vdev_t *vd) + { l2arc_dev_t *dev; + boolean_t held = MUTEX_HELD(&l2arc_dev_mtx); + if (!held) mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } + if (!held) mutex_exit(&l2arc_dev_mtx); ! return (dev); } /* * Add a vdev for use by the L2ARC. By this point the spa has already ! * validated the vdev and opened it. The `rebuild' flag indicates whether ! * we should attempt an L2ARC persistency rebuild. */ void ! l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd));
*** 5067,5077 **** * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; ! adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; --- 5559,5570 ---- * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; ! /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */ ! adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE;
*** 5090,5099 **** --- 5583,5602 ---- * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); + if (rebuild && l2arc_rebuild_enabled && + adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) { + /* + * Just mark the device as pending for a rebuild. We won't + * be starting a rebuild in line here as it would block pool + * import. Instead spa_load_impl will hand that off to an + * async task which will call l2arc_spa_rebuild_start. + */ + adddev->l2ad_rebuild = B_TRUE; + } mutex_exit(&l2arc_dev_mtx); } /* * Remove a vdev from the L2ARC.
*** 5196,5201 **** --- 5699,6429 ---- cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); + } + + /* + * Punches out rebuild threads for the L2ARC devices in a spa. This should + * be called as one of the final steps of a pool import. + */ + void + l2arc_spa_rebuild_start(spa_t *spa) + { + l2arc_dev_t *dev; + /* + * Locate the spa's l2arc devices and kick off rebuild threads. + */ + mutex_enter(&l2arc_dev_mtx); + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + ASSERT(dev != NULL); + if (dev->l2ad_rebuild) { + (void) thread_create(NULL, 0, l2arc_dev_rebuild_start, + dev, 0, &p0, TS_RUN, minclsyspri); + } + } + mutex_exit(&l2arc_dev_mtx); + } + + /* + * Main entry point for L2ARC rebuilding. + */ + static void + l2arc_dev_rebuild_start(l2arc_dev_t *dev) + { + spa_t *spa = dev->l2ad_spa; + vdev_t *vd = dev->l2ad_vdev; + + /* Lock out device removal. */ + spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); + ASSERT(dev->l2ad_rebuild); + (void) l2arc_rebuild(dev); + dev->l2ad_rebuild = B_FALSE; + spa_config_exit(spa, SCL_L2ARC, vd); + thread_exit(); + } + + /* + * This function implements the actual L2ARC metadata rebuild. It: + * + * 1) reads the device's header + * 2) if a good device header is found, starts reading the log block chain + * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's) + * + * Operation stops under any of the following conditions: + * + * 1) We reach the end of the log blk chain (the back-reference in the blk is + * invalid or loops over our starting point). + * 2) We encounter *any* error condition (cksum errors, io errors, looped + * blocks, etc.). + * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect + * from making severely fragmented L2ARC log blocks or slow L2ARC devices + * prevent a machine from finishing a pool import (and thus letting the + * administrator take corrective action, e.g. by kicking the misbehaving + * L2ARC device out of the pool, or by reimporting the pool with L2ARC + * rebuilding disabled). + */ + static int + l2arc_rebuild(l2arc_dev_t *dev) + { + int err; + l2arc_log_blk_phys_t *this_lb, *next_lb; + uint8_t *this_lb_buf, *next_lb_buf; + zio_t *this_io = NULL, *next_io = NULL; + int64_t deadline; + l2arc_log_blk_ptr_t lb_ptrs[2]; + boolean_t first_pass; + uint64_t load_guid; + + load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa); + deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout; + /* + * Device header processing phase. + */ + if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) { + /* device header corrupted, start a new one */ + bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr)); + return (err); + } + if (l2arc_check_rebuild_timeout_hit(deadline)) + return (SET_ERROR(ETIMEDOUT)); + + /* Retrieve the persistent L2ARC device state */ + dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail; + dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev, + dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr + + LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0])); + dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags & + L2ARC_DEV_HDR_EVICT_FIRST); + + /* Prepare the rebuild processing state */ + bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs)); + this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP); + next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP); + this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP); + next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP); + first_pass = B_TRUE; + + /* Start the rebuild process */ + for (;;) { + if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0])) + /* We hit an invalid block address, end the rebuild. */ + break; + + if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1], + this_lb, next_lb, this_lb_buf, next_lb_buf, + this_io, &next_io)) != 0) + break; + + /* Protection against infinite loops of log blocks. */ + if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr, + lb_ptrs[0].l2lbp_daddr, + dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) && + !first_pass) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors); + err = SET_ERROR(ELOOP); + break; + } + + /* + * Our memory pressure valve. If the system is running low + * on memory, rather than swamping memory with new ARC buf + * hdrs, we opt not to rebuild the L2ARC. At this point, + * however, we have already set up our L2ARC dev to chain in + * new metadata log blk, so the user may choose to re-add the + * L2ARC dev at a later time to reconstruct it (when there's + * less memory pressure). + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); + cmn_err(CE_NOTE, "System running low on memory, " + "aborting L2ARC rebuild."); + err = SET_ERROR(ENOMEM); + break; + } + + /* + * Now that we know that the next_lb checks out alright, we + * can start reconstruction from this lb - we can be sure + * that the L2ARC write hand has not yet reached any of our + * buffers. + */ + l2arc_log_blk_restore(dev, load_guid, this_lb, + LBP_GET_PSIZE(&lb_ptrs[0])); + + /* + * End of list detection. We can look ahead two steps in the + * blk chain and if the 2nd blk from this_lb dips below the + * initial chain starting point, then we know two things: + * 1) it can't be valid, and + * 2) the next_lb's ARC entries might have already been + * partially overwritten and so we should stop before + * we restore it + */ + if (l2arc_range_check_overlap( + this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr, + dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) && + !first_pass) + break; + + /* log blk restored, continue with next one in the list */ + lb_ptrs[0] = lb_ptrs[1]; + lb_ptrs[1] = this_lb->l2lb_back2_lbp; + PTR_SWAP(this_lb, next_lb); + PTR_SWAP(this_lb_buf, next_lb_buf); + this_io = next_io; + next_io = NULL; + first_pass = B_FALSE; + + if (l2arc_check_rebuild_timeout_hit(deadline)) { + err = SET_ERROR(ETIMEDOUT); + break; + } + } + if (next_io != NULL) + l2arc_log_blk_prefetch_abort(next_io); + kmem_free(this_lb, sizeof (*this_lb)); + kmem_free(next_lb, sizeof (*next_lb)); + kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t)); + kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t)); + if (err == 0) + ARCSTAT_BUMP(arcstat_l2_rebuild_successes); + + return (err); + } + + /* + * Restores the payload of a log blk to ARC. This creates empty ARC hdr + * entries which only contain an l2arc hdr, essentially restoring the + * buffers to their L2ARC evicted state. This function also updates space + * usage on the L2ARC vdev to make sure it tracks restored buffers. + */ + static void + l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid, + l2arc_log_blk_phys_t *lb, uint64_t lb_psize) + { + uint64_t size = 0, psize = 0; + + mutex_enter(&l2arc_buflist_mtx); + + for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) { + /* + * Restore goes in the reverse direction to preserve correct + * temporal ordering of buffers in the l2ad_buflist. + */ + l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid); + size += LE_GET_LSIZE(&lb->l2lb_entries[i]); + psize += LE_GET_PSIZE(&lb->l2lb_entries[i]); + } + mutex_exit(&l2arc_buflist_mtx); + + /* + * Record rebuild stats: + * size In-memory size of restored buffer data in ARC + * psize Physical size of restored buffers in the L2ARC + * bufs # of ARC buffer headers restored + * log_blks # of L2ARC log entries processed during restore + */ + ARCSTAT_INCR(arcstat_l2_rebuild_size, size); + ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize); + ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES); + ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize); + vdev_space_update(dev->l2ad_vdev, psize, 0, 0); + } + + /* + * Restores a single ARC buf hdr from a log block. The ARC buffer is put + * into a state indicating that it has been evicted to L2ARC. + */ + static void + l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev, + uint64_t load_guid) + { + arc_buf_hdr_t *hdr, *exists; + kmutex_t *hash_lock; + arc_buf_contents_t type = LE_GET_TYPE(le); + l2arc_buf_hdr_t *l2hdr; + + hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type); + hdr->b_dva = le->l2le_dva; + hdr->b_birth = le->l2le_birth; + hdr->b_cksum0 = le->l2le_cksum0; + hdr->b_size = LE_GET_LSIZE(le); + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* Buffer was already cached, no need to restore it. */ + mutex_exit(hash_lock); + arc_hdr_destroy(hdr); + ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); + return; + } + hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE; + if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF) + hdr->b_flags |= ARC_L2COMPRESS; + mutex_enter(&hdr->b_freeze_lock); + ASSERT(hdr->b_freeze_cksum == NULL); + hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); + *hdr->b_freeze_cksum = le->l2le_freeze_cksum; + mutex_exit(&hdr->b_freeze_lock); + + /* now rebuild the l2arc entry */ + ASSERT(hdr->b_l2hdr == NULL); + l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP); + l2hdr->b_dev = dev; + l2hdr->b_daddr = le->l2le_daddr; + l2hdr->b_asize = LE_GET_PSIZE(le); + l2hdr->b_compress = LE_GET_COMPRESS(le); + hdr->b_l2hdr = l2hdr; + list_insert_tail(dev->l2ad_buflist, hdr); + ARCSTAT_INCR(arcstat_l2_size, hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize); + + arc_change_state(arc_l2c_only, hdr, hash_lock); + mutex_exit(hash_lock); + } + + /* + * Attempts to read the device header on the provided L2ARC device and writes + * it to `ub'. On success, this function returns 0, otherwise the appropriate + * error code is returned. + */ + static int + l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr) + { + int err; + uint64_t guid; + zio_cksum_t cksum; + + guid = spa_guid(dev->l2ad_vdev->vdev_spa); + + if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr, + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); + return (err); + } + + if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(hdr, sizeof (*hdr)); + + if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC || + hdr->l2dh_spa_guid != guid) { + /* + * Attempt to rebuild a device containing no actual dev hdr + * or containing a header from some other pool. + */ + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); + return (SET_ERROR(ENOTSUP)); + } + + l2arc_dev_hdr_checksum(hdr, &cksum); + if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors); + return (SET_ERROR(EINVAL)); + } + if (hdr->l2dh_evict_tail < dev->l2ad_start || + hdr->l2dh_evict_tail >= dev->l2ad_end) { + /* Data in dev hdr is invalid for this device. */ + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); + return (SET_ERROR(EINVAL)); + } + + return (0); + } + + /* + * Reads L2ARC log blocks from storage and validates their contents. + * + * This function implements a simple prefetcher to make sure that while + * we're processing one buffer the L2ARC is already prefetching the next + * one in the chain. + * + * The arguments this_lp and next_lp point to the current and next log blk + * address in the block chain. Similarly, this_lb and next_lb hold the + * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf + * and next_lb_buf must be buffers of appropriate to hold a raw + * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior + * to buffer decompression). + * + * The `this_io' and `next_io' arguments are used for block prefetching. + * When issuing the first blk IO during rebuild, you should pass NULL for + * `this_io'. This function will then issue a sync IO to read the block and + * also issue an async IO to fetch the next block in the block chain. The + * prefetch IO is returned in `next_io'. On subsequent calls to this + * function, pass the value returned in `next_io' from the previous call + * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO. + * Prior to the call, you should initialize your `next_io' pointer to be + * NULL. If no prefetch IO was issued, the pointer is left set at NULL. + * + * On success, this function returns 0, otherwise it returns an appropriate + * error code. On error the prefetching IO is aborted and cleared before + * returning from this function. Therefore, if we return `success', the + * caller can assume that we have taken care of cleanup of prefetch IOs. + */ + static int + l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + uint8_t *this_lb_buf, uint8_t *next_lb_buf, + zio_t *this_io, zio_t **next_io) + { + int err = 0; + zio_cksum_t cksum; + + ASSERT(this_lbp != NULL && next_lbp != NULL); + ASSERT(this_lb != NULL && next_lb != NULL); + ASSERT(this_lb_buf != NULL && next_lb_buf != NULL); + ASSERT(next_io != NULL && *next_io == NULL); + ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp)); + + /* + * Check to see if we have issued the IO for this log blk in a + * previous run. If not, this is the first call, so issue it now. + */ + if (this_io == NULL) { + this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp, + this_lb_buf); + } + + /* + * Peek to see if we can start issuing the next IO immediately. + */ + if (l2arc_log_blk_ptr_valid(dev, next_lbp)) { + /* + * Start issuing IO for the next log blk early - this + * should help keep the L2ARC device busy while we + * decompress and restore this log blk. + */ + *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp, + next_lb_buf); + } + + /* Wait for the IO to read this log block to complete */ + if ((err = zio_wait(this_io)) != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); + goto cleanup; + } + + /* Make sure the buffer checks out */ + fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors); + err = SET_ERROR(EINVAL); + goto cleanup; + } + + /* Now we can take our time decoding this buffer */ + switch (LBP_GET_COMPRESS(this_lbp)) { + case ZIO_COMPRESS_OFF: + bcopy(this_lb_buf, this_lb, sizeof (*this_lb)); + break; + case ZIO_COMPRESS_LZ4: + if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp), + this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp), + sizeof (*this_lb))) != 0) { + err = SET_ERROR(EINVAL); + goto cleanup; + } + break; + default: + err = SET_ERROR(EINVAL); + goto cleanup; + } + if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(this_lb, sizeof (*this_lb)); + if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) { + err = SET_ERROR(EINVAL); + goto cleanup; + } + cleanup: + /* Abort an in-flight prefetch I/O in case of error */ + if (err != 0 && *next_io != NULL) { + l2arc_log_blk_prefetch_abort(*next_io); + *next_io = NULL; + } + return (err); + } + + /* + * Validates an L2ARC log blk address to make sure that it can be read + * from the provided L2ARC device. Returns B_TRUE if the address is + * within the device's bounds, or B_FALSE if not. + */ + static boolean_t + l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp) + { + uint64_t psize = LBP_GET_PSIZE(lbp); + uint64_t end = lbp->l2lbp_daddr + psize; + + /* + * A log block is valid if all of the following conditions are true: + * - it fits entirely between l2ad_start and l2ad_end + * - it has a valid size + * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it + * doesn't sit in the evicted region) + */ + return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end && + psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) && + lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand); + } + + /* + * Starts an asynchronous read IO to read a log block. This is used in log + * block reconstruction to start reading the next block before we are done + * decoding and reconstructing the current block, to keep the l2arc device + * nice and hot with read IO to process. + * The returned zio will contain a newly allocated memory buffers for the IO + * data which should then be freed by the caller once the zio is no longer + * needed (i.e. due to it having completed). If you wish to abort this + * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes + * care of disposing of the allocated buffers correctly. + */ + static zio_t * + l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp, + uint8_t *lb_buf) + { + uint32_t psize; + zio_t *pio; + + psize = LBP_GET_PSIZE(lbp); + ASSERT(psize <= sizeof (l2arc_log_blk_phys_t)); + pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); + (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize, + lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); + + return (pio); + } + + /* + * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data + * buffers allocated for it. + */ + static void + l2arc_log_blk_prefetch_abort(zio_t *zio) + { + (void) zio_wait(zio); + } + + /* + * Creates a zio to update the device header on an l2arc device. The zio is + * initiated as a child of `pio'. + */ + static void + l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio) + { + zio_t *wzio; + vdev_stat_t st; + l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr; + + vdev_get_stats(dev->l2ad_vdev, &st); + + hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC; + hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); + hdr->l2dh_evict_tail = dev->l2ad_evict; + hdr->l2dh_alloc_space = st.vs_alloc; + hdr->l2dh_flags = 0; + if (dev->l2ad_first) + hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; + + /* checksum operation goes last */ + l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum); + + CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE && + sizeof (*hdr) <= SPA_MAXBLOCKSIZE); + wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, + sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL, + NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + (void) zio_nowait(wzio); + } + + /* + * Commits a log block to the L2ARC device. This routine is invoked from + * l2arc_write_buffers when the log block fills up. + * This function allocates some memory to temporarily hold the serialized + * buffer to be written. This is then released in l2arc_write_done. + */ + static void + l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, + l2arc_write_callback_t *cb) + { + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + uint64_t psize, asize; + l2arc_log_blk_buf_t *lb_buf; + zio_t *wzio; + + VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES); + + /* link the buffer into the block chain */ + lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1]; + lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC; + + /* try to compress the buffer */ + lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP); + list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf); + VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb, + lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0); + + /* + * Update the start log blk pointer in the device header to point + * to the log block we're about to write. + */ + dev->l2ad_dev_hdr.l2dh_start_lbps[1] = + dev->l2ad_dev_hdr.l2dh_start_lbps[0]; + dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand; + LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb)); + LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize); + LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], + ZIO_CHECKSUM_FLETCHER_4); + LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0); + if (psize < sizeof (*lb)) { + /* compression succeeded */ + LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], + ZIO_COMPRESS_LZ4); + } else { + /* compression failed */ + bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb)); + LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], + ZIO_COMPRESS_OFF); + } + /* checksum what we're about to write */ + fletcher_4_native(lb_buf->l2lbb_log_blk, psize, + &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum); + + /* perform the write itself */ + CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE && + L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE); + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, + psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + (void) zio_nowait(wzio); + + /* realign the device hand */ + asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + dev->l2ad_hand += asize; + VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + /* bump the kstats */ + ARCSTAT_INCR(arcstat_l2_write_bytes, psize); + ARCSTAT_BUMP(arcstat_l2_log_blk_writes); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, + dev->l2ad_log_blk_payload_asize / asize); + + dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0; + } + + /* + * Computes the checksum of `hdr' and stores it in `cksum'. + */ + static void + l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum) + { + fletcher_4_native((uint8_t *)hdr + + offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid), + sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid), + cksum); + } + + /* + * Inserts ARC buffer `ab' into the current L2ARC log blk on the device. + * The buffer being inserted must be present in L2ARC. + * Returns B_TRUE if the L2ARC log blk is full and needs to be committed + * to L2ARC, or B_FALSE if it still has room for more ARC buffers. + */ + static boolean_t + l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab) + { + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_log_ent_phys_t *le; + const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; + int index = dev->l2ad_log_ent_idx++; + + ASSERT(l2hdr != NULL); + ASSERT(index < L2ARC_LOG_BLK_ENTRIES); + + le = &lb->l2lb_entries[index]; + bzero(le, sizeof (*le)); + le->l2le_dva = ab->b_dva; + le->l2le_birth = ab->b_birth; + le->l2le_cksum0 = ab->b_cksum0; + le->l2le_daddr = l2hdr->b_daddr; + LE_SET_LSIZE(le, ab->b_size); + LE_SET_PSIZE(le, l2hdr->b_asize); + LE_SET_COMPRESS(le, l2hdr->b_compress); + le->l2le_freeze_cksum = *ab->b_freeze_cksum; + LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2); + LE_SET_TYPE(le, ab->b_type); + dev->l2ad_log_blk_payload_asize += l2hdr->b_asize; + + return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES); + } + + /* + * Checks whether a given L2ARC device address sits in a time-sequential + * range. The trick here is that the L2ARC is a rotary buffer, so we can't + * just do a range comparison, we need to handle the situation in which the + * range wraps around the end of the L2ARC device. Arguments: + * bottom Lower end of the range to check (written to earlier). + * top Upper end of the range to check (written to later). + * check The address for which we want to determine if it sits in + * between the top and bottom. + * + * The 3-way conditional below represents the following cases: + * + * bottom < top : Sequentially ordered case: + * <check>--------+-------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |---------------<bottom>============<top>--------------| + * + * bottom > top: Looped-around case: + * <check>--------+------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |===============<top>---------------<bottom>===========| + * ^ ^ + * | (or here?) | + * +---------------+---------<check> + * + * top == bottom : Just a single address comparison. + */ + static inline boolean_t + l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) + { + if (bottom < top) + return (bottom <= check && check <= top); + else if (bottom > top) + return (check <= top || bottom <= check); + else + return (check == top); + } + + /* + * Checks whether a rebuild timeout deadline has been hit and if it has, + * increments the appropriate error counters. + */ + static boolean_t + l2arc_check_rebuild_timeout_hit(int64_t deadline) + { + if (deadline != 0 && deadline < ddi_get_lbolt64()) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout); + cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " + "dropping remaining L2ARC metadata."); + return (B_TRUE); + } else { + return (B_FALSE); + } }