121 */
122
123 #include <sys/spa.h>
124 #include <sys/zio.h>
125 #include <sys/zio_compress.h>
126 #include <sys/zfs_context.h>
127 #include <sys/arc.h>
128 #include <sys/refcount.h>
129 #include <sys/vdev.h>
130 #include <sys/vdev_impl.h>
131 #include <sys/dsl_pool.h>
132 #ifdef _KERNEL
133 #include <sys/vmsystm.h>
134 #include <vm/anon.h>
135 #include <sys/fs/swapnode.h>
136 #include <sys/dnlc.h>
137 #endif
138 #include <sys/callb.h>
139 #include <sys/kstat.h>
140 #include <zfs_fletcher.h>
141
142 #ifndef _KERNEL
143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
144 boolean_t arc_watch = B_FALSE;
145 int arc_procfd;
146 #endif
147
148 static kmutex_t arc_reclaim_thr_lock;
149 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
150 static uint8_t arc_thread_exit;
151
152 #define ARC_REDUCE_DNLC_PERCENT 3
153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
154
155 typedef enum arc_reclaim_strategy {
156 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
157 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
158 } arc_reclaim_strategy_t;
159
160 /*
299 kstat_named_t arcstat_l2_feeds;
300 kstat_named_t arcstat_l2_rw_clash;
301 kstat_named_t arcstat_l2_read_bytes;
302 kstat_named_t arcstat_l2_write_bytes;
303 kstat_named_t arcstat_l2_writes_sent;
304 kstat_named_t arcstat_l2_writes_done;
305 kstat_named_t arcstat_l2_writes_error;
306 kstat_named_t arcstat_l2_writes_hdr_miss;
307 kstat_named_t arcstat_l2_evict_lock_retry;
308 kstat_named_t arcstat_l2_evict_reading;
309 kstat_named_t arcstat_l2_free_on_write;
310 kstat_named_t arcstat_l2_abort_lowmem;
311 kstat_named_t arcstat_l2_cksum_bad;
312 kstat_named_t arcstat_l2_io_error;
313 kstat_named_t arcstat_l2_size;
314 kstat_named_t arcstat_l2_asize;
315 kstat_named_t arcstat_l2_hdr_size;
316 kstat_named_t arcstat_l2_compress_successes;
317 kstat_named_t arcstat_l2_compress_zeros;
318 kstat_named_t arcstat_l2_compress_failures;
319 kstat_named_t arcstat_memory_throttle_count;
320 kstat_named_t arcstat_duplicate_buffers;
321 kstat_named_t arcstat_duplicate_buffers_size;
322 kstat_named_t arcstat_duplicate_reads;
323 kstat_named_t arcstat_meta_used;
324 kstat_named_t arcstat_meta_limit;
325 kstat_named_t arcstat_meta_max;
326 } arc_stats_t;
327
328 static arc_stats_t arc_stats = {
329 { "hits", KSTAT_DATA_UINT64 },
330 { "misses", KSTAT_DATA_UINT64 },
331 { "demand_data_hits", KSTAT_DATA_UINT64 },
332 { "demand_data_misses", KSTAT_DATA_UINT64 },
333 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
334 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
335 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
336 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
337 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
338 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
365 { "l2_feeds", KSTAT_DATA_UINT64 },
366 { "l2_rw_clash", KSTAT_DATA_UINT64 },
367 { "l2_read_bytes", KSTAT_DATA_UINT64 },
368 { "l2_write_bytes", KSTAT_DATA_UINT64 },
369 { "l2_writes_sent", KSTAT_DATA_UINT64 },
370 { "l2_writes_done", KSTAT_DATA_UINT64 },
371 { "l2_writes_error", KSTAT_DATA_UINT64 },
372 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
373 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
374 { "l2_evict_reading", KSTAT_DATA_UINT64 },
375 { "l2_free_on_write", KSTAT_DATA_UINT64 },
376 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
377 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
378 { "l2_io_error", KSTAT_DATA_UINT64 },
379 { "l2_size", KSTAT_DATA_UINT64 },
380 { "l2_asize", KSTAT_DATA_UINT64 },
381 { "l2_hdr_size", KSTAT_DATA_UINT64 },
382 { "l2_compress_successes", KSTAT_DATA_UINT64 },
383 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
384 { "l2_compress_failures", KSTAT_DATA_UINT64 },
385 { "memory_throttle_count", KSTAT_DATA_UINT64 },
386 { "duplicate_buffers", KSTAT_DATA_UINT64 },
387 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
388 { "duplicate_reads", KSTAT_DATA_UINT64 },
389 { "arc_meta_used", KSTAT_DATA_UINT64 },
390 { "arc_meta_limit", KSTAT_DATA_UINT64 },
391 { "arc_meta_max", KSTAT_DATA_UINT64 }
392 };
393
394 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
395
396 #define ARCSTAT_INCR(stat, val) \
397 atomic_add_64(&arc_stats.stat.value.ui64, (val))
398
399 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
400 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
401
402 #define ARCSTAT_MAX(stat, val) { \
403 uint64_t m; \
404 while ((val) > (m = arc_stats.stat.value.ui64) && \
412 /*
413 * We define a macro to allow ARC hits/misses to be easily broken down by
414 * two separate conditions, giving a total of four different subtypes for
415 * each of hits and misses (so eight statistics total).
416 */
417 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
418 if (cond1) { \
419 if (cond2) { \
420 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
421 } else { \
422 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
423 } \
424 } else { \
425 if (cond2) { \
426 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
427 } else { \
428 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
429 } \
430 }
431
432 kstat_t *arc_ksp;
433 static arc_state_t *arc_anon;
434 static arc_state_t *arc_mru;
435 static arc_state_t *arc_mru_ghost;
436 static arc_state_t *arc_mfu;
437 static arc_state_t *arc_mfu_ghost;
438 static arc_state_t *arc_l2c_only;
439
440 /*
441 * There are several ARC variables that are critical to export as kstats --
442 * but we don't want to have to grovel around in the kstat whenever we wish to
443 * manipulate them. For these variables, we therefore define them to be in
444 * terms of the statistic variable. This assures that we are not introducing
445 * the possibility of inconsistency by having shadow copies of the variables,
446 * while still allowing the code to be readable.
447 */
448 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
449 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
450 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
451 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
620 #define L2ARC_FEED_SECS 1 /* caching interval secs */
621 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
622
623 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
624 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
625
626 /* L2ARC Performance Tunables */
627 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
628 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
629 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
630 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
631 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
632 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
633 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
634 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
635 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
636
637 /*
638 * L2ARC Internals
639 */
640 typedef struct l2arc_dev {
641 vdev_t *l2ad_vdev; /* vdev */
642 spa_t *l2ad_spa; /* spa */
643 uint64_t l2ad_hand; /* next write location */
644 uint64_t l2ad_start; /* first addr on device */
645 uint64_t l2ad_end; /* last addr on device */
646 uint64_t l2ad_evict; /* last addr eviction reached */
647 boolean_t l2ad_first; /* first sweep through */
648 boolean_t l2ad_writing; /* currently writing */
649 list_t *l2ad_buflist; /* buffer list */
650 list_node_t l2ad_node; /* device list node */
651 } l2arc_dev_t;
652
653 static list_t L2ARC_dev_list; /* device list */
654 static list_t *l2arc_dev_list; /* device list pointer */
655 static kmutex_t l2arc_dev_mtx; /* device list mutex */
656 static l2arc_dev_t *l2arc_dev_last; /* last device used */
657 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
658 static list_t L2ARC_free_on_write; /* free after write buf list */
659 static list_t *l2arc_free_on_write; /* free after write list ptr */
660 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
661 static uint64_t l2arc_ndev; /* number of devices */
662
663 typedef struct l2arc_read_callback {
664 arc_buf_t *l2rcb_buf; /* read buffer */
665 spa_t *l2rcb_spa; /* spa */
666 blkptr_t l2rcb_bp; /* original blkptr */
667 zbookmark_t l2rcb_zb; /* original bookmark */
668 int l2rcb_flags; /* original flags */
669 enum zio_compress l2rcb_compress; /* applied compress */
670 } l2arc_read_callback_t;
671
672 typedef struct l2arc_write_callback {
673 l2arc_dev_t *l2wcb_dev; /* device info */
674 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
675 } l2arc_write_callback_t;
676
677 struct l2arc_buf_hdr {
678 /* protected by arc_buf_hdr mutex */
679 l2arc_dev_t *b_dev; /* L2ARC device */
680 uint64_t b_daddr; /* disk address, offset byte */
681 /* compression applied to buffer data */
682 enum zio_compress b_compress;
683 /* real alloc'd buffer size depending on b_compress applied */
684 int b_asize;
685 /* temporary buffer holder for in-flight compressed data */
686 void *b_tmp_cdata;
687 };
688
689 typedef struct l2arc_data_free {
690 /* protected by l2arc_free_on_write_mtx */
691 void *l2df_data;
692 size_t l2df_size;
693 void (*l2df_func)(void *, size_t);
694 list_node_t l2df_list_node;
695 } l2arc_data_free_t;
696
697 static kmutex_t l2arc_feed_thr_lock;
698 static kcondvar_t l2arc_feed_thr_cv;
699 static uint8_t l2arc_thread_exit;
700
701 static void l2arc_read_done(zio_t *zio);
702 static void l2arc_hdr_stat_add(void);
703 static void l2arc_hdr_stat_remove(void);
704
705 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
706 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
707 enum zio_compress c);
708 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
709
710 static uint64_t
711 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
712 {
713 uint8_t *vdva = (uint8_t *)dva;
714 uint64_t crc = -1ULL;
715 int i;
716
717 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
718
719 for (i = 0; i < sizeof (dva_t); i++)
720 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
721
722 crc ^= (spa>>8) ^ birth;
723
724 return (crc);
725 }
726
727 #define BUF_EMPTY(buf) \
728 ((buf)->b_dva.dva_word[0] == 0 && \
729 (buf)->b_dva.dva_word[1] == 0 && \
730 (buf)->b_birth == 0)
1230 if (use_mutex)
1231 mutex_exit(&new_state->arcs_mtx);
1232 }
1233 }
1234
1235 ASSERT(!BUF_EMPTY(ab));
1236 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1237 buf_hash_remove(ab);
1238
1239 /* adjust state sizes */
1240 if (to_delta)
1241 atomic_add_64(&new_state->arcs_size, to_delta);
1242 if (from_delta) {
1243 ASSERT3U(old_state->arcs_size, >=, from_delta);
1244 atomic_add_64(&old_state->arcs_size, -from_delta);
1245 }
1246 ab->b_state = new_state;
1247
1248 /* adjust l2arc hdr stats */
1249 if (new_state == arc_l2c_only)
1250 l2arc_hdr_stat_add();
1251 else if (old_state == arc_l2c_only)
1252 l2arc_hdr_stat_remove();
1253 }
1254
1255 void
1256 arc_space_consume(uint64_t space, arc_space_type_t type)
1257 {
1258 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1259
1260 switch (type) {
1261 case ARC_SPACE_DATA:
1262 ARCSTAT_INCR(arcstat_data_size, space);
1263 break;
1264 case ARC_SPACE_OTHER:
1265 ARCSTAT_INCR(arcstat_other_size, space);
1266 break;
1267 case ARC_SPACE_HDRS:
1268 ARCSTAT_INCR(arcstat_hdr_size, space);
1269 break;
1270 case ARC_SPACE_L2HDRS:
1334 hdr->b_type = type;
1335 hdr->b_spa = spa_load_guid(spa);
1336 hdr->b_state = arc_anon;
1337 hdr->b_arc_access = 0;
1338 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1339 buf->b_hdr = hdr;
1340 buf->b_data = NULL;
1341 buf->b_efunc = NULL;
1342 buf->b_private = NULL;
1343 buf->b_next = NULL;
1344 hdr->b_buf = buf;
1345 arc_get_data_buf(buf);
1346 hdr->b_datacnt = 1;
1347 hdr->b_flags = 0;
1348 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1349 (void) refcount_add(&hdr->b_refcnt, tag);
1350
1351 return (buf);
1352 }
1353
1354 static char *arc_onloan_tag = "onloan";
1355
1356 /*
1357 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1358 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1359 * buffers must be returned to the arc before they can be used by the DMU or
1360 * freed.
1361 */
1362 arc_buf_t *
1363 arc_loan_buf(spa_t *spa, int size)
1364 {
1365 arc_buf_t *buf;
1366
1367 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1368
1369 atomic_add_64(&arc_loaned_bytes, size);
1370 return (buf);
1371 }
1372
1373 /*
1571 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1572 /*
1573 * To prevent arc_free() and l2arc_evict() from
1574 * attempting to free the same buffer at the same time,
1575 * a FREE_IN_PROGRESS flag is given to arc_free() to
1576 * give it priority. l2arc_evict() can't destroy this
1577 * header while we are waiting on l2arc_buflist_mtx.
1578 *
1579 * The hdr may be removed from l2ad_buflist before we
1580 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1581 */
1582 if (!buflist_held) {
1583 mutex_enter(&l2arc_buflist_mtx);
1584 l2hdr = hdr->b_l2hdr;
1585 }
1586
1587 if (l2hdr != NULL) {
1588 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1589 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1590 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1591 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1592 if (hdr->b_state == arc_l2c_only)
1593 l2arc_hdr_stat_remove();
1594 hdr->b_l2hdr = NULL;
1595 }
1596
1597 if (!buflist_held)
1598 mutex_exit(&l2arc_buflist_mtx);
1599 }
1600
1601 if (!BUF_EMPTY(hdr)) {
1602 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1603 buf_discard_identity(hdr);
1604 }
1605 while (hdr->b_buf) {
1606 arc_buf_t *buf = hdr->b_buf;
1607
1608 if (buf->b_efunc) {
1609 mutex_enter(&arc_eviction_mtx);
1610 mutex_enter(&buf->b_evict_lock);
1611 ASSERT(buf->b_hdr != NULL);
3028 buf->b_next = NULL;
3029 hdr->b_buf = buf;
3030 ASSERT(hdr->b_datacnt == 0);
3031 hdr->b_datacnt = 1;
3032 arc_get_data_buf(buf);
3033 arc_access(hdr, hash_lock);
3034 }
3035
3036 ASSERT(!GHOST_STATE(hdr->b_state));
3037
3038 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3039 acb->acb_done = done;
3040 acb->acb_private = private;
3041
3042 ASSERT(hdr->b_acb == NULL);
3043 hdr->b_acb = acb;
3044 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3045
3046 if (hdr->b_l2hdr != NULL &&
3047 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3048 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3049 addr = hdr->b_l2hdr->b_daddr;
3050 b_compress = hdr->b_l2hdr->b_compress;
3051 b_asize = hdr->b_l2hdr->b_asize;
3052 /*
3053 * Lock out device removal.
3054 */
3055 if (vdev_is_dead(vd) ||
3056 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3057 vd = NULL;
3058 }
3059
3060 mutex_exit(hash_lock);
3061
3062 /*
3063 * At this point, we have a level 1 cache miss. Try again in
3064 * L2ARC if possible.
3065 */
3066 ASSERT3U(hdr->b_size, ==, size);
3067 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3401 atomic_add_64(&arc_anon->arcs_size, blksz);
3402 } else {
3403 mutex_exit(&buf->b_evict_lock);
3404 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3405 ASSERT(!list_link_active(&hdr->b_arc_node));
3406 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3407 if (hdr->b_state != arc_anon)
3408 arc_change_state(arc_anon, hdr, hash_lock);
3409 hdr->b_arc_access = 0;
3410 if (hash_lock)
3411 mutex_exit(hash_lock);
3412
3413 buf_discard_identity(hdr);
3414 arc_buf_thaw(buf);
3415 }
3416 buf->b_efunc = NULL;
3417 buf->b_private = NULL;
3418
3419 if (l2hdr) {
3420 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3421 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3422 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3423 mutex_exit(&l2arc_buflist_mtx);
3424 }
3425 }
3426
3427 int
3428 arc_released(arc_buf_t *buf)
3429 {
3430 int released;
3431
3432 mutex_enter(&buf->b_evict_lock);
3433 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3434 mutex_exit(&buf->b_evict_lock);
3435 return (released);
3436 }
3437
3438 int
3439 arc_has_callback(arc_buf_t *buf)
3440 {
3441 int callback;
4016 * l2arc_noprefetch skip caching prefetched buffers
4017 * l2arc_headroom number of max device writes to precache
4018 * l2arc_headroom_boost when we find compressed buffers during ARC
4019 * scanning, we multiply headroom by this
4020 * percentage factor for the next scan cycle,
4021 * since more compressed buffers are likely to
4022 * be present
4023 * l2arc_feed_secs seconds between L2ARC writing
4024 *
4025 * Tunables may be removed or added as future performance improvements are
4026 * integrated, and also may become zpool properties.
4027 *
4028 * There are three key functions that control how the L2ARC warms up:
4029 *
4030 * l2arc_write_eligible() check if a buffer is eligible to cache
4031 * l2arc_write_size() calculate how much to write
4032 * l2arc_write_interval() calculate sleep delay between writes
4033 *
4034 * These three functions determine what to write, how much, and how quickly
4035 * to send writes.
4036 */
4037
4038 static boolean_t
4039 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4040 {
4041 /*
4042 * A buffer is *not* eligible for the L2ARC if it:
4043 * 1. belongs to a different spa.
4044 * 2. is already cached on the L2ARC.
4045 * 3. has an I/O in progress (it may be an incomplete read).
4046 * 4. is flagged not eligible (zfs property).
4047 */
4048 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4049 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4050 return (B_FALSE);
4051
4052 return (B_TRUE);
4053 }
4054
4055 static uint64_t
4082 clock_t interval, next, now;
4083
4084 /*
4085 * If the ARC lists are busy, increase our write rate; if the
4086 * lists are stale, idle back. This is achieved by checking
4087 * how much we previously wrote - if it was more than half of
4088 * what we wanted, schedule the next write much sooner.
4089 */
4090 if (l2arc_feed_again && wrote > (wanted / 2))
4091 interval = (hz * l2arc_feed_min_ms) / 1000;
4092 else
4093 interval = hz * l2arc_feed_secs;
4094
4095 now = ddi_get_lbolt();
4096 next = MAX(now, MIN(now + interval, began + interval));
4097
4098 return (next);
4099 }
4100
4101 static void
4102 l2arc_hdr_stat_add(void)
4103 {
4104 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4105 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4106 }
4107
4108 static void
4109 l2arc_hdr_stat_remove(void)
4110 {
4111 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4112 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4113 }
4114
4115 /*
4116 * Cycle through L2ARC devices. This is how L2ARC load balances.
4117 * If a device is returned, this also returns holding the spa config lock.
4118 */
4119 static l2arc_dev_t *
4120 l2arc_dev_get_next(void)
4121 {
4122 l2arc_dev_t *first, *next = NULL;
4123
4124 /*
4125 * Lock out the removal of spas (spa_namespace_lock), then removal
4126 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4127 * both locks will be dropped and a spa config lock held instead.
4128 */
4129 mutex_enter(&spa_namespace_lock);
4130 mutex_enter(&l2arc_dev_mtx);
4131
4132 /* if there are no vdevs, there is nothing to do */
4133 if (l2arc_ndev == 0)
4134 goto out;
4135
4136 first = NULL;
4137 next = l2arc_dev_last;
4138 do {
4139 /* loop around the list looking for a non-faulted vdev */
4140 if (next == NULL) {
4141 next = list_head(l2arc_dev_list);
4142 } else {
4143 next = list_next(l2arc_dev_list, next);
4144 if (next == NULL)
4145 next = list_head(l2arc_dev_list);
4146 }
4147
4148 /* if we have come back to the start, bail out */
4149 if (first == NULL)
4150 first = next;
4151 else if (next == first)
4152 break;
4153
4154 } while (vdev_is_dead(next->l2ad_vdev));
4155
4156 /* if we were unable to find any usable vdevs, return NULL */
4157 if (vdev_is_dead(next->l2ad_vdev))
4158 next = NULL;
4159
4160 l2arc_dev_last = next;
4161
4162 out:
4163 mutex_exit(&l2arc_dev_mtx);
4164
4165 /*
4166 * Grab the config lock to prevent the 'next' device from being
4167 * removed while we are writing to it.
4168 */
4169 if (next != NULL)
4170 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4171 mutex_exit(&spa_namespace_lock);
4172
4173 return (next);
4174 }
4175
4176 /*
4177 * Free buffers that were tagged for destruction.
4191 ASSERT(df->l2df_func != NULL);
4192 df->l2df_func(df->l2df_data, df->l2df_size);
4193 list_remove(buflist, df);
4194 kmem_free(df, sizeof (l2arc_data_free_t));
4195 }
4196
4197 mutex_exit(&l2arc_free_on_write_mtx);
4198 }
4199
4200 /*
4201 * A write to a cache device has completed. Update all headers to allow
4202 * reads from these buffers to begin.
4203 */
4204 static void
4205 l2arc_write_done(zio_t *zio)
4206 {
4207 l2arc_write_callback_t *cb;
4208 l2arc_dev_t *dev;
4209 list_t *buflist;
4210 arc_buf_hdr_t *head, *ab, *ab_prev;
4211 l2arc_buf_hdr_t *abl2;
4212 kmutex_t *hash_lock;
4213
4214 cb = zio->io_private;
4215 ASSERT(cb != NULL);
4216 dev = cb->l2wcb_dev;
4217 ASSERT(dev != NULL);
4218 head = cb->l2wcb_head;
4219 ASSERT(head != NULL);
4220 buflist = dev->l2ad_buflist;
4221 ASSERT(buflist != NULL);
4222 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4223 l2arc_write_callback_t *, cb);
4224
4225 if (zio->io_error != 0)
4226 ARCSTAT_BUMP(arcstat_l2_writes_error);
4227
4228 mutex_enter(&l2arc_buflist_mtx);
4229
4230 /*
4231 * All writes completed, or an error was hit.
4232 */
4233 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4234 ab_prev = list_prev(buflist, ab);
4235
4236 hash_lock = HDR_LOCK(ab);
4237 if (!mutex_tryenter(hash_lock)) {
4238 /*
4239 * This buffer misses out. It may be in a stage
4240 * of eviction. Its ARC_L2_WRITING flag will be
4241 * left set, denying reads to this buffer.
4242 */
4243 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4244 continue;
4245 }
4246
4247 abl2 = ab->b_l2hdr;
4248
4249 /*
4250 * Release the temporary compressed buffer as soon as possible.
4251 */
4252 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4253 l2arc_release_cdata_buf(ab);
4254
4255 if (zio->io_error != 0) {
4256 /*
4257 * Error - drop L2ARC entry.
4258 */
4259 list_remove(buflist, ab);
4260 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4261 ab->b_l2hdr = NULL;
4262 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4263 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4264 }
4265
4266 /*
4267 * Allow ARC to begin reads to this L2ARC entry.
4268 */
4269 ab->b_flags &= ~ARC_L2_WRITING;
4270
4271 mutex_exit(hash_lock);
4272 }
4273
4274 atomic_inc_64(&l2arc_writes_done);
4275 list_remove(buflist, head);
4276 kmem_cache_free(hdr_cache, head);
4277 mutex_exit(&l2arc_buflist_mtx);
4278
4279 l2arc_do_free_on_write();
4280
4281 kmem_free(cb, sizeof (l2arc_write_callback_t));
4282 }
4283
4284 /*
4285 * A read to a cache device completed. Validate buffer contents before
4286 * handing over to the regular ARC routines.
4287 */
4288 static void
4289 l2arc_read_done(zio_t *zio)
4290 {
4291 l2arc_read_callback_t *cb;
4292 arc_buf_hdr_t *hdr;
4293 arc_buf_t *buf;
4294 kmutex_t *hash_lock;
4295 int equal;
4296
4297 ASSERT(zio->io_vd != NULL);
4298 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4299
4300 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4384 case 1:
4385 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4386 *lock = &arc_mru->arcs_mtx;
4387 break;
4388 case 2:
4389 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4390 *lock = &arc_mfu->arcs_mtx;
4391 break;
4392 case 3:
4393 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4394 *lock = &arc_mru->arcs_mtx;
4395 break;
4396 }
4397
4398 ASSERT(!(MUTEX_HELD(*lock)));
4399 mutex_enter(*lock);
4400 return (list);
4401 }
4402
4403 /*
4404 * Evict buffers from the device write hand to the distance specified in
4405 * bytes. This distance may span populated buffers, it may span nothing.
4406 * This is clearing a region on the L2ARC device ready for writing.
4407 * If the 'all' boolean is set, every buffer is evicted.
4408 */
4409 static void
4410 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4411 {
4412 list_t *buflist;
4413 l2arc_buf_hdr_t *abl2;
4414 arc_buf_hdr_t *ab, *ab_prev;
4415 kmutex_t *hash_lock;
4416 uint64_t taddr;
4417
4418 buflist = dev->l2ad_buflist;
4419
4420 if (buflist == NULL)
4421 return;
4422
4423 if (!all && dev->l2ad_first) {
4424 /*
4425 * This is the first sweep through the device. There is
4426 * nothing to evict.
4427 */
4428 return;
4429 }
4430
4431 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4432 /*
4433 * When nearing the end of the device, evict to the end
4434 * before the device write hand jumps to the start.
4435 */
4436 taddr = dev->l2ad_end;
4437 } else {
4438 taddr = dev->l2ad_hand + distance;
4439 }
4440 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4441 uint64_t, taddr, boolean_t, all);
4442
4443 top:
4444 mutex_enter(&l2arc_buflist_mtx);
4445 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4446 ab_prev = list_prev(buflist, ab);
4447
4448 hash_lock = HDR_LOCK(ab);
4449 if (!mutex_tryenter(hash_lock)) {
4450 /*
4493 * arc_hdr_destroy() will call list_remove()
4494 * and decrement arcstat_l2_size.
4495 */
4496 arc_change_state(arc_anon, ab, hash_lock);
4497 arc_hdr_destroy(ab);
4498 } else {
4499 /*
4500 * Invalidate issued or about to be issued
4501 * reads, since we may be about to write
4502 * over this location.
4503 */
4504 if (HDR_L2_READING(ab)) {
4505 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4506 ab->b_flags |= ARC_L2_EVICTED;
4507 }
4508
4509 /*
4510 * Tell ARC this no longer exists in L2ARC.
4511 */
4512 if (ab->b_l2hdr != NULL) {
4513 abl2 = ab->b_l2hdr;
4514 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4515 ab->b_l2hdr = NULL;
4516 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4517 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4518 }
4519 list_remove(buflist, ab);
4520
4521 /*
4522 * This may have been leftover after a
4523 * failed write.
4524 */
4525 ab->b_flags &= ~ARC_L2_WRITING;
4526 }
4527 mutex_exit(hash_lock);
4528 }
4529 mutex_exit(&l2arc_buflist_mtx);
4530
4531 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4532 dev->l2ad_evict = taddr;
4533 }
4534
4535 /*
4536 * Find and write ARC buffers to the L2ARC device.
4537 *
4538 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4539 * for reading until they have completed writing.
4540 * The headroom_boost is an in-out parameter used to maintain headroom boost
4541 * state between calls to this function.
4542 *
4543 * Returns the number of bytes actually written (which may be smaller than
4544 * the delta by which the device hand has changed due to alignment).
4545 */
4546 static uint64_t
4547 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4548 boolean_t *headroom_boost)
4549 {
4550 arc_buf_hdr_t *ab, *ab_prev, *head;
4551 list_t *list;
4552 uint64_t write_asize, write_psize, write_sz, headroom,
4553 buf_compress_minsz;
4554 void *buf_data;
4555 kmutex_t *list_lock;
4556 boolean_t full;
4557 l2arc_write_callback_t *cb;
4558 zio_t *pio, *wzio;
4559 uint64_t guid = spa_load_guid(spa);
4560 const boolean_t do_headroom_boost = *headroom_boost;
4561
4562 ASSERT(dev->l2ad_vdev != NULL);
4563
4564 /* Lower the flag now, we might want to raise it again later. */
4565 *headroom_boost = B_FALSE;
4566
4567 pio = NULL;
4568 write_sz = write_asize = write_psize = 0;
4569 full = B_FALSE;
4570 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4571 head->b_flags |= ARC_L2_WRITE_HEAD;
4572
4573 /*
4574 * We will want to try to compress buffers that are at least 2x the
4575 * device sector size.
4576 */
4577 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4578
4579 /*
4580 * Copy buffers for L2ARC writing.
4581 */
4582 mutex_enter(&l2arc_buflist_mtx);
4583 for (int try = 0; try <= 3; try++) {
4584 uint64_t passed_sz = 0;
4585
4586 list = l2arc_list_locked(try, &list_lock);
4587
4588 /*
4589 * L2ARC fast warmup.
4590 *
4591 * Until the ARC is warm and starts to evict, read from the
4592 * head of the ARC lists rather than the tail.
4593 */
4594 if (arc_warm == B_FALSE)
4595 ab = list_head(list);
4596 else
4597 ab = list_tail(list);
4598
4599 headroom = target_sz * l2arc_headroom;
4600 if (do_headroom_boost)
4601 headroom = (headroom * l2arc_headroom_boost) / 100;
4602
4603 for (; ab; ab = ab_prev) {
4604 l2arc_buf_hdr_t *l2hdr;
4605 kmutex_t *hash_lock;
4606 uint64_t buf_sz;
4607
4608 if (arc_warm == B_FALSE)
4609 ab_prev = list_next(list, ab);
4610 else
4611 ab_prev = list_prev(list, ab);
4612
4613 hash_lock = HDR_LOCK(ab);
4614 if (!mutex_tryenter(hash_lock)) {
4615 /*
4616 * Skip this buffer rather than waiting.
4617 */
4618 continue;
4619 }
4620
4621 passed_sz += ab->b_size;
4622 if (passed_sz > headroom) {
4623 /*
4624 * Searched too far.
4625 */
4626 mutex_exit(hash_lock);
4627 break;
4628 }
4629
4630 if (!l2arc_write_eligible(guid, ab)) {
4631 mutex_exit(hash_lock);
4632 continue;
4633 }
4634
4635 if ((write_sz + ab->b_size) > target_sz) {
4636 full = B_TRUE;
4637 mutex_exit(hash_lock);
4638 break;
4639 }
4640
4641 if (pio == NULL) {
4642 /*
4643 * Insert a dummy header on the buflist so
4644 * l2arc_write_done() can find where the
4645 * write buffers begin without searching.
4646 */
4647 list_insert_head(dev->l2ad_buflist, head);
4648
4649 cb = kmem_alloc(
4650 sizeof (l2arc_write_callback_t), KM_SLEEP);
4651 cb->l2wcb_dev = dev;
4652 cb->l2wcb_head = head;
4653 pio = zio_root(spa, l2arc_write_done, cb,
4654 ZIO_FLAG_CANFAIL);
4655 }
4656
4657 /*
4658 * Create and add a new L2ARC header.
4659 */
4660 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4661 l2hdr->b_dev = dev;
4662 ab->b_flags |= ARC_L2_WRITING;
4663
4664 /*
4665 * Temporarily stash the data buffer in b_tmp_cdata.
4666 * The subsequent write step will pick it up from
4667 * there. This is because can't access ab->b_buf
4668 * without holding the hash_lock, which we in turn
4669 * can't access without holding the ARC list locks
4670 * (which we want to avoid during compression/writing).
4671 */
4672 l2hdr->b_compress = ZIO_COMPRESS_OFF;
4673 l2hdr->b_asize = ab->b_size;
4674 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4675
4676 buf_sz = ab->b_size;
4677 ab->b_l2hdr = l2hdr;
4678
4679 list_insert_head(dev->l2ad_buflist, ab);
4680
4681 /*
4682 * Compute and store the buffer cksum before
4683 * writing. On debug the cksum is verified first.
4684 */
4685 arc_cksum_verify(ab->b_buf);
4686 arc_cksum_compute(ab->b_buf, B_TRUE);
4687
4688 mutex_exit(hash_lock);
4689
4690 write_sz += buf_sz;
4691 }
4692
4693 mutex_exit(list_lock);
4694
4695 if (full == B_TRUE)
4696 break;
4697 }
4698
4699 /* No buffers selected for writing? */
4700 if (pio == NULL) {
4701 ASSERT0(write_sz);
4702 mutex_exit(&l2arc_buflist_mtx);
4703 kmem_cache_free(hdr_cache, head);
4704 return (0);
4705 }
4706
4707 /*
4708 * Now start writing the buffers. We're starting at the write head
4709 * and work backwards, retracing the course of the buffer selector
4710 * loop above.
4711 */
4712 for (ab = list_prev(dev->l2ad_buflist, head); ab;
4713 ab = list_prev(dev->l2ad_buflist, ab)) {
4714 l2arc_buf_hdr_t *l2hdr;
4715 uint64_t buf_sz;
4716
4717 /*
4718 * We shouldn't need to lock the buffer here, since we flagged
4719 * it as ARC_L2_WRITING in the previous step, but we must take
4720 * care to only access its L2 cache parameters. In particular,
4721 * ab->b_buf may be invalid by now due to ARC eviction.
4726 if ((ab->b_flags & ARC_L2COMPRESS) &&
4727 l2hdr->b_asize >= buf_compress_minsz) {
4728 if (l2arc_compress_buf(l2hdr)) {
4729 /*
4730 * If compression succeeded, enable headroom
4731 * boost on the next scan cycle.
4732 */
4733 *headroom_boost = B_TRUE;
4734 }
4735 }
4736
4737 /*
4738 * Pick up the buffer data we had previously stashed away
4739 * (and now potentially also compressed).
4740 */
4741 buf_data = l2hdr->b_tmp_cdata;
4742 buf_sz = l2hdr->b_asize;
4743
4744 /* Compression may have squashed the buffer to zero length. */
4745 if (buf_sz != 0) {
4746 uint64_t buf_p_sz;
4747
4748 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4749 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4750 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4751 ZIO_FLAG_CANFAIL, B_FALSE);
4752
4753 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4754 zio_t *, wzio);
4755 (void) zio_nowait(wzio);
4756
4757 write_asize += buf_sz;
4758 /*
4759 * Keep the clock hand suitably device-aligned.
4760 */
4761 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4762 write_psize += buf_p_sz;
4763 dev->l2ad_hand += buf_p_sz;
4764 }
4765 }
4766
4767 mutex_exit(&l2arc_buflist_mtx);
4768
4769 ASSERT3U(write_asize, <=, target_sz);
4770 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4771 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4772 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4773 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4774 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4775
4776 /*
4777 * Bump device hand to the device start if it is approaching the end.
4778 * l2arc_evict() will already have evicted ahead for this case.
4779 */
4780 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4781 vdev_space_update(dev->l2ad_vdev,
4782 dev->l2ad_end - dev->l2ad_hand, 0, 0);
4783 dev->l2ad_hand = dev->l2ad_start;
4784 dev->l2ad_evict = dev->l2ad_start;
4785 dev->l2ad_first = B_FALSE;
4786 }
4787
4788 dev->l2ad_writing = B_TRUE;
4789 (void) zio_wait(pio);
4790 dev->l2ad_writing = B_FALSE;
4791
4792 return (write_asize);
4793 }
4794
4795 /*
4796 * Compresses an L2ARC buffer.
4797 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4798 * size in l2hdr->b_asize. This routine tries to compress the data and
4799 * depending on the compression result there are three possible outcomes:
4800 * *) The buffer was incompressible. The original l2hdr contents were left
5022 * Write ARC buffers.
5023 */
5024 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5025
5026 /*
5027 * Calculate interval between writes.
5028 */
5029 next = l2arc_write_interval(begin, size, wrote);
5030 spa_config_exit(spa, SCL_L2ARC, dev);
5031 }
5032
5033 l2arc_thread_exit = 0;
5034 cv_broadcast(&l2arc_feed_thr_cv);
5035 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
5036 thread_exit();
5037 }
5038
5039 boolean_t
5040 l2arc_vdev_present(vdev_t *vd)
5041 {
5042 l2arc_dev_t *dev;
5043
5044 mutex_enter(&l2arc_dev_mtx);
5045 for (dev = list_head(l2arc_dev_list); dev != NULL;
5046 dev = list_next(l2arc_dev_list, dev)) {
5047 if (dev->l2ad_vdev == vd)
5048 break;
5049 }
5050 mutex_exit(&l2arc_dev_mtx);
5051
5052 return (dev != NULL);
5053 }
5054
5055 /*
5056 * Add a vdev for use by the L2ARC. By this point the spa has already
5057 * validated the vdev and opened it.
5058 */
5059 void
5060 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5061 {
5062 l2arc_dev_t *adddev;
5063
5064 ASSERT(!l2arc_vdev_present(vd));
5065
5066 /*
5067 * Create a new l2arc device entry.
5068 */
5069 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5070 adddev->l2ad_spa = spa;
5071 adddev->l2ad_vdev = vd;
5072 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5073 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5074 adddev->l2ad_hand = adddev->l2ad_start;
5075 adddev->l2ad_evict = adddev->l2ad_start;
5076 adddev->l2ad_first = B_TRUE;
5077 adddev->l2ad_writing = B_FALSE;
5078
5079 /*
5080 * This is a list of all ARC buffers that are still valid on the
5081 * device.
5082 */
5083 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5084 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5085 offsetof(arc_buf_hdr_t, b_l2node));
5086
5087 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5088
5089 /*
5090 * Add device to global list
5091 */
5092 mutex_enter(&l2arc_dev_mtx);
5093 list_insert_head(l2arc_dev_list, adddev);
5094 atomic_inc_64(&l2arc_ndev);
5095 mutex_exit(&l2arc_dev_mtx);
5096 }
5097
5098 /*
5099 * Remove a vdev from the L2ARC.
5100 */
5101 void
5102 l2arc_remove_vdev(vdev_t *vd)
5103 {
5104 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5105
5106 /*
5107 * Find the device by vdev
5108 */
5109 mutex_enter(&l2arc_dev_mtx);
5110 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5111 nextdev = list_next(l2arc_dev_list, dev);
5112 if (vd == dev->l2ad_vdev) {
5113 remdev = dev;
5114 break;
5181 {
5182 if (!(spa_mode_global & FWRITE))
5183 return;
5184
5185 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5186 TS_RUN, minclsyspri);
5187 }
5188
5189 void
5190 l2arc_stop(void)
5191 {
5192 if (!(spa_mode_global & FWRITE))
5193 return;
5194
5195 mutex_enter(&l2arc_feed_thr_lock);
5196 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5197 l2arc_thread_exit = 1;
5198 while (l2arc_thread_exit != 0)
5199 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5200 mutex_exit(&l2arc_feed_thr_lock);
5201 }
|
121 */
122
123 #include <sys/spa.h>
124 #include <sys/zio.h>
125 #include <sys/zio_compress.h>
126 #include <sys/zfs_context.h>
127 #include <sys/arc.h>
128 #include <sys/refcount.h>
129 #include <sys/vdev.h>
130 #include <sys/vdev_impl.h>
131 #include <sys/dsl_pool.h>
132 #ifdef _KERNEL
133 #include <sys/vmsystm.h>
134 #include <vm/anon.h>
135 #include <sys/fs/swapnode.h>
136 #include <sys/dnlc.h>
137 #endif
138 #include <sys/callb.h>
139 #include <sys/kstat.h>
140 #include <zfs_fletcher.h>
141 #include <sys/byteorder.h>
142 #include <sys/spa_impl.h>
143
144 #ifndef _KERNEL
145 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
146 boolean_t arc_watch = B_FALSE;
147 int arc_procfd;
148 #endif
149
150 static kmutex_t arc_reclaim_thr_lock;
151 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
152 static uint8_t arc_thread_exit;
153
154 #define ARC_REDUCE_DNLC_PERCENT 3
155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
156
157 typedef enum arc_reclaim_strategy {
158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
159 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
160 } arc_reclaim_strategy_t;
161
162 /*
301 kstat_named_t arcstat_l2_feeds;
302 kstat_named_t arcstat_l2_rw_clash;
303 kstat_named_t arcstat_l2_read_bytes;
304 kstat_named_t arcstat_l2_write_bytes;
305 kstat_named_t arcstat_l2_writes_sent;
306 kstat_named_t arcstat_l2_writes_done;
307 kstat_named_t arcstat_l2_writes_error;
308 kstat_named_t arcstat_l2_writes_hdr_miss;
309 kstat_named_t arcstat_l2_evict_lock_retry;
310 kstat_named_t arcstat_l2_evict_reading;
311 kstat_named_t arcstat_l2_free_on_write;
312 kstat_named_t arcstat_l2_abort_lowmem;
313 kstat_named_t arcstat_l2_cksum_bad;
314 kstat_named_t arcstat_l2_io_error;
315 kstat_named_t arcstat_l2_size;
316 kstat_named_t arcstat_l2_asize;
317 kstat_named_t arcstat_l2_hdr_size;
318 kstat_named_t arcstat_l2_compress_successes;
319 kstat_named_t arcstat_l2_compress_zeros;
320 kstat_named_t arcstat_l2_compress_failures;
321 kstat_named_t arcstat_l2_log_blk_writes;
322 kstat_named_t arcstat_l2_log_blk_avg_size;
323 kstat_named_t arcstat_l2_data_to_meta_ratio;
324 kstat_named_t arcstat_l2_rebuild_successes;
325 kstat_named_t arcstat_l2_rebuild_abort_unsupported;
326 kstat_named_t arcstat_l2_rebuild_abort_timeout;
327 kstat_named_t arcstat_l2_rebuild_abort_io_errors;
328 kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
329 kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
330 kstat_named_t arcstat_l2_rebuild_abort_lowmem;
331 kstat_named_t arcstat_l2_rebuild_size;
332 kstat_named_t arcstat_l2_rebuild_bufs;
333 kstat_named_t arcstat_l2_rebuild_bufs_precached;
334 kstat_named_t arcstat_l2_rebuild_psize;
335 kstat_named_t arcstat_l2_rebuild_log_blks;
336 kstat_named_t arcstat_memory_throttle_count;
337 kstat_named_t arcstat_duplicate_buffers;
338 kstat_named_t arcstat_duplicate_buffers_size;
339 kstat_named_t arcstat_duplicate_reads;
340 kstat_named_t arcstat_meta_used;
341 kstat_named_t arcstat_meta_limit;
342 kstat_named_t arcstat_meta_max;
343 } arc_stats_t;
344
345 static arc_stats_t arc_stats = {
346 { "hits", KSTAT_DATA_UINT64 },
347 { "misses", KSTAT_DATA_UINT64 },
348 { "demand_data_hits", KSTAT_DATA_UINT64 },
349 { "demand_data_misses", KSTAT_DATA_UINT64 },
350 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
351 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
352 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
353 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
354 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
355 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
382 { "l2_feeds", KSTAT_DATA_UINT64 },
383 { "l2_rw_clash", KSTAT_DATA_UINT64 },
384 { "l2_read_bytes", KSTAT_DATA_UINT64 },
385 { "l2_write_bytes", KSTAT_DATA_UINT64 },
386 { "l2_writes_sent", KSTAT_DATA_UINT64 },
387 { "l2_writes_done", KSTAT_DATA_UINT64 },
388 { "l2_writes_error", KSTAT_DATA_UINT64 },
389 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
390 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
391 { "l2_evict_reading", KSTAT_DATA_UINT64 },
392 { "l2_free_on_write", KSTAT_DATA_UINT64 },
393 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
394 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
395 { "l2_io_error", KSTAT_DATA_UINT64 },
396 { "l2_size", KSTAT_DATA_UINT64 },
397 { "l2_asize", KSTAT_DATA_UINT64 },
398 { "l2_hdr_size", KSTAT_DATA_UINT64 },
399 { "l2_compress_successes", KSTAT_DATA_UINT64 },
400 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
401 { "l2_compress_failures", KSTAT_DATA_UINT64 },
402 { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
403 { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 },
404 { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
405 { "l2_rebuild_successes", KSTAT_DATA_UINT64 },
406 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
407 { "l2_rebuild_timeout", KSTAT_DATA_UINT64 },
408 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
409 { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 },
410 { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 },
411 { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
412 { "l2_rebuild_psize", KSTAT_DATA_UINT64 },
413 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
414 { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
415 { "l2_rebuild_size", KSTAT_DATA_UINT64 },
416 { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
417 { "memory_throttle_count", KSTAT_DATA_UINT64 },
418 { "duplicate_buffers", KSTAT_DATA_UINT64 },
419 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
420 { "duplicate_reads", KSTAT_DATA_UINT64 },
421 { "arc_meta_used", KSTAT_DATA_UINT64 },
422 { "arc_meta_limit", KSTAT_DATA_UINT64 },
423 { "arc_meta_max", KSTAT_DATA_UINT64 }
424 };
425
426 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
427
428 #define ARCSTAT_INCR(stat, val) \
429 atomic_add_64(&arc_stats.stat.value.ui64, (val))
430
431 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
432 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
433
434 #define ARCSTAT_MAX(stat, val) { \
435 uint64_t m; \
436 while ((val) > (m = arc_stats.stat.value.ui64) && \
444 /*
445 * We define a macro to allow ARC hits/misses to be easily broken down by
446 * two separate conditions, giving a total of four different subtypes for
447 * each of hits and misses (so eight statistics total).
448 */
449 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
450 if (cond1) { \
451 if (cond2) { \
452 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
453 } else { \
454 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
455 } \
456 } else { \
457 if (cond2) { \
458 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
459 } else { \
460 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
461 } \
462 }
463
464 /*
465 * This macro allows us to use kstats as floating averages. Each time we
466 * update this kstat, we first factor it and the update value by
467 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
468 * average. This macro assumes that integer loads and stores are atomic, but
469 * is not safe for multiple writers updating the kstat in parallel (only the
470 * last writer's update will remain).
471 */
472 #define ARCSTAT_F_AVG_FACTOR 3
473 #define ARCSTAT_F_AVG(stat, value) \
474 do { \
475 uint64_t x = ARCSTAT(stat); \
476 x = x - x / ARCSTAT_F_AVG_FACTOR + \
477 (value) / ARCSTAT_F_AVG_FACTOR; \
478 ARCSTAT(stat) = x; \
479 _NOTE(NOTREACHED) \
480 _NOTE(CONSTCOND) \
481 } while (0)
482
483 kstat_t *arc_ksp;
484 static arc_state_t *arc_anon;
485 static arc_state_t *arc_mru;
486 static arc_state_t *arc_mru_ghost;
487 static arc_state_t *arc_mfu;
488 static arc_state_t *arc_mfu_ghost;
489 static arc_state_t *arc_l2c_only;
490
491 /*
492 * There are several ARC variables that are critical to export as kstats --
493 * but we don't want to have to grovel around in the kstat whenever we wish to
494 * manipulate them. For these variables, we therefore define them to be in
495 * terms of the statistic variable. This assures that we are not introducing
496 * the possibility of inconsistency by having shadow copies of the variables,
497 * while still allowing the code to be readable.
498 */
499 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
500 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
501 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
502 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
671 #define L2ARC_FEED_SECS 1 /* caching interval secs */
672 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
673
674 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
675 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
676
677 /* L2ARC Performance Tunables */
678 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
679 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
680 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
681 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
682 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
683 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
684 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
685 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
686 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
687
688 /*
689 * L2ARC Internals
690 */
691 typedef struct l2arc_dev l2arc_dev_t;
692 static list_t L2ARC_dev_list; /* device list */
693 static list_t *l2arc_dev_list; /* device list pointer */
694 static kmutex_t l2arc_dev_mtx; /* device list mutex */
695 static l2arc_dev_t *l2arc_dev_last; /* last device used */
696 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
697 static list_t L2ARC_free_on_write; /* free after write buf list */
698 static list_t *l2arc_free_on_write; /* free after write list ptr */
699 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
700 static uint64_t l2arc_ndev; /* number of devices */
701
702 typedef struct l2arc_read_callback {
703 arc_buf_t *l2rcb_buf; /* read buffer */
704 spa_t *l2rcb_spa; /* spa */
705 blkptr_t l2rcb_bp; /* original blkptr */
706 zbookmark_t l2rcb_zb; /* original bookmark */
707 int l2rcb_flags; /* original flags */
708 enum zio_compress l2rcb_compress; /* applied compress */
709 } l2arc_read_callback_t;
710
711 typedef struct l2arc_write_callback {
712 l2arc_dev_t *l2wcb_dev; /* device info */
713 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
714 /* list of in-flight l2arc_log_blk_buf_t's */
715 list_t l2wcb_log_blk_buf_list;
716 } l2arc_write_callback_t;
717
718 struct l2arc_buf_hdr {
719 /* protected by arc_buf_hdr mutex */
720 l2arc_dev_t *b_dev; /* L2ARC device */
721 uint64_t b_daddr; /* disk address, offset byte */
722 /* compression applied to buffer data */
723 enum zio_compress b_compress;
724 /* real alloc'd buffer size depending on b_compress applied */
725 int b_asize;
726 /* temporary buffer holder for in-flight compressed data */
727 void *b_tmp_cdata;
728 };
729
730 typedef struct l2arc_data_free {
731 /* protected by l2arc_free_on_write_mtx */
732 void *l2df_data;
733 size_t l2df_size;
734 void (*l2df_func)(void *, size_t);
735 list_node_t l2df_list_node;
736 } l2arc_data_free_t;
737
738 static kmutex_t l2arc_feed_thr_lock;
739 static kcondvar_t l2arc_feed_thr_cv;
740 static uint8_t l2arc_thread_exit;
741
742 static void l2arc_read_done(zio_t *zio);
743 static void l2arc_hdr_stat_add(boolean_t from_arc);
744 static void l2arc_hdr_stat_remove(void);
745 static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
746
747 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
748 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
749 enum zio_compress c);
750 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
751
752 enum {
753 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
754 };
755
756 /*
757 * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
758 */
759 typedef struct l2arc_log_blk_ptr {
760 uint64_t l2lbp_daddr; /* device address of log */
761 /*
762 * l2lbp_prop is the same format as the blk_prop in blkptr_t:
763 * * logical size (in sectors)
764 * * physical (compressed) size (in sectors)
765 * * compression algorithm (we always LZ4-compress l2arc logs)
766 * * checksum algorithm (used for l2lbp_cksum)
767 * * object type & level (unused for now)
768 */
769 uint64_t l2lbp_prop;
770 zio_cksum_t l2lbp_cksum; /* fletcher4 of log */
771 } l2arc_log_blk_ptr_t;
772
773 /*
774 * The persistent L2ARC device header.
775 */
776 typedef struct l2arc_dev_hdr_phys {
777 uint64_t l2dh_magic;
778 zio_cksum_t l2dh_self_cksum; /* fletcher4 of fields below */
779
780 /*
781 * Global L2ARC device state and metadata.
782 */
783 uint64_t l2dh_spa_guid;
784 uint64_t l2dh_evict_tail; /* current evict pointer */
785 uint64_t l2dh_alloc_space; /* vdev space alloc status */
786 uint64_t l2dh_flags; /* l2arc_dev_hdr_flags_t */
787
788 /*
789 * Start of log block chain. [0] -> newest log, [1] -> one older (used
790 * for initiating prefetch).
791 */
792 l2arc_log_blk_ptr_t l2dh_start_lbps[2];
793
794 const uint64_t l2dh_pad[43]; /* pad to 512 bytes */
795 } l2arc_dev_hdr_phys_t;
796 CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
797
798 /*
799 * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
800 */
801 typedef struct l2arc_log_ent_phys {
802 dva_t l2le_dva; /* dva of buffer */
803 uint64_t l2le_birth; /* birth txg of buffer */
804 uint64_t l2le_cksum0;
805 zio_cksum_t l2le_freeze_cksum;
806 /*
807 * l2le_prop is the same format as the blk_prop in blkptr_t:
808 * * logical size (in sectors)
809 * * physical (compressed) size (in sectors)
810 * * compression algorithm
811 * * checksum algorithm (used for cksum0)
812 * * object type & level (used to restore arc_buf_contents_t)
813 */
814 uint64_t l2le_prop;
815 uint64_t l2le_daddr; /* buf location on l2dev */
816 const uint64_t l2le_pad[6]; /* resv'd for future use */
817 } l2arc_log_ent_phys_t;
818
819 /*
820 * These design limits give us the following overhead (before compression):
821 * avg_blk_sz overhead
822 * 1k 12.51 %
823 * 2k 6.26 %
824 * 4k 3.13 %
825 * 8k 1.56 %
826 * 16k 0.78 %
827 * 32k 0.39 %
828 * 64k 0.20 %
829 * 128k 0.10 %
830 * Compression should be able to sequeeze these down by about a factor of 2x.
831 */
832 #define L2ARC_LOG_BLK_SIZE (128 * 1024) /* 128k */
833 #define L2ARC_LOG_BLK_HEADER_LEN (128)
834 #define L2ARC_LOG_BLK_ENTRIES /* 1023 entries */ \
835 ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) / \
836 sizeof (l2arc_log_ent_phys_t))
837 /*
838 * Maximum amount of data in an l2arc log block (used to terminate rebuilding
839 * before we hit the write head and restore potentially corrupted blocks).
840 */
841 #define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE \
842 (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
843 /*
844 * For the persistency and rebuild algorithms to operate reliably we need
845 * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
846 * excessive log block looping might confuse the log chain end detection).
847 * Under normal circumstances this is not a problem, since this is somewhere
848 * around only 400 MB.
849 */
850 #define L2ARC_PERSIST_MIN_SIZE (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
851
852 /*
853 * A log block of up to 1023 ARC buffer log entries, chained into the
854 * persistent L2ARC metadata linked list.
855 */
856 typedef struct l2arc_log_blk_phys {
857 /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
858 uint64_t l2lb_magic;
859 l2arc_log_blk_ptr_t l2lb_back2_lbp; /* back 2 steps in chain */
860 uint64_t l2lb_pad[9]; /* resv'd for future use */
861 /* Payload */
862 l2arc_log_ent_phys_t l2lb_entries[L2ARC_LOG_BLK_ENTRIES];
863 } l2arc_log_blk_phys_t;
864
865 CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
866 CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) -
867 offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
868
869 /*
870 * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
871 * written to the L2ARC device. They may be compressed, hence the uint8_t[].
872 */
873 typedef struct l2arc_log_blk_buf {
874 uint8_t l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
875 list_node_t l2lbb_node;
876 } l2arc_log_blk_buf_t;
877
878 /* Macros for the manipulation fields in the blk_prop format of blkptr_t */
879 #define BLKPROP_GET_LSIZE(_obj, _field) \
880 BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
881 #define BLKPROP_SET_LSIZE(_obj, _field, x) \
882 BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
883 #define BLKPROP_GET_PSIZE(_obj, _field) \
884 BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
885 #define BLKPROP_SET_PSIZE(_obj, _field, x) \
886 BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
887 #define BLKPROP_GET_COMPRESS(_obj, _field) \
888 BF64_GET((_obj)->_field, 32, 8)
889 #define BLKPROP_SET_COMPRESS(_obj, _field, x) \
890 BF64_SET((_obj)->_field, 32, 8, x)
891 #define BLKPROP_GET_CHECKSUM(_obj, _field) \
892 BF64_GET((_obj)->_field, 40, 8)
893 #define BLKPROP_SET_CHECKSUM(_obj, _field, x) \
894 BF64_SET((_obj)->_field, 40, 8, x)
895 #define BLKPROP_GET_TYPE(_obj, _field) \
896 BF64_GET((_obj)->_field, 48, 8)
897 #define BLKPROP_SET_TYPE(_obj, _field, x) \
898 BF64_SET((_obj)->_field, 48, 8, x)
899
900 /* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */
901 #define LBP_GET_LSIZE(_add) BLKPROP_GET_LSIZE(_add, l2lbp_prop)
902 #define LBP_SET_LSIZE(_add, x) BLKPROP_SET_LSIZE(_add, l2lbp_prop, x)
903 #define LBP_GET_PSIZE(_add) BLKPROP_GET_PSIZE(_add, l2lbp_prop)
904 #define LBP_SET_PSIZE(_add, x) BLKPROP_SET_PSIZE(_add, l2lbp_prop, x)
905 #define LBP_GET_COMPRESS(_add) BLKPROP_GET_COMPRESS(_add, l2lbp_prop)
906 #define LBP_SET_COMPRESS(_add, x) BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \
907 x)
908 #define LBP_GET_CHECKSUM(_add) BLKPROP_GET_CHECKSUM(_add, l2lbp_prop)
909 #define LBP_SET_CHECKSUM(_add, x) BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \
910 x)
911 #define LBP_GET_TYPE(_add) BLKPROP_GET_TYPE(_add, l2lbp_prop)
912 #define LBP_SET_TYPE(_add, x) BLKPROP_SET_TYPE(_add, l2lbp_prop, x)
913
914 /* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */
915 #define LE_GET_LSIZE(_le) BLKPROP_GET_LSIZE(_le, l2le_prop)
916 #define LE_SET_LSIZE(_le, x) BLKPROP_SET_LSIZE(_le, l2le_prop, x)
917 #define LE_GET_PSIZE(_le) BLKPROP_GET_PSIZE(_le, l2le_prop)
918 #define LE_SET_PSIZE(_le, x) BLKPROP_SET_PSIZE(_le, l2le_prop, x)
919 #define LE_GET_COMPRESS(_le) BLKPROP_GET_COMPRESS(_le, l2le_prop)
920 #define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x)
921 #define LE_GET_CHECKSUM(_le) BLKPROP_GET_CHECKSUM(_le, l2le_prop)
922 #define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x)
923 #define LE_GET_TYPE(_le) BLKPROP_GET_TYPE(_le, l2le_prop)
924 #define LE_SET_TYPE(_le, x) BLKPROP_SET_TYPE(_le, l2le_prop, x)
925
926 #define PTR_SWAP(x, y) \
927 do { \
928 void *tmp = (x);\
929 x = y; \
930 y = tmp; \
931 _NOTE(CONSTCOND)\
932 } while (0)
933
934 #define L2ARC_DEV_HDR_MAGIC 0x12bab10c00000001LLU
935 #define L2ARC_LOG_BLK_MAGIC 0x120103b10c000001LLU
936 #define L2ARC_REBUILD_TIMEOUT 300 /* a rebuild may take at most 300s */
937
938 struct l2arc_dev {
939 vdev_t *l2ad_vdev; /* vdev */
940 spa_t *l2ad_spa; /* spa */
941 uint64_t l2ad_hand; /* next write location */
942 uint64_t l2ad_start; /* first addr on device */
943 uint64_t l2ad_end; /* last addr on device */
944 uint64_t l2ad_evict; /* last addr eviction reached */
945 boolean_t l2ad_first; /* first sweep through */
946 boolean_t l2ad_writing; /* currently writing */
947 list_t *l2ad_buflist; /* buffer list */
948 list_node_t l2ad_node; /* device list node */
949 l2arc_dev_hdr_phys_t l2ad_dev_hdr; /* persistent device header */
950 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
951 int l2ad_log_ent_idx; /* index into cur log blk */
952 /* number of bytes in current log block's payload */
953 uint64_t l2ad_log_blk_payload_asize;
954 /* flag indicating whether a rebuild is scheduled or is going on */
955 boolean_t l2ad_rebuild;
956 };
957
958 /*
959 * Performance tuning of L2ARC persistency:
960 *
961 * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
962 * pool import or when adding one manually later) will attempt
963 * to rebuild L2ARC buffer contents. In special circumstances,
964 * the administrator may want to set this to B_FALSE, if they
965 * are having trouble importing a pool or attaching an L2ARC
966 * device (e.g. the L2ARC device is slow to read in stored log
967 * metadata, or the metadata has become somehow
968 * fragmented/unusable).
969 * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
970 * avoid a slow L2ARC device from preventing pool import. If we
971 * are not done rebuilding an L2ARC device by this time, we
972 * stop the rebuild and return immediately.
973 */
974 boolean_t l2arc_rebuild_enabled = B_TRUE;
975 uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
976
977 /*
978 * L2ARC persistency rebuild routines.
979 */
980 static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
981 static int l2arc_rebuild(l2arc_dev_t *dev);
982 static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
983 l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
984 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
985 l2arc_dev_t *dev, uint64_t guid);
986
987 /*
988 * L2ARC persistency read I/O routines.
989 */
990 static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr);
991 static int l2arc_log_blk_read(l2arc_dev_t *dev,
992 const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp,
993 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
994 uint8_t *this_lb_buf, uint8_t *next_lb_buf,
995 zio_t *this_io, zio_t **next_io);
996 static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev,
997 const l2arc_log_blk_ptr_t *lp);
998 static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
999 const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf);
1000 static void l2arc_log_blk_prefetch_abort(zio_t *zio);
1001
1002 /*
1003 * L2ARC persistency write I/O routines.
1004 */
1005 static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
1006 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
1007 l2arc_write_callback_t *cb);
1008
1009 /*
1010 * L2ARC persistency auxilliary routines.
1011 */
1012 static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
1013 zio_cksum_t *cksum);
1014 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
1015 const arc_buf_hdr_t *ab);
1016 static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
1017 uint64_t top, uint64_t check);
1018 static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline);
1019
1020 static inline uint64_t
1021 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1022 {
1023 uint8_t *vdva = (uint8_t *)dva;
1024 uint64_t crc = -1ULL;
1025 int i;
1026
1027 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
1028
1029 for (i = 0; i < sizeof (dva_t); i++)
1030 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
1031
1032 crc ^= (spa>>8) ^ birth;
1033
1034 return (crc);
1035 }
1036
1037 #define BUF_EMPTY(buf) \
1038 ((buf)->b_dva.dva_word[0] == 0 && \
1039 (buf)->b_dva.dva_word[1] == 0 && \
1040 (buf)->b_birth == 0)
1540 if (use_mutex)
1541 mutex_exit(&new_state->arcs_mtx);
1542 }
1543 }
1544
1545 ASSERT(!BUF_EMPTY(ab));
1546 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1547 buf_hash_remove(ab);
1548
1549 /* adjust state sizes */
1550 if (to_delta)
1551 atomic_add_64(&new_state->arcs_size, to_delta);
1552 if (from_delta) {
1553 ASSERT3U(old_state->arcs_size, >=, from_delta);
1554 atomic_add_64(&old_state->arcs_size, -from_delta);
1555 }
1556 ab->b_state = new_state;
1557
1558 /* adjust l2arc hdr stats */
1559 if (new_state == arc_l2c_only)
1560 l2arc_hdr_stat_add(old_state != arc_anon);
1561 else if (old_state == arc_l2c_only)
1562 l2arc_hdr_stat_remove();
1563 }
1564
1565 void
1566 arc_space_consume(uint64_t space, arc_space_type_t type)
1567 {
1568 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1569
1570 switch (type) {
1571 case ARC_SPACE_DATA:
1572 ARCSTAT_INCR(arcstat_data_size, space);
1573 break;
1574 case ARC_SPACE_OTHER:
1575 ARCSTAT_INCR(arcstat_other_size, space);
1576 break;
1577 case ARC_SPACE_HDRS:
1578 ARCSTAT_INCR(arcstat_hdr_size, space);
1579 break;
1580 case ARC_SPACE_L2HDRS:
1644 hdr->b_type = type;
1645 hdr->b_spa = spa_load_guid(spa);
1646 hdr->b_state = arc_anon;
1647 hdr->b_arc_access = 0;
1648 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1649 buf->b_hdr = hdr;
1650 buf->b_data = NULL;
1651 buf->b_efunc = NULL;
1652 buf->b_private = NULL;
1653 buf->b_next = NULL;
1654 hdr->b_buf = buf;
1655 arc_get_data_buf(buf);
1656 hdr->b_datacnt = 1;
1657 hdr->b_flags = 0;
1658 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1659 (void) refcount_add(&hdr->b_refcnt, tag);
1660
1661 return (buf);
1662 }
1663
1664 /*
1665 * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
1666 * This is used during l2arc reconstruction to make empty ARC buffers
1667 * which circumvent the regular disk->arc->l2arc path and instead come
1668 * into being in the reverse order, i.e. l2arc->arc->(disk).
1669 */
1670 arc_buf_hdr_t *
1671 arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
1672 {
1673 arc_buf_hdr_t *hdr;
1674
1675 ASSERT3U(size, >, 0);
1676 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
1677 ASSERT(BUF_EMPTY(hdr));
1678 hdr->b_size = size;
1679 hdr->b_type = type;
1680 hdr->b_spa = guid;
1681 hdr->b_state = arc_anon;
1682 hdr->b_arc_access = 0;
1683 hdr->b_buf = NULL;
1684 hdr->b_datacnt = 0;
1685 hdr->b_flags = 0;
1686 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1687
1688 return (hdr);
1689 }
1690
1691 static char *arc_onloan_tag = "onloan";
1692
1693 /*
1694 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1695 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1696 * buffers must be returned to the arc before they can be used by the DMU or
1697 * freed.
1698 */
1699 arc_buf_t *
1700 arc_loan_buf(spa_t *spa, int size)
1701 {
1702 arc_buf_t *buf;
1703
1704 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1705
1706 atomic_add_64(&arc_loaned_bytes, size);
1707 return (buf);
1708 }
1709
1710 /*
1908 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1909 /*
1910 * To prevent arc_free() and l2arc_evict() from
1911 * attempting to free the same buffer at the same time,
1912 * a FREE_IN_PROGRESS flag is given to arc_free() to
1913 * give it priority. l2arc_evict() can't destroy this
1914 * header while we are waiting on l2arc_buflist_mtx.
1915 *
1916 * The hdr may be removed from l2ad_buflist before we
1917 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1918 */
1919 if (!buflist_held) {
1920 mutex_enter(&l2arc_buflist_mtx);
1921 l2hdr = hdr->b_l2hdr;
1922 }
1923
1924 if (l2hdr != NULL) {
1925 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1926 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1927 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1928 kmem_free(l2hdr, sizeof (*l2hdr));
1929 if (hdr->b_state == arc_l2c_only)
1930 l2arc_hdr_stat_remove();
1931 hdr->b_l2hdr = NULL;
1932 }
1933
1934 if (!buflist_held)
1935 mutex_exit(&l2arc_buflist_mtx);
1936 }
1937
1938 if (!BUF_EMPTY(hdr)) {
1939 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1940 buf_discard_identity(hdr);
1941 }
1942 while (hdr->b_buf) {
1943 arc_buf_t *buf = hdr->b_buf;
1944
1945 if (buf->b_efunc) {
1946 mutex_enter(&arc_eviction_mtx);
1947 mutex_enter(&buf->b_evict_lock);
1948 ASSERT(buf->b_hdr != NULL);
3365 buf->b_next = NULL;
3366 hdr->b_buf = buf;
3367 ASSERT(hdr->b_datacnt == 0);
3368 hdr->b_datacnt = 1;
3369 arc_get_data_buf(buf);
3370 arc_access(hdr, hash_lock);
3371 }
3372
3373 ASSERT(!GHOST_STATE(hdr->b_state));
3374
3375 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3376 acb->acb_done = done;
3377 acb->acb_private = private;
3378
3379 ASSERT(hdr->b_acb == NULL);
3380 hdr->b_acb = acb;
3381 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3382
3383 if (hdr->b_l2hdr != NULL &&
3384 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3385 /*
3386 * Need to stash these before letting go of hash_lock
3387 */
3388 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3389 addr = hdr->b_l2hdr->b_daddr;
3390 b_compress = hdr->b_l2hdr->b_compress;
3391 b_asize = hdr->b_l2hdr->b_asize;
3392 /*
3393 * Lock out device removal.
3394 */
3395 if (vdev_is_dead(vd) ||
3396 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3397 vd = NULL;
3398 }
3399
3400 mutex_exit(hash_lock);
3401
3402 /*
3403 * At this point, we have a level 1 cache miss. Try again in
3404 * L2ARC if possible.
3405 */
3406 ASSERT3U(hdr->b_size, ==, size);
3407 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3741 atomic_add_64(&arc_anon->arcs_size, blksz);
3742 } else {
3743 mutex_exit(&buf->b_evict_lock);
3744 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3745 ASSERT(!list_link_active(&hdr->b_arc_node));
3746 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3747 if (hdr->b_state != arc_anon)
3748 arc_change_state(arc_anon, hdr, hash_lock);
3749 hdr->b_arc_access = 0;
3750 if (hash_lock)
3751 mutex_exit(hash_lock);
3752
3753 buf_discard_identity(hdr);
3754 arc_buf_thaw(buf);
3755 }
3756 buf->b_efunc = NULL;
3757 buf->b_private = NULL;
3758
3759 if (l2hdr) {
3760 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3761 kmem_free(l2hdr, sizeof (*l2hdr));
3762 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3763 mutex_exit(&l2arc_buflist_mtx);
3764 }
3765 }
3766
3767 int
3768 arc_released(arc_buf_t *buf)
3769 {
3770 int released;
3771
3772 mutex_enter(&buf->b_evict_lock);
3773 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3774 mutex_exit(&buf->b_evict_lock);
3775 return (released);
3776 }
3777
3778 int
3779 arc_has_callback(arc_buf_t *buf)
3780 {
3781 int callback;
4356 * l2arc_noprefetch skip caching prefetched buffers
4357 * l2arc_headroom number of max device writes to precache
4358 * l2arc_headroom_boost when we find compressed buffers during ARC
4359 * scanning, we multiply headroom by this
4360 * percentage factor for the next scan cycle,
4361 * since more compressed buffers are likely to
4362 * be present
4363 * l2arc_feed_secs seconds between L2ARC writing
4364 *
4365 * Tunables may be removed or added as future performance improvements are
4366 * integrated, and also may become zpool properties.
4367 *
4368 * There are three key functions that control how the L2ARC warms up:
4369 *
4370 * l2arc_write_eligible() check if a buffer is eligible to cache
4371 * l2arc_write_size() calculate how much to write
4372 * l2arc_write_interval() calculate sleep delay between writes
4373 *
4374 * These three functions determine what to write, how much, and how quickly
4375 * to send writes.
4376 *
4377 * L2ARC persistency:
4378 *
4379 * When writing buffers to L2ARC, we periodically add some metadata to
4380 * make sure we can pick them up after reboot, thus dramatically reducing
4381 * the impact that any downtime has on the performance of storage systems
4382 * with large caches.
4383 *
4384 * The implementation works fairly simply by integrating the following two
4385 * modifications:
4386 *
4387 * *) Every now and then we mix in a piece of metadata (called a log block)
4388 * into the L2ARC write. This allows us to understand what's been written,
4389 * so that we can rebuild the arc_buf_hdr_t structures of the main ARC
4390 * buffers. The log block also includes a "back-reference" pointer to the
4391 * previous block, forming a back-linked list of blocks on the L2ARC device.
4392 *
4393 * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
4394 * for our header bookkeeping purposes. This contains a device header, which
4395 * contains our top-level reference structures. We update it each time we
4396 * write a new log block, so that we're able to locate it in the L2ARC
4397 * device. If this write results in an inconsistent device header (e.g. due
4398 * to power failure), we detect this by verifying the header's checksum
4399 * and simply drop the entries from L2ARC.
4400 *
4401 * Implementation diagram:
4402 *
4403 * +=== L2ARC device (not to scale) ======================================+
4404 * | __________newest log block pointers_________ |
4405 * | / \1 back \latest |
4406 * | / V V |
4407 * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
4408 * | ^ / ^ / ^ / |
4409 * | `-prev-' `-prev-' `-prev-' |
4410 * | lb lb lb |
4411 * +======================================================================+
4412 *
4413 * On-device data structures:
4414 *
4415 * L2ARC device header: l2arc_dev_hdr_phys_t
4416 * L2ARC log block: l2arc_log_blk_phys_t
4417 *
4418 * L2ARC reconstruction:
4419 *
4420 * When writing data, we simply write in the standard rotary fashion,
4421 * evicting buffers as we go and simply writing new data over them (writing
4422 * a new log block every now and then). This obviously means that once we
4423 * loop around the end of the device, we will start cutting into an already
4424 * committed log block (and its referenced data buffers), like so:
4425 *
4426 * current write head__ __old tail
4427 * \ /
4428 * V V
4429 * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
4430 * ^ ^^^^^^^^^___________________________________
4431 * | \
4432 * <<nextwrite>> may overwrite this blk and/or its bufs --'
4433 *
4434 * When importing the pool, we detect this situation and use it to stop
4435 * our scanning process (see l2arc_rebuild).
4436 *
4437 * There is one significant caveat to consider when rebuilding ARC contents
4438 * from an L2ARC device: what about invalidated buffers? Given the above
4439 * construction, we cannot update blocks which we've already written to amend
4440 * them to remove buffers which were invalidated. Thus, during reconstruction,
4441 * we might be populating the cache with buffers for data that's not on the
4442 * main pool anymore, or may have been overwritten!
4443 *
4444 * As it turns out, this isn't a problem. Every arc_read request includes
4445 * both the DVA and, crucially, the birth TXG of the BP the caller is
4446 * looking for. So even if the cache were populated by completely rotten
4447 * blocks for data that had been long deleted and/or overwritten, we'll
4448 * never actually return bad data from the cache, since the DVA with the
4449 * birth TXG uniquely identify a block in space and time - once created,
4450 * a block is immutable on disk. The worst thing we have done is wasted
4451 * some time and memory at l2arc rebuild to reconstruct outdated ARC
4452 * entries that will get dropped from the l2arc as it is being updated
4453 * with new blocks.
4454 */
4455
4456 static boolean_t
4457 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4458 {
4459 /*
4460 * A buffer is *not* eligible for the L2ARC if it:
4461 * 1. belongs to a different spa.
4462 * 2. is already cached on the L2ARC.
4463 * 3. has an I/O in progress (it may be an incomplete read).
4464 * 4. is flagged not eligible (zfs property).
4465 */
4466 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4467 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4468 return (B_FALSE);
4469
4470 return (B_TRUE);
4471 }
4472
4473 static uint64_t
4500 clock_t interval, next, now;
4501
4502 /*
4503 * If the ARC lists are busy, increase our write rate; if the
4504 * lists are stale, idle back. This is achieved by checking
4505 * how much we previously wrote - if it was more than half of
4506 * what we wanted, schedule the next write much sooner.
4507 */
4508 if (l2arc_feed_again && wrote > (wanted / 2))
4509 interval = (hz * l2arc_feed_min_ms) / 1000;
4510 else
4511 interval = hz * l2arc_feed_secs;
4512
4513 now = ddi_get_lbolt();
4514 next = MAX(now, MIN(now + interval, began + interval));
4515
4516 return (next);
4517 }
4518
4519 static void
4520 l2arc_hdr_stat_add(boolean_t from_arc)
4521 {
4522 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4523 if (from_arc)
4524 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4525 }
4526
4527 static void
4528 l2arc_hdr_stat_remove(void)
4529 {
4530 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4531 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4532 }
4533
4534 /*
4535 * Cycle through L2ARC devices. This is how L2ARC load balances.
4536 * If a device is returned, this also returns holding the spa config lock.
4537 */
4538 static l2arc_dev_t *
4539 l2arc_dev_get_next(void)
4540 {
4541 l2arc_dev_t *first, *next = NULL;
4542
4543 /*
4544 * Lock out the removal of spas (spa_namespace_lock), then removal
4545 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4546 * both locks will be dropped and a spa config lock held instead.
4547 */
4548 mutex_enter(&spa_namespace_lock);
4549 mutex_enter(&l2arc_dev_mtx);
4550
4551 /* if there are no vdevs, there is nothing to do */
4552 if (l2arc_ndev == 0)
4553 goto out;
4554
4555 first = NULL;
4556 next = l2arc_dev_last;
4557 do {
4558 /*
4559 * Loop around the list looking for a non-faulted vdev
4560 * and one that isn't currently doing an L2ARC rebuild.
4561 */
4562 if (next == NULL) {
4563 next = list_head(l2arc_dev_list);
4564 } else {
4565 next = list_next(l2arc_dev_list, next);
4566 if (next == NULL)
4567 next = list_head(l2arc_dev_list);
4568 }
4569
4570 /* if we have come back to the start, bail out */
4571 if (first == NULL)
4572 first = next;
4573 else if (next == first)
4574 break;
4575
4576 } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
4577
4578 /* if we were unable to find any usable vdevs, return NULL */
4579 if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
4580 next = NULL;
4581
4582 l2arc_dev_last = next;
4583
4584 out:
4585 mutex_exit(&l2arc_dev_mtx);
4586
4587 /*
4588 * Grab the config lock to prevent the 'next' device from being
4589 * removed while we are writing to it.
4590 */
4591 if (next != NULL)
4592 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4593 mutex_exit(&spa_namespace_lock);
4594
4595 return (next);
4596 }
4597
4598 /*
4599 * Free buffers that were tagged for destruction.
4613 ASSERT(df->l2df_func != NULL);
4614 df->l2df_func(df->l2df_data, df->l2df_size);
4615 list_remove(buflist, df);
4616 kmem_free(df, sizeof (l2arc_data_free_t));
4617 }
4618
4619 mutex_exit(&l2arc_free_on_write_mtx);
4620 }
4621
4622 /*
4623 * A write to a cache device has completed. Update all headers to allow
4624 * reads from these buffers to begin.
4625 */
4626 static void
4627 l2arc_write_done(zio_t *zio)
4628 {
4629 l2arc_write_callback_t *cb;
4630 l2arc_dev_t *dev;
4631 list_t *buflist;
4632 arc_buf_hdr_t *head, *ab, *ab_prev;
4633 l2arc_buf_hdr_t *l2hdr;
4634 kmutex_t *hash_lock;
4635 l2arc_log_blk_buf_t *lb_buf;
4636
4637 cb = zio->io_private;
4638 ASSERT(cb != NULL);
4639 dev = cb->l2wcb_dev;
4640 ASSERT(dev != NULL);
4641 head = cb->l2wcb_head;
4642 ASSERT(head != NULL);
4643 buflist = dev->l2ad_buflist;
4644 ASSERT(buflist != NULL);
4645 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4646 l2arc_write_callback_t *, cb);
4647
4648 if (zio->io_error != 0)
4649 ARCSTAT_BUMP(arcstat_l2_writes_error);
4650
4651 mutex_enter(&l2arc_buflist_mtx);
4652
4653 /*
4654 * All writes completed, or an error was hit.
4655 */
4656 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4657 ab_prev = list_prev(buflist, ab);
4658 l2hdr = ab->b_l2hdr;
4659
4660 /*
4661 * Release the temporary compressed buffer as soon as possible.
4662 */
4663 if (l2hdr->b_compress != ZIO_COMPRESS_OFF)
4664 l2arc_release_cdata_buf(ab);
4665
4666 hash_lock = HDR_LOCK(ab);
4667 if (!mutex_tryenter(hash_lock)) {
4668 /*
4669 * This buffer misses out. It may be in a stage
4670 * of eviction. Its ARC_L2_WRITING flag will be
4671 * left set, denying reads to this buffer.
4672 */
4673 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4674 continue;
4675 }
4676
4677 if (zio->io_error != 0) {
4678 /*
4679 * Error - drop L2ARC entry.
4680 */
4681 list_remove(buflist, ab);
4682 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4683 ab->b_l2hdr = NULL;
4684 kmem_free(l2hdr, sizeof (*l2hdr));
4685 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4686 }
4687
4688 /*
4689 * Allow ARC to begin reads to this L2ARC entry.
4690 */
4691 ab->b_flags &= ~ARC_L2_WRITING;
4692
4693 mutex_exit(hash_lock);
4694 }
4695
4696 atomic_inc_64(&l2arc_writes_done);
4697 list_remove(buflist, head);
4698 kmem_cache_free(hdr_cache, head);
4699 mutex_exit(&l2arc_buflist_mtx);
4700
4701 l2arc_do_free_on_write();
4702
4703 for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL;
4704 lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) {
4705 (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list);
4706 kmem_free(lb_buf, sizeof (*lb_buf));
4707 }
4708 list_destroy(&cb->l2wcb_log_blk_buf_list);
4709 kmem_free(cb, sizeof (l2arc_write_callback_t));
4710 }
4711
4712 /*
4713 * A read to a cache device completed. Validate buffer contents before
4714 * handing over to the regular ARC routines.
4715 */
4716 static void
4717 l2arc_read_done(zio_t *zio)
4718 {
4719 l2arc_read_callback_t *cb;
4720 arc_buf_hdr_t *hdr;
4721 arc_buf_t *buf;
4722 kmutex_t *hash_lock;
4723 int equal;
4724
4725 ASSERT(zio->io_vd != NULL);
4726 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4727
4728 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4812 case 1:
4813 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4814 *lock = &arc_mru->arcs_mtx;
4815 break;
4816 case 2:
4817 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4818 *lock = &arc_mfu->arcs_mtx;
4819 break;
4820 case 3:
4821 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4822 *lock = &arc_mru->arcs_mtx;
4823 break;
4824 }
4825
4826 ASSERT(!(MUTEX_HELD(*lock)));
4827 mutex_enter(*lock);
4828 return (list);
4829 }
4830
4831 /*
4832 * Calculates the maximum overhead of L2ARC metadata log blocks for a given
4833 * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
4834 * overhead in processing to make sure there is enough headroom available
4835 * when writing buffers.
4836 */
4837 static inline uint64_t
4838 l2arc_log_blk_overhead(uint64_t write_sz)
4839 {
4840 return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
4841 L2ARC_LOG_BLK_SIZE;
4842 }
4843
4844 /*
4845 * Evict buffers from the device write hand to the distance specified in
4846 * bytes. This distance may span populated buffers, it may span nothing.
4847 * This is clearing a region on the L2ARC device ready for writing.
4848 * If the 'all' boolean is set, every buffer is evicted.
4849 */
4850 static void
4851 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4852 {
4853 list_t *buflist;
4854 l2arc_buf_hdr_t *l2hdr;
4855 arc_buf_hdr_t *ab, *ab_prev;
4856 kmutex_t *hash_lock;
4857 uint64_t taddr;
4858
4859 buflist = dev->l2ad_buflist;
4860
4861 if (buflist == NULL)
4862 return;
4863
4864 if (!all && dev->l2ad_first) {
4865 /*
4866 * This is the first sweep through the device. There is
4867 * nothing to evict.
4868 */
4869 return;
4870 }
4871
4872 /*
4873 * We need to add in the worst case scenario of log block overhead.
4874 */
4875 distance += l2arc_log_blk_overhead(distance);
4876 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4877 /*
4878 * When nearing the end of the device, evict to the end
4879 * before the device write hand jumps to the start.
4880 */
4881 taddr = dev->l2ad_end;
4882 } else {
4883 taddr = dev->l2ad_hand + distance;
4884 }
4885 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4886 uint64_t, taddr, boolean_t, all);
4887
4888 top:
4889 mutex_enter(&l2arc_buflist_mtx);
4890 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4891 ab_prev = list_prev(buflist, ab);
4892
4893 hash_lock = HDR_LOCK(ab);
4894 if (!mutex_tryenter(hash_lock)) {
4895 /*
4938 * arc_hdr_destroy() will call list_remove()
4939 * and decrement arcstat_l2_size.
4940 */
4941 arc_change_state(arc_anon, ab, hash_lock);
4942 arc_hdr_destroy(ab);
4943 } else {
4944 /*
4945 * Invalidate issued or about to be issued
4946 * reads, since we may be about to write
4947 * over this location.
4948 */
4949 if (HDR_L2_READING(ab)) {
4950 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4951 ab->b_flags |= ARC_L2_EVICTED;
4952 }
4953
4954 /*
4955 * Tell ARC this no longer exists in L2ARC.
4956 */
4957 if (ab->b_l2hdr != NULL) {
4958 l2hdr = ab->b_l2hdr;
4959 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4960 ab->b_l2hdr = NULL;
4961 kmem_free(l2hdr, sizeof (*l2hdr));
4962 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4963 }
4964 list_remove(buflist, ab);
4965
4966 /*
4967 * This may have been leftover after a
4968 * failed write.
4969 */
4970 ab->b_flags &= ~ARC_L2_WRITING;
4971 }
4972 mutex_exit(hash_lock);
4973 }
4974 mutex_exit(&l2arc_buflist_mtx);
4975
4976 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4977 dev->l2ad_evict = taddr;
4978 }
4979
4980 /*
4981 * Find and write ARC buffers to the L2ARC device.
4982 *
4983 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4984 * for reading until they have completed writing.
4985 * The headroom_boost is an in-out parameter used to maintain headroom boost
4986 * state between calls to this function.
4987 *
4988 * Returns the number of bytes actually written (which may be smaller than
4989 * the delta by which the device hand has changed due to alignment).
4990 */
4991 static uint64_t
4992 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4993 boolean_t *headroom_boost)
4994 {
4995 arc_buf_hdr_t *ab, *ab_prev, *head;
4996 list_t *list;
4997 /*
4998 * These variables mean:
4999 * - write_size: in-memory size of ARC buffers we've written (before
5000 * compression).
5001 * - write_asize: actual on-disk size of ARC buffers we've written
5002 * (after compression).
5003 * - write_aligned_asize: actual sum of space taken by ARC buffers
5004 * on the device (after compression and alignment, so that
5005 * every buffer starts on a multiple of the device block size).
5006 * - headroom: L2ARC scanning headroom (we won't scan beyond this
5007 * distance from the list tail).
5008 * - buf_compress_minsz: minimum in-memory ARC buffer size for us
5009 * to try compressing it.
5010 */
5011 uint64_t write_size, write_asize, write_aligned_asize, headroom,
5012 buf_compress_minsz;
5013 void *buf_data;
5014 kmutex_t *list_lock;
5015 boolean_t full;
5016 l2arc_write_callback_t *cb;
5017 zio_t *pio, *wzio;
5018 uint64_t guid = spa_load_guid(spa);
5019 const boolean_t do_headroom_boost = *headroom_boost;
5020 boolean_t dev_hdr_update = B_FALSE;
5021
5022 ASSERT(dev->l2ad_vdev != NULL);
5023
5024 /* Lower the flag now, we might want to raise it again later. */
5025 *headroom_boost = B_FALSE;
5026
5027 pio = NULL;
5028 cb = NULL;
5029 write_size = write_asize = write_aligned_asize = 0;
5030 full = B_FALSE;
5031 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5032 head->b_flags |= ARC_L2_WRITE_HEAD;
5033
5034 /*
5035 * We will want to try to compress buffers that are at least 2x the
5036 * device sector size.
5037 */
5038 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5039
5040 /*
5041 * Copy buffers for L2ARC writing.
5042 */
5043 mutex_enter(&l2arc_buflist_mtx);
5044 for (int try = 0; try <= 3; try++) {
5045 uint64_t passed_sz = 0;
5046
5047 list = l2arc_list_locked(try, &list_lock);
5048
5049 /*
5050 * L2ARC fast warmup.
5051 *
5052 * Until the ARC is warm and starts to evict, read from the
5053 * head of the ARC lists rather than the tail.
5054 */
5055 if (arc_warm == B_FALSE)
5056 ab = list_head(list);
5057 else
5058 ab = list_tail(list);
5059
5060 headroom = target_sz * l2arc_headroom;
5061 if (do_headroom_boost)
5062 headroom = (headroom * l2arc_headroom_boost) / 100;
5063
5064 for (; ab; ab = ab_prev) {
5065 l2arc_buf_hdr_t *l2hdr;
5066 kmutex_t *hash_lock;
5067 uint64_t buf_aligned_size;
5068
5069 if (arc_warm == B_FALSE)
5070 ab_prev = list_next(list, ab);
5071 else
5072 ab_prev = list_prev(list, ab);
5073
5074 hash_lock = HDR_LOCK(ab);
5075 if (!mutex_tryenter(hash_lock)) {
5076 /*
5077 * Skip this buffer rather than waiting.
5078 */
5079 continue;
5080 }
5081
5082 /*
5083 * When examining whether we've met our write target,
5084 * we must always use the aligned size of the buffer,
5085 * since that's the maximum amount of space a buffer
5086 * can take up on the L2ARC device.
5087 */
5088 buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev,
5089 ab->b_size);
5090 passed_sz += buf_aligned_size;
5091 if (passed_sz > headroom) {
5092 /*
5093 * Searched too far.
5094 */
5095 mutex_exit(hash_lock);
5096 break;
5097 }
5098
5099 if (!l2arc_write_eligible(guid, ab)) {
5100 mutex_exit(hash_lock);
5101 continue;
5102 }
5103
5104 if ((write_size + buf_aligned_size) > target_sz) {
5105 full = B_TRUE;
5106 mutex_exit(hash_lock);
5107 break;
5108 }
5109
5110 if (pio == NULL) {
5111 /*
5112 * Insert a dummy header on the buflist so
5113 * l2arc_write_done() can find where the
5114 * write buffers begin without searching.
5115 */
5116 list_insert_head(dev->l2ad_buflist, head);
5117
5118 cb = kmem_zalloc(
5119 sizeof (l2arc_write_callback_t), KM_SLEEP);
5120 cb->l2wcb_dev = dev;
5121 cb->l2wcb_head = head;
5122 list_create(&cb->l2wcb_log_blk_buf_list,
5123 sizeof (l2arc_log_blk_buf_t),
5124 offsetof(l2arc_log_blk_buf_t, l2lbb_node));
5125 pio = zio_root(spa, l2arc_write_done, cb,
5126 ZIO_FLAG_CANFAIL);
5127 }
5128
5129 /*
5130 * Create and add a new L2ARC header.
5131 */
5132 l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
5133 l2hdr->b_dev = dev;
5134 ab->b_flags |= ARC_L2_WRITING;
5135
5136 /*
5137 * Temporarily stash the data buffer in b_tmp_cdata.
5138 * The subsequent write step will pick it up from
5139 * there. This is because can't access ab->b_buf
5140 * without holding the hash_lock, which we in turn
5141 * can't access without holding the ARC list locks
5142 * (which we want to avoid during compression/writing).
5143 */
5144 l2hdr->b_compress = ZIO_COMPRESS_OFF;
5145 l2hdr->b_asize = ab->b_size;
5146 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5147
5148 ab->b_l2hdr = l2hdr;
5149
5150 list_insert_head(dev->l2ad_buflist, ab);
5151
5152 /*
5153 * Compute and store the buffer cksum before
5154 * writing. On debug the cksum is verified first.
5155 */
5156 arc_cksum_verify(ab->b_buf);
5157 arc_cksum_compute(ab->b_buf, B_TRUE);
5158
5159 mutex_exit(hash_lock);
5160
5161 write_size += buf_aligned_size;
5162 }
5163
5164 mutex_exit(list_lock);
5165
5166 if (full == B_TRUE)
5167 break;
5168 }
5169
5170 /* No buffers selected for writing? */
5171 if (pio == NULL) {
5172 ASSERT0(write_size);
5173 mutex_exit(&l2arc_buflist_mtx);
5174 kmem_cache_free(hdr_cache, head);
5175 return (0);
5176 }
5177
5178 /*
5179 * Now start writing the buffers. We're starting at the write head
5180 * and work backwards, retracing the course of the buffer selector
5181 * loop above.
5182 */
5183 for (ab = list_prev(dev->l2ad_buflist, head); ab;
5184 ab = list_prev(dev->l2ad_buflist, ab)) {
5185 l2arc_buf_hdr_t *l2hdr;
5186 uint64_t buf_sz;
5187
5188 /*
5189 * We shouldn't need to lock the buffer here, since we flagged
5190 * it as ARC_L2_WRITING in the previous step, but we must take
5191 * care to only access its L2 cache parameters. In particular,
5192 * ab->b_buf may be invalid by now due to ARC eviction.
5197 if ((ab->b_flags & ARC_L2COMPRESS) &&
5198 l2hdr->b_asize >= buf_compress_minsz) {
5199 if (l2arc_compress_buf(l2hdr)) {
5200 /*
5201 * If compression succeeded, enable headroom
5202 * boost on the next scan cycle.
5203 */
5204 *headroom_boost = B_TRUE;
5205 }
5206 }
5207
5208 /*
5209 * Pick up the buffer data we had previously stashed away
5210 * (and now potentially also compressed).
5211 */
5212 buf_data = l2hdr->b_tmp_cdata;
5213 buf_sz = l2hdr->b_asize;
5214
5215 /* Compression may have squashed the buffer to zero length. */
5216 if (buf_sz != 0) {
5217 uint64_t buf_aligned_asize;
5218
5219 wzio = zio_write_phys(pio, dev->l2ad_vdev,
5220 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5221 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5222 ZIO_FLAG_CANFAIL, B_FALSE);
5223
5224 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5225 zio_t *, wzio);
5226 (void) zio_nowait(wzio);
5227
5228 write_asize += buf_sz;
5229 /*
5230 * Keep the clock hand suitably device-aligned.
5231 */
5232 buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev,
5233 buf_sz);
5234 write_aligned_asize += buf_aligned_asize;
5235 dev->l2ad_hand += buf_aligned_asize;
5236 ASSERT(dev->l2ad_hand <= dev->l2ad_evict ||
5237 dev->l2ad_first);
5238 }
5239
5240 if (l2arc_log_blk_insert(dev, ab)) {
5241 l2arc_log_blk_commit(dev, pio, cb);
5242 dev_hdr_update = B_TRUE;
5243 }
5244 }
5245 mutex_exit(&l2arc_buflist_mtx);
5246
5247 if (dev_hdr_update)
5248 l2arc_dev_hdr_update(dev, pio);
5249
5250 VERIFY3U(write_aligned_asize, <=, target_sz);
5251 ARCSTAT_BUMP(arcstat_l2_writes_sent);
5252 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5253 ARCSTAT_INCR(arcstat_l2_size, write_size);
5254 ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize);
5255 vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0);
5256
5257 /*
5258 * Bump device hand to the device start if it is approaching the end.
5259 * l2arc_evict() will already have evicted ahead for this case.
5260 */
5261 if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
5262 dev->l2ad_end) {
5263 vdev_space_update(dev->l2ad_vdev,
5264 dev->l2ad_end - dev->l2ad_hand, 0, 0);
5265 dev->l2ad_hand = dev->l2ad_start;
5266 dev->l2ad_evict = dev->l2ad_start;
5267 dev->l2ad_first = B_FALSE;
5268 }
5269
5270 dev->l2ad_writing = B_TRUE;
5271 (void) zio_wait(pio);
5272 dev->l2ad_writing = B_FALSE;
5273
5274 return (write_asize);
5275 }
5276
5277 /*
5278 * Compresses an L2ARC buffer.
5279 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5280 * size in l2hdr->b_asize. This routine tries to compress the data and
5281 * depending on the compression result there are three possible outcomes:
5282 * *) The buffer was incompressible. The original l2hdr contents were left
5504 * Write ARC buffers.
5505 */
5506 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5507
5508 /*
5509 * Calculate interval between writes.
5510 */
5511 next = l2arc_write_interval(begin, size, wrote);
5512 spa_config_exit(spa, SCL_L2ARC, dev);
5513 }
5514
5515 l2arc_thread_exit = 0;
5516 cv_broadcast(&l2arc_feed_thr_cv);
5517 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
5518 thread_exit();
5519 }
5520
5521 boolean_t
5522 l2arc_vdev_present(vdev_t *vd)
5523 {
5524 return (l2arc_vdev_get(vd) != NULL);
5525 }
5526
5527 static l2arc_dev_t *
5528 l2arc_vdev_get(vdev_t *vd)
5529 {
5530 l2arc_dev_t *dev;
5531 boolean_t held = MUTEX_HELD(&l2arc_dev_mtx);
5532
5533 if (!held)
5534 mutex_enter(&l2arc_dev_mtx);
5535 for (dev = list_head(l2arc_dev_list); dev != NULL;
5536 dev = list_next(l2arc_dev_list, dev)) {
5537 if (dev->l2ad_vdev == vd)
5538 break;
5539 }
5540 if (!held)
5541 mutex_exit(&l2arc_dev_mtx);
5542
5543 return (dev);
5544 }
5545
5546 /*
5547 * Add a vdev for use by the L2ARC. By this point the spa has already
5548 * validated the vdev and opened it. The `rebuild' flag indicates whether
5549 * we should attempt an L2ARC persistency rebuild.
5550 */
5551 void
5552 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5553 {
5554 l2arc_dev_t *adddev;
5555
5556 ASSERT(!l2arc_vdev_present(vd));
5557
5558 /*
5559 * Create a new l2arc device entry.
5560 */
5561 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5562 adddev->l2ad_spa = spa;
5563 adddev->l2ad_vdev = vd;
5564 /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */
5565 adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE;
5566 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5567 adddev->l2ad_hand = adddev->l2ad_start;
5568 adddev->l2ad_evict = adddev->l2ad_start;
5569 adddev->l2ad_first = B_TRUE;
5570 adddev->l2ad_writing = B_FALSE;
5571
5572 /*
5573 * This is a list of all ARC buffers that are still valid on the
5574 * device.
5575 */
5576 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5577 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5578 offsetof(arc_buf_hdr_t, b_l2node));
5579
5580 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5581
5582 /*
5583 * Add device to global list
5584 */
5585 mutex_enter(&l2arc_dev_mtx);
5586 list_insert_head(l2arc_dev_list, adddev);
5587 atomic_inc_64(&l2arc_ndev);
5588 if (rebuild && l2arc_rebuild_enabled &&
5589 adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
5590 /*
5591 * Just mark the device as pending for a rebuild. We won't
5592 * be starting a rebuild in line here as it would block pool
5593 * import. Instead spa_load_impl will hand that off to an
5594 * async task which will call l2arc_spa_rebuild_start.
5595 */
5596 adddev->l2ad_rebuild = B_TRUE;
5597 }
5598 mutex_exit(&l2arc_dev_mtx);
5599 }
5600
5601 /*
5602 * Remove a vdev from the L2ARC.
5603 */
5604 void
5605 l2arc_remove_vdev(vdev_t *vd)
5606 {
5607 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5608
5609 /*
5610 * Find the device by vdev
5611 */
5612 mutex_enter(&l2arc_dev_mtx);
5613 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5614 nextdev = list_next(l2arc_dev_list, dev);
5615 if (vd == dev->l2ad_vdev) {
5616 remdev = dev;
5617 break;
5684 {
5685 if (!(spa_mode_global & FWRITE))
5686 return;
5687
5688 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5689 TS_RUN, minclsyspri);
5690 }
5691
5692 void
5693 l2arc_stop(void)
5694 {
5695 if (!(spa_mode_global & FWRITE))
5696 return;
5697
5698 mutex_enter(&l2arc_feed_thr_lock);
5699 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5700 l2arc_thread_exit = 1;
5701 while (l2arc_thread_exit != 0)
5702 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5703 mutex_exit(&l2arc_feed_thr_lock);
5704 }
5705
5706 /*
5707 * Punches out rebuild threads for the L2ARC devices in a spa. This should
5708 * be called as one of the final steps of a pool import.
5709 */
5710 void
5711 l2arc_spa_rebuild_start(spa_t *spa)
5712 {
5713 l2arc_dev_t *dev;
5714 /*
5715 * Locate the spa's l2arc devices and kick off rebuild threads.
5716 */
5717 mutex_enter(&l2arc_dev_mtx);
5718 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
5719 dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
5720 ASSERT(dev != NULL);
5721 if (dev->l2ad_rebuild) {
5722 (void) thread_create(NULL, 0, l2arc_dev_rebuild_start,
5723 dev, 0, &p0, TS_RUN, minclsyspri);
5724 }
5725 }
5726 mutex_exit(&l2arc_dev_mtx);
5727 }
5728
5729 /*
5730 * Main entry point for L2ARC rebuilding.
5731 */
5732 static void
5733 l2arc_dev_rebuild_start(l2arc_dev_t *dev)
5734 {
5735 spa_t *spa = dev->l2ad_spa;
5736 vdev_t *vd = dev->l2ad_vdev;
5737
5738 /* Lock out device removal. */
5739 spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
5740 ASSERT(dev->l2ad_rebuild);
5741 (void) l2arc_rebuild(dev);
5742 dev->l2ad_rebuild = B_FALSE;
5743 spa_config_exit(spa, SCL_L2ARC, vd);
5744 thread_exit();
5745 }
5746
5747 /*
5748 * This function implements the actual L2ARC metadata rebuild. It:
5749 *
5750 * 1) reads the device's header
5751 * 2) if a good device header is found, starts reading the log block chain
5752 * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
5753 *
5754 * Operation stops under any of the following conditions:
5755 *
5756 * 1) We reach the end of the log blk chain (the back-reference in the blk is
5757 * invalid or loops over our starting point).
5758 * 2) We encounter *any* error condition (cksum errors, io errors, looped
5759 * blocks, etc.).
5760 * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
5761 * from making severely fragmented L2ARC log blocks or slow L2ARC devices
5762 * prevent a machine from finishing a pool import (and thus letting the
5763 * administrator take corrective action, e.g. by kicking the misbehaving
5764 * L2ARC device out of the pool, or by reimporting the pool with L2ARC
5765 * rebuilding disabled).
5766 */
5767 static int
5768 l2arc_rebuild(l2arc_dev_t *dev)
5769 {
5770 int err;
5771 l2arc_log_blk_phys_t *this_lb, *next_lb;
5772 uint8_t *this_lb_buf, *next_lb_buf;
5773 zio_t *this_io = NULL, *next_io = NULL;
5774 int64_t deadline;
5775 l2arc_log_blk_ptr_t lb_ptrs[2];
5776 boolean_t first_pass;
5777 uint64_t load_guid;
5778
5779 load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
5780 deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
5781 /*
5782 * Device header processing phase.
5783 */
5784 if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) {
5785 /* device header corrupted, start a new one */
5786 bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr));
5787 return (err);
5788 }
5789 if (l2arc_check_rebuild_timeout_hit(deadline))
5790 return (SET_ERROR(ETIMEDOUT));
5791
5792 /* Retrieve the persistent L2ARC device state */
5793 dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail;
5794 dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
5795 dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr +
5796 LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0]));
5797 dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags &
5798 L2ARC_DEV_HDR_EVICT_FIRST);
5799
5800 /* Prepare the rebuild processing state */
5801 bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
5802 this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
5803 next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
5804 this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
5805 next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
5806 first_pass = B_TRUE;
5807
5808 /* Start the rebuild process */
5809 for (;;) {
5810 if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0]))
5811 /* We hit an invalid block address, end the rebuild. */
5812 break;
5813
5814 if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
5815 this_lb, next_lb, this_lb_buf, next_lb_buf,
5816 this_io, &next_io)) != 0)
5817 break;
5818
5819 /* Protection against infinite loops of log blocks. */
5820 if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr,
5821 lb_ptrs[0].l2lbp_daddr,
5822 dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
5823 !first_pass) {
5824 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
5825 err = SET_ERROR(ELOOP);
5826 break;
5827 }
5828
5829 /*
5830 * Our memory pressure valve. If the system is running low
5831 * on memory, rather than swamping memory with new ARC buf
5832 * hdrs, we opt not to rebuild the L2ARC. At this point,
5833 * however, we have already set up our L2ARC dev to chain in
5834 * new metadata log blk, so the user may choose to re-add the
5835 * L2ARC dev at a later time to reconstruct it (when there's
5836 * less memory pressure).
5837 */
5838 if (arc_reclaim_needed()) {
5839 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
5840 cmn_err(CE_NOTE, "System running low on memory, "
5841 "aborting L2ARC rebuild.");
5842 err = SET_ERROR(ENOMEM);
5843 break;
5844 }
5845
5846 /*
5847 * Now that we know that the next_lb checks out alright, we
5848 * can start reconstruction from this lb - we can be sure
5849 * that the L2ARC write hand has not yet reached any of our
5850 * buffers.
5851 */
5852 l2arc_log_blk_restore(dev, load_guid, this_lb,
5853 LBP_GET_PSIZE(&lb_ptrs[0]));
5854
5855 /*
5856 * End of list detection. We can look ahead two steps in the
5857 * blk chain and if the 2nd blk from this_lb dips below the
5858 * initial chain starting point, then we know two things:
5859 * 1) it can't be valid, and
5860 * 2) the next_lb's ARC entries might have already been
5861 * partially overwritten and so we should stop before
5862 * we restore it
5863 */
5864 if (l2arc_range_check_overlap(
5865 this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr,
5866 dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
5867 !first_pass)
5868 break;
5869
5870 /* log blk restored, continue with next one in the list */
5871 lb_ptrs[0] = lb_ptrs[1];
5872 lb_ptrs[1] = this_lb->l2lb_back2_lbp;
5873 PTR_SWAP(this_lb, next_lb);
5874 PTR_SWAP(this_lb_buf, next_lb_buf);
5875 this_io = next_io;
5876 next_io = NULL;
5877 first_pass = B_FALSE;
5878
5879 if (l2arc_check_rebuild_timeout_hit(deadline)) {
5880 err = SET_ERROR(ETIMEDOUT);
5881 break;
5882 }
5883 }
5884 if (next_io != NULL)
5885 l2arc_log_blk_prefetch_abort(next_io);
5886 kmem_free(this_lb, sizeof (*this_lb));
5887 kmem_free(next_lb, sizeof (*next_lb));
5888 kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
5889 kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
5890 if (err == 0)
5891 ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
5892
5893 return (err);
5894 }
5895
5896 /*
5897 * Restores the payload of a log blk to ARC. This creates empty ARC hdr
5898 * entries which only contain an l2arc hdr, essentially restoring the
5899 * buffers to their L2ARC evicted state. This function also updates space
5900 * usage on the L2ARC vdev to make sure it tracks restored buffers.
5901 */
5902 static void
5903 l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
5904 l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
5905 {
5906 uint64_t size = 0, psize = 0;
5907
5908 mutex_enter(&l2arc_buflist_mtx);
5909
5910 for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
5911 /*
5912 * Restore goes in the reverse direction to preserve correct
5913 * temporal ordering of buffers in the l2ad_buflist.
5914 */
5915 l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid);
5916 size += LE_GET_LSIZE(&lb->l2lb_entries[i]);
5917 psize += LE_GET_PSIZE(&lb->l2lb_entries[i]);
5918 }
5919 mutex_exit(&l2arc_buflist_mtx);
5920
5921 /*
5922 * Record rebuild stats:
5923 * size In-memory size of restored buffer data in ARC
5924 * psize Physical size of restored buffers in the L2ARC
5925 * bufs # of ARC buffer headers restored
5926 * log_blks # of L2ARC log entries processed during restore
5927 */
5928 ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
5929 ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
5930 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
5931 ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
5932 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
5933 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
5934 vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
5935 }
5936
5937 /*
5938 * Restores a single ARC buf hdr from a log block. The ARC buffer is put
5939 * into a state indicating that it has been evicted to L2ARC.
5940 */
5941 static void
5942 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
5943 uint64_t load_guid)
5944 {
5945 arc_buf_hdr_t *hdr, *exists;
5946 kmutex_t *hash_lock;
5947 arc_buf_contents_t type = LE_GET_TYPE(le);
5948 l2arc_buf_hdr_t *l2hdr;
5949
5950 hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type);
5951 hdr->b_dva = le->l2le_dva;
5952 hdr->b_birth = le->l2le_birth;
5953 hdr->b_cksum0 = le->l2le_cksum0;
5954 hdr->b_size = LE_GET_LSIZE(le);
5955 exists = buf_hash_insert(hdr, &hash_lock);
5956 if (exists) {
5957 /* Buffer was already cached, no need to restore it. */
5958 mutex_exit(hash_lock);
5959 arc_hdr_destroy(hdr);
5960 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
5961 return;
5962 }
5963 hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE;
5964 if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF)
5965 hdr->b_flags |= ARC_L2COMPRESS;
5966 mutex_enter(&hdr->b_freeze_lock);
5967 ASSERT(hdr->b_freeze_cksum == NULL);
5968 hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
5969 *hdr->b_freeze_cksum = le->l2le_freeze_cksum;
5970 mutex_exit(&hdr->b_freeze_lock);
5971
5972 /* now rebuild the l2arc entry */
5973 ASSERT(hdr->b_l2hdr == NULL);
5974 l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
5975 l2hdr->b_dev = dev;
5976 l2hdr->b_daddr = le->l2le_daddr;
5977 l2hdr->b_asize = LE_GET_PSIZE(le);
5978 l2hdr->b_compress = LE_GET_COMPRESS(le);
5979 hdr->b_l2hdr = l2hdr;
5980 list_insert_tail(dev->l2ad_buflist, hdr);
5981 ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
5982 ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
5983
5984 arc_change_state(arc_l2c_only, hdr, hash_lock);
5985 mutex_exit(hash_lock);
5986 }
5987
5988 /*
5989 * Attempts to read the device header on the provided L2ARC device and writes
5990 * it to `ub'. On success, this function returns 0, otherwise the appropriate
5991 * error code is returned.
5992 */
5993 static int
5994 l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr)
5995 {
5996 int err;
5997 uint64_t guid;
5998 zio_cksum_t cksum;
5999
6000 guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6001
6002 if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
6003 VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr,
6004 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
6005 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6006 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
6007 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
6008 return (err);
6009 }
6010
6011 if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
6012 byteswap_uint64_array(hdr, sizeof (*hdr));
6013
6014 if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC ||
6015 hdr->l2dh_spa_guid != guid) {
6016 /*
6017 * Attempt to rebuild a device containing no actual dev hdr
6018 * or containing a header from some other pool.
6019 */
6020 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
6021 return (SET_ERROR(ENOTSUP));
6022 }
6023
6024 l2arc_dev_hdr_checksum(hdr, &cksum);
6025 if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) {
6026 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
6027 return (SET_ERROR(EINVAL));
6028 }
6029 if (hdr->l2dh_evict_tail < dev->l2ad_start ||
6030 hdr->l2dh_evict_tail >= dev->l2ad_end) {
6031 /* Data in dev hdr is invalid for this device. */
6032 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
6033 return (SET_ERROR(EINVAL));
6034 }
6035
6036 return (0);
6037 }
6038
6039 /*
6040 * Reads L2ARC log blocks from storage and validates their contents.
6041 *
6042 * This function implements a simple prefetcher to make sure that while
6043 * we're processing one buffer the L2ARC is already prefetching the next
6044 * one in the chain.
6045 *
6046 * The arguments this_lp and next_lp point to the current and next log blk
6047 * address in the block chain. Similarly, this_lb and next_lb hold the
6048 * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
6049 * and next_lb_buf must be buffers of appropriate to hold a raw
6050 * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
6051 * to buffer decompression).
6052 *
6053 * The `this_io' and `next_io' arguments are used for block prefetching.
6054 * When issuing the first blk IO during rebuild, you should pass NULL for
6055 * `this_io'. This function will then issue a sync IO to read the block and
6056 * also issue an async IO to fetch the next block in the block chain. The
6057 * prefetch IO is returned in `next_io'. On subsequent calls to this
6058 * function, pass the value returned in `next_io' from the previous call
6059 * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
6060 * Prior to the call, you should initialize your `next_io' pointer to be
6061 * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
6062 *
6063 * On success, this function returns 0, otherwise it returns an appropriate
6064 * error code. On error the prefetching IO is aborted and cleared before
6065 * returning from this function. Therefore, if we return `success', the
6066 * caller can assume that we have taken care of cleanup of prefetch IOs.
6067 */
6068 static int
6069 l2arc_log_blk_read(l2arc_dev_t *dev,
6070 const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp,
6071 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
6072 uint8_t *this_lb_buf, uint8_t *next_lb_buf,
6073 zio_t *this_io, zio_t **next_io)
6074 {
6075 int err = 0;
6076 zio_cksum_t cksum;
6077
6078 ASSERT(this_lbp != NULL && next_lbp != NULL);
6079 ASSERT(this_lb != NULL && next_lb != NULL);
6080 ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
6081 ASSERT(next_io != NULL && *next_io == NULL);
6082 ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp));
6083
6084 /*
6085 * Check to see if we have issued the IO for this log blk in a
6086 * previous run. If not, this is the first call, so issue it now.
6087 */
6088 if (this_io == NULL) {
6089 this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
6090 this_lb_buf);
6091 }
6092
6093 /*
6094 * Peek to see if we can start issuing the next IO immediately.
6095 */
6096 if (l2arc_log_blk_ptr_valid(dev, next_lbp)) {
6097 /*
6098 * Start issuing IO for the next log blk early - this
6099 * should help keep the L2ARC device busy while we
6100 * decompress and restore this log blk.
6101 */
6102 *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
6103 next_lb_buf);
6104 }
6105
6106 /* Wait for the IO to read this log block to complete */
6107 if ((err = zio_wait(this_io)) != 0) {
6108 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
6109 goto cleanup;
6110 }
6111
6112 /* Make sure the buffer checks out */
6113 fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum);
6114 if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) {
6115 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
6116 err = SET_ERROR(EINVAL);
6117 goto cleanup;
6118 }
6119
6120 /* Now we can take our time decoding this buffer */
6121 switch (LBP_GET_COMPRESS(this_lbp)) {
6122 case ZIO_COMPRESS_OFF:
6123 bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
6124 break;
6125 case ZIO_COMPRESS_LZ4:
6126 if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
6127 this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
6128 sizeof (*this_lb))) != 0) {
6129 err = SET_ERROR(EINVAL);
6130 goto cleanup;
6131 }
6132 break;
6133 default:
6134 err = SET_ERROR(EINVAL);
6135 goto cleanup;
6136 }
6137 if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
6138 byteswap_uint64_array(this_lb, sizeof (*this_lb));
6139 if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) {
6140 err = SET_ERROR(EINVAL);
6141 goto cleanup;
6142 }
6143 cleanup:
6144 /* Abort an in-flight prefetch I/O in case of error */
6145 if (err != 0 && *next_io != NULL) {
6146 l2arc_log_blk_prefetch_abort(*next_io);
6147 *next_io = NULL;
6148 }
6149 return (err);
6150 }
6151
6152 /*
6153 * Validates an L2ARC log blk address to make sure that it can be read
6154 * from the provided L2ARC device. Returns B_TRUE if the address is
6155 * within the device's bounds, or B_FALSE if not.
6156 */
6157 static boolean_t
6158 l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp)
6159 {
6160 uint64_t psize = LBP_GET_PSIZE(lbp);
6161 uint64_t end = lbp->l2lbp_daddr + psize;
6162
6163 /*
6164 * A log block is valid if all of the following conditions are true:
6165 * - it fits entirely between l2ad_start and l2ad_end
6166 * - it has a valid size
6167 * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it
6168 * doesn't sit in the evicted region)
6169 */
6170 return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end &&
6171 psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
6172 lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand);
6173 }
6174
6175 /*
6176 * Starts an asynchronous read IO to read a log block. This is used in log
6177 * block reconstruction to start reading the next block before we are done
6178 * decoding and reconstructing the current block, to keep the l2arc device
6179 * nice and hot with read IO to process.
6180 * The returned zio will contain a newly allocated memory buffers for the IO
6181 * data which should then be freed by the caller once the zio is no longer
6182 * needed (i.e. due to it having completed). If you wish to abort this
6183 * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
6184 * care of disposing of the allocated buffers correctly.
6185 */
6186 static zio_t *
6187 l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp,
6188 uint8_t *lb_buf)
6189 {
6190 uint32_t psize;
6191 zio_t *pio;
6192
6193 psize = LBP_GET_PSIZE(lbp);
6194 ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
6195 pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
6196 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6197 ZIO_FLAG_DONT_RETRY);
6198 (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize,
6199 lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
6200 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6201 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
6202
6203 return (pio);
6204 }
6205
6206 /*
6207 * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
6208 * buffers allocated for it.
6209 */
6210 static void
6211 l2arc_log_blk_prefetch_abort(zio_t *zio)
6212 {
6213 (void) zio_wait(zio);
6214 }
6215
6216 /*
6217 * Creates a zio to update the device header on an l2arc device. The zio is
6218 * initiated as a child of `pio'.
6219 */
6220 static void
6221 l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
6222 {
6223 zio_t *wzio;
6224 vdev_stat_t st;
6225 l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr;
6226
6227 vdev_get_stats(dev->l2ad_vdev, &st);
6228
6229 hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC;
6230 hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6231 hdr->l2dh_evict_tail = dev->l2ad_evict;
6232 hdr->l2dh_alloc_space = st.vs_alloc;
6233 hdr->l2dh_flags = 0;
6234 if (dev->l2ad_first)
6235 hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
6236
6237 /* checksum operation goes last */
6238 l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum);
6239
6240 CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE &&
6241 sizeof (*hdr) <= SPA_MAXBLOCKSIZE);
6242 wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
6243 sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL,
6244 NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6245 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6246 zio_t *, wzio);
6247 (void) zio_nowait(wzio);
6248 }
6249
6250 /*
6251 * Commits a log block to the L2ARC device. This routine is invoked from
6252 * l2arc_write_buffers when the log block fills up.
6253 * This function allocates some memory to temporarily hold the serialized
6254 * buffer to be written. This is then released in l2arc_write_done.
6255 */
6256 static void
6257 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
6258 l2arc_write_callback_t *cb)
6259 {
6260 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
6261 uint64_t psize, asize;
6262 l2arc_log_blk_buf_t *lb_buf;
6263 zio_t *wzio;
6264
6265 VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
6266
6267 /* link the buffer into the block chain */
6268 lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1];
6269 lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC;
6270
6271 /* try to compress the buffer */
6272 lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
6273 list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf);
6274 VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb,
6275 lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0);
6276
6277 /*
6278 * Update the start log blk pointer in the device header to point
6279 * to the log block we're about to write.
6280 */
6281 dev->l2ad_dev_hdr.l2dh_start_lbps[1] =
6282 dev->l2ad_dev_hdr.l2dh_start_lbps[0];
6283 dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand;
6284 LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb));
6285 LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize);
6286 LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6287 ZIO_CHECKSUM_FLETCHER_4);
6288 LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0);
6289 if (psize < sizeof (*lb)) {
6290 /* compression succeeded */
6291 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6292 ZIO_COMPRESS_LZ4);
6293 } else {
6294 /* compression failed */
6295 bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb));
6296 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6297 ZIO_COMPRESS_OFF);
6298 }
6299 /* checksum what we're about to write */
6300 fletcher_4_native(lb_buf->l2lbb_log_blk, psize,
6301 &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum);
6302
6303 /* perform the write itself */
6304 CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
6305 L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
6306 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
6307 psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
6308 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6309 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
6310 (void) zio_nowait(wzio);
6311
6312 /* realign the device hand */
6313 asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
6314 dev->l2ad_hand += asize;
6315 VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first);
6316 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
6317
6318 /* bump the kstats */
6319 ARCSTAT_INCR(arcstat_l2_write_bytes, psize);
6320 ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
6321 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
6322 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
6323 dev->l2ad_log_blk_payload_asize / asize);
6324
6325 dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0;
6326 }
6327
6328 /*
6329 * Computes the checksum of `hdr' and stores it in `cksum'.
6330 */
6331 static void
6332 l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
6333 {
6334 fletcher_4_native((uint8_t *)hdr +
6335 offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
6336 sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
6337 cksum);
6338 }
6339
6340 /*
6341 * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
6342 * The buffer being inserted must be present in L2ARC.
6343 * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
6344 * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
6345 */
6346 static boolean_t
6347 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
6348 {
6349 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
6350 l2arc_log_ent_phys_t *le;
6351 const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
6352 int index = dev->l2ad_log_ent_idx++;
6353
6354 ASSERT(l2hdr != NULL);
6355 ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
6356
6357 le = &lb->l2lb_entries[index];
6358 bzero(le, sizeof (*le));
6359 le->l2le_dva = ab->b_dva;
6360 le->l2le_birth = ab->b_birth;
6361 le->l2le_cksum0 = ab->b_cksum0;
6362 le->l2le_daddr = l2hdr->b_daddr;
6363 LE_SET_LSIZE(le, ab->b_size);
6364 LE_SET_PSIZE(le, l2hdr->b_asize);
6365 LE_SET_COMPRESS(le, l2hdr->b_compress);
6366 le->l2le_freeze_cksum = *ab->b_freeze_cksum;
6367 LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
6368 LE_SET_TYPE(le, ab->b_type);
6369 dev->l2ad_log_blk_payload_asize += l2hdr->b_asize;
6370
6371 return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
6372 }
6373
6374 /*
6375 * Checks whether a given L2ARC device address sits in a time-sequential
6376 * range. The trick here is that the L2ARC is a rotary buffer, so we can't
6377 * just do a range comparison, we need to handle the situation in which the
6378 * range wraps around the end of the L2ARC device. Arguments:
6379 * bottom Lower end of the range to check (written to earlier).
6380 * top Upper end of the range to check (written to later).
6381 * check The address for which we want to determine if it sits in
6382 * between the top and bottom.
6383 *
6384 * The 3-way conditional below represents the following cases:
6385 *
6386 * bottom < top : Sequentially ordered case:
6387 * <check>--------+-------------------+
6388 * | (overlap here?) |
6389 * L2ARC dev V V
6390 * |---------------<bottom>============<top>--------------|
6391 *
6392 * bottom > top: Looped-around case:
6393 * <check>--------+------------------+
6394 * | (overlap here?) |
6395 * L2ARC dev V V
6396 * |===============<top>---------------<bottom>===========|
6397 * ^ ^
6398 * | (or here?) |
6399 * +---------------+---------<check>
6400 *
6401 * top == bottom : Just a single address comparison.
6402 */
6403 static inline boolean_t
6404 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
6405 {
6406 if (bottom < top)
6407 return (bottom <= check && check <= top);
6408 else if (bottom > top)
6409 return (check <= top || bottom <= check);
6410 else
6411 return (check == top);
6412 }
6413
6414 /*
6415 * Checks whether a rebuild timeout deadline has been hit and if it has,
6416 * increments the appropriate error counters.
6417 */
6418 static boolean_t
6419 l2arc_check_rebuild_timeout_hit(int64_t deadline)
6420 {
6421 if (deadline != 0 && deadline < ddi_get_lbolt64()) {
6422 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout);
6423 cmn_err(CE_WARN, "L2ARC rebuild is taking too long, "
6424 "dropping remaining L2ARC metadata.");
6425 return (B_TRUE);
6426 } else {
6427 return (B_FALSE);
6428 }
6429 }
|