119 * - ARC header release, as it removes from L2ARC buflists
120 */
121
122 #include <sys/spa.h>
123 #include <sys/zio.h>
124 #include <sys/zio_compress.h>
125 #include <sys/zfs_context.h>
126 #include <sys/arc.h>
127 #include <sys/refcount.h>
128 #include <sys/vdev.h>
129 #include <sys/vdev_impl.h>
130 #ifdef _KERNEL
131 #include <sys/vmsystm.h>
132 #include <vm/anon.h>
133 #include <sys/fs/swapnode.h>
134 #include <sys/dnlc.h>
135 #endif
136 #include <sys/callb.h>
137 #include <sys/kstat.h>
138 #include <zfs_fletcher.h>
139
140 #ifndef _KERNEL
141 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
142 boolean_t arc_watch = B_FALSE;
143 int arc_procfd;
144 #endif
145
146 static kmutex_t arc_reclaim_thr_lock;
147 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
148 static uint8_t arc_thread_exit;
149
150 extern int zfs_write_limit_shift;
151 extern uint64_t zfs_write_limit_max;
152 extern kmutex_t zfs_write_limit_lock;
153
154 #define ARC_REDUCE_DNLC_PERCENT 3
155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
156
157 typedef enum arc_reclaim_strategy {
158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
290 kstat_named_t arcstat_l2_feeds;
291 kstat_named_t arcstat_l2_rw_clash;
292 kstat_named_t arcstat_l2_read_bytes;
293 kstat_named_t arcstat_l2_write_bytes;
294 kstat_named_t arcstat_l2_writes_sent;
295 kstat_named_t arcstat_l2_writes_done;
296 kstat_named_t arcstat_l2_writes_error;
297 kstat_named_t arcstat_l2_writes_hdr_miss;
298 kstat_named_t arcstat_l2_evict_lock_retry;
299 kstat_named_t arcstat_l2_evict_reading;
300 kstat_named_t arcstat_l2_free_on_write;
301 kstat_named_t arcstat_l2_abort_lowmem;
302 kstat_named_t arcstat_l2_cksum_bad;
303 kstat_named_t arcstat_l2_io_error;
304 kstat_named_t arcstat_l2_size;
305 kstat_named_t arcstat_l2_asize;
306 kstat_named_t arcstat_l2_hdr_size;
307 kstat_named_t arcstat_l2_compress_successes;
308 kstat_named_t arcstat_l2_compress_zeros;
309 kstat_named_t arcstat_l2_compress_failures;
310 kstat_named_t arcstat_memory_throttle_count;
311 kstat_named_t arcstat_duplicate_buffers;
312 kstat_named_t arcstat_duplicate_buffers_size;
313 kstat_named_t arcstat_duplicate_reads;
314 kstat_named_t arcstat_meta_used;
315 kstat_named_t arcstat_meta_limit;
316 kstat_named_t arcstat_meta_max;
317 } arc_stats_t;
318
319 static arc_stats_t arc_stats = {
320 { "hits", KSTAT_DATA_UINT64 },
321 { "misses", KSTAT_DATA_UINT64 },
322 { "demand_data_hits", KSTAT_DATA_UINT64 },
323 { "demand_data_misses", KSTAT_DATA_UINT64 },
324 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
325 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
326 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
327 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
328 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
329 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
356 { "l2_feeds", KSTAT_DATA_UINT64 },
357 { "l2_rw_clash", KSTAT_DATA_UINT64 },
358 { "l2_read_bytes", KSTAT_DATA_UINT64 },
359 { "l2_write_bytes", KSTAT_DATA_UINT64 },
360 { "l2_writes_sent", KSTAT_DATA_UINT64 },
361 { "l2_writes_done", KSTAT_DATA_UINT64 },
362 { "l2_writes_error", KSTAT_DATA_UINT64 },
363 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
364 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
365 { "l2_evict_reading", KSTAT_DATA_UINT64 },
366 { "l2_free_on_write", KSTAT_DATA_UINT64 },
367 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
368 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
369 { "l2_io_error", KSTAT_DATA_UINT64 },
370 { "l2_size", KSTAT_DATA_UINT64 },
371 { "l2_asize", KSTAT_DATA_UINT64 },
372 { "l2_hdr_size", KSTAT_DATA_UINT64 },
373 { "l2_compress_successes", KSTAT_DATA_UINT64 },
374 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
375 { "l2_compress_failures", KSTAT_DATA_UINT64 },
376 { "memory_throttle_count", KSTAT_DATA_UINT64 },
377 { "duplicate_buffers", KSTAT_DATA_UINT64 },
378 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
379 { "duplicate_reads", KSTAT_DATA_UINT64 },
380 { "arc_meta_used", KSTAT_DATA_UINT64 },
381 { "arc_meta_limit", KSTAT_DATA_UINT64 },
382 { "arc_meta_max", KSTAT_DATA_UINT64 }
383 };
384
385 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
386
387 #define ARCSTAT_INCR(stat, val) \
388 atomic_add_64(&arc_stats.stat.value.ui64, (val))
389
390 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
391 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
392
393 #define ARCSTAT_MAX(stat, val) { \
394 uint64_t m; \
395 while ((val) > (m = arc_stats.stat.value.ui64) && \
403 /*
404 * We define a macro to allow ARC hits/misses to be easily broken down by
405 * two separate conditions, giving a total of four different subtypes for
406 * each of hits and misses (so eight statistics total).
407 */
408 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
409 if (cond1) { \
410 if (cond2) { \
411 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
412 } else { \
413 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
414 } \
415 } else { \
416 if (cond2) { \
417 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
418 } else { \
419 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
420 } \
421 }
422
423 kstat_t *arc_ksp;
424 static arc_state_t *arc_anon;
425 static arc_state_t *arc_mru;
426 static arc_state_t *arc_mru_ghost;
427 static arc_state_t *arc_mfu;
428 static arc_state_t *arc_mfu_ghost;
429 static arc_state_t *arc_l2c_only;
430
431 /*
432 * There are several ARC variables that are critical to export as kstats --
433 * but we don't want to have to grovel around in the kstat whenever we wish to
434 * manipulate them. For these variables, we therefore define them to be in
435 * terms of the statistic variable. This assures that we are not introducing
436 * the possibility of inconsistency by having shadow copies of the variables,
437 * while still allowing the code to be readable.
438 */
439 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
440 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
441 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
442 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
610 #define L2ARC_FEED_SECS 1 /* caching interval secs */
611 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
612
613 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
614 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
615
616 /* L2ARC Performance Tunables */
617 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
618 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
619 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
620 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
621 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
622 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
623 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
624 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
625 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
626
627 /*
628 * L2ARC Internals
629 */
630 typedef struct l2arc_dev {
631 vdev_t *l2ad_vdev; /* vdev */
632 spa_t *l2ad_spa; /* spa */
633 uint64_t l2ad_hand; /* next write location */
634 uint64_t l2ad_start; /* first addr on device */
635 uint64_t l2ad_end; /* last addr on device */
636 uint64_t l2ad_evict; /* last addr eviction reached */
637 boolean_t l2ad_first; /* first sweep through */
638 boolean_t l2ad_writing; /* currently writing */
639 list_t *l2ad_buflist; /* buffer list */
640 list_node_t l2ad_node; /* device list node */
641 } l2arc_dev_t;
642
643 static list_t L2ARC_dev_list; /* device list */
644 static list_t *l2arc_dev_list; /* device list pointer */
645 static kmutex_t l2arc_dev_mtx; /* device list mutex */
646 static l2arc_dev_t *l2arc_dev_last; /* last device used */
647 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
648 static list_t L2ARC_free_on_write; /* free after write buf list */
649 static list_t *l2arc_free_on_write; /* free after write list ptr */
650 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
651 static uint64_t l2arc_ndev; /* number of devices */
652
653 typedef struct l2arc_read_callback {
654 arc_buf_t *l2rcb_buf; /* read buffer */
655 spa_t *l2rcb_spa; /* spa */
656 blkptr_t l2rcb_bp; /* original blkptr */
657 zbookmark_t l2rcb_zb; /* original bookmark */
658 int l2rcb_flags; /* original flags */
659 enum zio_compress l2rcb_compress; /* applied compress */
660 } l2arc_read_callback_t;
661
662 typedef struct l2arc_write_callback {
663 l2arc_dev_t *l2wcb_dev; /* device info */
664 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
665 } l2arc_write_callback_t;
666
667 struct l2arc_buf_hdr {
668 /* protected by arc_buf_hdr mutex */
669 l2arc_dev_t *b_dev; /* L2ARC device */
670 uint64_t b_daddr; /* disk address, offset byte */
671 /* compression applied to buffer data */
672 enum zio_compress b_compress;
673 /* real alloc'd buffer size depending on b_compress applied */
674 int b_asize;
675 /* temporary buffer holder for in-flight compressed data */
676 void *b_tmp_cdata;
677 };
678
679 typedef struct l2arc_data_free {
680 /* protected by l2arc_free_on_write_mtx */
681 void *l2df_data;
682 size_t l2df_size;
683 void (*l2df_func)(void *, size_t);
684 list_node_t l2df_list_node;
685 } l2arc_data_free_t;
686
687 static kmutex_t l2arc_feed_thr_lock;
688 static kcondvar_t l2arc_feed_thr_cv;
689 static uint8_t l2arc_thread_exit;
690
691 static void l2arc_read_done(zio_t *zio);
692 static void l2arc_hdr_stat_add(void);
693 static void l2arc_hdr_stat_remove(void);
694
695 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
696 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
697 enum zio_compress c);
698 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
699
700 static uint64_t
701 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
702 {
703 uint8_t *vdva = (uint8_t *)dva;
704 uint64_t crc = -1ULL;
705 int i;
706
707 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
708
709 for (i = 0; i < sizeof (dva_t); i++)
710 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
711
712 crc ^= (spa>>8) ^ birth;
713
714 return (crc);
715 }
716
717 #define BUF_EMPTY(buf) \
718 ((buf)->b_dva.dva_word[0] == 0 && \
719 (buf)->b_dva.dva_word[1] == 0 && \
1220 if (use_mutex)
1221 mutex_exit(&new_state->arcs_mtx);
1222 }
1223 }
1224
1225 ASSERT(!BUF_EMPTY(ab));
1226 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1227 buf_hash_remove(ab);
1228
1229 /* adjust state sizes */
1230 if (to_delta)
1231 atomic_add_64(&new_state->arcs_size, to_delta);
1232 if (from_delta) {
1233 ASSERT3U(old_state->arcs_size, >=, from_delta);
1234 atomic_add_64(&old_state->arcs_size, -from_delta);
1235 }
1236 ab->b_state = new_state;
1237
1238 /* adjust l2arc hdr stats */
1239 if (new_state == arc_l2c_only)
1240 l2arc_hdr_stat_add();
1241 else if (old_state == arc_l2c_only)
1242 l2arc_hdr_stat_remove();
1243 }
1244
1245 void
1246 arc_space_consume(uint64_t space, arc_space_type_t type)
1247 {
1248 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1249
1250 switch (type) {
1251 case ARC_SPACE_DATA:
1252 ARCSTAT_INCR(arcstat_data_size, space);
1253 break;
1254 case ARC_SPACE_OTHER:
1255 ARCSTAT_INCR(arcstat_other_size, space);
1256 break;
1257 case ARC_SPACE_HDRS:
1258 ARCSTAT_INCR(arcstat_hdr_size, space);
1259 break;
1260 case ARC_SPACE_L2HDRS:
1324 hdr->b_type = type;
1325 hdr->b_spa = spa_load_guid(spa);
1326 hdr->b_state = arc_anon;
1327 hdr->b_arc_access = 0;
1328 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1329 buf->b_hdr = hdr;
1330 buf->b_data = NULL;
1331 buf->b_efunc = NULL;
1332 buf->b_private = NULL;
1333 buf->b_next = NULL;
1334 hdr->b_buf = buf;
1335 arc_get_data_buf(buf);
1336 hdr->b_datacnt = 1;
1337 hdr->b_flags = 0;
1338 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1339 (void) refcount_add(&hdr->b_refcnt, tag);
1340
1341 return (buf);
1342 }
1343
1344 static char *arc_onloan_tag = "onloan";
1345
1346 /*
1347 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1348 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1349 * buffers must be returned to the arc before they can be used by the DMU or
1350 * freed.
1351 */
1352 arc_buf_t *
1353 arc_loan_buf(spa_t *spa, int size)
1354 {
1355 arc_buf_t *buf;
1356
1357 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1358
1359 atomic_add_64(&arc_loaned_bytes, size);
1360 return (buf);
1361 }
1362
1363 /*
3956 * l2arc_noprefetch skip caching prefetched buffers
3957 * l2arc_headroom number of max device writes to precache
3958 * l2arc_headroom_boost when we find compressed buffers during ARC
3959 * scanning, we multiply headroom by this
3960 * percentage factor for the next scan cycle,
3961 * since more compressed buffers are likely to
3962 * be present
3963 * l2arc_feed_secs seconds between L2ARC writing
3964 *
3965 * Tunables may be removed or added as future performance improvements are
3966 * integrated, and also may become zpool properties.
3967 *
3968 * There are three key functions that control how the L2ARC warms up:
3969 *
3970 * l2arc_write_eligible() check if a buffer is eligible to cache
3971 * l2arc_write_size() calculate how much to write
3972 * l2arc_write_interval() calculate sleep delay between writes
3973 *
3974 * These three functions determine what to write, how much, and how quickly
3975 * to send writes.
3976 */
3977
3978 static boolean_t
3979 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3980 {
3981 /*
3982 * A buffer is *not* eligible for the L2ARC if it:
3983 * 1. belongs to a different spa.
3984 * 2. is already cached on the L2ARC.
3985 * 3. has an I/O in progress (it may be an incomplete read).
3986 * 4. is flagged not eligible (zfs property).
3987 */
3988 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3989 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3990 return (B_FALSE);
3991
3992 return (B_TRUE);
3993 }
3994
3995 static uint64_t
4022 clock_t interval, next, now;
4023
4024 /*
4025 * If the ARC lists are busy, increase our write rate; if the
4026 * lists are stale, idle back. This is achieved by checking
4027 * how much we previously wrote - if it was more than half of
4028 * what we wanted, schedule the next write much sooner.
4029 */
4030 if (l2arc_feed_again && wrote > (wanted / 2))
4031 interval = (hz * l2arc_feed_min_ms) / 1000;
4032 else
4033 interval = hz * l2arc_feed_secs;
4034
4035 now = ddi_get_lbolt();
4036 next = MAX(now, MIN(now + interval, began + interval));
4037
4038 return (next);
4039 }
4040
4041 static void
4042 l2arc_hdr_stat_add(void)
4043 {
4044 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4045 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4046 }
4047
4048 static void
4049 l2arc_hdr_stat_remove(void)
4050 {
4051 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4052 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4053 }
4054
4055 /*
4056 * Cycle through L2ARC devices. This is how L2ARC load balances.
4057 * If a device is returned, this also returns holding the spa config lock.
4058 */
4059 static l2arc_dev_t *
4060 l2arc_dev_get_next(void)
4061 {
4062 l2arc_dev_t *first, *next = NULL;
4063
4064 /*
4065 * Lock out the removal of spas (spa_namespace_lock), then removal
4066 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4067 * both locks will be dropped and a spa config lock held instead.
4068 */
4069 mutex_enter(&spa_namespace_lock);
4070 mutex_enter(&l2arc_dev_mtx);
4071
4072 /* if there are no vdevs, there is nothing to do */
4073 if (l2arc_ndev == 0)
4074 goto out;
4075
4076 first = NULL;
4077 next = l2arc_dev_last;
4078 do {
4079 /* loop around the list looking for a non-faulted vdev */
4080 if (next == NULL) {
4081 next = list_head(l2arc_dev_list);
4082 } else {
4083 next = list_next(l2arc_dev_list, next);
4084 if (next == NULL)
4085 next = list_head(l2arc_dev_list);
4086 }
4087
4088 /* if we have come back to the start, bail out */
4089 if (first == NULL)
4090 first = next;
4091 else if (next == first)
4092 break;
4093
4094 } while (vdev_is_dead(next->l2ad_vdev));
4095
4096 /* if we were unable to find any usable vdevs, return NULL */
4097 if (vdev_is_dead(next->l2ad_vdev))
4098 next = NULL;
4099
4100 l2arc_dev_last = next;
4101
4102 out:
4103 mutex_exit(&l2arc_dev_mtx);
4104
4105 /*
4106 * Grab the config lock to prevent the 'next' device from being
4107 * removed while we are writing to it.
4108 */
4109 if (next != NULL)
4110 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4111 mutex_exit(&spa_namespace_lock);
4112
4113 return (next);
4114 }
4115
4116 /*
4117 * Free buffers that were tagged for destruction.
4155 ASSERT(cb != NULL);
4156 dev = cb->l2wcb_dev;
4157 ASSERT(dev != NULL);
4158 head = cb->l2wcb_head;
4159 ASSERT(head != NULL);
4160 buflist = dev->l2ad_buflist;
4161 ASSERT(buflist != NULL);
4162 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4163 l2arc_write_callback_t *, cb);
4164
4165 if (zio->io_error != 0)
4166 ARCSTAT_BUMP(arcstat_l2_writes_error);
4167
4168 mutex_enter(&l2arc_buflist_mtx);
4169
4170 /*
4171 * All writes completed, or an error was hit.
4172 */
4173 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4174 ab_prev = list_prev(buflist, ab);
4175
4176 hash_lock = HDR_LOCK(ab);
4177 if (!mutex_tryenter(hash_lock)) {
4178 /*
4179 * This buffer misses out. It may be in a stage
4180 * of eviction. Its ARC_L2_WRITING flag will be
4181 * left set, denying reads to this buffer.
4182 */
4183 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4184 continue;
4185 }
4186
4187 abl2 = ab->b_l2hdr;
4188
4189 /*
4190 * Release the temporary compressed buffer as soon as possible.
4191 */
4192 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4193 l2arc_release_cdata_buf(ab);
4194
4195 if (zio->io_error != 0) {
4196 /*
4197 * Error - drop L2ARC entry.
4198 */
4199 list_remove(buflist, ab);
4200 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4201 ab->b_l2hdr = NULL;
4202 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4203 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4204 }
4205
4206 /*
4207 * Allow ARC to begin reads to this L2ARC entry.
4208 */
4209 ab->b_flags &= ~ARC_L2_WRITING;
4210
4211 mutex_exit(hash_lock);
4212 }
4213
4214 atomic_inc_64(&l2arc_writes_done);
4215 list_remove(buflist, head);
4216 kmem_cache_free(hdr_cache, head);
4217 mutex_exit(&l2arc_buflist_mtx);
4218
4219 l2arc_do_free_on_write();
4220
4221 kmem_free(cb, sizeof (l2arc_write_callback_t));
4222 }
4223
4224 /*
4225 * A read to a cache device completed. Validate buffer contents before
4226 * handing over to the regular ARC routines.
4227 */
4228 static void
4229 l2arc_read_done(zio_t *zio)
4230 {
4231 l2arc_read_callback_t *cb;
4232 arc_buf_hdr_t *hdr;
4233 arc_buf_t *buf;
4234 kmutex_t *hash_lock;
4235 int equal;
4236
4237 ASSERT(zio->io_vd != NULL);
4238 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4239
4240 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4482 *
4483 * Returns the number of bytes actually written (which may be smaller than
4484 * the delta by which the device hand has changed due to alignment).
4485 */
4486 static uint64_t
4487 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4488 boolean_t *headroom_boost)
4489 {
4490 arc_buf_hdr_t *ab, *ab_prev, *head;
4491 list_t *list;
4492 uint64_t write_asize, write_psize, write_sz, headroom,
4493 buf_compress_minsz;
4494 void *buf_data;
4495 kmutex_t *list_lock;
4496 boolean_t full;
4497 l2arc_write_callback_t *cb;
4498 zio_t *pio, *wzio;
4499 uint64_t guid = spa_load_guid(spa);
4500 const boolean_t do_headroom_boost = *headroom_boost;
4501
4502 ASSERT(dev->l2ad_vdev != NULL);
4503
4504 /* Lower the flag now, we might want to raise it again later. */
4505 *headroom_boost = B_FALSE;
4506
4507 pio = NULL;
4508 write_sz = write_asize = write_psize = 0;
4509 full = B_FALSE;
4510 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4511 head->b_flags |= ARC_L2_WRITE_HEAD;
4512
4513 /*
4514 * We will want to try to compress buffers that are at least 2x the
4515 * device sector size.
4516 */
4517 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4518
4519 /*
4520 * Copy buffers for L2ARC writing.
4521 */
4522 mutex_enter(&l2arc_buflist_mtx);
4523 for (int try = 0; try <= 3; try++) {
4524 uint64_t passed_sz = 0;
4525
4526 list = l2arc_list_locked(try, &list_lock);
4527
4528 /*
4529 * L2ARC fast warmup.
4530 *
4531 * Until the ARC is warm and starts to evict, read from the
4532 * head of the ARC lists rather than the tail.
4533 */
4534 if (arc_warm == B_FALSE)
4535 ab = list_head(list);
4536 else
4537 ab = list_tail(list);
4538
4539 headroom = target_sz * l2arc_headroom;
4569
4570 if (!l2arc_write_eligible(guid, ab)) {
4571 mutex_exit(hash_lock);
4572 continue;
4573 }
4574
4575 if ((write_sz + ab->b_size) > target_sz) {
4576 full = B_TRUE;
4577 mutex_exit(hash_lock);
4578 break;
4579 }
4580
4581 if (pio == NULL) {
4582 /*
4583 * Insert a dummy header on the buflist so
4584 * l2arc_write_done() can find where the
4585 * write buffers begin without searching.
4586 */
4587 list_insert_head(dev->l2ad_buflist, head);
4588
4589 cb = kmem_alloc(
4590 sizeof (l2arc_write_callback_t), KM_SLEEP);
4591 cb->l2wcb_dev = dev;
4592 cb->l2wcb_head = head;
4593 pio = zio_root(spa, l2arc_write_done, cb,
4594 ZIO_FLAG_CANFAIL);
4595 }
4596
4597 /*
4598 * Create and add a new L2ARC header.
4599 */
4600 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4601 l2hdr->b_dev = dev;
4602 ab->b_flags |= ARC_L2_WRITING;
4603
4604 /*
4605 * Temporarily stash the data buffer in b_tmp_cdata.
4606 * The subsequent write step will pick it up from
4607 * there. This is because can't access ab->b_buf
4608 * without holding the hash_lock, which we in turn
4609 * can't access without holding the ARC list locks
4611 */
4612 l2hdr->b_compress = ZIO_COMPRESS_OFF;
4613 l2hdr->b_asize = ab->b_size;
4614 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4615
4616 buf_sz = ab->b_size;
4617 ab->b_l2hdr = l2hdr;
4618
4619 list_insert_head(dev->l2ad_buflist, ab);
4620
4621 /*
4622 * Compute and store the buffer cksum before
4623 * writing. On debug the cksum is verified first.
4624 */
4625 arc_cksum_verify(ab->b_buf);
4626 arc_cksum_compute(ab->b_buf, B_TRUE);
4627
4628 mutex_exit(hash_lock);
4629
4630 write_sz += buf_sz;
4631 }
4632
4633 mutex_exit(list_lock);
4634
4635 if (full == B_TRUE)
4636 break;
4637 }
4638
4639 /* No buffers selected for writing? */
4640 if (pio == NULL) {
4641 ASSERT0(write_sz);
4642 mutex_exit(&l2arc_buflist_mtx);
4643 kmem_cache_free(hdr_cache, head);
4644 return (0);
4645 }
4646
4647 /*
4648 * Now start writing the buffers. We're starting at the write head
4649 * and work backwards, retracing the course of the buffer selector
4650 * loop above.
4651 */
4652 for (ab = list_prev(dev->l2ad_buflist, head); ab;
4653 ab = list_prev(dev->l2ad_buflist, ab)) {
4654 l2arc_buf_hdr_t *l2hdr;
4655 uint64_t buf_sz;
4656
4657 /*
4658 * We shouldn't need to lock the buffer here, since we flagged
4659 * it as ARC_L2_WRITING in the previous step, but we must take
4660 * care to only access its L2 cache parameters. In particular,
4661 * ab->b_buf may be invalid by now due to ARC eviction.
4662 */
4663 l2hdr = ab->b_l2hdr;
4664 l2hdr->b_daddr = dev->l2ad_hand;
4665
4666 if ((ab->b_flags & ARC_L2COMPRESS) &&
4667 l2hdr->b_asize >= buf_compress_minsz) {
4668 if (l2arc_compress_buf(l2hdr)) {
4669 /*
4670 * If compression succeeded, enable headroom
4671 * boost on the next scan cycle.
4672 */
4673 *headroom_boost = B_TRUE;
4685 if (buf_sz != 0) {
4686 uint64_t buf_p_sz;
4687
4688 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4689 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4690 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4691 ZIO_FLAG_CANFAIL, B_FALSE);
4692
4693 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4694 zio_t *, wzio);
4695 (void) zio_nowait(wzio);
4696
4697 write_asize += buf_sz;
4698 /*
4699 * Keep the clock hand suitably device-aligned.
4700 */
4701 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4702 write_psize += buf_p_sz;
4703 dev->l2ad_hand += buf_p_sz;
4704 }
4705 }
4706
4707 mutex_exit(&l2arc_buflist_mtx);
4708
4709 ASSERT3U(write_asize, <=, target_sz);
4710 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4711 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4712 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4713 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4714 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4715
4716 /*
4717 * Bump device hand to the device start if it is approaching the end.
4718 * l2arc_evict() will already have evicted ahead for this case.
4719 */
4720 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4721 vdev_space_update(dev->l2ad_vdev,
4722 dev->l2ad_end - dev->l2ad_hand, 0, 0);
4723 dev->l2ad_hand = dev->l2ad_start;
4724 dev->l2ad_evict = dev->l2ad_start;
4725 dev->l2ad_first = B_FALSE;
4726 }
4727
4728 dev->l2ad_writing = B_TRUE;
4729 (void) zio_wait(pio);
4730 dev->l2ad_writing = B_FALSE;
4731
4732 return (write_asize);
4733 }
4734
4735 /*
4977 }
4978
4979 boolean_t
4980 l2arc_vdev_present(vdev_t *vd)
4981 {
4982 l2arc_dev_t *dev;
4983
4984 mutex_enter(&l2arc_dev_mtx);
4985 for (dev = list_head(l2arc_dev_list); dev != NULL;
4986 dev = list_next(l2arc_dev_list, dev)) {
4987 if (dev->l2ad_vdev == vd)
4988 break;
4989 }
4990 mutex_exit(&l2arc_dev_mtx);
4991
4992 return (dev != NULL);
4993 }
4994
4995 /*
4996 * Add a vdev for use by the L2ARC. By this point the spa has already
4997 * validated the vdev and opened it.
4998 */
4999 void
5000 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5001 {
5002 l2arc_dev_t *adddev;
5003
5004 ASSERT(!l2arc_vdev_present(vd));
5005
5006 /*
5007 * Create a new l2arc device entry.
5008 */
5009 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5010 adddev->l2ad_spa = spa;
5011 adddev->l2ad_vdev = vd;
5012 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5013 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5014 adddev->l2ad_hand = adddev->l2ad_start;
5015 adddev->l2ad_evict = adddev->l2ad_start;
5016 adddev->l2ad_first = B_TRUE;
5017 adddev->l2ad_writing = B_FALSE;
5018
5019 /*
5020 * This is a list of all ARC buffers that are still valid on the
5021 * device.
5022 */
5023 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5024 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5025 offsetof(arc_buf_hdr_t, b_l2node));
5026
5027 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5028
5029 /*
5030 * Add device to global list
5031 */
5032 mutex_enter(&l2arc_dev_mtx);
5033 list_insert_head(l2arc_dev_list, adddev);
5034 atomic_inc_64(&l2arc_ndev);
5035 mutex_exit(&l2arc_dev_mtx);
5036 }
5037
5038 /*
5039 * Remove a vdev from the L2ARC.
5040 */
5041 void
5042 l2arc_remove_vdev(vdev_t *vd)
5043 {
5044 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5045
5046 /*
5047 * Find the device by vdev
5048 */
5049 mutex_enter(&l2arc_dev_mtx);
5050 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5051 nextdev = list_next(l2arc_dev_list, dev);
5052 if (vd == dev->l2ad_vdev) {
5053 remdev = dev;
5054 break;
5055 }
5056 }
5057 ASSERT(remdev != NULL);
5058
5059 /*
5060 * Remove device from global list
5061 */
5062 list_remove(l2arc_dev_list, remdev);
5063 l2arc_dev_last = NULL; /* may have been invalidated */
5064 atomic_dec_64(&l2arc_ndev);
5065 mutex_exit(&l2arc_dev_mtx);
5066
5067 /*
5068 * Clear all buflists and ARC references. L2ARC device flush.
5069 */
5070 l2arc_evict(remdev, 0, B_TRUE);
5071 list_destroy(remdev->l2ad_buflist);
5072 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5073 kmem_free(remdev, sizeof (l2arc_dev_t));
5074 }
5075
5076 void
5077 l2arc_init(void)
5078 {
5079 l2arc_thread_exit = 0;
5080 l2arc_ndev = 0;
5081 l2arc_writes_sent = 0;
5082 l2arc_writes_done = 0;
5083
5084 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5085 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5086 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5087 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5088 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5089
5121 {
5122 if (!(spa_mode_global & FWRITE))
5123 return;
5124
5125 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5126 TS_RUN, minclsyspri);
5127 }
5128
5129 void
5130 l2arc_stop(void)
5131 {
5132 if (!(spa_mode_global & FWRITE))
5133 return;
5134
5135 mutex_enter(&l2arc_feed_thr_lock);
5136 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5137 l2arc_thread_exit = 1;
5138 while (l2arc_thread_exit != 0)
5139 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5140 mutex_exit(&l2arc_feed_thr_lock);
5141 }
|
119 * - ARC header release, as it removes from L2ARC buflists
120 */
121
122 #include <sys/spa.h>
123 #include <sys/zio.h>
124 #include <sys/zio_compress.h>
125 #include <sys/zfs_context.h>
126 #include <sys/arc.h>
127 #include <sys/refcount.h>
128 #include <sys/vdev.h>
129 #include <sys/vdev_impl.h>
130 #ifdef _KERNEL
131 #include <sys/vmsystm.h>
132 #include <vm/anon.h>
133 #include <sys/fs/swapnode.h>
134 #include <sys/dnlc.h>
135 #endif
136 #include <sys/callb.h>
137 #include <sys/kstat.h>
138 #include <zfs_fletcher.h>
139 #include <sys/byteorder.h>
140
141 #ifndef _KERNEL
142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
143 boolean_t arc_watch = B_FALSE;
144 int arc_procfd;
145 #endif
146
147 static kmutex_t arc_reclaim_thr_lock;
148 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
149 static uint8_t arc_thread_exit;
150
151 extern int zfs_write_limit_shift;
152 extern uint64_t zfs_write_limit_max;
153 extern kmutex_t zfs_write_limit_lock;
154
155 #define ARC_REDUCE_DNLC_PERCENT 3
156 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
157
158 typedef enum arc_reclaim_strategy {
159 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
291 kstat_named_t arcstat_l2_feeds;
292 kstat_named_t arcstat_l2_rw_clash;
293 kstat_named_t arcstat_l2_read_bytes;
294 kstat_named_t arcstat_l2_write_bytes;
295 kstat_named_t arcstat_l2_writes_sent;
296 kstat_named_t arcstat_l2_writes_done;
297 kstat_named_t arcstat_l2_writes_error;
298 kstat_named_t arcstat_l2_writes_hdr_miss;
299 kstat_named_t arcstat_l2_evict_lock_retry;
300 kstat_named_t arcstat_l2_evict_reading;
301 kstat_named_t arcstat_l2_free_on_write;
302 kstat_named_t arcstat_l2_abort_lowmem;
303 kstat_named_t arcstat_l2_cksum_bad;
304 kstat_named_t arcstat_l2_io_error;
305 kstat_named_t arcstat_l2_size;
306 kstat_named_t arcstat_l2_asize;
307 kstat_named_t arcstat_l2_hdr_size;
308 kstat_named_t arcstat_l2_compress_successes;
309 kstat_named_t arcstat_l2_compress_zeros;
310 kstat_named_t arcstat_l2_compress_failures;
311 kstat_named_t arcstat_l2_meta_writes;
312 kstat_named_t arcstat_l2_meta_avg_size;
313 kstat_named_t arcstat_l2_meta_avg_asize;
314 kstat_named_t arcstat_l2_asize_to_meta_ratio;
315 kstat_named_t arcstat_l2_rebuild_attempts;
316 kstat_named_t arcstat_l2_rebuild_successes;
317 kstat_named_t arcstat_l2_rebuild_unsupported;
318 kstat_named_t arcstat_l2_rebuild_timeout;
319 kstat_named_t arcstat_l2_rebuild_arc_bytes;
320 kstat_named_t arcstat_l2_rebuild_l2arc_bytes;
321 kstat_named_t arcstat_l2_rebuild_bufs;
322 kstat_named_t arcstat_l2_rebuild_bufs_precached;
323 kstat_named_t arcstat_l2_rebuild_metabufs;
324 kstat_named_t arcstat_l2_rebuild_uberblk_errors;
325 kstat_named_t arcstat_l2_rebuild_io_errors;
326 kstat_named_t arcstat_l2_rebuild_cksum_errors;
327 kstat_named_t arcstat_l2_rebuild_loop_errors;
328 kstat_named_t arcstat_l2_rebuild_abort_lowmem;
329 kstat_named_t arcstat_memory_throttle_count;
330 kstat_named_t arcstat_duplicate_buffers;
331 kstat_named_t arcstat_duplicate_buffers_size;
332 kstat_named_t arcstat_duplicate_reads;
333 kstat_named_t arcstat_meta_used;
334 kstat_named_t arcstat_meta_limit;
335 kstat_named_t arcstat_meta_max;
336 } arc_stats_t;
337
338 static arc_stats_t arc_stats = {
339 { "hits", KSTAT_DATA_UINT64 },
340 { "misses", KSTAT_DATA_UINT64 },
341 { "demand_data_hits", KSTAT_DATA_UINT64 },
342 { "demand_data_misses", KSTAT_DATA_UINT64 },
343 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
344 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
345 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
346 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
347 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
348 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
375 { "l2_feeds", KSTAT_DATA_UINT64 },
376 { "l2_rw_clash", KSTAT_DATA_UINT64 },
377 { "l2_read_bytes", KSTAT_DATA_UINT64 },
378 { "l2_write_bytes", KSTAT_DATA_UINT64 },
379 { "l2_writes_sent", KSTAT_DATA_UINT64 },
380 { "l2_writes_done", KSTAT_DATA_UINT64 },
381 { "l2_writes_error", KSTAT_DATA_UINT64 },
382 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
383 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
384 { "l2_evict_reading", KSTAT_DATA_UINT64 },
385 { "l2_free_on_write", KSTAT_DATA_UINT64 },
386 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
387 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
388 { "l2_io_error", KSTAT_DATA_UINT64 },
389 { "l2_size", KSTAT_DATA_UINT64 },
390 { "l2_asize", KSTAT_DATA_UINT64 },
391 { "l2_hdr_size", KSTAT_DATA_UINT64 },
392 { "l2_compress_successes", KSTAT_DATA_UINT64 },
393 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
394 { "l2_compress_failures", KSTAT_DATA_UINT64 },
395 { "l2_meta_writes", KSTAT_DATA_UINT64 },
396 { "l2_meta_avg_size", KSTAT_DATA_UINT64 },
397 { "l2_meta_avg_asize", KSTAT_DATA_UINT64 },
398 { "l2_asize_to_meta_ratio", KSTAT_DATA_UINT64 },
399 { "l2_rebuild_attempts", KSTAT_DATA_UINT64 },
400 { "l2_rebuild_successes", KSTAT_DATA_UINT64 },
401 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
402 { "l2_rebuild_timeout", KSTAT_DATA_UINT64 },
403 { "l2_rebuild_arc_bytes", KSTAT_DATA_UINT64 },
404 { "l2_rebuild_l2arc_bytes", KSTAT_DATA_UINT64 },
405 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
406 { "l2_rebuild_precached", KSTAT_DATA_UINT64 },
407 { "l2_rebuild_metabufs", KSTAT_DATA_UINT64 },
408 { "l2_rebuild_uberblk_errors", KSTAT_DATA_UINT64 },
409 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
410 { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 },
411 { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 },
412 { "l2_rebuild_abort_lowmem", KSTAT_DATA_UINT64 },
413 { "memory_throttle_count", KSTAT_DATA_UINT64 },
414 { "duplicate_buffers", KSTAT_DATA_UINT64 },
415 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
416 { "duplicate_reads", KSTAT_DATA_UINT64 },
417 { "arc_meta_used", KSTAT_DATA_UINT64 },
418 { "arc_meta_limit", KSTAT_DATA_UINT64 },
419 { "arc_meta_max", KSTAT_DATA_UINT64 }
420 };
421
422 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
423
424 #define ARCSTAT_INCR(stat, val) \
425 atomic_add_64(&arc_stats.stat.value.ui64, (val))
426
427 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
428 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
429
430 #define ARCSTAT_MAX(stat, val) { \
431 uint64_t m; \
432 while ((val) > (m = arc_stats.stat.value.ui64) && \
440 /*
441 * We define a macro to allow ARC hits/misses to be easily broken down by
442 * two separate conditions, giving a total of four different subtypes for
443 * each of hits and misses (so eight statistics total).
444 */
445 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
446 if (cond1) { \
447 if (cond2) { \
448 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
449 } else { \
450 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
451 } \
452 } else { \
453 if (cond2) { \
454 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
455 } else { \
456 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
457 } \
458 }
459
460 /*
461 * This macro allows us to use kstats as floating averages. Each time we
462 * update this kstat, we first factor it and the update value by
463 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
464 * average. This macro assumes that integer loads and stores are atomic, but
465 * is not safe for multiple writers updating the kstat in parallel (only the
466 * last writer's update will remain).
467 */
468 #define ARCSTAT_F_AVG_FACTOR 3
469 #define ARCSTAT_F_AVG(stat, value) \
470 do { \
471 uint64_t x = ARCSTAT(stat); \
472 x = x - x / ARCSTAT_F_AVG_FACTOR + \
473 (value) / ARCSTAT_F_AVG_FACTOR; \
474 ARCSTAT(stat) = x; \
475 _NOTE(NOTREACHED) \
476 _NOTE(CONSTCOND) \
477 } while (0)
478
479 kstat_t *arc_ksp;
480 static arc_state_t *arc_anon;
481 static arc_state_t *arc_mru;
482 static arc_state_t *arc_mru_ghost;
483 static arc_state_t *arc_mfu;
484 static arc_state_t *arc_mfu_ghost;
485 static arc_state_t *arc_l2c_only;
486
487 /*
488 * There are several ARC variables that are critical to export as kstats --
489 * but we don't want to have to grovel around in the kstat whenever we wish to
490 * manipulate them. For these variables, we therefore define them to be in
491 * terms of the statistic variable. This assures that we are not introducing
492 * the possibility of inconsistency by having shadow copies of the variables,
493 * while still allowing the code to be readable.
494 */
495 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
496 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
497 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
498 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
666 #define L2ARC_FEED_SECS 1 /* caching interval secs */
667 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
668
669 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
670 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
671
672 /* L2ARC Performance Tunables */
673 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
674 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
675 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
676 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
677 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
678 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
679 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
680 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
681 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
682
683 /*
684 * L2ARC Internals
685 */
686 typedef struct l2arc_dev l2arc_dev_t;
687 static list_t L2ARC_dev_list; /* device list */
688 static list_t *l2arc_dev_list; /* device list pointer */
689 static kmutex_t l2arc_dev_mtx; /* device list mutex */
690 static l2arc_dev_t *l2arc_dev_last; /* last device used */
691 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
692 static list_t L2ARC_free_on_write; /* free after write buf list */
693 static list_t *l2arc_free_on_write; /* free after write list ptr */
694 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
695 static uint64_t l2arc_ndev; /* number of devices */
696
697 typedef struct l2arc_read_callback {
698 arc_buf_t *l2rcb_buf; /* read buffer */
699 spa_t *l2rcb_spa; /* spa */
700 blkptr_t l2rcb_bp; /* original blkptr */
701 zbookmark_t l2rcb_zb; /* original bookmark */
702 int l2rcb_flags; /* original flags */
703 enum zio_compress l2rcb_compress; /* applied compress */
704 } l2arc_read_callback_t;
705
706 typedef struct l2arc_write_callback {
707 l2arc_dev_t *l2wcb_dev; /* device info */
708 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
709 uint8_t *l2wcb_pbuf; /* pbuf sent in this write */
710 uint32_t l2wcb_pbuf_size; /* size of committed pbuf */
711 uint8_t *l2wcb_ub_buf; /* uberblock in this write */
712 } l2arc_write_callback_t;
713
714 struct l2arc_buf_hdr {
715 /* protected by arc_buf_hdr mutex */
716 l2arc_dev_t *b_dev; /* L2ARC device */
717 uint64_t b_daddr; /* disk address, offset byte */
718 /* compression applied to buffer data */
719 enum zio_compress b_compress;
720 /* real alloc'd buffer size depending on b_compress applied */
721 int b_asize;
722 /* temporary buffer holder for in-flight compressed data */
723 void *b_tmp_cdata;
724 };
725
726 typedef struct l2arc_data_free {
727 /* protected by l2arc_free_on_write_mtx */
728 void *l2df_data;
729 size_t l2df_size;
730 void (*l2df_func)(void *, size_t);
731 list_node_t l2df_list_node;
732 } l2arc_data_free_t;
733
734 static kmutex_t l2arc_feed_thr_lock;
735 static kcondvar_t l2arc_feed_thr_cv;
736 static uint8_t l2arc_thread_exit;
737
738 static void l2arc_read_done(zio_t *zio);
739 static void l2arc_hdr_stat_add(boolean_t from_arc);
740 static void l2arc_hdr_stat_remove(void);
741
742 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
743 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
744 enum zio_compress c);
745 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
746
747 typedef enum {
748 L2UBLK_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */
749 L2UBLK_EVICT_FIRST = (1 << 1) /* mirror of l2ad_first in l2dev */
750 } l2uberblock_flags_t;
751
752 typedef struct l2uberblock {
753 uint32_t ub_magic;
754 uint8_t ub_version;
755 l2uberblock_flags_t ub_flags;
756
757 uint64_t ub_spa_guid;
758 uint64_t ub_birth;
759 uint64_t ub_evict_tail; /* current evict pointer */
760 uint64_t ub_alloc_space; /* vdev space alloc status */
761 uint64_t ub_pbuf_daddr; /* address of newest pbuf */
762 uint32_t ub_pbuf_asize; /* size of newest pbuf */
763 zio_cksum_t ub_pbuf_cksum; /* fletcher4 of newest pbuf */
764
765 zio_cksum_t ub_cksum; /* cksum of uberblock */
766 } l2uberblock_t;
767
768 typedef enum {
769 L2PBUF_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */
770 L2PBUF_COMPRESSED = (1 << 1) /* pbuf data items are compressed */
771 } l2pbuf_flags_t;
772
773 typedef struct l2pbuf {
774 uint32_t pb_magic;
775 unsigned int pb_version;
776 l2pbuf_flags_t pb_flags;
777
778 uint64_t pb_prev_daddr; /* address of previous pbuf */
779 uint32_t pb_prev_asize; /* size of previous pbuf */
780 zio_cksum_t pb_prev_cksum; /* fletcher4 of prev. pbuf */
781
782 /*
783 * This is a set of item lists that are contained in this pbuf. Each
784 * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's.
785 * This serves as a soft timeout feature - once the limit of the
786 * number of item lists that a pbuf can hold is reached, the pbuf is
787 * flushed to stable storage, regardless of its total size.
788 */
789 list_t *pb_buflists_list;
790
791 /*
792 * Number of compressed bytes referenced by items in this pbuf and
793 * the number of lists present.
794 * This is not actually written to storage, it is only used by
795 * internal algorithms which check for when a pbuf reaches a
796 * certain size limit, after which it is flushed in a write.
797 */
798 uint64_t pb_payload_asz;
799 /* Same thing for number of buflists */
800 int pb_nbuflists;
801
802 /*
803 * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size.
804 * This is then used by l2arc_pbuf_restore to update used space
805 * on the L2ARC vdev.
806 */
807 size_t pb_asize;
808 } l2pbuf_t;
809
810 typedef struct l2pbuf_buf l2pbuf_buf_t;
811 typedef struct l2pbuf_buflist {
812 uint32_t l2pbl_nbufs;
813 l2pbuf_buf_t *l2pbl_bufs;
814 list_node_t l2pbl_node;
815 } l2pbuf_buflist_t;
816
817 struct l2pbuf_buf {
818 dva_t b_dva; /* dva of buffer */
819 uint64_t b_birth; /* birth txg of buffer */
820 uint64_t b_cksum0;
821 zio_cksum_t b_freeze_cksum;
822 uint32_t b_size; /* uncompressed buf size */
823 uint64_t b_l2daddr; /* buf location on l2dev */
824 uint32_t b_l2asize; /* actual buf data size */
825 enum zio_compress b_l2compress; /* compression applied */
826 uint16_t b_contents_type;
827 uint32_t b_flags;
828 };
829
830 struct l2arc_dev {
831 vdev_t *l2ad_vdev; /* vdev */
832 spa_t *l2ad_spa; /* spa */
833 uint64_t l2ad_hand; /* next write location */
834 uint64_t l2ad_start; /* first addr on device */
835 uint64_t l2ad_end; /* last addr on device */
836 uint64_t l2ad_evict; /* last addr eviction reached */
837 boolean_t l2ad_first; /* first sweep through */
838 boolean_t l2ad_writing; /* currently writing */
839 list_t *l2ad_buflist; /* buffer list */
840 list_node_t l2ad_node; /* device list node */
841 l2pbuf_t l2ad_pbuf; /* currently open pbuf */
842 uint64_t l2ad_pbuf_daddr; /* prev pbuf daddr */
843 uint64_t l2ad_pbuf_asize; /* prev pbuf asize */
844 zio_cksum_t l2ad_pbuf_cksum; /* prev pbuf cksum */
845 /* uberblock birth counter - incremented for each committed uberblk */
846 uint64_t l2ad_uberblock_birth;
847 /* flag indicating whether a rebuild is currently going on */
848 boolean_t l2ad_rebuilding;
849 };
850
851 /* Stores information about an L2ARC prefetch zio */
852 typedef struct l2arc_prefetch_info {
853 uint8_t *pi_buf; /* where the zio writes to */
854 uint64_t pi_buflen; /* length of `buf' */
855 zio_t *pi_hdr_io; /* see l2arc_pbuf_read below */
856 } l2arc_prefetch_info_t;
857
858 /* 256 x 4k of l2uberblocks */
859 #define L2UBERBLOCK_SIZE 4096
860 #define L2UBERBLOCK_MAGIC 0x12bab10c
861 #define L2UBERBLOCK_MAX_VERSION 1 /* our maximum uberblock version */
862 #define L2PBUF_MAGIC 0xdb0faba6
863 #define L2PBUF_MAX_VERSION 1 /* our maximum pbuf version */
864 #define L2PBUF_BUF_SIZE 88 /* size of one pbuf buf entry */
865 #define L2PBUF_HDR_SIZE 56 /* pbuf header excluding any payload */
866 #define L2PBUF_ENCODED_SIZE(_pb) \
867 (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb))
868 /*
869 * Allocation limit for the payload of a pbuf. This also fundamentally
870 * limits the number of bufs we can reference in a pbuf.
871 */
872 #define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024)
873 #define L2PBUF_MAX_BUFS (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE)
874 #define L2PBUF_COMPRESS_MINSZ 8192 /* minimum size to compress a pbuf */
875 #define L2PBUF_MAXSZ 100 * 1024 * 1024 /* maximum pbuf size */
876 #define L2PBUF_MAX_BUFLISTS 128 /* max number of buflists per pbuf */
877 #define L2ARC_REBUILD_TIMEOUT 60 /* a rebuild may take at most 60s */
878 #define L2PBUF_IS_FULL(_pb) \
879 ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \
880 (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists)
881 /*
882 * These are the flags we allow to persist in L2ARC pbufs. The other flags
883 * of an ARC buffer pertain to the buffer's runtime behavior.
884 */
885 #define L2ARC_PERSIST_FLAGS \
886 (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH)
887
888 /*
889 * Used during L2ARC rebuild after each read operation to check whether we
890 * haven't exceeded the rebuild timeout value.
891 */
892 #define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \
893 do { \
894 if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \
895 __VA_ARGS__; \
896 ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \
897 cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \
898 "dropping remaining L2ARC metadata."); \
899 return; \
900 } \
901 _NOTE(NOTREACHED) \
902 _NOTE(CONSTCOND) \
903 } while (0)
904
905 /*
906 * Performance tuning of L2ARC persistency:
907 *
908 * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt
909 * compressing it.
910 * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers
911 * referenced from a pbuf. Once a pbuf reaches this size, it is
912 * committed to stable storage. Ideally, there should be approx.
913 * l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device.
914 * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will
915 * be buffered in a pbuf before it is committed to L2ARC. This
916 * puts a soft temporal upper bound on pbuf commit intervals.
917 * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
918 * pool import or when adding one manually later) will attempt
919 * to rebuild L2ARC buffer contents. In special circumstances,
920 * the administrator may want to set this to B_FALSE, if they
921 * are having trouble importing a pool or attaching an L2ARC
922 * device (e.g. the L2ARC device is slow to read in stored pbuf
923 * metadata, or the metadata has become somehow
924 * fragmented/unusable).
925 * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
926 * avoid a slow L2ARC device from preventing pool import. If we
927 * are not done rebuilding an L2ARC device by this time, we
928 * stop the rebuild and return immediately.
929 */
930 uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ;
931 uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ;
932 uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS;
933 boolean_t l2arc_rebuild_enabled = B_TRUE;
934 uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
935
936 static void l2arc_rebuild_start(l2arc_dev_t *dev);
937 static void l2arc_rebuild(l2arc_dev_t *dev);
938 static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb);
939 static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev,
940 uint64_t guid);
941
942 static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub);
943 static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
944 zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io);
945 static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr,
946 uint32_t asize);
947 static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize);
948 static void l2arc_pbuf_prefetch_abort(zio_t *zio);
949
950 static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf);
951 static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub);
952 static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
953 uint64_t guid);
954 static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio,
955 l2arc_write_callback_t *cb);
956
957 static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen);
958 static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen,
959 l2pbuf_t *pbuf);
960 static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen,
961 uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum);
962 static void l2arc_pbuf_init(l2pbuf_t *pb);
963 static void l2arc_pbuf_destroy(l2pbuf_t *pb);
964 static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio,
965 l2arc_write_callback_t *cb);
966 static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs);
967 static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
968 const arc_buf_hdr_t *ab, int index);
969 static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb);
970
971 static uint64_t
972 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
973 {
974 uint8_t *vdva = (uint8_t *)dva;
975 uint64_t crc = -1ULL;
976 int i;
977
978 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
979
980 for (i = 0; i < sizeof (dva_t); i++)
981 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
982
983 crc ^= (spa>>8) ^ birth;
984
985 return (crc);
986 }
987
988 #define BUF_EMPTY(buf) \
989 ((buf)->b_dva.dva_word[0] == 0 && \
990 (buf)->b_dva.dva_word[1] == 0 && \
1491 if (use_mutex)
1492 mutex_exit(&new_state->arcs_mtx);
1493 }
1494 }
1495
1496 ASSERT(!BUF_EMPTY(ab));
1497 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1498 buf_hash_remove(ab);
1499
1500 /* adjust state sizes */
1501 if (to_delta)
1502 atomic_add_64(&new_state->arcs_size, to_delta);
1503 if (from_delta) {
1504 ASSERT3U(old_state->arcs_size, >=, from_delta);
1505 atomic_add_64(&old_state->arcs_size, -from_delta);
1506 }
1507 ab->b_state = new_state;
1508
1509 /* adjust l2arc hdr stats */
1510 if (new_state == arc_l2c_only)
1511 l2arc_hdr_stat_add(old_state != arc_anon);
1512 else if (old_state == arc_l2c_only)
1513 l2arc_hdr_stat_remove();
1514 }
1515
1516 void
1517 arc_space_consume(uint64_t space, arc_space_type_t type)
1518 {
1519 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1520
1521 switch (type) {
1522 case ARC_SPACE_DATA:
1523 ARCSTAT_INCR(arcstat_data_size, space);
1524 break;
1525 case ARC_SPACE_OTHER:
1526 ARCSTAT_INCR(arcstat_other_size, space);
1527 break;
1528 case ARC_SPACE_HDRS:
1529 ARCSTAT_INCR(arcstat_hdr_size, space);
1530 break;
1531 case ARC_SPACE_L2HDRS:
1595 hdr->b_type = type;
1596 hdr->b_spa = spa_load_guid(spa);
1597 hdr->b_state = arc_anon;
1598 hdr->b_arc_access = 0;
1599 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1600 buf->b_hdr = hdr;
1601 buf->b_data = NULL;
1602 buf->b_efunc = NULL;
1603 buf->b_private = NULL;
1604 buf->b_next = NULL;
1605 hdr->b_buf = buf;
1606 arc_get_data_buf(buf);
1607 hdr->b_datacnt = 1;
1608 hdr->b_flags = 0;
1609 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1610 (void) refcount_add(&hdr->b_refcnt, tag);
1611
1612 return (buf);
1613 }
1614
1615 /*
1616 * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
1617 * This is used during l2arc reconstruction to make empty ARC buffers
1618 * which circumvent the regular disk->arc->l2arc path and instead come
1619 * into being in the reverse order, i.e. l2arc->arc->(disk).
1620 */
1621 arc_buf_hdr_t *
1622 arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
1623 {
1624 arc_buf_hdr_t *hdr;
1625
1626 ASSERT3U(size, >, 0);
1627 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1628 ASSERT(BUF_EMPTY(hdr));
1629 hdr->b_size = size;
1630 hdr->b_type = type;
1631 hdr->b_spa = guid;
1632 hdr->b_state = arc_anon;
1633 hdr->b_arc_access = 0;
1634 hdr->b_buf = NULL;
1635 hdr->b_datacnt = 0;
1636 hdr->b_flags = 0;
1637 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1638
1639 return (hdr);
1640 }
1641
1642 static char *arc_onloan_tag = "onloan";
1643
1644 /*
1645 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1646 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1647 * buffers must be returned to the arc before they can be used by the DMU or
1648 * freed.
1649 */
1650 arc_buf_t *
1651 arc_loan_buf(spa_t *spa, int size)
1652 {
1653 arc_buf_t *buf;
1654
1655 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1656
1657 atomic_add_64(&arc_loaned_bytes, size);
1658 return (buf);
1659 }
1660
1661 /*
4254 * l2arc_noprefetch skip caching prefetched buffers
4255 * l2arc_headroom number of max device writes to precache
4256 * l2arc_headroom_boost when we find compressed buffers during ARC
4257 * scanning, we multiply headroom by this
4258 * percentage factor for the next scan cycle,
4259 * since more compressed buffers are likely to
4260 * be present
4261 * l2arc_feed_secs seconds between L2ARC writing
4262 *
4263 * Tunables may be removed or added as future performance improvements are
4264 * integrated, and also may become zpool properties.
4265 *
4266 * There are three key functions that control how the L2ARC warms up:
4267 *
4268 * l2arc_write_eligible() check if a buffer is eligible to cache
4269 * l2arc_write_size() calculate how much to write
4270 * l2arc_write_interval() calculate sleep delay between writes
4271 *
4272 * These three functions determine what to write, how much, and how quickly
4273 * to send writes.
4274 *
4275 * L2ARC persistency:
4276 *
4277 * When writing buffers to L2ARC, we periodically add some metadata to
4278 * make sure we can pick them up after reboot, thus dramatically reducing
4279 * the impact that any downtime has on the performance of storage systems
4280 * with large caches.
4281 *
4282 * The implementation works fairly simply by integrating the following two
4283 * modifications:
4284 *
4285 * *) Every now and then, at end of an L2ARC feed cycle, we append a piece
4286 * of metadata (called a "pbuf", or "persistency buffer") to the L2ARC
4287 * write. This allows us to understand what what's been written, so that
4288 * we can rebuild the arc_buf_hdr_t structures of the main ARC buffers.
4289 * The pbuf also includes a "back-reference" pointer to the previous
4290 * pbuf, forming a linked list of pbufs on the L2ARC device.
4291 *
4292 * *) We reserve 4k of space at the start of each L2ARC device for our
4293 * header bookkeeping purposes. This contains a single 4k uberblock, which
4294 * contains our top-level reference structures. We update it on each pbuf
4295 * write. If this write results in an inconsistent uberblock (e.g. due to
4296 * power failure), we detect this by verifying the uberblock's checksum
4297 * and simply drop the entries from L2ARC. Once an L2ARC pbuf update
4298 * completes, we update the uberblock to point to it.
4299 *
4300 * Implementation diagram:
4301 *
4302 * +=== L2ARC device (not to scale) ======================================+
4303 * | ____________newest pbuf pointer_____________ |
4304 * | / \ |
4305 * | / V |
4306 * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---|
4307 * | ^ / ^ / ^ / |
4308 * | `-prev-' `-prev-' `-prev-' |
4309 * | pbuf pbuf pbuf |
4310 * +======================================================================+
4311 *
4312 * On-device data structures:
4313 *
4314 * (L2ARC persistent uberblock)
4315 * struct l2uberblock {
4316 * (these fields are in network byte order)
4317 * uint32_t magic = 0x12bab10c; l2-ber-block
4318 * uint8_t version = 0x1;
4319 * uint8_t reserved = 0x0;
4320 * uint16_t ublk_flags; see l2uberblock_flags_t
4321 *
4322 * (byte order of fields below determined by `ublk_flags')
4323 * uint64_t spa_guid; what pool this l2arc dev belongs to
4324 * uint64_t birth_txg; ublk with highest birth_txg is newest
4325 * uint64_t evict_tail; current evict pointer on l2arc dev
4326 * uint64_t alloc_space; how much space is alloc'd on the dev
4327 * uint64_t pbuf_daddr; dev addr of the newest l2pbuf_t
4328 * uint32_t pbuf_asize; size of newest pbuf
4329 * uint64_t pbuf_cksum[4]; fletcher4 of newest pbuf
4330 *
4331 * uint8_t reserved[3996] = {0x0, 0x0, ... 0x0};
4332 *
4333 * uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above);
4334 * } l2dev_uberblock;
4335 *
4336 * (L2ARC persistent buffer list)
4337 * typedef struct l2pbuf_t {
4338 * (these fields are in network byte order)
4339 * uint32_t magic = 0xdb0faba6; the-buffer-bag
4340 * uint8_t version = 0x1;
4341 * uint8_t reserved = 0x0;
4342 * uint16_t pbuf_flags; see l2pbuf_flags_t
4343 *
4344 * (byte order of fields below determined by `pbuf_flags')
4345 * uint64_t prev_pbuf_daddr; previous pbuf dev addr
4346 * uint32_t prev_pbuf_asize; previous pbuf size
4347 * uint64_t prev_pbuf_cksum[4]; fletcher4(of previous pbuf)
4348 *
4349 * uint32_t items_size; uncompressed size of `items' below
4350 * (if (pbuf_flags & compress) decompress `items' prior to decoding)
4351 * struct l2pbuf_buf_item {
4352 * (these fields mirror [l2]arc_buf_hdr fields)
4353 * uint64_t dva[2]; buffer's DVA
4354 * uint64_t birth; buffer's birth TXG in ARC
4355 * uint64_t cksum0; lower 64-bits of buffer's cksum
4356 * uint64_t freeze_cksum[4]; buffer's freeze cksum
4357 * uint32_t size; uncompressed buffer data size
4358 * uint64_t l2daddr; device address (offset) of buf
4359 * uint32_t l2asize; actual space occupied by buf
4360 * uint8_t compress; compress algo used on data
4361 * uint8_t contents_type; buffer's contents type
4362 * uint16_t reserved = 0x0; for alignment and future use
4363 * uint32_t flags; buffer's persistent flags
4364 * } items[]; continues for remainder of pbuf
4365 * } l2pbuf_t;
4366 *
4367 * L2ARC reconstruction:
4368 *
4369 * When writing data, we simply write in the standard rotary fashion,
4370 * evicting buffers as we go and simply writing new data over them (appending
4371 * an updated l2pbuf_t every now and then). This obviously means that once we
4372 * loop around the end of the device, we will start cutting into an already
4373 * committed l2pbuf (and its referenced data buffers), like so:
4374 *
4375 * current write head__ __old tail
4376 * \ /
4377 * V V
4378 * <--|bufs|pbuf|bufs|pbuf| |bufs|pbuf|bufs|pbuf|-->
4379 * ^ ^^^^^^^^^_____________________________
4380 * | \
4381 * <<nextwrite>> - will overwrite this pbuf --/
4382 *
4383 * When importing the pool, we detect this situation and use it to stop
4384 * our scanning process:
4385 * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the
4386 * previous one.
4387 * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum)
4388 * then the pbuf is invalid and stop scanning (goto step 3 below).
4389 * 3) if (this is the last valid pbuf)
4390 * discard this pbuf as well (its ARC bufs may have been damaged by a
4391 * partial overwrite).
4392 * (We could potentially salvage the remaining good arc bufs above in step 3,
4393 * buf the cost of doing so probably outweighs the value of the entire pbuf).
4394 *
4395 * There is one significant caveat to consider when rebuilding ARC contents
4396 * from an L2ARC device: what about invalidated buffers? Given the above
4397 * construction, we cannot update pbufs which we've already written to amend
4398 * them to remove buffers which were invalidated. Thus, during reconstruction,
4399 * we might be populating the cache with buffers for data that's not on the
4400 * main pool anymore, or may have been overwritten!
4401 *
4402 * As it turns out, this isn't a problem. Every arc_read request includes
4403 * both the DVA and, crucially, the birth TXG of the BP the caller is
4404 * looking for. So even if the cache were populated by completely rotten
4405 * blocks for data that had been long deleted and/or overwritten, we'll
4406 * never actually return bad data from the cache, since the DVA with the
4407 * birth TXG uniquely identify a block in space and time - once created,
4408 * a block is immutable on disk. The worst thing we have done is wasted
4409 * some time and memory at l2arc rebuild to reconstruct outdated ARC
4410 * entries that will get dropped from the l2arc as it is being updated
4411 * with new blocks.
4412 */
4413
4414 static boolean_t
4415 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4416 {
4417 /*
4418 * A buffer is *not* eligible for the L2ARC if it:
4419 * 1. belongs to a different spa.
4420 * 2. is already cached on the L2ARC.
4421 * 3. has an I/O in progress (it may be an incomplete read).
4422 * 4. is flagged not eligible (zfs property).
4423 */
4424 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4425 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4426 return (B_FALSE);
4427
4428 return (B_TRUE);
4429 }
4430
4431 static uint64_t
4458 clock_t interval, next, now;
4459
4460 /*
4461 * If the ARC lists are busy, increase our write rate; if the
4462 * lists are stale, idle back. This is achieved by checking
4463 * how much we previously wrote - if it was more than half of
4464 * what we wanted, schedule the next write much sooner.
4465 */
4466 if (l2arc_feed_again && wrote > (wanted / 2))
4467 interval = (hz * l2arc_feed_min_ms) / 1000;
4468 else
4469 interval = hz * l2arc_feed_secs;
4470
4471 now = ddi_get_lbolt();
4472 next = MAX(now, MIN(now + interval, began + interval));
4473
4474 return (next);
4475 }
4476
4477 static void
4478 l2arc_hdr_stat_add(boolean_t from_arc)
4479 {
4480 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4481 if (from_arc)
4482 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4483 }
4484
4485 static void
4486 l2arc_hdr_stat_remove(void)
4487 {
4488 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4489 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4490 }
4491
4492 /*
4493 * Cycle through L2ARC devices. This is how L2ARC load balances.
4494 * If a device is returned, this also returns holding the spa config lock.
4495 */
4496 static l2arc_dev_t *
4497 l2arc_dev_get_next(void)
4498 {
4499 l2arc_dev_t *first, *next = NULL;
4500
4501 /*
4502 * Lock out the removal of spas (spa_namespace_lock), then removal
4503 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4504 * both locks will be dropped and a spa config lock held instead.
4505 */
4506 mutex_enter(&spa_namespace_lock);
4507 mutex_enter(&l2arc_dev_mtx);
4508
4509 /* if there are no vdevs, there is nothing to do */
4510 if (l2arc_ndev == 0)
4511 goto out;
4512
4513 first = NULL;
4514 next = l2arc_dev_last;
4515 do {
4516 /*
4517 * Loop around the list looking for a non-faulted vdev
4518 * and one that isn't currently doing an L2ARC rebuild.
4519 */
4520 if (next == NULL) {
4521 next = list_head(l2arc_dev_list);
4522 } else {
4523 next = list_next(l2arc_dev_list, next);
4524 if (next == NULL)
4525 next = list_head(l2arc_dev_list);
4526 }
4527
4528 /* if we have come back to the start, bail out */
4529 if (first == NULL)
4530 first = next;
4531 else if (next == first)
4532 break;
4533
4534 } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding);
4535
4536 /* if we were unable to find any usable vdevs, return NULL */
4537 if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding)
4538 next = NULL;
4539
4540 l2arc_dev_last = next;
4541
4542 out:
4543 mutex_exit(&l2arc_dev_mtx);
4544
4545 /*
4546 * Grab the config lock to prevent the 'next' device from being
4547 * removed while we are writing to it.
4548 */
4549 if (next != NULL)
4550 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4551 mutex_exit(&spa_namespace_lock);
4552
4553 return (next);
4554 }
4555
4556 /*
4557 * Free buffers that were tagged for destruction.
4595 ASSERT(cb != NULL);
4596 dev = cb->l2wcb_dev;
4597 ASSERT(dev != NULL);
4598 head = cb->l2wcb_head;
4599 ASSERT(head != NULL);
4600 buflist = dev->l2ad_buflist;
4601 ASSERT(buflist != NULL);
4602 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4603 l2arc_write_callback_t *, cb);
4604
4605 if (zio->io_error != 0)
4606 ARCSTAT_BUMP(arcstat_l2_writes_error);
4607
4608 mutex_enter(&l2arc_buflist_mtx);
4609
4610 /*
4611 * All writes completed, or an error was hit.
4612 */
4613 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4614 ab_prev = list_prev(buflist, ab);
4615 abl2 = ab->b_l2hdr;
4616
4617 /*
4618 * Release the temporary compressed buffer as soon as possible.
4619 */
4620 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4621 l2arc_release_cdata_buf(ab);
4622
4623 hash_lock = HDR_LOCK(ab);
4624 if (!mutex_tryenter(hash_lock)) {
4625 /*
4626 * This buffer misses out. It may be in a stage
4627 * of eviction. Its ARC_L2_WRITING flag will be
4628 * left set, denying reads to this buffer.
4629 */
4630 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4631 continue;
4632 }
4633
4634 if (zio->io_error != 0) {
4635 /*
4636 * Error - drop L2ARC entry.
4637 */
4638 list_remove(buflist, ab);
4639 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4640 ab->b_l2hdr = NULL;
4641 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4642 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4643 }
4644
4645 /*
4646 * Allow ARC to begin reads to this L2ARC entry.
4647 */
4648 ab->b_flags &= ~ARC_L2_WRITING;
4649
4650 mutex_exit(hash_lock);
4651 }
4652
4653 atomic_inc_64(&l2arc_writes_done);
4654 list_remove(buflist, head);
4655 kmem_cache_free(hdr_cache, head);
4656 mutex_exit(&l2arc_buflist_mtx);
4657
4658 l2arc_do_free_on_write();
4659
4660 if (cb->l2wcb_pbuf)
4661 kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size);
4662 if (cb->l2wcb_ub_buf)
4663 kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE);
4664 kmem_free(cb, sizeof (l2arc_write_callback_t));
4665 }
4666
4667 /*
4668 * A read to a cache device completed. Validate buffer contents before
4669 * handing over to the regular ARC routines.
4670 */
4671 static void
4672 l2arc_read_done(zio_t *zio)
4673 {
4674 l2arc_read_callback_t *cb;
4675 arc_buf_hdr_t *hdr;
4676 arc_buf_t *buf;
4677 kmutex_t *hash_lock;
4678 int equal;
4679
4680 ASSERT(zio->io_vd != NULL);
4681 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4682
4683 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4925 *
4926 * Returns the number of bytes actually written (which may be smaller than
4927 * the delta by which the device hand has changed due to alignment).
4928 */
4929 static uint64_t
4930 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4931 boolean_t *headroom_boost)
4932 {
4933 arc_buf_hdr_t *ab, *ab_prev, *head;
4934 list_t *list;
4935 uint64_t write_asize, write_psize, write_sz, headroom,
4936 buf_compress_minsz;
4937 void *buf_data;
4938 kmutex_t *list_lock;
4939 boolean_t full;
4940 l2arc_write_callback_t *cb;
4941 zio_t *pio, *wzio;
4942 uint64_t guid = spa_load_guid(spa);
4943 const boolean_t do_headroom_boost = *headroom_boost;
4944
4945 /* persistency-related */
4946 l2pbuf_t *pb;
4947 l2pbuf_buflist_t *pb_buflist;
4948 int num_bufs, buf_index;
4949
4950 ASSERT(dev->l2ad_vdev != NULL);
4951
4952 /* Lower the flag now, we might want to raise it again later. */
4953 *headroom_boost = B_FALSE;
4954
4955 pio = NULL;
4956 cb = NULL;
4957 write_sz = write_asize = write_psize = 0;
4958 full = B_FALSE;
4959 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4960 head->b_flags |= ARC_L2_WRITE_HEAD;
4961
4962 /*
4963 * We will want to try to compress buffers that are at least 2x the
4964 * device sector size.
4965 */
4966 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4967
4968 pb = &dev->l2ad_pbuf;
4969 num_bufs = 0;
4970
4971 /*
4972 * We will want to try to compress buffers that are at least 2x the
4973 * device sector size.
4974 */
4975 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4976
4977 /*
4978 * Copy buffers for L2ARC writing.
4979 */
4980 mutex_enter(&l2arc_buflist_mtx);
4981 for (int try = 0; try <= 3; try++) {
4982 uint64_t passed_sz = 0;
4983
4984 list = l2arc_list_locked(try, &list_lock);
4985
4986 /*
4987 * L2ARC fast warmup.
4988 *
4989 * Until the ARC is warm and starts to evict, read from the
4990 * head of the ARC lists rather than the tail.
4991 */
4992 if (arc_warm == B_FALSE)
4993 ab = list_head(list);
4994 else
4995 ab = list_tail(list);
4996
4997 headroom = target_sz * l2arc_headroom;
5027
5028 if (!l2arc_write_eligible(guid, ab)) {
5029 mutex_exit(hash_lock);
5030 continue;
5031 }
5032
5033 if ((write_sz + ab->b_size) > target_sz) {
5034 full = B_TRUE;
5035 mutex_exit(hash_lock);
5036 break;
5037 }
5038
5039 if (pio == NULL) {
5040 /*
5041 * Insert a dummy header on the buflist so
5042 * l2arc_write_done() can find where the
5043 * write buffers begin without searching.
5044 */
5045 list_insert_head(dev->l2ad_buflist, head);
5046
5047 cb = kmem_zalloc(
5048 sizeof (l2arc_write_callback_t), KM_SLEEP);
5049 cb->l2wcb_dev = dev;
5050 cb->l2wcb_head = head;
5051 pio = zio_root(spa, l2arc_write_done, cb,
5052 ZIO_FLAG_CANFAIL);
5053 }
5054
5055 /*
5056 * Create and add a new L2ARC header.
5057 */
5058 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5059 l2hdr->b_dev = dev;
5060 ab->b_flags |= ARC_L2_WRITING;
5061
5062 /*
5063 * Temporarily stash the data buffer in b_tmp_cdata.
5064 * The subsequent write step will pick it up from
5065 * there. This is because can't access ab->b_buf
5066 * without holding the hash_lock, which we in turn
5067 * can't access without holding the ARC list locks
5069 */
5070 l2hdr->b_compress = ZIO_COMPRESS_OFF;
5071 l2hdr->b_asize = ab->b_size;
5072 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5073
5074 buf_sz = ab->b_size;
5075 ab->b_l2hdr = l2hdr;
5076
5077 list_insert_head(dev->l2ad_buflist, ab);
5078
5079 /*
5080 * Compute and store the buffer cksum before
5081 * writing. On debug the cksum is verified first.
5082 */
5083 arc_cksum_verify(ab->b_buf);
5084 arc_cksum_compute(ab->b_buf, B_TRUE);
5085
5086 mutex_exit(hash_lock);
5087
5088 write_sz += buf_sz;
5089 num_bufs++;
5090 }
5091
5092 mutex_exit(list_lock);
5093
5094 if (full == B_TRUE)
5095 break;
5096 }
5097
5098 /* No buffers selected for writing? */
5099 if (pio == NULL) {
5100 ASSERT0(write_sz);
5101 mutex_exit(&l2arc_buflist_mtx);
5102 kmem_cache_free(hdr_cache, head);
5103 return (0);
5104 }
5105
5106 /* expand the pbuf to include a new list */
5107 pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs);
5108
5109 /*
5110 * Now start writing the buffers. We're starting at the write head
5111 * and work backwards, retracing the course of the buffer selector
5112 * loop above.
5113 */
5114 for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab;
5115 ab = list_prev(dev->l2ad_buflist, ab), buf_index++) {
5116 l2arc_buf_hdr_t *l2hdr;
5117 uint64_t buf_sz;
5118
5119 /*
5120 * We shouldn't need to lock the buffer here, since we flagged
5121 * it as ARC_L2_WRITING in the previous step, but we must take
5122 * care to only access its L2 cache parameters. In particular,
5123 * ab->b_buf may be invalid by now due to ARC eviction.
5124 */
5125 l2hdr = ab->b_l2hdr;
5126 l2hdr->b_daddr = dev->l2ad_hand;
5127
5128 if ((ab->b_flags & ARC_L2COMPRESS) &&
5129 l2hdr->b_asize >= buf_compress_minsz) {
5130 if (l2arc_compress_buf(l2hdr)) {
5131 /*
5132 * If compression succeeded, enable headroom
5133 * boost on the next scan cycle.
5134 */
5135 *headroom_boost = B_TRUE;
5147 if (buf_sz != 0) {
5148 uint64_t buf_p_sz;
5149
5150 wzio = zio_write_phys(pio, dev->l2ad_vdev,
5151 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5152 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5153 ZIO_FLAG_CANFAIL, B_FALSE);
5154
5155 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5156 zio_t *, wzio);
5157 (void) zio_nowait(wzio);
5158
5159 write_asize += buf_sz;
5160 /*
5161 * Keep the clock hand suitably device-aligned.
5162 */
5163 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5164 write_psize += buf_p_sz;
5165 dev->l2ad_hand += buf_p_sz;
5166 }
5167
5168 l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index);
5169 }
5170 ASSERT(buf_index == num_bufs);
5171 mutex_exit(&l2arc_buflist_mtx);
5172
5173 ASSERT3U(write_asize, <=, target_sz);
5174 ARCSTAT_BUMP(arcstat_l2_writes_sent);
5175 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5176 ARCSTAT_INCR(arcstat_l2_size, write_sz);
5177 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5178 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5179
5180 /* Is it time to commit this pbuf? */
5181 if (L2PBUF_IS_FULL(pb) &&
5182 dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) {
5183 l2arc_pbuf_commit(dev, pio, cb);
5184 l2arc_pbuf_destroy(pb);
5185 l2arc_pbuf_init(pb);
5186 }
5187
5188 /*
5189 * Bump device hand to the device start if it is approaching the end.
5190 * l2arc_evict() will already have evicted ahead for this case.
5191 */
5192 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5193 vdev_space_update(dev->l2ad_vdev,
5194 dev->l2ad_end - dev->l2ad_hand, 0, 0);
5195 dev->l2ad_hand = dev->l2ad_start;
5196 dev->l2ad_evict = dev->l2ad_start;
5197 dev->l2ad_first = B_FALSE;
5198 }
5199
5200 dev->l2ad_writing = B_TRUE;
5201 (void) zio_wait(pio);
5202 dev->l2ad_writing = B_FALSE;
5203
5204 return (write_asize);
5205 }
5206
5207 /*
5449 }
5450
5451 boolean_t
5452 l2arc_vdev_present(vdev_t *vd)
5453 {
5454 l2arc_dev_t *dev;
5455
5456 mutex_enter(&l2arc_dev_mtx);
5457 for (dev = list_head(l2arc_dev_list); dev != NULL;
5458 dev = list_next(l2arc_dev_list, dev)) {
5459 if (dev->l2ad_vdev == vd)
5460 break;
5461 }
5462 mutex_exit(&l2arc_dev_mtx);
5463
5464 return (dev != NULL);
5465 }
5466
5467 /*
5468 * Add a vdev for use by the L2ARC. By this point the spa has already
5469 * validated the vdev and opened it. The `rebuild' flag indicates whether
5470 * we should attempt an L2ARC persistency rebuild.
5471 */
5472 void
5473 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5474 {
5475 l2arc_dev_t *adddev;
5476
5477 ASSERT(!l2arc_vdev_present(vd));
5478
5479 /*
5480 * Create a new l2arc device entry.
5481 */
5482 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5483 adddev->l2ad_spa = spa;
5484 adddev->l2ad_vdev = vd;
5485 adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE;
5486 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5487 adddev->l2ad_hand = adddev->l2ad_start;
5488 adddev->l2ad_evict = adddev->l2ad_start;
5489 adddev->l2ad_first = B_TRUE;
5490 adddev->l2ad_writing = B_FALSE;
5491 l2arc_pbuf_init(&adddev->l2ad_pbuf);
5492
5493 /*
5494 * This is a list of all ARC buffers that are still valid on the
5495 * device.
5496 */
5497 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5498 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5499 offsetof(arc_buf_hdr_t, b_l2node));
5500
5501 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5502
5503 /*
5504 * Add device to global list
5505 */
5506 mutex_enter(&l2arc_dev_mtx);
5507 list_insert_head(l2arc_dev_list, adddev);
5508 atomic_inc_64(&l2arc_ndev);
5509 if (rebuild && l2arc_rebuild_enabled) {
5510 adddev->l2ad_rebuilding = B_TRUE;
5511 (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev,
5512 0, &p0, TS_RUN, minclsyspri);
5513 }
5514 mutex_exit(&l2arc_dev_mtx);
5515 }
5516
5517 /*
5518 * Remove a vdev from the L2ARC.
5519 */
5520 void
5521 l2arc_remove_vdev(vdev_t *vd)
5522 {
5523 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5524
5525 /*
5526 * Find the device by vdev
5527 */
5528 mutex_enter(&l2arc_dev_mtx);
5529 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5530 nextdev = list_next(l2arc_dev_list, dev);
5531 if (vd == dev->l2ad_vdev) {
5532 remdev = dev;
5533 break;
5534 }
5535 }
5536 ASSERT(remdev != NULL);
5537
5538 /*
5539 * Remove device from global list
5540 */
5541 list_remove(l2arc_dev_list, remdev);
5542 l2arc_dev_last = NULL; /* may have been invalidated */
5543 atomic_dec_64(&l2arc_ndev);
5544 mutex_exit(&l2arc_dev_mtx);
5545
5546 /*
5547 * Clear all buflists and ARC references. L2ARC device flush.
5548 */
5549 l2arc_pbuf_destroy(&remdev->l2ad_pbuf);
5550 l2arc_evict(remdev, 0, B_TRUE);
5551 list_destroy(remdev->l2ad_buflist);
5552 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5553 kmem_free(remdev, sizeof (l2arc_dev_t));
5554 }
5555
5556 void
5557 l2arc_init(void)
5558 {
5559 l2arc_thread_exit = 0;
5560 l2arc_ndev = 0;
5561 l2arc_writes_sent = 0;
5562 l2arc_writes_done = 0;
5563
5564 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5565 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5566 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5567 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5568 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5569
5601 {
5602 if (!(spa_mode_global & FWRITE))
5603 return;
5604
5605 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5606 TS_RUN, minclsyspri);
5607 }
5608
5609 void
5610 l2arc_stop(void)
5611 {
5612 if (!(spa_mode_global & FWRITE))
5613 return;
5614
5615 mutex_enter(&l2arc_feed_thr_lock);
5616 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5617 l2arc_thread_exit = 1;
5618 while (l2arc_thread_exit != 0)
5619 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5620 mutex_exit(&l2arc_feed_thr_lock);
5621 }
5622
5623 /*
5624 * Main entry point for L2ARC metadata rebuilding. This function must be
5625 * called via thread_create so that the L2ARC metadata rebuild doesn't block
5626 * pool import and may proceed in parallel on all available L2ARC devices.
5627 */
5628 static void
5629 l2arc_rebuild_start(l2arc_dev_t *dev)
5630 {
5631 vdev_t *vd = dev->l2ad_vdev;
5632 spa_t *spa = dev->l2ad_spa;
5633
5634 /* Lock out device removal. */
5635 spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
5636 ASSERT(dev->l2ad_rebuilding == B_TRUE);
5637 l2arc_rebuild(dev);
5638 dev->l2ad_rebuilding = B_FALSE;
5639 spa_config_exit(spa, SCL_L2ARC, vd);
5640 thread_exit();
5641 }
5642
5643 /*
5644 * This function implements the actual L2ARC metadata rebuild. It:
5645 *
5646 * 1) scans the device for valid l2uberblocks
5647 * 2) if it finds a good uberblock, starts reading the pbuf chain
5648 * 3) restores each pbuf's contents to memory
5649 *
5650 * Operation stops under any of the following conditions:
5651 *
5652 * 1) We reach the end of the pbuf chain (the previous-buffer reference
5653 * in the pbuf is zero).
5654 * 2) We encounter *any* error condition (cksum errors, io errors, looped
5655 * pbufs, etc.).
5656 * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
5657 * from making severely fragmented L2ARC pbufs or slow L2ARC devices
5658 * prevent a machine from importing the pool (and letting the
5659 * administrator take corrective action, e.g. by kicking the misbehaving
5660 * L2ARC device out of the pool, or by reimporting the pool with L2ARC
5661 * rebuilding disabled).
5662 */
5663 static void
5664 l2arc_rebuild(l2arc_dev_t *dev)
5665 {
5666 int err;
5667 l2uberblock_t ub;
5668 l2pbuf_t pb;
5669 zio_t *this_io = NULL, *next_io = NULL;
5670 int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
5671
5672 if ((err = l2arc_uberblock_find(dev, &ub)) != 0)
5673 return;
5674 L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */);
5675
5676 /* set up uberblock update info */
5677 dev->l2ad_uberblock_birth = ub.ub_birth + 1;
5678
5679 /* initial sanity checks */
5680 l2arc_pbuf_init(&pb);
5681 if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize,
5682 ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) {
5683 /* root pbuf is bad, we can't do anything about that */
5684 if (err == EINVAL) {
5685 ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5686 } else {
5687 ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5688 }
5689 l2arc_pbuf_destroy(&pb);
5690 return;
5691 }
5692 L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5693
5694 dev->l2ad_evict = ub.ub_evict_tail;
5695
5696 /* keep on chaining in new blocks */
5697 dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr;
5698 dev->l2ad_pbuf_asize = ub.ub_pbuf_asize;
5699 dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum;
5700 dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
5701 ub.ub_pbuf_daddr + ub.ub_pbuf_asize);
5702 dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0);
5703
5704 /* start the rebuild process */
5705 for (;;) {
5706 l2pbuf_t pb_prev;
5707
5708 l2arc_pbuf_init(&pb_prev);
5709 if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr,
5710 pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io,
5711 &next_io)) != 0) {
5712 /*
5713 * We are done reading, discard the last good buffer.
5714 */
5715 if (pb.pb_prev_daddr > dev->l2ad_hand &&
5716 pb.pb_prev_asize > L2PBUF_HDR_SIZE) {
5717 /* this is an error, we stopped too early */
5718 if (err == EINVAL) {
5719 ARCSTAT_BUMP(
5720 arcstat_l2_rebuild_cksum_errors);
5721 } else {
5722 ARCSTAT_BUMP(
5723 arcstat_l2_rebuild_io_errors);
5724 }
5725 }
5726 l2arc_pbuf_destroy(&pb_prev);
5727 l2arc_pbuf_destroy(&pb);
5728 break;
5729 }
5730
5731 /*
5732 * Protection against infinite loops of pbufs. This is also
5733 * our primary termination mechanism - once the buffer list
5734 * loops around our starting pbuf, we can stop.
5735 */
5736 if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr &&
5737 pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) {
5738 ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
5739 l2arc_pbuf_destroy(&pb);
5740 l2arc_pbuf_destroy(&pb_prev);
5741 if (next_io)
5742 l2arc_pbuf_prefetch_abort(next_io);
5743 return;
5744 }
5745
5746 /*
5747 * Our memory pressure valve. If the system is running low
5748 * on memory, rather than swamping memory with new ARC buf
5749 * hdrs, we opt not to reconstruct the L2ARC. At this point,
5750 * however, we have already set up our L2ARC dev to chain in
5751 * new metadata pbufs, so the user may choose to re-add the
5752 * L2ARC dev at a later time to reconstruct it (when there's
5753 * less memory pressure).
5754 */
5755 if (arc_reclaim_needed()) {
5756 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
5757 cmn_err(CE_NOTE, "System running low on memory, "
5758 "aborting L2ARC rebuild.");
5759 l2arc_pbuf_destroy(&pb);
5760 l2arc_pbuf_destroy(&pb_prev);
5761 if (next_io)
5762 l2arc_pbuf_prefetch_abort(next_io);
5763 break;
5764 }
5765
5766 /*
5767 * Now that we know that the prev_pbuf checks out alright, we
5768 * can start reconstruction from this pbuf - we can be sure
5769 * that the L2ARC write hand has not yet reached any of our
5770 * buffers.
5771 */
5772 l2arc_pbuf_restore(dev, &pb);
5773
5774 /* pbuf restored, continue with next one in the list */
5775 l2arc_pbuf_destroy(&pb);
5776 pb = pb_prev;
5777 this_io = next_io;
5778 next_io = NULL;
5779
5780 L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5781 }
5782
5783 ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
5784 }
5785
5786 /*
5787 * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries
5788 * which only contain an l2arc hdr, essentially restoring the buffers to
5789 * their L2ARC evicted state. This function also updates space usage on the
5790 * L2ARC vdev to make sure it tracks restored buffers.
5791 */
5792 static void
5793 l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb)
5794 {
5795 spa_t *spa;
5796 uint64_t guid;
5797 list_t *buflists_list;
5798 l2pbuf_buflist_t *buflist;
5799
5800 mutex_enter(&l2arc_buflist_mtx);
5801 spa = dev->l2ad_vdev->vdev_spa;
5802 guid = spa_load_guid(spa);
5803 buflists_list = pb->pb_buflists_list;
5804 for (buflist = list_head(buflists_list); buflist;
5805 buflist = list_next(buflists_list, buflist)) {
5806 int i;
5807 uint64_t size, asize, psize;
5808
5809 size = asize = psize = 0;
5810 for (i = 0; i < buflist->l2pbl_nbufs; i++) {
5811 l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev,
5812 guid);
5813 size += buflist->l2pbl_bufs[i].b_size;
5814 asize += buflist->l2pbl_bufs[i].b_l2asize;
5815 psize += vdev_psize_to_asize(dev->l2ad_vdev,
5816 buflist->l2pbl_bufs[i].b_l2asize);
5817 }
5818 ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size);
5819 ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize);
5820 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs);
5821 vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
5822 }
5823 mutex_exit(&l2arc_buflist_mtx);
5824 ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs);
5825 vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev,
5826 pb->pb_asize), 0, 0);
5827 }
5828
5829 /*
5830 * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into
5831 * a state indicating that it has been evicted to L2ARC.
5832 * The `guid' here is the ARC-load-guid from spa_load_guid.
5833 */
5834 static void
5835 l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid)
5836 {
5837 arc_buf_hdr_t *hdr;
5838 kmutex_t *hash_lock;
5839 dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]};
5840
5841 hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock);
5842 if (hdr == NULL) {
5843 /* not in cache, try to insert */
5844 arc_buf_hdr_t *exists;
5845 arc_buf_contents_t type = buf->b_contents_type;
5846 l2arc_buf_hdr_t *l2hdr;
5847
5848 hdr = arc_buf_hdr_alloc(guid, buf->b_size, type);
5849 hdr->b_dva = buf->b_dva;
5850 hdr->b_birth = buf->b_birth;
5851 hdr->b_cksum0 = buf->b_cksum0;
5852 hdr->b_size = buf->b_size;
5853 exists = buf_hash_insert(hdr, &hash_lock);
5854 if (exists) {
5855 /* somebody beat us to the hash insert */
5856 mutex_exit(hash_lock);
5857 arc_hdr_destroy(hdr);
5858 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
5859 return;
5860 }
5861 hdr->b_flags = buf->b_flags;
5862 mutex_enter(&hdr->b_freeze_lock);
5863 ASSERT(hdr->b_freeze_cksum == NULL);
5864 hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
5865 KM_SLEEP);
5866 *hdr->b_freeze_cksum = buf->b_freeze_cksum;
5867 mutex_exit(&hdr->b_freeze_lock);
5868
5869 /* now rebuild the l2arc entry */
5870 ASSERT(hdr->b_l2hdr == NULL);
5871 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5872 l2hdr->b_dev = dev;
5873 l2hdr->b_daddr = buf->b_l2daddr;
5874 l2hdr->b_asize = buf->b_l2asize;
5875 l2hdr->b_compress = buf->b_l2compress;
5876 hdr->b_l2hdr = l2hdr;
5877 list_insert_head(dev->l2ad_buflist, hdr);
5878 ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
5879 ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
5880
5881 arc_change_state(arc_l2c_only, hdr, hash_lock);
5882 }
5883 mutex_exit(hash_lock);
5884 }
5885
5886 /*
5887 * Attempts to locate and read the newest valid uberblock on the provided
5888 * L2ARC device and writes it to `ub'. On success, this function returns 0,
5889 * otherwise the appropriate error code is returned.
5890 */
5891 static int
5892 l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub)
5893 {
5894 int err = 0;
5895 uint8_t *ub_buf;
5896 uint64_t guid;
5897
5898 ARCSTAT_BUMP(arcstat_l2_rebuild_attempts);
5899 ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
5900 guid = spa_guid(dev->l2ad_vdev->vdev_spa);
5901
5902 if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
5903 VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf,
5904 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
5905 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
5906 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
5907 ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5908 goto cleanup;
5909 }
5910
5911 /*
5912 * Initial peek - does the device even have any usable uberblocks?
5913 * If not, don't bother continuing.
5914 */
5915 l2arc_uberblock_decode(ub_buf, ub);
5916 if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 ||
5917 ub->ub_version > L2UBERBLOCK_MAX_VERSION ||
5918 ub->ub_spa_guid != guid) {
5919 err = ENOTSUP;
5920 ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported);
5921 goto cleanup;
5922 }
5923
5924 /* now check to make sure that what we selected is okay */
5925 if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) {
5926 if (err == EINVAL) {
5927 ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5928 } else {
5929 ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors);
5930 }
5931 goto cleanup;
5932 }
5933
5934 /* this uberblock is valid */
5935
5936 cleanup:
5937 kmem_free(ub_buf, L2UBERBLOCK_SIZE);
5938 return (err);
5939 }
5940
5941 /*
5942 * Reads a pbuf from storage, decodes it and validates its contents against
5943 * the provided checksum. The result is placed in `pb'.
5944 *
5945 * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching.
5946 * When issuing the first pbuf IO during rebuild, you should pass NULL for
5947 * `this_io'. This function will then issue a sync IO to read the pbuf and
5948 * also issue an async IO to fetch the next pbuf in the pbuf chain. The
5949 * prefetch IO is returned in `prefetch_io. On subsequent calls to this
5950 * function, pass the value returned in `prefetch_io' from the previous
5951 * call as `this_io' and a fresh `prefetch_io' pointer to hold the next
5952 * prefetch IO. Prior to the call, you should initialize your `prefetch_io'
5953 * pointer to be NULL. If no prefetch IO was issued, the pointer is left
5954 * set at NULL.
5955 *
5956 * Actual prefetching takes place in two steps: a header IO (pi_hdr_io)
5957 * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io
5958 * IO is used internally in this function to be able to `peek' at the next
5959 * buffer's header before the main IO to read it in completely has finished.
5960 * We can then begin to issue the IO for the next buffer in the chain before
5961 * we are done reading, keeping the L2ARC device's pipeline saturated with
5962 * reads (rather than issuing an IO, waiting for it to complete, validating
5963 * the returned buffer and issuing the next one). This will make sure that
5964 * the rebuild proceeds at maximum read throughput.
5965 *
5966 * On success, this function returns 0, otherwise it returns an appropriate
5967 * error code. On error the prefetching IO is aborted and cleared before
5968 * returning from this function. Therefore, if we return `success', the
5969 * caller can assume that we have taken care of cleanup of prefetch IOs.
5970 */
5971 static int
5972 l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
5973 zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io)
5974 {
5975 int err = 0;
5976 uint64_t prev_pb_start;
5977 uint32_t prev_pb_asize;
5978 zio_cksum_t calc_cksum, prev_pb_cksum;
5979 l2arc_prefetch_info_t *pi = NULL;
5980
5981 ASSERT(dev != NULL);
5982 ASSERT(pb != NULL);
5983 ASSERT(*prefetch_io == NULL);
5984
5985 if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) {
5986 /* We could not have issued a prefetch IO for this */
5987 ASSERT(this_io == NULL);
5988 return (EINVAL);
5989 }
5990
5991 /*
5992 * Check to see if we have issued the IO for this pbuf in a previous
5993 * run. If not, issue it now.
5994 */
5995 if (this_io == NULL)
5996 this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize);
5997
5998 /* Pick up the prefetch info buffer and read its contents */
5999 pi = this_io->io_private;
6000 ASSERT(pi != NULL);
6001 ASSERT(asize <= pi->pi_buflen);
6002
6003 /* Wait for the IO to read this pbuf's header to complete */
6004 if ((err = zio_wait(pi->pi_hdr_io)) != 0) {
6005 (void) zio_wait(this_io);
6006 goto cleanup;
6007 }
6008
6009 /*
6010 * Peek to see if we can start issuing the next pbuf IO immediately.
6011 * At this point, only the current pbuf's header has been read.
6012 */
6013 if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start,
6014 &prev_pb_asize, &prev_pb_cksum) == 0) {
6015 uint64_t this_pb_start, this_pb_end, prev_pb_end;
6016 /* Detect malformed pbuf references and loops */
6017 this_pb_start = daddr;
6018 this_pb_end = daddr + asize;
6019 prev_pb_end = prev_pb_start + prev_pb_asize;
6020 if ((prev_pb_start >= this_pb_start && prev_pb_start <
6021 this_pb_end) ||
6022 (prev_pb_end >= this_pb_start && prev_pb_end <
6023 this_pb_end)) {
6024 ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
6025 cmn_err(CE_WARN, "Looping L2ARC metadata reference "
6026 "detected, aborting rebuild.");
6027 err = EINVAL;
6028 goto cleanup;
6029 }
6030 /*
6031 * Start issuing IO for the next pbuf early - this should
6032 * help keep the L2ARC device busy while we read, decode
6033 * and restore this pbuf.
6034 */
6035 if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize))
6036 *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev,
6037 prev_pb_start, prev_pb_asize);
6038 }
6039
6040 /* Wait for the main pbuf IO to complete */
6041 if ((err = zio_wait(this_io)) != 0)
6042 goto cleanup;
6043
6044 /* Make sure the buffer checks out ok */
6045 fletcher_4_native(pi->pi_buf, asize, &calc_cksum);
6046 if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) {
6047 err = EINVAL;
6048 goto cleanup;
6049 }
6050
6051 /* Now we can take our time decoding this buffer */
6052 if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0)
6053 goto cleanup;
6054
6055 /* This will be used in l2arc_pbuf_restore for space accounting */
6056 pb->pb_asize = asize;
6057
6058 ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb));
6059 ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize);
6060 ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6061 pb->pb_payload_asz / asize);
6062
6063 cleanup:
6064 kmem_free(pi->pi_buf, pi->pi_buflen);
6065 pi->pi_buf = NULL;
6066 kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6067 /* Abort an in-flight prefetch in case of error */
6068 if (err != 0 && *prefetch_io != NULL) {
6069 l2arc_pbuf_prefetch_abort(*prefetch_io);
6070 *prefetch_io = NULL;
6071 }
6072 return (err);
6073 }
6074
6075 /*
6076 * Validates a pbuf device address to make sure that it can be read
6077 * from the provided L2ARC device. Returns 1 if the address is within
6078 * the device's bounds, or 0 if not.
6079 */
6080 static int
6081 l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize)
6082 {
6083 uint32_t psize;
6084 uint64_t end;
6085
6086 psize = vdev_psize_to_asize(dev->l2ad_vdev, asize);
6087 end = daddr + psize;
6088
6089 if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE ||
6090 asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start ||
6091 /* check that the buffer address is correctly aligned */
6092 (daddr & (vdev_psize_to_asize(dev->l2ad_vdev,
6093 SPA_MINBLOCKSIZE) - 1)) != 0)
6094 return (0);
6095 else
6096 return (1);
6097 }
6098
6099 /*
6100 * Starts an asynchronous read IO to read a pbuf. This is used in pbuf
6101 * reconstruction to start reading the next pbuf before we are done
6102 * decoding and reconstructing the current pbuf, to keep the l2arc device
6103 * nice and hot with read IO to process.
6104 * The returned zio will contain a newly allocated memory buffers for the IO
6105 * data which should then be freed by the caller once the zio is no longer
6106 * needed (i.e. due to it having completed). If you wish to abort this
6107 * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care
6108 * of disposing of the allocated buffers correctly.
6109 */
6110 static zio_t *
6111 l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize)
6112 {
6113 uint32_t i, psize;
6114 zio_t *pio, *hdr_io;
6115 uint64_t hdr_rsize;
6116 uint8_t *buf;
6117 l2arc_prefetch_info_t *pinfo;
6118
6119 psize = vdev_psize_to_asize(vd, asize);
6120 buf = kmem_alloc(psize, KM_SLEEP);
6121 pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP);
6122 pinfo->pi_buf = buf;
6123 pinfo->pi_buflen = psize;
6124
6125 /*
6126 * We start issuing the IO for the pbuf header early. This
6127 * allows l2arc_pbuf_read to start issuing IO for the next
6128 * buffer before the current pbuf is read in completely.
6129 */
6130
6131 hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE);
6132 ASSERT(hdr_rsize <= psize);
6133 pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL,
6134 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6135 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
6136 hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf,
6137 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
6138 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6139 ZIO_FLAG_DONT_RETRY, B_FALSE);
6140 (void) zio_nowait(hdr_io);
6141
6142 /*
6143 * Read in the rest of the pbuf - this can take longer than just
6144 * having a peek at the header.
6145 */
6146 pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE |
6147 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6148 ZIO_FLAG_DONT_RETRY);
6149 for (i = hdr_rsize; i < psize; ) {
6150 uint64_t rsize = psize - i;
6151 zio_t *rzio;
6152
6153 if (psize - i > SPA_MAXBLOCKSIZE)
6154 rsize = SPA_MAXBLOCKSIZE;
6155 ASSERT(rsize >= SPA_MINBLOCKSIZE);
6156 rzio = zio_read_phys(pio, vd, daddr + i,
6157 rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6158 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE |
6159 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6160 ZIO_FLAG_DONT_RETRY, B_FALSE);
6161 (void) zio_nowait(rzio);
6162 i += rsize;
6163 }
6164
6165 return (pio);
6166 }
6167
6168 /*
6169 * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data
6170 * buffers allocated for it.
6171 */
6172 static void
6173 l2arc_pbuf_prefetch_abort(zio_t *zio)
6174 {
6175 l2arc_prefetch_info_t *pi;
6176
6177 pi = zio->io_private;
6178 ASSERT(pi != NULL);
6179 if (pi->pi_hdr_io != NULL)
6180 (void) zio_wait(pi->pi_hdr_io);
6181 (void) zio_wait(zio);
6182 kmem_free(pi->pi_buf, pi->pi_buflen);
6183 pi->pi_buf = NULL;
6184 kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6185 }
6186
6187 /*
6188 * Encodes an l2uberblock_t structure into a destination buffer. This
6189 * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting
6190 * uberblock is always of this constant size.
6191 */
6192 static void
6193 l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf)
6194 {
6195 zio_cksum_t cksum;
6196
6197 bzero(buf, L2UBERBLOCK_SIZE);
6198
6199 #if defined(_BIG_ENDIAN)
6200 *(uint32_t *)buf = L2UBERBLOCK_MAGIC;
6201 *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN;
6202 #else /* !defined(_BIG_ENDIAN) */
6203 *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC);
6204 /* zero flags is ok */
6205 #endif /* !defined(_BIG_ENDIAN) */
6206 buf[4] = L2UBERBLOCK_MAX_VERSION;
6207
6208 /* rest in native byte order */
6209 *(uint64_t *)(buf + 8) = ub->ub_spa_guid;
6210 *(uint64_t *)(buf + 16) = ub->ub_birth;
6211 *(uint64_t *)(buf + 24) = ub->ub_evict_tail;
6212 *(uint64_t *)(buf + 32) = ub->ub_alloc_space;
6213 *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr;
6214 *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize;
6215 bcopy(&ub->ub_pbuf_cksum, buf + 52, 32);
6216
6217 fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6218 bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32);
6219 }
6220
6221 /*
6222 * Decodes an l2uberblock_t from an on-disk representation. Please note
6223 * that this function does not perform any uberblock validation and
6224 * checksumming - call l2arc_uberblock_verify() for that.
6225 */
6226 static void
6227 l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub)
6228 {
6229 boolean_t bswap_needed;
6230
6231 /* these always come in big endian */
6232 #if defined(_BIG_ENDIAN)
6233 ub->ub_magic = *(uint32_t *)buf;
6234 ub->ub_flags = *(uint16_t *)(buf + 6);
6235 bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1);
6236 #else /* !defined(_BIG_ENDIAN) */
6237 ub->ub_magic = BSWAP_32(*(uint32_t *)buf);
6238 ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6));
6239 bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0);
6240 #endif /* !defined(_BIG_ENDIAN) */
6241 ub->ub_version = buf[4];
6242
6243 ub->ub_spa_guid = *(uint64_t *)(buf + 8);
6244 ub->ub_birth = *(uint64_t *)(buf + 16);
6245 ub->ub_evict_tail = *(uint64_t *)(buf + 24);
6246 ub->ub_alloc_space = *(uint64_t *)(buf + 32);
6247 ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40);
6248 ub->ub_pbuf_asize = *(uint32_t *)(buf + 48);
6249 bcopy(buf + 52, &ub->ub_pbuf_cksum, 36);
6250 bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32);
6251
6252 /* swap the rest if endianness doesn't match us */
6253 if (bswap_needed) {
6254 ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid);
6255 ub->ub_birth = BSWAP_64(ub->ub_birth);
6256 ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail);
6257 ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space);
6258 ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr);
6259 ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize);
6260 ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum);
6261 ZIO_CHECKSUM_BSWAP(&ub->ub_cksum);
6262 }
6263 }
6264
6265 /*
6266 * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is
6267 * valid and matches its checksum.
6268 */
6269 static int
6270 l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
6271 uint64_t guid)
6272 {
6273 zio_cksum_t cksum;
6274
6275 if (ub->ub_magic != L2UBERBLOCK_MAGIC ||
6276 ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION)
6277 /*
6278 * bad magic or invalid version => persistent l2arc not
6279 * supported
6280 */
6281 return (ENOTSUP);
6282
6283 if (ub->ub_spa_guid != guid)
6284 /* this l2arc dev isn't ours */
6285 return (EINVAL);
6286
6287 fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6288 if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum))
6289 /* bad checksum, corrupt uberblock */
6290 return (EINVAL);
6291
6292 return (0);
6293 }
6294
6295 /*
6296 * Schedules a zio to update the uberblock on an l2arc device. The zio is
6297 * initiated as a child of `pio' and `cb' is filled with the information
6298 * needed to free the uberblock data buffer after writing.
6299 */
6300 static void
6301 l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6302 {
6303 uint8_t *ub_buf;
6304 l2uberblock_t ub;
6305 zio_t *wzio;
6306 vdev_stat_t st;
6307
6308 ASSERT(cb->l2wcb_ub_buf == NULL);
6309 vdev_get_stats(dev->l2ad_vdev, &st);
6310
6311 bzero(&ub, sizeof (ub));
6312 ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6313 ub.ub_birth = dev->l2ad_uberblock_birth++;
6314 ub.ub_evict_tail = dev->l2ad_evict;
6315 ub.ub_alloc_space = st.vs_alloc;
6316 ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr;
6317 ub.ub_pbuf_asize = dev->l2ad_pbuf_asize;
6318 ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum;
6319 if (dev->l2ad_first)
6320 ub.ub_flags |= L2UBLK_EVICT_FIRST;
6321
6322 ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
6323 cb->l2wcb_ub_buf = ub_buf;
6324 l2arc_uberblock_encode(&ub, ub_buf);
6325 wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
6326 L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL,
6327 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6328 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6329 zio_t *, wzio);
6330 (void) zio_nowait(wzio);
6331 }
6332
6333 /*
6334 * Encodes a l2pbuf_t structure into the portable on-disk format. The
6335 * `buf' buffer must be suitably sized to hold the entire uncompressed
6336 * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function
6337 * also compresses the buffer.
6338 *
6339 * The return value is the length of the resulting encoded pbuf structure.
6340 * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression
6341 * was applied, or smaller if compression was applied. In either case,
6342 * prior to writing to disk, the caller must suitably pad the output
6343 * buffer so that it is aligned on a multiple of the underlying storage
6344 * system's block size.
6345 */
6346 static uint32_t
6347 l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen)
6348 {
6349 uint16_t flags = 0;
6350 uint8_t *dst_buf;
6351 uint32_t enclen;
6352 l2pbuf_buflist_t *buflist;
6353
6354 enclen = L2PBUF_ENCODED_SIZE(pb);
6355 ASSERT(buflen >= enclen);
6356 bzero(buf, enclen);
6357
6358 /* non-header portions of pbufs are in native byte order */
6359 *(uint64_t *)(buf + 8) = pb->pb_prev_daddr;
6360 *(uint32_t *)(buf + 16) = pb->pb_prev_asize;
6361 bcopy(&pb->pb_prev_cksum, buf + 20, 32);
6362 *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE;
6363
6364 /* first we encode the buflists uncompressed */
6365 dst_buf = buf + L2PBUF_HDR_SIZE;
6366 for (buflist = list_head(pb->pb_buflists_list); buflist;
6367 buflist = list_next(pb->pb_buflists_list, buflist)) {
6368 int i;
6369
6370 ASSERT(buflist->l2pbl_nbufs != 0);
6371 for (i = 0; i < buflist->l2pbl_nbufs; i++) {
6372 l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6373
6374 ASSERT(pbl_buf->b_size != 0);
6375 *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0];
6376 *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1];
6377 *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth;
6378 *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0;
6379 bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32);
6380 *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size;
6381 *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr;
6382 *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize;
6383 dst_buf[80] = pbl_buf->b_l2compress;
6384 dst_buf[81] = pbl_buf->b_contents_type;
6385 *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags;
6386 dst_buf += L2PBUF_BUF_SIZE;
6387 }
6388 }
6389 ASSERT((uint32_t)(dst_buf - buf) == enclen);
6390
6391 /* and then compress them if necessary */
6392 if (enclen >= l2arc_pbuf_compress_minsz) {
6393 uint8_t *cbuf;
6394 size_t slen, clen;
6395
6396 slen = l2arc_pbuf_items_encoded_size(pb);
6397 cbuf = kmem_alloc(slen, KM_SLEEP);
6398 clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0);
6399 ASSERT(clen != 0);
6400 if (clen < slen) {
6401 bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen);
6402 flags |= L2PBUF_COMPRESSED;
6403 /* zero out the rest of the input buffer */
6404 bzero(buf + L2PBUF_HDR_SIZE + clen,
6405 buflen - (L2PBUF_HDR_SIZE + clen));
6406 /* adjust our buffer length now that it's shortened */
6407 enclen = L2PBUF_HDR_SIZE + clen;
6408 }
6409 kmem_free(cbuf, slen);
6410 }
6411
6412 /* the header goes last since `flags' may change due to compression */
6413 #if defined(_BIG_ENDIAN)
6414 *(uint32_t *)buf = L2PBUF_MAGIC;
6415 flags |= L2PBUF_BIG_ENDIAN;
6416 *(uint16_t *)(buf + 6) = flags;
6417 #else /* !defined(_BIG_ENDIAN) */
6418 *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC);
6419 *(uint16_t *)(buf + 6) = BSWAP_16(flags);
6420 #endif /* !defined(_BIG_ENDIAN) */
6421 buf[4] = L2PBUF_MAX_VERSION;
6422
6423 return (enclen);
6424 }
6425
6426 /*
6427 * Decodes a stored l2pbuf_t structure previously encoded using
6428 * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf
6429 * must be initialized by l2arc_pbuf_init by the caller beforehand, but
6430 * must not have been used to store any buffers yet.
6431 *
6432 * Please note that we don't do checksum verification here, as we don't
6433 * know our own checksum (that's know by the previous block in the linked
6434 * list, or by the uberblock). This should be performed by the caller
6435 * prior to calling l2arc_pbuf_decode.
6436 */
6437 static int
6438 l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb)
6439 {
6440 boolean_t bswap_needed;
6441 uint32_t payload_sz, payload_asz;
6442 uint8_t *src_bufs;
6443 l2pbuf_buflist_t *buflist;
6444 int i, nbufs;
6445
6446 ASSERT(input_buf != NULL);
6447 ASSERT(pb != NULL);
6448 ASSERT(pb->pb_version != 0);
6449 ASSERT(pb->pb_nbuflists == 0);
6450
6451 /* no valid buffer can be this small */
6452 if (buflen < L2PBUF_HDR_SIZE)
6453 return (EINVAL);
6454
6455 /* these always come in big endian */
6456 #if defined(_BIG_ENDIAN)
6457 pb->pb_magic = *(uint32_t *)input_buf;
6458 pb->pb_flags = *(uint16_t *)(input_buf + 6);
6459 bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1);
6460 #else /* !defined(_BIG_ENDIAN) */
6461 pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf);
6462 pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6));
6463 bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0);
6464 #endif /* !defined(_BIG_ENDIAN) */
6465 pb->pb_version = input_buf[4];
6466
6467 if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0)
6468 return (EINVAL);
6469 if (pb->pb_version > L2PBUF_MAX_VERSION)
6470 return (ENOTSUP);
6471
6472 /* remainder of pbuf may need bswap'ping */
6473 pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8);
6474 pb->pb_prev_asize = *(uint64_t *)(input_buf + 16);
6475 bcopy(input_buf + 20, &pb->pb_prev_cksum, 32);
6476 payload_sz = *(uint32_t *)(input_buf + 52);
6477 payload_asz = buflen - L2PBUF_HDR_SIZE;
6478
6479 if (bswap_needed) {
6480 pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr);
6481 pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize);
6482 ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum);
6483 payload_sz = BSWAP_32(payload_sz);
6484 }
6485
6486 /* check for sensible buffer allocation limits */
6487 if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) ||
6488 (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) ||
6489 (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0)
6490 return (EINVAL);
6491 nbufs = payload_sz / L2PBUF_BUF_SIZE;
6492
6493 /* decompression might be needed */
6494 if (pb->pb_flags & L2PBUF_COMPRESSED) {
6495 src_bufs = kmem_alloc(payload_sz, KM_SLEEP);
6496 if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs,
6497 payload_asz, payload_sz, 0) != 0) {
6498 kmem_free(src_bufs, payload_sz);
6499 return (EINVAL);
6500 }
6501 } else {
6502 src_bufs = input_buf + L2PBUF_HDR_SIZE;
6503 }
6504
6505 /* Decode individual pbuf items from our source buffer. */
6506 buflist = l2arc_pbuf_buflist_alloc(pb, nbufs);
6507 for (i = 0; i < nbufs; i++) {
6508 l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6509 const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE;
6510
6511 pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src;
6512 pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8);
6513 pbl_buf->b_birth = *(uint64_t *)(src + 16);
6514 pbl_buf->b_cksum0 = *(uint64_t *)(src + 24);
6515 bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32);
6516 pbl_buf->b_size = *(uint32_t *)(src + 64);
6517 pbl_buf->b_l2daddr = *(uint64_t *)(src + 68);
6518 pbl_buf->b_l2asize = *(uint32_t *)(src + 76);
6519 pbl_buf->b_l2compress = src[80];
6520 pbl_buf->b_contents_type = src[81];
6521 pbl_buf->b_flags = *(uint32_t *)(src + 84);
6522
6523 if (bswap_needed) {
6524 pbl_buf->b_dva.dva_word[0] =
6525 BSWAP_64(pbl_buf->b_dva.dva_word[0]);
6526 pbl_buf->b_dva.dva_word[1] =
6527 BSWAP_64(pbl_buf->b_dva.dva_word[1]);
6528 pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth);
6529 pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0);
6530 ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum);
6531 pbl_buf->b_size = BSWAP_32(pbl_buf->b_size);
6532 pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr);
6533 pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize);
6534 pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags);
6535 }
6536
6537 pb->pb_payload_asz += pbl_buf->b_l2asize;
6538 }
6539
6540 if (pb->pb_flags & L2PBUF_COMPRESSED)
6541 kmem_free(src_bufs, payload_sz);
6542
6543 return (0);
6544 }
6545
6546 /*
6547 * Decodes the previous buffer pointer encoded in a pbuf. This is used
6548 * during L2ARC reconstruction to "peek" at the next buffer and start
6549 * issuing IO to fetch it early, before decoding of the current buffer
6550 * is done (which can take time due to decompression).
6551 * Returns 0 on success (and fills in the return parameters `daddr',
6552 * `asize' and `cksum' with the info of the previous pbuf), and an errno
6553 * on error.
6554 */
6555 static int
6556 l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr,
6557 uint32_t *asize, zio_cksum_t *cksum)
6558 {
6559 boolean_t bswap_needed;
6560 uint16_t version, flags;
6561 uint32_t magic;
6562
6563 ASSERT(buf != NULL);
6564
6565 /* no valid buffer can be this small */
6566 if (buflen <= L2PBUF_HDR_SIZE)
6567 return (EINVAL);
6568
6569 /* these always come in big endian */
6570 #if defined(_BIG_ENDIAN)
6571 magic = *(uint32_t *)buf;
6572 flags = *(uint16_t *)(buf + 6);
6573 bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1);
6574 #else /* !defined(_BIG_ENDIAN) */
6575 magic = BSWAP_32(*(uint32_t *)buf);
6576 flags = BSWAP_16(*(uint16_t *)(buf + 6));
6577 bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0);
6578 #endif /* !defined(_BIG_ENDIAN) */
6579 version = buf[4];
6580
6581 if (magic != L2PBUF_MAGIC || version == 0)
6582 return (EINVAL);
6583 if (version > L2PBUF_MAX_VERSION)
6584 return (ENOTSUP);
6585
6586 *daddr = *(uint64_t *)(buf + 4);
6587 *asize = *(uint64_t *)(buf + 12);
6588 bcopy(buf + 16, cksum, 32);
6589
6590 if (bswap_needed) {
6591 *daddr = BSWAP_64(*daddr);
6592 *asize = BSWAP_64(*asize);
6593 ZIO_CHECKSUM_BSWAP(cksum);
6594 }
6595
6596 return (0);
6597 }
6598
6599 /*
6600 * Initializes a pbuf structure into a clean state. All version and flags
6601 * fields are filled in as appropriate for this architecture.
6602 * If the structure was used before, first call l2arc_pbuf_destroy on it,
6603 * as this function assumes the structure is uninitialized.
6604 */
6605 static void
6606 l2arc_pbuf_init(l2pbuf_t *pb)
6607 {
6608 bzero(pb, sizeof (l2pbuf_t));
6609 pb->pb_version = L2PBUF_MAX_VERSION;
6610 #if defined(_BIG_ENDIAN)
6611 pb->pb_flags |= L2PB_BIG_ENDIAN;
6612 #endif
6613 pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP);
6614 list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t),
6615 offsetof(l2pbuf_buflist_t, l2pbl_node));
6616 }
6617
6618 /*
6619 * Destroys a pbuf structure and puts it into a clean state ready to be
6620 * initialized by l2arc_pbuf_init. All buflists created by
6621 * l2arc_pbuf_buflist_alloc are released as well.
6622 */
6623 static void
6624 l2arc_pbuf_destroy(l2pbuf_t *pb)
6625 {
6626 list_t *buflist_list = pb->pb_buflists_list;
6627 l2pbuf_buflist_t *buflist;
6628
6629 while ((buflist = list_head(buflist_list)) != NULL) {
6630 ASSERT(buflist->l2pbl_nbufs > 0);
6631 kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) *
6632 buflist->l2pbl_nbufs);
6633 list_remove(buflist_list, buflist);
6634 kmem_free(buflist, sizeof (l2pbuf_buflist_t));
6635 }
6636 pb->pb_nbuflists = 0;
6637 list_destroy(pb->pb_buflists_list);
6638 kmem_free(pb->pb_buflists_list, sizeof (list_t));
6639 bzero(pb, sizeof (l2pbuf_t));
6640 }
6641
6642 /*
6643 * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs'
6644 * buffers. This is used during the buffer write cycle - each cycle allocates
6645 * a new buflist and fills it with buffers it writes. Then, when the pbuf
6646 * reaches its buflist limit, it is commited to stable storage.
6647 */
6648 static l2pbuf_buflist_t *
6649 l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs)
6650 {
6651 l2pbuf_buflist_t *buflist;
6652
6653 ASSERT(pb->pb_buflists_list != NULL);
6654 buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP);
6655 buflist->l2pbl_nbufs = nbufs;
6656 buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs,
6657 KM_SLEEP);
6658 list_insert_tail(pb->pb_buflists_list, buflist);
6659 pb->pb_nbuflists++;
6660
6661 return (buflist);
6662 }
6663
6664 /*
6665 * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'.
6666 * The buffer being inserted must be present in L2ARC.
6667 */
6668 static void
6669 l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
6670 const arc_buf_hdr_t *ab, int index)
6671 {
6672 l2pbuf_buf_t *pb_buf;
6673 const l2arc_buf_hdr_t *l2hdr;
6674
6675 l2hdr = ab->b_l2hdr;
6676 ASSERT(l2hdr != NULL);
6677 ASSERT(pbl->l2pbl_nbufs > index);
6678
6679 pb_buf = &pbl->l2pbl_bufs[index];
6680 pb_buf->b_dva = ab->b_dva;
6681 pb_buf->b_birth = ab->b_birth;
6682 pb_buf->b_cksum0 = ab->b_cksum0;
6683 pb_buf->b_freeze_cksum = *ab->b_freeze_cksum;
6684 pb_buf->b_size = ab->b_size;
6685 pb_buf->b_l2daddr = l2hdr->b_daddr;
6686 pb_buf->b_l2asize = l2hdr->b_asize;
6687 pb_buf->b_l2compress = l2hdr->b_compress;
6688 pb_buf->b_contents_type = ab->b_type;
6689 pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS;
6690 pb->pb_payload_asz += l2hdr->b_asize;
6691 }
6692
6693 /*
6694 * Commits a pbuf to stable storage. This routine is invoked when writing
6695 * ARC buffers to an L2ARC device. When the pbuf associated with the device
6696 * has reached its limits (either in size or in number of writes), it is
6697 * scheduled here for writing.
6698 * This function allocates some memory to temporarily hold the serialized
6699 * buffer to be written. This is then released in l2arc_write_done.
6700 */
6701 static void
6702 l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6703 {
6704 l2pbuf_t *pb = &dev->l2ad_pbuf;
6705 uint64_t i, est_encsize, bufsize, encsize, io_size;
6706 uint8_t *pb_buf;
6707
6708 pb->pb_prev_daddr = dev->l2ad_pbuf_daddr;
6709 pb->pb_prev_asize = dev->l2ad_pbuf_asize;
6710 pb->pb_prev_cksum = dev->l2ad_pbuf_cksum;
6711
6712 est_encsize = L2PBUF_ENCODED_SIZE(pb);
6713 bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize);
6714 pb_buf = kmem_zalloc(bufsize, KM_SLEEP);
6715 encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize);
6716 cb->l2wcb_pbuf = pb_buf;
6717 cb->l2wcb_pbuf_size = bufsize;
6718
6719 dev->l2ad_pbuf_daddr = dev->l2ad_hand;
6720 dev->l2ad_pbuf_asize = encsize;
6721 fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum);
6722
6723 io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize);
6724 for (i = 0; i < io_size; ) {
6725 zio_t *wzio;
6726 uint64_t wsize = io_size - i;
6727
6728 if (wsize > SPA_MAXBLOCKSIZE)
6729 wsize = SPA_MAXBLOCKSIZE;
6730 ASSERT(wsize >= SPA_MINBLOCKSIZE);
6731 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i,
6732 wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6733 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6734 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6735 zio_t *, wzio);
6736 (void) zio_nowait(wzio);
6737 i += wsize;
6738 }
6739
6740 dev->l2ad_hand += io_size;
6741 vdev_space_update(dev->l2ad_vdev, io_size, 0, 0);
6742 l2arc_uberblock_update(dev, pio, cb);
6743
6744 ARCSTAT_INCR(arcstat_l2_write_bytes, io_size);
6745 ARCSTAT_BUMP(arcstat_l2_meta_writes);
6746 ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize);
6747 ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize);
6748 ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6749 pb->pb_payload_asz / encsize);
6750 }
6751
6752 /*
6753 * Returns the number of bytes occupied by the payload buffer items of
6754 * a pbuf in portable (on-disk) encoded form, i.e. the bytes following
6755 * L2PBUF_HDR_SIZE.
6756 */
6757 static uint32_t
6758 l2arc_pbuf_items_encoded_size(l2pbuf_t *pb)
6759 {
6760 uint32_t size = 0;
6761 l2pbuf_buflist_t *buflist;
6762
6763 for (buflist = list_head(pb->pb_buflists_list); buflist != NULL;
6764 buflist = list_next(pb->pb_buflists_list, buflist))
6765 size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs;
6766
6767 return (size);
6768 }
|