100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()). Note however that the data associated
104 * with the buffer may be evicted prior to the callback. The callback
105 * must be made with *no locks held* (to prevent deadlock). Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_buf_evict()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 * - L2ARC buflist creation
116 * - L2ARC buflist eviction
117 * - L2ARC write completion, which walks L2ARC buflists
118 * - ARC header destruction, as it removes from L2ARC buflists
119 * - ARC header release, as it removes from L2ARC buflists
120 */
121
122 #include <sys/spa.h>
123 #include <sys/zio.h>
124 #include <sys/zio_compress.h>
125 #include <sys/zfs_context.h>
126 #include <sys/arc.h>
127 #include <sys/refcount.h>
128 #include <sys/vdev.h>
129 #include <sys/vdev_impl.h>
130 #ifdef _KERNEL
131 #include <sys/vmsystm.h>
132 #include <vm/anon.h>
133 #include <sys/fs/swapnode.h>
134 #include <sys/dnlc.h>
135 #endif
136 #include <sys/callb.h>
137 #include <sys/kstat.h>
138 #include <zfs_fletcher.h>
139
605 /*
606 * If we discover during ARC scan any buffers to be compressed, we boost
607 * our headroom for the next scanning cycle by this percentage multiple.
608 */
609 #define L2ARC_HEADROOM_BOOST 200
610 #define L2ARC_FEED_SECS 1 /* caching interval secs */
611 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
612
613 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
614 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
615
616 /* L2ARC Performance Tunables */
617 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
618 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
619 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
620 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
621 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
622 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
623 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
624 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
625 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
626
627 /*
628 * L2ARC Internals
629 */
630 typedef struct l2arc_dev {
631 vdev_t *l2ad_vdev; /* vdev */
632 spa_t *l2ad_spa; /* spa */
633 uint64_t l2ad_hand; /* next write location */
634 uint64_t l2ad_start; /* first addr on device */
635 uint64_t l2ad_end; /* last addr on device */
636 uint64_t l2ad_evict; /* last addr eviction reached */
637 boolean_t l2ad_first; /* first sweep through */
638 boolean_t l2ad_writing; /* currently writing */
639 list_t *l2ad_buflist; /* buffer list */
640 list_node_t l2ad_node; /* device list node */
641 } l2arc_dev_t;
642
643 static list_t L2ARC_dev_list; /* device list */
644 static list_t *l2arc_dev_list; /* device list pointer */
645 static kmutex_t l2arc_dev_mtx; /* device list mutex */
655 spa_t *l2rcb_spa; /* spa */
656 blkptr_t l2rcb_bp; /* original blkptr */
657 zbookmark_t l2rcb_zb; /* original bookmark */
658 int l2rcb_flags; /* original flags */
659 enum zio_compress l2rcb_compress; /* applied compress */
660 } l2arc_read_callback_t;
661
662 typedef struct l2arc_write_callback {
663 l2arc_dev_t *l2wcb_dev; /* device info */
664 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
665 } l2arc_write_callback_t;
666
667 struct l2arc_buf_hdr {
668 /* protected by arc_buf_hdr mutex */
669 l2arc_dev_t *b_dev; /* L2ARC device */
670 uint64_t b_daddr; /* disk address, offset byte */
671 /* compression applied to buffer data */
672 enum zio_compress b_compress;
673 /* real alloc'd buffer size depending on b_compress applied */
674 int b_asize;
675 /* temporary buffer holder for in-flight compressed data */
676 void *b_tmp_cdata;
677 };
678
679 typedef struct l2arc_data_free {
680 /* protected by l2arc_free_on_write_mtx */
681 void *l2df_data;
682 size_t l2df_size;
683 void (*l2df_func)(void *, size_t);
684 list_node_t l2df_list_node;
685 } l2arc_data_free_t;
686
687 static kmutex_t l2arc_feed_thr_lock;
688 static kcondvar_t l2arc_feed_thr_cv;
689 static uint8_t l2arc_thread_exit;
690
691 static void l2arc_read_done(zio_t *zio);
692 static void l2arc_hdr_stat_add(void);
693 static void l2arc_hdr_stat_remove(void);
694
695 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
696 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
697 enum zio_compress c);
698 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
699
700 static uint64_t
701 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
702 {
703 uint8_t *vdva = (uint8_t *)dva;
704 uint64_t crc = -1ULL;
705 int i;
706
707 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
708
709 for (i = 0; i < sizeof (dva_t); i++)
710 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
711
712 crc ^= (spa>>8) ^ birth;
713
714 return (crc);
715 }
716
717 #define BUF_EMPTY(buf) \
718 ((buf)->b_dva.dva_word[0] == 0 && \
4100 l2arc_dev_last = next;
4101
4102 out:
4103 mutex_exit(&l2arc_dev_mtx);
4104
4105 /*
4106 * Grab the config lock to prevent the 'next' device from being
4107 * removed while we are writing to it.
4108 */
4109 if (next != NULL)
4110 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4111 mutex_exit(&spa_namespace_lock);
4112
4113 return (next);
4114 }
4115
4116 /*
4117 * Free buffers that were tagged for destruction.
4118 */
4119 static void
4120 l2arc_do_free_on_write()
4121 {
4122 list_t *buflist;
4123 l2arc_data_free_t *df, *df_prev;
4124
4125 mutex_enter(&l2arc_free_on_write_mtx);
4126 buflist = l2arc_free_on_write;
4127
4128 for (df = list_tail(buflist); df; df = df_prev) {
4129 df_prev = list_prev(buflist, df);
4130 ASSERT(df->l2df_data != NULL);
4131 ASSERT(df->l2df_func != NULL);
4132 df->l2df_func(df->l2df_data, df->l2df_size);
4133 list_remove(buflist, df);
4134 kmem_free(df, sizeof (l2arc_data_free_t));
4135 }
4136
4137 mutex_exit(&l2arc_free_on_write_mtx);
4138 }
4139
4140 /*
4141 * A write to a cache device has completed. Update all headers to allow
4142 * reads from these buffers to begin.
4143 */
4144 static void
4145 l2arc_write_done(zio_t *zio)
4146 {
4147 l2arc_write_callback_t *cb;
4148 l2arc_dev_t *dev;
4149 list_t *buflist;
4150 arc_buf_hdr_t *head, *ab, *ab_prev;
4151 l2arc_buf_hdr_t *abl2;
4152 kmutex_t *hash_lock;
4153
4154 cb = zio->io_private;
4155 ASSERT(cb != NULL);
4156 dev = cb->l2wcb_dev;
4157 ASSERT(dev != NULL);
4158 head = cb->l2wcb_head;
4159 ASSERT(head != NULL);
4160 buflist = dev->l2ad_buflist;
4161 ASSERT(buflist != NULL);
4162 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4163 l2arc_write_callback_t *, cb);
4164
4165 if (zio->io_error != 0)
4166 ARCSTAT_BUMP(arcstat_l2_writes_error);
4167
4168 mutex_enter(&l2arc_buflist_mtx);
4169
4170 /*
4171 * All writes completed, or an error was hit.
4172 */
4173 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4174 ab_prev = list_prev(buflist, ab);
4175
4176 hash_lock = HDR_LOCK(ab);
4177 if (!mutex_tryenter(hash_lock)) {
4178 /*
4179 * This buffer misses out. It may be in a stage
4180 * of eviction. Its ARC_L2_WRITING flag will be
4181 * left set, denying reads to this buffer.
4182 */
4183 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4184 continue;
4185 }
4186
4187 abl2 = ab->b_l2hdr;
4188
4189 /*
4190 * Release the temporary compressed buffer as soon as possible.
4191 */
4192 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4193 l2arc_release_cdata_buf(ab);
4194
4195 if (zio->io_error != 0) {
4196 /*
4197 * Error - drop L2ARC entry.
4198 */
4199 list_remove(buflist, ab);
4200 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4201 ab->b_l2hdr = NULL;
4202 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4203 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4204 }
4205
4206 /*
4207 * Allow ARC to begin reads to this L2ARC entry.
4208 */
4209 ab->b_flags &= ~ARC_L2_WRITING;
4210
4211 mutex_exit(hash_lock);
4212 }
4213
4214 atomic_inc_64(&l2arc_writes_done);
4215 list_remove(buflist, head);
4216 kmem_cache_free(hdr_cache, head);
4217 mutex_exit(&l2arc_buflist_mtx);
4218
4219 l2arc_do_free_on_write();
4220
4221 kmem_free(cb, sizeof (l2arc_write_callback_t));
4222 }
4223
4224 /*
4225 * A read to a cache device completed. Validate buffer contents before
4226 * handing over to the regular ARC routines.
4227 */
4228 static void
4229 l2arc_read_done(zio_t *zio)
4230 {
4231 l2arc_read_callback_t *cb;
4232 arc_buf_hdr_t *hdr;
4233 arc_buf_t *buf;
4234 kmutex_t *hash_lock;
4235 int equal;
4236
4237 ASSERT(zio->io_vd != NULL);
4238 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4333 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4334 *lock = &arc_mru->arcs_mtx;
4335 break;
4336 }
4337
4338 ASSERT(!(MUTEX_HELD(*lock)));
4339 mutex_enter(*lock);
4340 return (list);
4341 }
4342
4343 /*
4344 * Evict buffers from the device write hand to the distance specified in
4345 * bytes. This distance may span populated buffers, it may span nothing.
4346 * This is clearing a region on the L2ARC device ready for writing.
4347 * If the 'all' boolean is set, every buffer is evicted.
4348 */
4349 static void
4350 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4351 {
4352 list_t *buflist;
4353 l2arc_buf_hdr_t *abl2;
4354 arc_buf_hdr_t *ab, *ab_prev;
4355 kmutex_t *hash_lock;
4356 uint64_t taddr;
4357
4358 buflist = dev->l2ad_buflist;
4359
4360 if (buflist == NULL)
4361 return;
4362
4363 if (!all && dev->l2ad_first) {
4364 /*
4365 * This is the first sweep through the device. There is
4366 * nothing to evict.
4367 */
4368 return;
4369 }
4370
4371 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4372 /*
4373 * When nearing the end of the device, evict to the end
4433 * arc_hdr_destroy() will call list_remove()
4434 * and decrement arcstat_l2_size.
4435 */
4436 arc_change_state(arc_anon, ab, hash_lock);
4437 arc_hdr_destroy(ab);
4438 } else {
4439 /*
4440 * Invalidate issued or about to be issued
4441 * reads, since we may be about to write
4442 * over this location.
4443 */
4444 if (HDR_L2_READING(ab)) {
4445 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4446 ab->b_flags |= ARC_L2_EVICTED;
4447 }
4448
4449 /*
4450 * Tell ARC this no longer exists in L2ARC.
4451 */
4452 if (ab->b_l2hdr != NULL) {
4453 abl2 = ab->b_l2hdr;
4454 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4455 ab->b_l2hdr = NULL;
4456 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4457 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4458 }
4459 list_remove(buflist, ab);
4460
4461 /*
4462 * This may have been leftover after a
4463 * failed write.
4464 */
4465 ab->b_flags &= ~ARC_L2_WRITING;
4466 }
4467 mutex_exit(hash_lock);
4468 }
4469 mutex_exit(&l2arc_buflist_mtx);
4470
4471 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4472 dev->l2ad_evict = taddr;
4473 }
4474
4475 /*
4476 * Find and write ARC buffers to the L2ARC device.
4481 * state between calls to this function.
4482 *
4483 * Returns the number of bytes actually written (which may be smaller than
4484 * the delta by which the device hand has changed due to alignment).
4485 */
4486 static uint64_t
4487 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4488 boolean_t *headroom_boost)
4489 {
4490 arc_buf_hdr_t *ab, *ab_prev, *head;
4491 list_t *list;
4492 uint64_t write_asize, write_psize, write_sz, headroom,
4493 buf_compress_minsz;
4494 void *buf_data;
4495 kmutex_t *list_lock;
4496 boolean_t full;
4497 l2arc_write_callback_t *cb;
4498 zio_t *pio, *wzio;
4499 uint64_t guid = spa_load_guid(spa);
4500 const boolean_t do_headroom_boost = *headroom_boost;
4501
4502 ASSERT(dev->l2ad_vdev != NULL);
4503
4504 /* Lower the flag now, we might want to raise it again later. */
4505 *headroom_boost = B_FALSE;
4506
4507 pio = NULL;
4508 write_sz = write_asize = write_psize = 0;
4509 full = B_FALSE;
4510 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4511 head->b_flags |= ARC_L2_WRITE_HEAD;
4512
4513 /*
4514 * We will want to try to compress buffers that are at least 2x the
4515 * device sector size.
4516 */
4517 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4518
4519 /*
4520 * Copy buffers for L2ARC writing.
4521 */
4522 mutex_enter(&l2arc_buflist_mtx);
4523 for (int try = 0; try <= 3; try++) {
4524 uint64_t passed_sz = 0;
4525
4526 list = l2arc_list_locked(try, &list_lock);
4527
4528 /*
4529 * L2ARC fast warmup.
4530 *
4531 * Until the ARC is warm and starts to evict, read from the
4532 * head of the ARC lists rather than the tail.
4533 */
4534 if (arc_warm == B_FALSE)
4535 ab = list_head(list);
4536 else
4537 ab = list_tail(list);
4538
4539 headroom = target_sz * l2arc_headroom;
4540 if (do_headroom_boost)
4541 headroom = (headroom * l2arc_headroom_boost) / 100;
4542
4543 for (; ab; ab = ab_prev) {
4544 l2arc_buf_hdr_t *l2hdr;
4545 kmutex_t *hash_lock;
4546 uint64_t buf_sz;
4547
4548 if (arc_warm == B_FALSE)
4549 ab_prev = list_next(list, ab);
4550 else
4551 ab_prev = list_prev(list, ab);
4552
4553 hash_lock = HDR_LOCK(ab);
4554 if (!mutex_tryenter(hash_lock)) {
4555 /*
4556 * Skip this buffer rather than waiting.
4557 */
4558 continue;
4559 }
4560
4561 passed_sz += ab->b_size;
4562 if (passed_sz > headroom) {
4563 /*
4564 * Searched too far.
4565 */
4566 mutex_exit(hash_lock);
4583 * Insert a dummy header on the buflist so
4584 * l2arc_write_done() can find where the
4585 * write buffers begin without searching.
4586 */
4587 list_insert_head(dev->l2ad_buflist, head);
4588
4589 cb = kmem_alloc(
4590 sizeof (l2arc_write_callback_t), KM_SLEEP);
4591 cb->l2wcb_dev = dev;
4592 cb->l2wcb_head = head;
4593 pio = zio_root(spa, l2arc_write_done, cb,
4594 ZIO_FLAG_CANFAIL);
4595 }
4596
4597 /*
4598 * Create and add a new L2ARC header.
4599 */
4600 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4601 l2hdr->b_dev = dev;
4602 ab->b_flags |= ARC_L2_WRITING;
4603
4604 /*
4605 * Temporarily stash the data buffer in b_tmp_cdata.
4606 * The subsequent write step will pick it up from
4607 * there. This is because can't access ab->b_buf
4608 * without holding the hash_lock, which we in turn
4609 * can't access without holding the ARC list locks
4610 * (which we want to avoid during compression/writing).
4611 */
4612 l2hdr->b_compress = ZIO_COMPRESS_OFF;
4613 l2hdr->b_asize = ab->b_size;
4614 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4615
4616 buf_sz = ab->b_size;
4617 ab->b_l2hdr = l2hdr;
4618
4619 list_insert_head(dev->l2ad_buflist, ab);
4620
4621 /*
4622 * Compute and store the buffer cksum before
4623 * writing. On debug the cksum is verified first.
4624 */
4625 arc_cksum_verify(ab->b_buf);
4626 arc_cksum_compute(ab->b_buf, B_TRUE);
4627
4628 mutex_exit(hash_lock);
4629
4630 write_sz += buf_sz;
4631 }
4632
4633 mutex_exit(list_lock);
4634
4635 if (full == B_TRUE)
4636 break;
4637 }
4638
4639 /* No buffers selected for writing? */
4640 if (pio == NULL) {
4641 ASSERT0(write_sz);
4642 mutex_exit(&l2arc_buflist_mtx);
4643 kmem_cache_free(hdr_cache, head);
4644 return (0);
4645 }
4646
4647 /*
4648 * Now start writing the buffers. We're starting at the write head
4649 * and work backwards, retracing the course of the buffer selector
4650 * loop above.
4651 */
4652 for (ab = list_prev(dev->l2ad_buflist, head); ab;
4653 ab = list_prev(dev->l2ad_buflist, ab)) {
4654 l2arc_buf_hdr_t *l2hdr;
4655 uint64_t buf_sz;
4656
4657 /*
4658 * We shouldn't need to lock the buffer here, since we flagged
4659 * it as ARC_L2_WRITING in the previous step, but we must take
4660 * care to only access its L2 cache parameters. In particular,
4661 * ab->b_buf may be invalid by now due to ARC eviction.
4662 */
4663 l2hdr = ab->b_l2hdr;
4664 l2hdr->b_daddr = dev->l2ad_hand;
4665
4666 if ((ab->b_flags & ARC_L2COMPRESS) &&
4667 l2hdr->b_asize >= buf_compress_minsz) {
4668 if (l2arc_compress_buf(l2hdr)) {
4669 /*
4670 * If compression succeeded, enable headroom
4671 * boost on the next scan cycle.
4672 */
4673 *headroom_boost = B_TRUE;
4674 }
4675 }
4676
4677 /*
4678 * Pick up the buffer data we had previously stashed away
4679 * (and now potentially also compressed).
4680 */
4681 buf_data = l2hdr->b_tmp_cdata;
4682 buf_sz = l2hdr->b_asize;
4683
4684 /* Compression may have squashed the buffer to zero length. */
4685 if (buf_sz != 0) {
4686 uint64_t buf_p_sz;
4687
4688 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4689 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4690 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4691 ZIO_FLAG_CANFAIL, B_FALSE);
4692
4693 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4694 zio_t *, wzio);
4695 (void) zio_nowait(wzio);
4696
4697 write_asize += buf_sz;
4698 /*
4699 * Keep the clock hand suitably device-aligned.
4700 */
4701 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4702 write_psize += buf_p_sz;
4703 dev->l2ad_hand += buf_p_sz;
4704 }
4705 }
4706
4707 mutex_exit(&l2arc_buflist_mtx);
4708
4709 ASSERT3U(write_asize, <=, target_sz);
4710 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4711 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4712 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4713 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4714 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4715
4716 /*
4717 * Bump device hand to the device start if it is approaching the end.
4718 * l2arc_evict() will already have evicted ahead for this case.
4719 */
4720 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4721 vdev_space_update(dev->l2ad_vdev,
4722 dev->l2ad_end - dev->l2ad_hand, 0, 0);
4723 dev->l2ad_hand = dev->l2ad_start;
4724 dev->l2ad_evict = dev->l2ad_start;
4725 dev->l2ad_first = B_FALSE;
4726 }
4727
4728 dev->l2ad_writing = B_TRUE;
4729 (void) zio_wait(pio);
4730 dev->l2ad_writing = B_FALSE;
4731
4732 return (write_asize);
4733 }
4734
4735 /*
4736 * Compresses an L2ARC buffer.
4737 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4738 * size in l2hdr->b_asize. This routine tries to compress the data and
4739 * depending on the compression result there are three possible outcomes:
4740 * *) The buffer was incompressible. The original l2hdr contents were left
4741 * untouched and are ready for writing to an L2 device.
4742 * *) The buffer was all-zeros, so there is no need to write it to an L2
4743 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4744 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4745 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4746 * data buffer which holds the compressed data to be written, and b_asize
4747 * tells us how much data there is. b_compress is set to the appropriate
4748 * compression algorithm. Once writing is done, invoke
4749 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4750 *
4751 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4752 * buffer was incompressible).
4753 */
4754 static boolean_t
4755 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4756 {
4757 void *cdata;
4758 size_t csize, len;
4759
4760 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4761 ASSERT(l2hdr->b_tmp_cdata != NULL);
4762
4763 len = l2hdr->b_asize;
4764 cdata = zio_data_buf_alloc(len);
4765 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4766 cdata, l2hdr->b_asize);
4767
4768 if (csize == 0) {
4769 /* zero block, indicate that there's nothing to write */
4770 zio_data_buf_free(cdata, len);
4771 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4772 l2hdr->b_asize = 0;
4773 l2hdr->b_tmp_cdata = NULL;
4774 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4775 return (B_TRUE);
4776 } else if (csize > 0 && csize < len) {
4777 /*
4778 * Compression succeeded, we'll keep the cdata around for
4779 * writing and release it afterwards.
4780 */
4781 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4782 l2hdr->b_asize = csize;
4783 l2hdr->b_tmp_cdata = cdata;
4784 ARCSTAT_BUMP(arcstat_l2_compress_successes);
4785 return (B_TRUE);
4786 } else {
4787 /*
4788 * Compression failed, release the compressed buffer.
4789 * l2hdr will be left unmodified.
4790 */
4791 zio_data_buf_free(cdata, len);
4792 ARCSTAT_BUMP(arcstat_l2_compress_failures);
4793 return (B_FALSE);
4794 }
4795 }
4796
4797 /*
4798 * Decompresses a zio read back from an l2arc device. On success, the
4799 * underlying zio's io_data buffer is overwritten by the uncompressed
4800 * version. On decompression error (corrupt compressed stream), the
4801 * zio->io_error value is set to signal an I/O error.
4802 *
4803 * Please note that the compressed data stream is not checksummed, so
4804 * if the underlying device is experiencing data corruption, we may feed
4805 * corrupt data to the decompressor, so the decompressor needs to be
4806 * able to handle this situation (LZ4 does).
4807 */
4808 static void
4809 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4810 {
4811 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4839 * original compressed data (rather than decompressing to an
4840 * aux buffer and then copying back the uncompressed buffer,
4841 * which is likely to be much larger).
4842 */
4843 uint64_t csize;
4844 void *cdata;
4845
4846 csize = zio->io_size;
4847 cdata = zio_data_buf_alloc(csize);
4848 bcopy(zio->io_data, cdata, csize);
4849 if (zio_decompress_data(c, cdata, zio->io_data, csize,
4850 hdr->b_size) != 0)
4851 zio->io_error = EIO;
4852 zio_data_buf_free(cdata, csize);
4853 }
4854
4855 /* Restore the expected uncompressed IO size. */
4856 zio->io_orig_size = zio->io_size = hdr->b_size;
4857 }
4858
4859 /*
4860 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
4861 * This buffer serves as a temporary holder of compressed data while
4862 * the buffer entry is being written to an l2arc device. Once that is
4863 * done, we can dispose of it.
4864 */
4865 static void
4866 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
4867 {
4868 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4869
4870 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
4871 /*
4872 * If the data was compressed, then we've allocated a
4873 * temporary buffer for it, so now we need to release it.
4874 */
4875 ASSERT(l2hdr->b_tmp_cdata != NULL);
4876 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
4877 }
4878 l2hdr->b_tmp_cdata = NULL;
4879 }
4880
4881 /*
4882 * This thread feeds the L2ARC at regular intervals. This is the beating
4883 * heart of the L2ARC.
4884 */
4885 static void
4886 l2arc_feed_thread(void)
4887 {
4888 callb_cpr_t cpr;
4889 l2arc_dev_t *dev;
4890 spa_t *spa;
4891 uint64_t size, wrote;
4892 clock_t begin, next = ddi_get_lbolt();
4893 boolean_t headroom_boost = B_FALSE;
4894
4895 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4896
4897 mutex_enter(&l2arc_feed_thr_lock);
4898
4899 while (l2arc_thread_exit == 0) {
4900 CALLB_CPR_SAFE_BEGIN(&cpr);
|
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()). Note however that the data associated
104 * with the buffer may be evicted prior to the callback. The callback
105 * must be made with *no locks held* (to prevent deadlock). Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_buf_evict()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 * - L2ARC buflist creation
116 * - L2ARC buflist eviction
117 * - L2ARC write completion, which walks L2ARC buflists
118 * - ARC header destruction, as it removes from L2ARC buflists
119 * - ARC header release, as it removes from L2ARC buflists
120 *
121 * Please note that if you first grab the l2arc_buflist_mtx, you can't do a
122 * mutex_enter on a buffer's hash_lock anymore due to lock inversion. To grab
123 * the hash_lock you must use mutex_tryenter and possibly deal with the buffer
124 * not being available (due to e.g. some other thread holding it while trying
125 * to unconditionally grab the l2arc_buflist_mtx which you are holding). The
126 * inverse situation (first grab hash_lock, then l2arc_buflist_mtx) is safe.
127 */
128
129 #include <sys/spa.h>
130 #include <sys/zio.h>
131 #include <sys/zio_compress.h>
132 #include <sys/zfs_context.h>
133 #include <sys/arc.h>
134 #include <sys/refcount.h>
135 #include <sys/vdev.h>
136 #include <sys/vdev_impl.h>
137 #ifdef _KERNEL
138 #include <sys/vmsystm.h>
139 #include <vm/anon.h>
140 #include <sys/fs/swapnode.h>
141 #include <sys/dnlc.h>
142 #endif
143 #include <sys/callb.h>
144 #include <sys/kstat.h>
145 #include <zfs_fletcher.h>
146
612 /*
613 * If we discover during ARC scan any buffers to be compressed, we boost
614 * our headroom for the next scanning cycle by this percentage multiple.
615 */
616 #define L2ARC_HEADROOM_BOOST 200
617 #define L2ARC_FEED_SECS 1 /* caching interval secs */
618 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
619
620 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
621 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
622
623 /* L2ARC Performance Tunables */
624 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
625 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
626 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
627 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
628 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
629 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
630 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
631 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
632 boolean_t l2arc_norw = B_FALSE; /* no reads during writes */
633
634 /*
635 * L2ARC Internals
636 */
637 typedef struct l2arc_dev {
638 vdev_t *l2ad_vdev; /* vdev */
639 spa_t *l2ad_spa; /* spa */
640 uint64_t l2ad_hand; /* next write location */
641 uint64_t l2ad_start; /* first addr on device */
642 uint64_t l2ad_end; /* last addr on device */
643 uint64_t l2ad_evict; /* last addr eviction reached */
644 boolean_t l2ad_first; /* first sweep through */
645 boolean_t l2ad_writing; /* currently writing */
646 list_t *l2ad_buflist; /* buffer list */
647 list_node_t l2ad_node; /* device list node */
648 } l2arc_dev_t;
649
650 static list_t L2ARC_dev_list; /* device list */
651 static list_t *l2arc_dev_list; /* device list pointer */
652 static kmutex_t l2arc_dev_mtx; /* device list mutex */
662 spa_t *l2rcb_spa; /* spa */
663 blkptr_t l2rcb_bp; /* original blkptr */
664 zbookmark_t l2rcb_zb; /* original bookmark */
665 int l2rcb_flags; /* original flags */
666 enum zio_compress l2rcb_compress; /* applied compress */
667 } l2arc_read_callback_t;
668
669 typedef struct l2arc_write_callback {
670 l2arc_dev_t *l2wcb_dev; /* device info */
671 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
672 } l2arc_write_callback_t;
673
674 struct l2arc_buf_hdr {
675 /* protected by arc_buf_hdr mutex */
676 l2arc_dev_t *b_dev; /* L2ARC device */
677 uint64_t b_daddr; /* disk address, offset byte */
678 /* compression applied to buffer data */
679 enum zio_compress b_compress;
680 /* real alloc'd buffer size depending on b_compress applied */
681 int b_asize;
682 };
683
684 typedef struct l2arc_data_free {
685 /* protected by l2arc_free_on_write_mtx */
686 void *l2df_data;
687 size_t l2df_size;
688 void (*l2df_func)(void *, size_t);
689 list_node_t l2df_list_node;
690 } l2arc_data_free_t;
691
692 static kmutex_t l2arc_feed_thr_lock;
693 static kcondvar_t l2arc_feed_thr_cv;
694 static uint8_t l2arc_thread_exit;
695
696 static void l2arc_read_done(zio_t *zio);
697 static void l2arc_hdr_stat_add(void);
698 static void l2arc_hdr_stat_remove(void);
699
700 static boolean_t l2arc_compress_buf(void *in_data, uint64_t in_sz,
701 void **out_data, uint64_t *out_sz, enum zio_compress *compress);
702 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
703 enum zio_compress c);
704
705 static uint64_t
706 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
707 {
708 uint8_t *vdva = (uint8_t *)dva;
709 uint64_t crc = -1ULL;
710 int i;
711
712 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
713
714 for (i = 0; i < sizeof (dva_t); i++)
715 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
716
717 crc ^= (spa>>8) ^ birth;
718
719 return (crc);
720 }
721
722 #define BUF_EMPTY(buf) \
723 ((buf)->b_dva.dva_word[0] == 0 && \
4105 l2arc_dev_last = next;
4106
4107 out:
4108 mutex_exit(&l2arc_dev_mtx);
4109
4110 /*
4111 * Grab the config lock to prevent the 'next' device from being
4112 * removed while we are writing to it.
4113 */
4114 if (next != NULL)
4115 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4116 mutex_exit(&spa_namespace_lock);
4117
4118 return (next);
4119 }
4120
4121 /*
4122 * Free buffers that were tagged for destruction.
4123 */
4124 static void
4125 l2arc_do_free_on_write(void)
4126 {
4127 list_t *buflist;
4128 l2arc_data_free_t *df, *df_prev;
4129
4130 mutex_enter(&l2arc_free_on_write_mtx);
4131 buflist = l2arc_free_on_write;
4132
4133 for (df = list_tail(buflist); df; df = df_prev) {
4134 df_prev = list_prev(buflist, df);
4135 ASSERT(df->l2df_data != NULL);
4136 ASSERT(df->l2df_func != NULL);
4137 df->l2df_func(df->l2df_data, df->l2df_size);
4138 list_remove(buflist, df);
4139 kmem_free(df, sizeof (l2arc_data_free_t));
4140 }
4141
4142 mutex_exit(&l2arc_free_on_write_mtx);
4143 }
4144
4145 /*
4146 * A write to a cache device has completed. Update all headers to allow
4147 * reads from these buffers to begin.
4148 */
4149 static void
4150 l2arc_write_done(zio_t *zio)
4151 {
4152 l2arc_write_callback_t *cb;
4153 l2arc_dev_t *dev;
4154 list_t *buflist;
4155 arc_buf_hdr_t *head, *ab;
4156
4157 struct defer_done_entry {
4158 arc_buf_hdr_t *dde_buf;
4159 list_node_t dde_node;
4160 } *dde, *dde_next;
4161 list_t defer_done_list;
4162
4163 cb = zio->io_private;
4164 ASSERT(cb != NULL);
4165 dev = cb->l2wcb_dev;
4166 ASSERT(dev != NULL);
4167 head = cb->l2wcb_head;
4168 ASSERT(head != NULL);
4169 buflist = dev->l2ad_buflist;
4170 ASSERT(buflist != NULL);
4171 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4172 l2arc_write_callback_t *, cb);
4173
4174 if (zio->io_error != 0)
4175 ARCSTAT_BUMP(arcstat_l2_writes_error);
4176
4177 mutex_enter(&l2arc_buflist_mtx);
4178
4179 /*
4180 * All writes completed, or an error was hit.
4181 */
4182 list_create(&defer_done_list, sizeof (*dde),
4183 offsetof(struct defer_done_entry, dde_node));
4184 for (ab = list_prev(buflist, head); ab; ab = list_prev(buflist, ab)) {
4185 /*
4186 * Can't pause here to grab hash_lock while also holding
4187 * l2arc_buflist_mtx, so place the buffers on a temporary
4188 * thread-local list for later processing.
4189 */
4190 dde = kmem_alloc(sizeof (*dde), KM_SLEEP);
4191 dde->dde_buf = ab;
4192 list_insert_tail(&defer_done_list, dde);
4193 }
4194
4195 atomic_inc_64(&l2arc_writes_done);
4196 list_remove(buflist, head);
4197 kmem_cache_free(hdr_cache, head);
4198 mutex_exit(&l2arc_buflist_mtx);
4199
4200 /*
4201 * Now process the buffers. We're not holding l2arc_buflist_mtx
4202 * anymore, so we can do a regular mutex_enter on the hash_lock.
4203 */
4204 for (dde = list_head(&defer_done_list); dde != NULL; dde = dde_next) {
4205 kmutex_t *hash_lock;
4206
4207 dde_next = list_next(&defer_done_list, dde);
4208 ab = dde->dde_buf;
4209 hash_lock = HDR_LOCK(ab);
4210
4211 mutex_enter(hash_lock);
4212
4213 if (zio->io_error != 0) {
4214 /*
4215 * Error - drop L2ARC entry.
4216 */
4217 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4218 mutex_enter(&l2arc_buflist_mtx);
4219 list_remove(buflist, ab);
4220 mutex_exit(&l2arc_buflist_mtx);
4221 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4222 ab->b_l2hdr = NULL;
4223 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
4224 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4225 }
4226
4227 /*
4228 * Allow ARC to begin reads to this L2ARC entry.
4229 */
4230 ab->b_flags &= ~ARC_L2_WRITING;
4231
4232 mutex_exit(hash_lock);
4233
4234 list_remove(&defer_done_list, dde);
4235 }
4236 list_destroy(&defer_done_list);
4237
4238 l2arc_do_free_on_write();
4239
4240 kmem_free(cb, sizeof (l2arc_write_callback_t));
4241 }
4242
4243 /*
4244 * A read to a cache device completed. Validate buffer contents before
4245 * handing over to the regular ARC routines.
4246 */
4247 static void
4248 l2arc_read_done(zio_t *zio)
4249 {
4250 l2arc_read_callback_t *cb;
4251 arc_buf_hdr_t *hdr;
4252 arc_buf_t *buf;
4253 kmutex_t *hash_lock;
4254 int equal;
4255
4256 ASSERT(zio->io_vd != NULL);
4257 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4352 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4353 *lock = &arc_mru->arcs_mtx;
4354 break;
4355 }
4356
4357 ASSERT(!(MUTEX_HELD(*lock)));
4358 mutex_enter(*lock);
4359 return (list);
4360 }
4361
4362 /*
4363 * Evict buffers from the device write hand to the distance specified in
4364 * bytes. This distance may span populated buffers, it may span nothing.
4365 * This is clearing a region on the L2ARC device ready for writing.
4366 * If the 'all' boolean is set, every buffer is evicted.
4367 */
4368 static void
4369 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4370 {
4371 list_t *buflist;
4372 l2arc_buf_hdr_t *l2hdr;
4373 arc_buf_hdr_t *ab, *ab_prev;
4374 kmutex_t *hash_lock;
4375 uint64_t taddr;
4376
4377 buflist = dev->l2ad_buflist;
4378
4379 if (buflist == NULL)
4380 return;
4381
4382 if (!all && dev->l2ad_first) {
4383 /*
4384 * This is the first sweep through the device. There is
4385 * nothing to evict.
4386 */
4387 return;
4388 }
4389
4390 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4391 /*
4392 * When nearing the end of the device, evict to the end
4452 * arc_hdr_destroy() will call list_remove()
4453 * and decrement arcstat_l2_size.
4454 */
4455 arc_change_state(arc_anon, ab, hash_lock);
4456 arc_hdr_destroy(ab);
4457 } else {
4458 /*
4459 * Invalidate issued or about to be issued
4460 * reads, since we may be about to write
4461 * over this location.
4462 */
4463 if (HDR_L2_READING(ab)) {
4464 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4465 ab->b_flags |= ARC_L2_EVICTED;
4466 }
4467
4468 /*
4469 * Tell ARC this no longer exists in L2ARC.
4470 */
4471 if (ab->b_l2hdr != NULL) {
4472 l2hdr = ab->b_l2hdr;
4473 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4474 ab->b_l2hdr = NULL;
4475 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
4476 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4477 }
4478 list_remove(buflist, ab);
4479
4480 /*
4481 * This may have been leftover after a
4482 * failed write.
4483 */
4484 ab->b_flags &= ~ARC_L2_WRITING;
4485 }
4486 mutex_exit(hash_lock);
4487 }
4488 mutex_exit(&l2arc_buflist_mtx);
4489
4490 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4491 dev->l2ad_evict = taddr;
4492 }
4493
4494 /*
4495 * Find and write ARC buffers to the L2ARC device.
4500 * state between calls to this function.
4501 *
4502 * Returns the number of bytes actually written (which may be smaller than
4503 * the delta by which the device hand has changed due to alignment).
4504 */
4505 static uint64_t
4506 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4507 boolean_t *headroom_boost)
4508 {
4509 arc_buf_hdr_t *ab, *ab_prev, *head;
4510 list_t *list;
4511 uint64_t write_asize, write_psize, write_sz, headroom,
4512 buf_compress_minsz;
4513 void *buf_data;
4514 kmutex_t *list_lock;
4515 boolean_t full;
4516 l2arc_write_callback_t *cb;
4517 zio_t *pio, *wzio;
4518 uint64_t guid = spa_load_guid(spa);
4519 const boolean_t do_headroom_boost = *headroom_boost;
4520 struct defer_write_entry {
4521 arc_buf_hdr_t *dwe_buf;
4522 void *dwe_orig_data;
4523 uint64_t dwe_orig_size;
4524 list_node_t *dwe_node;
4525 } *dwe, *dwe_next;
4526 list_t defer_write_list;
4527
4528 ASSERT(dev->l2ad_vdev != NULL);
4529
4530 /* Lower the flag now, we might want to raise it again later. */
4531 *headroom_boost = B_FALSE;
4532
4533 pio = NULL;
4534 write_sz = write_asize = write_psize = 0;
4535 full = B_FALSE;
4536 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4537 head->b_flags |= ARC_L2_WRITE_HEAD;
4538
4539 /*
4540 * We will want to try to compress buffers that are at least 2x the
4541 * device sector size.
4542 */
4543 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4544
4545 /*
4546 * Copy buffers for L2ARC writing.
4547 */
4548 list_create(&defer_write_list, sizeof (*dwe),
4549 offsetof(struct defer_write_entry, dwe_node));
4550 mutex_enter(&l2arc_buflist_mtx);
4551 for (int try = 0; try <= 3; try++) {
4552 uint64_t passed_sz = 0;
4553
4554 list = l2arc_list_locked(try, &list_lock);
4555
4556 /*
4557 * L2ARC fast warmup.
4558 *
4559 * Until the ARC is warm and starts to evict, read from the
4560 * head of the ARC lists rather than the tail.
4561 */
4562 if (arc_warm == B_FALSE)
4563 ab = list_head(list);
4564 else
4565 ab = list_tail(list);
4566
4567 headroom = target_sz * l2arc_headroom;
4568 if (do_headroom_boost)
4569 headroom = (headroom * l2arc_headroom_boost) / 100;
4570
4571 for (; ab; ab = ab_prev) {
4572 l2arc_buf_hdr_t *l2hdr;
4573 kmutex_t *hash_lock;
4574
4575 if (arc_warm == B_FALSE)
4576 ab_prev = list_next(list, ab);
4577 else
4578 ab_prev = list_prev(list, ab);
4579
4580 hash_lock = HDR_LOCK(ab);
4581 if (!mutex_tryenter(hash_lock)) {
4582 /*
4583 * Skip this buffer rather than waiting.
4584 */
4585 continue;
4586 }
4587
4588 passed_sz += ab->b_size;
4589 if (passed_sz > headroom) {
4590 /*
4591 * Searched too far.
4592 */
4593 mutex_exit(hash_lock);
4610 * Insert a dummy header on the buflist so
4611 * l2arc_write_done() can find where the
4612 * write buffers begin without searching.
4613 */
4614 list_insert_head(dev->l2ad_buflist, head);
4615
4616 cb = kmem_alloc(
4617 sizeof (l2arc_write_callback_t), KM_SLEEP);
4618 cb->l2wcb_dev = dev;
4619 cb->l2wcb_head = head;
4620 pio = zio_root(spa, l2arc_write_done, cb,
4621 ZIO_FLAG_CANFAIL);
4622 }
4623
4624 /*
4625 * Create and add a new L2ARC header.
4626 */
4627 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4628 l2hdr->b_dev = dev;
4629 ab->b_flags |= ARC_L2_WRITING;
4630 l2hdr->b_compress = ZIO_COMPRESS_OFF;
4631 l2hdr->b_asize = ab->b_size;
4632
4633 /*
4634 * Temporarily stash the buffer in defer_write_entries.
4635 * The subsequent write step will pick it up from
4636 * there. This is because we can't access ab->b_buf
4637 * without holding the hash_lock, which we in turn
4638 * can't access without holding the ARC list locks
4639 * while walking the ARC lists (we want to avoid
4640 * holding these locks during compression/writing).
4641 */
4642 dwe = kmem_alloc(sizeof (*dwe), KM_SLEEP);
4643 dwe->dwe_buf = ab;
4644 dwe->dwe_orig_data = ab->b_buf->b_data;
4645 dwe->dwe_orig_size = ab->b_size;
4646
4647 ab->b_l2hdr = l2hdr;
4648
4649 list_insert_head(dev->l2ad_buflist, ab);
4650 list_insert_tail(&defer_write_list, dwe);
4651
4652 /*
4653 * Compute and store the buffer cksum before
4654 * writing. On debug the cksum is verified first.
4655 */
4656 arc_cksum_verify(ab->b_buf);
4657 arc_cksum_compute(ab->b_buf, B_TRUE);
4658
4659 mutex_exit(hash_lock);
4660
4661 write_sz += dwe->dwe_orig_size;
4662 }
4663
4664 mutex_exit(list_lock);
4665
4666 if (full == B_TRUE)
4667 break;
4668 }
4669
4670 /* No buffers selected for writing? */
4671 if (pio == NULL) {
4672 ASSERT0(write_sz);
4673 mutex_exit(&l2arc_buflist_mtx);
4674 kmem_cache_free(hdr_cache, head);
4675 list_destroy(&defer_write_list);
4676 return (0);
4677 }
4678
4679 mutex_exit(&l2arc_buflist_mtx);
4680
4681 /*
4682 * Now start writing the buffers. We're starting at the write head
4683 * and work backwards, retracing the course of the buffer selector
4684 * loop above.
4685 */
4686 for (dwe = list_head(&defer_write_list); dwe != NULL; dwe = dwe_next) {
4687 l2arc_buf_hdr_t *l2hdr;
4688 uint64_t buf_sz;
4689
4690 dwe_next = list_next(&defer_write_list, dwe);
4691 ab = dwe->dwe_buf;
4692
4693 /*
4694 * Accessing ab->b_l2hdr without locking is safe here because
4695 * we're holding the l2arc_buflist_mtx and no other thread will
4696 * ever directly modify the L2 fields. In particular ab->b_buf
4697 * may be invalid by now due to ARC eviction.
4698 */
4699 l2hdr = ab->b_l2hdr;
4700 l2hdr->b_daddr = dev->l2ad_hand;
4701
4702 if ((ab->b_flags & ARC_L2COMPRESS) &&
4703 l2hdr->b_asize >= buf_compress_minsz &&
4704 l2arc_compress_buf(dwe->dwe_orig_data, dwe->dwe_orig_size,
4705 &buf_data, &buf_sz, &l2hdr->b_compress)) {
4706 /*
4707 * If compression succeeded, enable headroom
4708 * boost on the next scan cycle.
4709 */
4710 *headroom_boost = B_TRUE;
4711 l2hdr->b_asize = buf_sz;
4712 } else {
4713 buf_data = dwe->dwe_orig_data;
4714 buf_sz = dwe->dwe_orig_size;
4715 l2hdr->b_asize = dwe->dwe_orig_size;
4716 }
4717
4718 /* Compression may have squashed the buffer to zero length. */
4719 if (buf_sz != 0) {
4720 uint64_t buf_p_sz;
4721
4722 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4723 dev->l2ad_hand, l2hdr->b_asize, buf_data,
4724 ZIO_CHECKSUM_OFF, NULL, NULL,
4725 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
4726 B_FALSE);
4727
4728 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4729 zio_t *, wzio);
4730 (void) zio_nowait(wzio);
4731
4732 write_asize += l2hdr->b_asize;
4733 /*
4734 * Keep the clock hand suitably device-aligned.
4735 */
4736 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4737 write_psize += buf_p_sz;
4738 dev->l2ad_hand += buf_p_sz;
4739 }
4740
4741 list_remove(&defer_write_list, dwe);
4742 kmem_free(dwe, sizeof (*dwe));
4743 }
4744
4745 list_destroy(&defer_write_list);
4746
4747 ASSERT3U(write_asize, <=, target_sz);
4748 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4749 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4750 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4751 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4752 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4753
4754 /*
4755 * Bump device hand to the device start if it is approaching the end.
4756 * l2arc_evict() will already have evicted ahead for this case.
4757 */
4758 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4759 vdev_space_update(dev->l2ad_vdev,
4760 dev->l2ad_end - dev->l2ad_hand, 0, 0);
4761 dev->l2ad_hand = dev->l2ad_start;
4762 dev->l2ad_evict = dev->l2ad_start;
4763 dev->l2ad_first = B_FALSE;
4764 }
4765
4766 dev->l2ad_writing = B_TRUE;
4767 (void) zio_wait(pio);
4768 dev->l2ad_writing = B_FALSE;
4769
4770 return (write_asize);
4771 }
4772
4773 /*
4774 * Compresses an L2ARC buffer.
4775 * The data to be compressed is in in_data and its size in in_sz. This routine
4776 * tries to compress the data and depending on the compression result there
4777 * are three possible outcomes:
4778 * *) The buffer was incompressible. The function returns with B_FALSE and
4779 * does nothing else.
4780 * *) The buffer was all-zeros, so there is no need to write it to an L2
4781 * device. To indicate this situation, the *out_data is set to NULL,
4782 * *out_sz is set to zero, *compress is set to ZIO_COMPRESS_EMPTY and
4783 * the function returns B_TRUE.
4784 * *) Compression succeeded and *out_data was set to point to a buffer holding
4785 * the compressed data buffer, *out_sz was set to indicate the output size,
4786 * *compress was set to the appropriate compression algorithm and B_TRUE is
4787 * returned. Once writing is done the buffer will be automatically freed by
4788 * l2arc_do_free_on_write().
4789 */
4790 static boolean_t
4791 l2arc_compress_buf(void *in_data, uint64_t in_sz, void **out_data,
4792 uint64_t *out_sz, enum zio_compress *compress)
4793 {
4794 void *cdata;
4795
4796 cdata = zio_data_buf_alloc(in_sz);
4797 *out_sz = zio_compress_data(ZIO_COMPRESS_LZ4, in_data, cdata, in_sz);
4798
4799 if (*out_sz == 0) {
4800 /* Zero block, indicate that there's nothing to write. */
4801 zio_data_buf_free(cdata, in_sz);
4802 *compress = ZIO_COMPRESS_EMPTY;
4803 *out_data = NULL;
4804 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4805 return (B_TRUE);
4806 } else if (*out_sz > 0 && *out_sz < in_sz) {
4807 /*
4808 * Compression succeeded, we'll keep the cdata around for
4809 * writing and release it after writing.
4810 */
4811 l2arc_data_free_t *df;
4812
4813 *compress = ZIO_COMPRESS_LZ4;
4814 *out_data = cdata;
4815
4816 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
4817 df->l2df_data = cdata;
4818 df->l2df_size = *out_sz;
4819 df->l2df_func = zio_data_buf_free;
4820 mutex_enter(&l2arc_free_on_write_mtx);
4821 list_insert_head(l2arc_free_on_write, df);
4822 mutex_exit(&l2arc_free_on_write_mtx);
4823
4824 ARCSTAT_BUMP(arcstat_l2_compress_successes);
4825 ARCSTAT_BUMP(arcstat_l2_free_on_write);
4826 return (B_TRUE);
4827 } else {
4828 /*
4829 * Compression failed, release the compressed buffer.
4830 */
4831 zio_data_buf_free(cdata, in_sz);
4832 ARCSTAT_BUMP(arcstat_l2_compress_failures);
4833 return (B_FALSE);
4834 }
4835 }
4836
4837 /*
4838 * Decompresses a zio read back from an l2arc device. On success, the
4839 * underlying zio's io_data buffer is overwritten by the uncompressed
4840 * version. On decompression error (corrupt compressed stream), the
4841 * zio->io_error value is set to signal an I/O error.
4842 *
4843 * Please note that the compressed data stream is not checksummed, so
4844 * if the underlying device is experiencing data corruption, we may feed
4845 * corrupt data to the decompressor, so the decompressor needs to be
4846 * able to handle this situation (LZ4 does).
4847 */
4848 static void
4849 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4850 {
4851 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4879 * original compressed data (rather than decompressing to an
4880 * aux buffer and then copying back the uncompressed buffer,
4881 * which is likely to be much larger).
4882 */
4883 uint64_t csize;
4884 void *cdata;
4885
4886 csize = zio->io_size;
4887 cdata = zio_data_buf_alloc(csize);
4888 bcopy(zio->io_data, cdata, csize);
4889 if (zio_decompress_data(c, cdata, zio->io_data, csize,
4890 hdr->b_size) != 0)
4891 zio->io_error = EIO;
4892 zio_data_buf_free(cdata, csize);
4893 }
4894
4895 /* Restore the expected uncompressed IO size. */
4896 zio->io_orig_size = zio->io_size = hdr->b_size;
4897 }
4898
4899 /*
4900 * This thread feeds the L2ARC at regular intervals. This is the beating
4901 * heart of the L2ARC.
4902 */
4903 static void
4904 l2arc_feed_thread(void)
4905 {
4906 callb_cpr_t cpr;
4907 l2arc_dev_t *dev;
4908 spa_t *spa;
4909 uint64_t size, wrote;
4910 clock_t begin, next = ddi_get_lbolt();
4911 boolean_t headroom_boost = B_FALSE;
4912
4913 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4914
4915 mutex_enter(&l2arc_feed_thr_lock);
4916
4917 while (l2arc_thread_exit == 0) {
4918 CALLB_CPR_SAFE_BEGIN(&cpr);
|