Print this page
3995 Memory leak of compressed buffers in l2arc_write_done
3997 ZFS L2ARC default behavior should allow reading while writing


 100  *
 101  * Arc buffers may have an associated eviction callback function.
 102  * This function will be invoked prior to removing the buffer (e.g.
 103  * in arc_do_user_evicts()).  Note however that the data associated
 104  * with the buffer may be evicted prior to the callback.  The callback
 105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  * the users of callbacks must ensure that their private data is
 107  * protected from simultaneous callbacks from arc_buf_evict()
 108  * and arc_do_user_evicts().
 109  *
 110  * Note that the majority of the performance stats are manipulated
 111  * with atomic operations.
 112  *
 113  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 114  *
 115  *      - L2ARC buflist creation
 116  *      - L2ARC buflist eviction
 117  *      - L2ARC write completion, which walks L2ARC buflists
 118  *      - ARC header destruction, as it removes from L2ARC buflists
 119  *      - ARC header release, as it removes from L2ARC buflists







 120  */
 121 
 122 #include <sys/spa.h>
 123 #include <sys/zio.h>
 124 #include <sys/zio_compress.h>
 125 #include <sys/zfs_context.h>
 126 #include <sys/arc.h>
 127 #include <sys/refcount.h>
 128 #include <sys/vdev.h>
 129 #include <sys/vdev_impl.h>
 130 #ifdef _KERNEL
 131 #include <sys/vmsystm.h>
 132 #include <vm/anon.h>
 133 #include <sys/fs/swapnode.h>
 134 #include <sys/dnlc.h>
 135 #endif
 136 #include <sys/callb.h>
 137 #include <sys/kstat.h>
 138 #include <zfs_fletcher.h>
 139 


 605 /*
 606  * If we discover during ARC scan any buffers to be compressed, we boost
 607  * our headroom for the next scanning cycle by this percentage multiple.
 608  */
 609 #define L2ARC_HEADROOM_BOOST    200
 610 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 611 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 612 
 613 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 614 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 615 
 616 /* L2ARC Performance Tunables */
 617 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 618 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 619 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 620 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 621 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 622 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 623 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 624 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 625 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 626 
 627 /*
 628  * L2ARC Internals
 629  */
 630 typedef struct l2arc_dev {
 631         vdev_t                  *l2ad_vdev;     /* vdev */
 632         spa_t                   *l2ad_spa;      /* spa */
 633         uint64_t                l2ad_hand;      /* next write location */
 634         uint64_t                l2ad_start;     /* first addr on device */
 635         uint64_t                l2ad_end;       /* last addr on device */
 636         uint64_t                l2ad_evict;     /* last addr eviction reached */
 637         boolean_t               l2ad_first;     /* first sweep through */
 638         boolean_t               l2ad_writing;   /* currently writing */
 639         list_t                  *l2ad_buflist;  /* buffer list */
 640         list_node_t             l2ad_node;      /* device list node */
 641 } l2arc_dev_t;
 642 
 643 static list_t L2ARC_dev_list;                   /* device list */
 644 static list_t *l2arc_dev_list;                  /* device list pointer */
 645 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */


 655         spa_t                   *l2rcb_spa;             /* spa */
 656         blkptr_t                l2rcb_bp;               /* original blkptr */
 657         zbookmark_t             l2rcb_zb;               /* original bookmark */
 658         int                     l2rcb_flags;            /* original flags */
 659         enum zio_compress       l2rcb_compress;         /* applied compress */
 660 } l2arc_read_callback_t;
 661 
 662 typedef struct l2arc_write_callback {
 663         l2arc_dev_t     *l2wcb_dev;             /* device info */
 664         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 665 } l2arc_write_callback_t;
 666 
 667 struct l2arc_buf_hdr {
 668         /* protected by arc_buf_hdr  mutex */
 669         l2arc_dev_t             *b_dev;         /* L2ARC device */
 670         uint64_t                b_daddr;        /* disk address, offset byte */
 671         /* compression applied to buffer data */
 672         enum zio_compress       b_compress;
 673         /* real alloc'd buffer size depending on b_compress applied */
 674         int                     b_asize;
 675         /* temporary buffer holder for in-flight compressed data */
 676         void                    *b_tmp_cdata;
 677 };
 678 
 679 typedef struct l2arc_data_free {
 680         /* protected by l2arc_free_on_write_mtx */
 681         void            *l2df_data;
 682         size_t          l2df_size;
 683         void            (*l2df_func)(void *, size_t);
 684         list_node_t     l2df_list_node;
 685 } l2arc_data_free_t;
 686 
 687 static kmutex_t l2arc_feed_thr_lock;
 688 static kcondvar_t l2arc_feed_thr_cv;
 689 static uint8_t l2arc_thread_exit;
 690 
 691 static void l2arc_read_done(zio_t *zio);
 692 static void l2arc_hdr_stat_add(void);
 693 static void l2arc_hdr_stat_remove(void);
 694 
 695 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);

 696 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 697     enum zio_compress c);
 698 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 699 
 700 static uint64_t
 701 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 702 {
 703         uint8_t *vdva = (uint8_t *)dva;
 704         uint64_t crc = -1ULL;
 705         int i;
 706 
 707         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 708 
 709         for (i = 0; i < sizeof (dva_t); i++)
 710                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 711 
 712         crc ^= (spa>>8) ^ birth;
 713 
 714         return (crc);
 715 }
 716 
 717 #define BUF_EMPTY(buf)                                          \
 718         ((buf)->b_dva.dva_word[0] == 0 &&                    \


4100         l2arc_dev_last = next;
4101 
4102 out:
4103         mutex_exit(&l2arc_dev_mtx);
4104 
4105         /*
4106          * Grab the config lock to prevent the 'next' device from being
4107          * removed while we are writing to it.
4108          */
4109         if (next != NULL)
4110                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4111         mutex_exit(&spa_namespace_lock);
4112 
4113         return (next);
4114 }
4115 
4116 /*
4117  * Free buffers that were tagged for destruction.
4118  */
4119 static void
4120 l2arc_do_free_on_write()
4121 {
4122         list_t *buflist;
4123         l2arc_data_free_t *df, *df_prev;
4124 
4125         mutex_enter(&l2arc_free_on_write_mtx);
4126         buflist = l2arc_free_on_write;
4127 
4128         for (df = list_tail(buflist); df; df = df_prev) {
4129                 df_prev = list_prev(buflist, df);
4130                 ASSERT(df->l2df_data != NULL);
4131                 ASSERT(df->l2df_func != NULL);
4132                 df->l2df_func(df->l2df_data, df->l2df_size);
4133                 list_remove(buflist, df);
4134                 kmem_free(df, sizeof (l2arc_data_free_t));
4135         }
4136 
4137         mutex_exit(&l2arc_free_on_write_mtx);
4138 }
4139 
4140 /*
4141  * A write to a cache device has completed.  Update all headers to allow
4142  * reads from these buffers to begin.
4143  */
4144 static void
4145 l2arc_write_done(zio_t *zio)
4146 {
4147         l2arc_write_callback_t *cb;
4148         l2arc_dev_t *dev;
4149         list_t *buflist;
4150         arc_buf_hdr_t *head, *ab, *ab_prev;
4151         l2arc_buf_hdr_t *abl2;
4152         kmutex_t *hash_lock;
4153 






4154         cb = zio->io_private;
4155         ASSERT(cb != NULL);
4156         dev = cb->l2wcb_dev;
4157         ASSERT(dev != NULL);
4158         head = cb->l2wcb_head;
4159         ASSERT(head != NULL);
4160         buflist = dev->l2ad_buflist;
4161         ASSERT(buflist != NULL);
4162         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4163             l2arc_write_callback_t *, cb);
4164 
4165         if (zio->io_error != 0)
4166                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4167 
4168         mutex_enter(&l2arc_buflist_mtx);
4169 
4170         /*
4171          * All writes completed, or an error was hit.
4172          */
4173         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4174                 ab_prev = list_prev(buflist, ab);
4175 
4176                 hash_lock = HDR_LOCK(ab);
4177                 if (!mutex_tryenter(hash_lock)) {
4178                         /*
4179                          * This buffer misses out.  It may be in a stage
4180                          * of eviction.  Its ARC_L2_WRITING flag will be
4181                          * left set, denying reads to this buffer.
4182                          */
4183                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4184                         continue;

4185                 }
4186 
4187                 abl2 = ab->b_l2hdr;



4188 
4189                 /*
4190                  * Release the temporary compressed buffer as soon as possible.

4191                  */
4192                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4193                         l2arc_release_cdata_buf(ab);
4194 






4195                 if (zio->io_error != 0) {
4196                         /*
4197                          * Error - drop L2ARC entry.
4198                          */


4199                         list_remove(buflist, ab);
4200                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);

4201                         ab->b_l2hdr = NULL;
4202                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4203                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4204                 }
4205 
4206                 /*
4207                  * Allow ARC to begin reads to this L2ARC entry.
4208                  */
4209                 ab->b_flags &= ~ARC_L2_WRITING;
4210 
4211                 mutex_exit(hash_lock);


4212         }

4213 
4214         atomic_inc_64(&l2arc_writes_done);
4215         list_remove(buflist, head);
4216         kmem_cache_free(hdr_cache, head);
4217         mutex_exit(&l2arc_buflist_mtx);
4218 
4219         l2arc_do_free_on_write();
4220 
4221         kmem_free(cb, sizeof (l2arc_write_callback_t));
4222 }
4223 
4224 /*
4225  * A read to a cache device completed.  Validate buffer contents before
4226  * handing over to the regular ARC routines.
4227  */
4228 static void
4229 l2arc_read_done(zio_t *zio)
4230 {
4231         l2arc_read_callback_t *cb;
4232         arc_buf_hdr_t *hdr;
4233         arc_buf_t *buf;
4234         kmutex_t *hash_lock;
4235         int equal;
4236 
4237         ASSERT(zio->io_vd != NULL);
4238         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);


4333                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4334                 *lock = &arc_mru->arcs_mtx;
4335                 break;
4336         }
4337 
4338         ASSERT(!(MUTEX_HELD(*lock)));
4339         mutex_enter(*lock);
4340         return (list);
4341 }
4342 
4343 /*
4344  * Evict buffers from the device write hand to the distance specified in
4345  * bytes.  This distance may span populated buffers, it may span nothing.
4346  * This is clearing a region on the L2ARC device ready for writing.
4347  * If the 'all' boolean is set, every buffer is evicted.
4348  */
4349 static void
4350 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4351 {
4352         list_t *buflist;
4353         l2arc_buf_hdr_t *abl2;
4354         arc_buf_hdr_t *ab, *ab_prev;
4355         kmutex_t *hash_lock;
4356         uint64_t taddr;
4357 
4358         buflist = dev->l2ad_buflist;
4359 
4360         if (buflist == NULL)
4361                 return;
4362 
4363         if (!all && dev->l2ad_first) {
4364                 /*
4365                  * This is the first sweep through the device.  There is
4366                  * nothing to evict.
4367                  */
4368                 return;
4369         }
4370 
4371         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4372                 /*
4373                  * When nearing the end of the device, evict to the end


4433                          * arc_hdr_destroy() will call list_remove()
4434                          * and decrement arcstat_l2_size.
4435                          */
4436                         arc_change_state(arc_anon, ab, hash_lock);
4437                         arc_hdr_destroy(ab);
4438                 } else {
4439                         /*
4440                          * Invalidate issued or about to be issued
4441                          * reads, since we may be about to write
4442                          * over this location.
4443                          */
4444                         if (HDR_L2_READING(ab)) {
4445                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4446                                 ab->b_flags |= ARC_L2_EVICTED;
4447                         }
4448 
4449                         /*
4450                          * Tell ARC this no longer exists in L2ARC.
4451                          */
4452                         if (ab->b_l2hdr != NULL) {
4453                                 abl2 = ab->b_l2hdr;
4454                                 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4455                                 ab->b_l2hdr = NULL;
4456                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4457                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4458                         }
4459                         list_remove(buflist, ab);
4460 
4461                         /*
4462                          * This may have been leftover after a
4463                          * failed write.
4464                          */
4465                         ab->b_flags &= ~ARC_L2_WRITING;
4466                 }
4467                 mutex_exit(hash_lock);
4468         }
4469         mutex_exit(&l2arc_buflist_mtx);
4470 
4471         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4472         dev->l2ad_evict = taddr;
4473 }
4474 
4475 /*
4476  * Find and write ARC buffers to the L2ARC device.


4481  * state between calls to this function.
4482  *
4483  * Returns the number of bytes actually written (which may be smaller than
4484  * the delta by which the device hand has changed due to alignment).
4485  */
4486 static uint64_t
4487 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4488     boolean_t *headroom_boost)
4489 {
4490         arc_buf_hdr_t *ab, *ab_prev, *head;
4491         list_t *list;
4492         uint64_t write_asize, write_psize, write_sz, headroom,
4493             buf_compress_minsz;
4494         void *buf_data;
4495         kmutex_t *list_lock;
4496         boolean_t full;
4497         l2arc_write_callback_t *cb;
4498         zio_t *pio, *wzio;
4499         uint64_t guid = spa_load_guid(spa);
4500         const boolean_t do_headroom_boost = *headroom_boost;







4501 
4502         ASSERT(dev->l2ad_vdev != NULL);
4503 
4504         /* Lower the flag now, we might want to raise it again later. */
4505         *headroom_boost = B_FALSE;
4506 
4507         pio = NULL;
4508         write_sz = write_asize = write_psize = 0;
4509         full = B_FALSE;
4510         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4511         head->b_flags |= ARC_L2_WRITE_HEAD;
4512 
4513         /*
4514          * We will want to try to compress buffers that are at least 2x the
4515          * device sector size.
4516          */
4517         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4518 
4519         /*
4520          * Copy buffers for L2ARC writing.
4521          */


4522         mutex_enter(&l2arc_buflist_mtx);
4523         for (int try = 0; try <= 3; try++) {
4524                 uint64_t passed_sz = 0;
4525 
4526                 list = l2arc_list_locked(try, &list_lock);
4527 
4528                 /*
4529                  * L2ARC fast warmup.
4530                  *
4531                  * Until the ARC is warm and starts to evict, read from the
4532                  * head of the ARC lists rather than the tail.
4533                  */
4534                 if (arc_warm == B_FALSE)
4535                         ab = list_head(list);
4536                 else
4537                         ab = list_tail(list);
4538 
4539                 headroom = target_sz * l2arc_headroom;
4540                 if (do_headroom_boost)
4541                         headroom = (headroom * l2arc_headroom_boost) / 100;
4542 
4543                 for (; ab; ab = ab_prev) {
4544                         l2arc_buf_hdr_t *l2hdr;
4545                         kmutex_t *hash_lock;
4546                         uint64_t buf_sz;
4547 
4548                         if (arc_warm == B_FALSE)
4549                                 ab_prev = list_next(list, ab);
4550                         else
4551                                 ab_prev = list_prev(list, ab);
4552 
4553                         hash_lock = HDR_LOCK(ab);
4554                         if (!mutex_tryenter(hash_lock)) {
4555                                 /*
4556                                  * Skip this buffer rather than waiting.
4557                                  */
4558                                 continue;
4559                         }
4560 
4561                         passed_sz += ab->b_size;
4562                         if (passed_sz > headroom) {
4563                                 /*
4564                                  * Searched too far.
4565                                  */
4566                                 mutex_exit(hash_lock);


4583                                  * Insert a dummy header on the buflist so
4584                                  * l2arc_write_done() can find where the
4585                                  * write buffers begin without searching.
4586                                  */
4587                                 list_insert_head(dev->l2ad_buflist, head);
4588 
4589                                 cb = kmem_alloc(
4590                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4591                                 cb->l2wcb_dev = dev;
4592                                 cb->l2wcb_head = head;
4593                                 pio = zio_root(spa, l2arc_write_done, cb,
4594                                     ZIO_FLAG_CANFAIL);
4595                         }
4596 
4597                         /*
4598                          * Create and add a new L2ARC header.
4599                          */
4600                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4601                         l2hdr->b_dev = dev;
4602                         ab->b_flags |= ARC_L2_WRITING;


4603 
4604                         /*
4605                          * Temporarily stash the data buffer in b_tmp_cdata.
4606                          * The subsequent write step will pick it up from
4607                          * there. This is because can't access ab->b_buf
4608                          * without holding the hash_lock, which we in turn
4609                          * can't access without holding the ARC list locks
4610                          * (which we want to avoid during compression/writing).

4611                          */
4612                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
4613                         l2hdr->b_asize = ab->b_size;
4614                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;

4615 
4616                         buf_sz = ab->b_size;
4617                         ab->b_l2hdr = l2hdr;
4618 
4619                         list_insert_head(dev->l2ad_buflist, ab);

4620 
4621                         /*
4622                          * Compute and store the buffer cksum before
4623                          * writing.  On debug the cksum is verified first.
4624                          */
4625                         arc_cksum_verify(ab->b_buf);
4626                         arc_cksum_compute(ab->b_buf, B_TRUE);
4627 
4628                         mutex_exit(hash_lock);
4629 
4630                         write_sz += buf_sz;
4631                 }
4632 
4633                 mutex_exit(list_lock);
4634 
4635                 if (full == B_TRUE)
4636                         break;
4637         }
4638 
4639         /* No buffers selected for writing? */
4640         if (pio == NULL) {
4641                 ASSERT0(write_sz);
4642                 mutex_exit(&l2arc_buflist_mtx);
4643                 kmem_cache_free(hdr_cache, head);

4644                 return (0);
4645         }
4646 


4647         /*
4648          * Now start writing the buffers. We're starting at the write head
4649          * and work backwards, retracing the course of the buffer selector
4650          * loop above.
4651          */
4652         for (ab = list_prev(dev->l2ad_buflist, head); ab;
4653             ab = list_prev(dev->l2ad_buflist, ab)) {
4654                 l2arc_buf_hdr_t *l2hdr;
4655                 uint64_t buf_sz;
4656 



4657                 /*
4658                  * We shouldn't need to lock the buffer here, since we flagged
4659                  * it as ARC_L2_WRITING in the previous step, but we must take
4660                  * care to only access its L2 cache parameters. In particular,
4661                  * ab->b_buf may be invalid by now due to ARC eviction.
4662                  */
4663                 l2hdr = ab->b_l2hdr;
4664                 l2hdr->b_daddr = dev->l2ad_hand;
4665 
4666                 if ((ab->b_flags & ARC_L2COMPRESS) &&
4667                     l2hdr->b_asize >= buf_compress_minsz) {
4668                         if (l2arc_compress_buf(l2hdr)) {

4669                                 /*
4670                                  * If compression succeeded, enable headroom
4671                                  * boost on the next scan cycle.
4672                                  */
4673                                 *headroom_boost = B_TRUE;





4674                         }
4675                 }
4676 
4677                 /*
4678                  * Pick up the buffer data we had previously stashed away
4679                  * (and now potentially also compressed).
4680                  */
4681                 buf_data = l2hdr->b_tmp_cdata;
4682                 buf_sz = l2hdr->b_asize;
4683 
4684                 /* Compression may have squashed the buffer to zero length. */
4685                 if (buf_sz != 0) {
4686                         uint64_t buf_p_sz;
4687 
4688                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4689                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4690                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4691                             ZIO_FLAG_CANFAIL, B_FALSE);

4692 
4693                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4694                             zio_t *, wzio);
4695                         (void) zio_nowait(wzio);
4696 
4697                         write_asize += buf_sz;
4698                         /*
4699                          * Keep the clock hand suitably device-aligned.
4700                          */
4701                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4702                         write_psize += buf_p_sz;
4703                         dev->l2ad_hand += buf_p_sz;
4704                 }



4705         }
4706 
4707         mutex_exit(&l2arc_buflist_mtx);
4708 
4709         ASSERT3U(write_asize, <=, target_sz);
4710         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4711         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4712         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4713         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4714         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4715 
4716         /*
4717          * Bump device hand to the device start if it is approaching the end.
4718          * l2arc_evict() will already have evicted ahead for this case.
4719          */
4720         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4721                 vdev_space_update(dev->l2ad_vdev,
4722                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4723                 dev->l2ad_hand = dev->l2ad_start;
4724                 dev->l2ad_evict = dev->l2ad_start;
4725                 dev->l2ad_first = B_FALSE;
4726         }
4727 
4728         dev->l2ad_writing = B_TRUE;
4729         (void) zio_wait(pio);
4730         dev->l2ad_writing = B_FALSE;
4731 
4732         return (write_asize);
4733 }
4734 
4735 /*
4736  * Compresses an L2ARC buffer.
4737  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4738  * size in l2hdr->b_asize. This routine tries to compress the data and
4739  * depending on the compression result there are three possible outcomes:
4740  * *) The buffer was incompressible. The original l2hdr contents were left
4741  *    untouched and are ready for writing to an L2 device.
4742  * *) The buffer was all-zeros, so there is no need to write it to an L2
4743  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4744  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4745  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4746  *    data buffer which holds the compressed data to be written, and b_asize
4747  *    tells us how much data there is. b_compress is set to the appropriate
4748  *    compression algorithm. Once writing is done, invoke
4749  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4750  *
4751  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4752  * buffer was incompressible).
4753  */
4754 static boolean_t
4755 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)

4756 {
4757         void *cdata;
4758         size_t csize, len;
4759 
4760         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4761         ASSERT(l2hdr->b_tmp_cdata != NULL);
4762 
4763         len = l2hdr->b_asize;
4764         cdata = zio_data_buf_alloc(len);
4765         csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4766             cdata, l2hdr->b_asize);
4767 
4768         if (csize == 0) {
4769                 /* zero block, indicate that there's nothing to write */
4770                 zio_data_buf_free(cdata, len);
4771                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4772                 l2hdr->b_asize = 0;
4773                 l2hdr->b_tmp_cdata = NULL;
4774                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4775                 return (B_TRUE);
4776         } else if (csize > 0 && csize < len) {
4777                 /*
4778                  * Compression succeeded, we'll keep the cdata around for
4779                  * writing and release it afterwards.
4780                  */
4781                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4782                 l2hdr->b_asize = csize;
4783                 l2hdr->b_tmp_cdata = cdata;










4784                 ARCSTAT_BUMP(arcstat_l2_compress_successes);

4785                 return (B_TRUE);
4786         } else {
4787                 /*
4788                  * Compression failed, release the compressed buffer.
4789                  * l2hdr will be left unmodified.
4790                  */
4791                 zio_data_buf_free(cdata, len);
4792                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
4793                 return (B_FALSE);
4794         }
4795 }
4796 
4797 /*
4798  * Decompresses a zio read back from an l2arc device. On success, the
4799  * underlying zio's io_data buffer is overwritten by the uncompressed
4800  * version. On decompression error (corrupt compressed stream), the
4801  * zio->io_error value is set to signal an I/O error.
4802  *
4803  * Please note that the compressed data stream is not checksummed, so
4804  * if the underlying device is experiencing data corruption, we may feed
4805  * corrupt data to the decompressor, so the decompressor needs to be
4806  * able to handle this situation (LZ4 does).
4807  */
4808 static void
4809 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4810 {
4811         ASSERT(L2ARC_IS_VALID_COMPRESS(c));


4839                  * original compressed data (rather than decompressing to an
4840                  * aux buffer and then copying back the uncompressed buffer,
4841                  * which is likely to be much larger).
4842                  */
4843                 uint64_t csize;
4844                 void *cdata;
4845 
4846                 csize = zio->io_size;
4847                 cdata = zio_data_buf_alloc(csize);
4848                 bcopy(zio->io_data, cdata, csize);
4849                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
4850                     hdr->b_size) != 0)
4851                         zio->io_error = EIO;
4852                 zio_data_buf_free(cdata, csize);
4853         }
4854 
4855         /* Restore the expected uncompressed IO size. */
4856         zio->io_orig_size = zio->io_size = hdr->b_size;
4857 }
4858 
4859 /*
4860  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
4861  * This buffer serves as a temporary holder of compressed data while
4862  * the buffer entry is being written to an l2arc device. Once that is
4863  * done, we can dispose of it.
4864  */
4865 static void
4866 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
4867 {
4868         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4869 
4870         if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
4871                 /*
4872                  * If the data was compressed, then we've allocated a
4873                  * temporary buffer for it, so now we need to release it.
4874                  */
4875                 ASSERT(l2hdr->b_tmp_cdata != NULL);
4876                 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
4877         }
4878         l2hdr->b_tmp_cdata = NULL;
4879 }
4880 
4881 /*
4882  * This thread feeds the L2ARC at regular intervals.  This is the beating
4883  * heart of the L2ARC.
4884  */
4885 static void
4886 l2arc_feed_thread(void)
4887 {
4888         callb_cpr_t cpr;
4889         l2arc_dev_t *dev;
4890         spa_t *spa;
4891         uint64_t size, wrote;
4892         clock_t begin, next = ddi_get_lbolt();
4893         boolean_t headroom_boost = B_FALSE;
4894 
4895         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4896 
4897         mutex_enter(&l2arc_feed_thr_lock);
4898 
4899         while (l2arc_thread_exit == 0) {
4900                 CALLB_CPR_SAFE_BEGIN(&cpr);




 100  *
 101  * Arc buffers may have an associated eviction callback function.
 102  * This function will be invoked prior to removing the buffer (e.g.
 103  * in arc_do_user_evicts()).  Note however that the data associated
 104  * with the buffer may be evicted prior to the callback.  The callback
 105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  * the users of callbacks must ensure that their private data is
 107  * protected from simultaneous callbacks from arc_buf_evict()
 108  * and arc_do_user_evicts().
 109  *
 110  * Note that the majority of the performance stats are manipulated
 111  * with atomic operations.
 112  *
 113  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 114  *
 115  *      - L2ARC buflist creation
 116  *      - L2ARC buflist eviction
 117  *      - L2ARC write completion, which walks L2ARC buflists
 118  *      - ARC header destruction, as it removes from L2ARC buflists
 119  *      - ARC header release, as it removes from L2ARC buflists
 120  *
 121  * Please note that if you first grab the l2arc_buflist_mtx, you can't do a
 122  * mutex_enter on a buffer's hash_lock anymore due to lock inversion. To grab
 123  * the hash_lock you must use mutex_tryenter and possibly deal with the buffer
 124  * not being available (due to e.g. some other thread holding it while trying
 125  * to unconditionally grab the l2arc_buflist_mtx which you are holding). The
 126  * inverse situation (first grab hash_lock, then l2arc_buflist_mtx) is safe.
 127  */
 128 
 129 #include <sys/spa.h>
 130 #include <sys/zio.h>
 131 #include <sys/zio_compress.h>
 132 #include <sys/zfs_context.h>
 133 #include <sys/arc.h>
 134 #include <sys/refcount.h>
 135 #include <sys/vdev.h>
 136 #include <sys/vdev_impl.h>
 137 #ifdef _KERNEL
 138 #include <sys/vmsystm.h>
 139 #include <vm/anon.h>
 140 #include <sys/fs/swapnode.h>
 141 #include <sys/dnlc.h>
 142 #endif
 143 #include <sys/callb.h>
 144 #include <sys/kstat.h>
 145 #include <zfs_fletcher.h>
 146 


 612 /*
 613  * If we discover during ARC scan any buffers to be compressed, we boost
 614  * our headroom for the next scanning cycle by this percentage multiple.
 615  */
 616 #define L2ARC_HEADROOM_BOOST    200
 617 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 618 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 619 
 620 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 621 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 622 
 623 /* L2ARC Performance Tunables */
 624 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 625 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 626 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 627 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 628 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 629 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 630 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 631 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 632 boolean_t l2arc_norw = B_FALSE;                 /* no reads during writes */
 633 
 634 /*
 635  * L2ARC Internals
 636  */
 637 typedef struct l2arc_dev {
 638         vdev_t                  *l2ad_vdev;     /* vdev */
 639         spa_t                   *l2ad_spa;      /* spa */
 640         uint64_t                l2ad_hand;      /* next write location */
 641         uint64_t                l2ad_start;     /* first addr on device */
 642         uint64_t                l2ad_end;       /* last addr on device */
 643         uint64_t                l2ad_evict;     /* last addr eviction reached */
 644         boolean_t               l2ad_first;     /* first sweep through */
 645         boolean_t               l2ad_writing;   /* currently writing */
 646         list_t                  *l2ad_buflist;  /* buffer list */
 647         list_node_t             l2ad_node;      /* device list node */
 648 } l2arc_dev_t;
 649 
 650 static list_t L2ARC_dev_list;                   /* device list */
 651 static list_t *l2arc_dev_list;                  /* device list pointer */
 652 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */


 662         spa_t                   *l2rcb_spa;             /* spa */
 663         blkptr_t                l2rcb_bp;               /* original blkptr */
 664         zbookmark_t             l2rcb_zb;               /* original bookmark */
 665         int                     l2rcb_flags;            /* original flags */
 666         enum zio_compress       l2rcb_compress;         /* applied compress */
 667 } l2arc_read_callback_t;
 668 
 669 typedef struct l2arc_write_callback {
 670         l2arc_dev_t     *l2wcb_dev;             /* device info */
 671         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 672 } l2arc_write_callback_t;
 673 
 674 struct l2arc_buf_hdr {
 675         /* protected by arc_buf_hdr  mutex */
 676         l2arc_dev_t             *b_dev;         /* L2ARC device */
 677         uint64_t                b_daddr;        /* disk address, offset byte */
 678         /* compression applied to buffer data */
 679         enum zio_compress       b_compress;
 680         /* real alloc'd buffer size depending on b_compress applied */
 681         int                     b_asize;


 682 };
 683 
 684 typedef struct l2arc_data_free {
 685         /* protected by l2arc_free_on_write_mtx */
 686         void            *l2df_data;
 687         size_t          l2df_size;
 688         void            (*l2df_func)(void *, size_t);
 689         list_node_t     l2df_list_node;
 690 } l2arc_data_free_t;
 691 
 692 static kmutex_t l2arc_feed_thr_lock;
 693 static kcondvar_t l2arc_feed_thr_cv;
 694 static uint8_t l2arc_thread_exit;
 695 
 696 static void l2arc_read_done(zio_t *zio);
 697 static void l2arc_hdr_stat_add(void);
 698 static void l2arc_hdr_stat_remove(void);
 699 
 700 static boolean_t l2arc_compress_buf(void *in_data, uint64_t in_sz,
 701     void **out_data, uint64_t *out_sz, enum zio_compress *compress);
 702 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 703     enum zio_compress c);

 704 
 705 static uint64_t
 706 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 707 {
 708         uint8_t *vdva = (uint8_t *)dva;
 709         uint64_t crc = -1ULL;
 710         int i;
 711 
 712         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 713 
 714         for (i = 0; i < sizeof (dva_t); i++)
 715                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 716 
 717         crc ^= (spa>>8) ^ birth;
 718 
 719         return (crc);
 720 }
 721 
 722 #define BUF_EMPTY(buf)                                          \
 723         ((buf)->b_dva.dva_word[0] == 0 &&                    \


4105         l2arc_dev_last = next;
4106 
4107 out:
4108         mutex_exit(&l2arc_dev_mtx);
4109 
4110         /*
4111          * Grab the config lock to prevent the 'next' device from being
4112          * removed while we are writing to it.
4113          */
4114         if (next != NULL)
4115                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4116         mutex_exit(&spa_namespace_lock);
4117 
4118         return (next);
4119 }
4120 
4121 /*
4122  * Free buffers that were tagged for destruction.
4123  */
4124 static void
4125 l2arc_do_free_on_write(void)
4126 {
4127         list_t *buflist;
4128         l2arc_data_free_t *df, *df_prev;
4129 
4130         mutex_enter(&l2arc_free_on_write_mtx);
4131         buflist = l2arc_free_on_write;
4132 
4133         for (df = list_tail(buflist); df; df = df_prev) {
4134                 df_prev = list_prev(buflist, df);
4135                 ASSERT(df->l2df_data != NULL);
4136                 ASSERT(df->l2df_func != NULL);
4137                 df->l2df_func(df->l2df_data, df->l2df_size);
4138                 list_remove(buflist, df);
4139                 kmem_free(df, sizeof (l2arc_data_free_t));
4140         }
4141 
4142         mutex_exit(&l2arc_free_on_write_mtx);
4143 }
4144 
4145 /*
4146  * A write to a cache device has completed.  Update all headers to allow
4147  * reads from these buffers to begin.
4148  */
4149 static void
4150 l2arc_write_done(zio_t *zio)
4151 {
4152         l2arc_write_callback_t *cb;
4153         l2arc_dev_t *dev;
4154         list_t *buflist;
4155         arc_buf_hdr_t *head, *ab;


4156 
4157         struct defer_done_entry {
4158                 arc_buf_hdr_t *dde_buf;
4159                 list_node_t dde_node;
4160         } *dde, *dde_next;
4161         list_t defer_done_list;
4162 
4163         cb = zio->io_private;
4164         ASSERT(cb != NULL);
4165         dev = cb->l2wcb_dev;
4166         ASSERT(dev != NULL);
4167         head = cb->l2wcb_head;
4168         ASSERT(head != NULL);
4169         buflist = dev->l2ad_buflist;
4170         ASSERT(buflist != NULL);
4171         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4172             l2arc_write_callback_t *, cb);
4173 
4174         if (zio->io_error != 0)
4175                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4176 
4177         mutex_enter(&l2arc_buflist_mtx);
4178 
4179         /*
4180          * All writes completed, or an error was hit.
4181          */
4182         list_create(&defer_done_list, sizeof (*dde),
4183             offsetof(struct defer_done_entry, dde_node));
4184         for (ab = list_prev(buflist, head); ab; ab = list_prev(buflist, ab)) {


4185                 /*
4186                  * Can't pause here to grab hash_lock while also holding
4187                  * l2arc_buflist_mtx, so place the buffers on a temporary
4188                  * thread-local list for later processing.
4189                  */
4190                 dde = kmem_alloc(sizeof (*dde), KM_SLEEP);
4191                 dde->dde_buf = ab;
4192                 list_insert_tail(&defer_done_list, dde);
4193         }
4194 
4195         atomic_inc_64(&l2arc_writes_done);
4196         list_remove(buflist, head);
4197         kmem_cache_free(hdr_cache, head);
4198         mutex_exit(&l2arc_buflist_mtx);
4199 
4200         /*
4201          * Now process the buffers. We're not holding l2arc_buflist_mtx
4202          * anymore, so we can do a regular mutex_enter on the hash_lock.
4203          */
4204         for (dde = list_head(&defer_done_list); dde != NULL; dde = dde_next) {
4205                 kmutex_t *hash_lock;
4206 
4207                 dde_next = list_next(&defer_done_list, dde);
4208                 ab = dde->dde_buf;
4209                 hash_lock = HDR_LOCK(ab);
4210 
4211                 mutex_enter(hash_lock);
4212 
4213                 if (zio->io_error != 0) {
4214                         /*
4215                          * Error - drop L2ARC entry.
4216                          */
4217                         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4218                         mutex_enter(&l2arc_buflist_mtx);
4219                         list_remove(buflist, ab);
4220                         mutex_exit(&l2arc_buflist_mtx);
4221                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4222                         ab->b_l2hdr = NULL;
4223                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
4224                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4225                 }
4226 
4227                 /*
4228                  * Allow ARC to begin reads to this L2ARC entry.
4229                  */
4230                 ab->b_flags &= ~ARC_L2_WRITING;
4231 
4232                 mutex_exit(hash_lock);
4233 
4234                 list_remove(&defer_done_list, dde);
4235         }
4236         list_destroy(&defer_done_list);
4237 





4238         l2arc_do_free_on_write();
4239 
4240         kmem_free(cb, sizeof (l2arc_write_callback_t));
4241 }
4242 
4243 /*
4244  * A read to a cache device completed.  Validate buffer contents before
4245  * handing over to the regular ARC routines.
4246  */
4247 static void
4248 l2arc_read_done(zio_t *zio)
4249 {
4250         l2arc_read_callback_t *cb;
4251         arc_buf_hdr_t *hdr;
4252         arc_buf_t *buf;
4253         kmutex_t *hash_lock;
4254         int equal;
4255 
4256         ASSERT(zio->io_vd != NULL);
4257         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);


4352                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4353                 *lock = &arc_mru->arcs_mtx;
4354                 break;
4355         }
4356 
4357         ASSERT(!(MUTEX_HELD(*lock)));
4358         mutex_enter(*lock);
4359         return (list);
4360 }
4361 
4362 /*
4363  * Evict buffers from the device write hand to the distance specified in
4364  * bytes.  This distance may span populated buffers, it may span nothing.
4365  * This is clearing a region on the L2ARC device ready for writing.
4366  * If the 'all' boolean is set, every buffer is evicted.
4367  */
4368 static void
4369 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4370 {
4371         list_t *buflist;
4372         l2arc_buf_hdr_t *l2hdr;
4373         arc_buf_hdr_t *ab, *ab_prev;
4374         kmutex_t *hash_lock;
4375         uint64_t taddr;
4376 
4377         buflist = dev->l2ad_buflist;
4378 
4379         if (buflist == NULL)
4380                 return;
4381 
4382         if (!all && dev->l2ad_first) {
4383                 /*
4384                  * This is the first sweep through the device.  There is
4385                  * nothing to evict.
4386                  */
4387                 return;
4388         }
4389 
4390         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4391                 /*
4392                  * When nearing the end of the device, evict to the end


4452                          * arc_hdr_destroy() will call list_remove()
4453                          * and decrement arcstat_l2_size.
4454                          */
4455                         arc_change_state(arc_anon, ab, hash_lock);
4456                         arc_hdr_destroy(ab);
4457                 } else {
4458                         /*
4459                          * Invalidate issued or about to be issued
4460                          * reads, since we may be about to write
4461                          * over this location.
4462                          */
4463                         if (HDR_L2_READING(ab)) {
4464                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4465                                 ab->b_flags |= ARC_L2_EVICTED;
4466                         }
4467 
4468                         /*
4469                          * Tell ARC this no longer exists in L2ARC.
4470                          */
4471                         if (ab->b_l2hdr != NULL) {
4472                                 l2hdr = ab->b_l2hdr;
4473                                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4474                                 ab->b_l2hdr = NULL;
4475                                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
4476                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4477                         }
4478                         list_remove(buflist, ab);
4479 
4480                         /*
4481                          * This may have been leftover after a
4482                          * failed write.
4483                          */
4484                         ab->b_flags &= ~ARC_L2_WRITING;
4485                 }
4486                 mutex_exit(hash_lock);
4487         }
4488         mutex_exit(&l2arc_buflist_mtx);
4489 
4490         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4491         dev->l2ad_evict = taddr;
4492 }
4493 
4494 /*
4495  * Find and write ARC buffers to the L2ARC device.


4500  * state between calls to this function.
4501  *
4502  * Returns the number of bytes actually written (which may be smaller than
4503  * the delta by which the device hand has changed due to alignment).
4504  */
4505 static uint64_t
4506 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4507     boolean_t *headroom_boost)
4508 {
4509         arc_buf_hdr_t *ab, *ab_prev, *head;
4510         list_t *list;
4511         uint64_t write_asize, write_psize, write_sz, headroom,
4512             buf_compress_minsz;
4513         void *buf_data;
4514         kmutex_t *list_lock;
4515         boolean_t full;
4516         l2arc_write_callback_t *cb;
4517         zio_t *pio, *wzio;
4518         uint64_t guid = spa_load_guid(spa);
4519         const boolean_t do_headroom_boost = *headroom_boost;
4520         struct defer_write_entry {
4521                 arc_buf_hdr_t *dwe_buf;
4522                 void *dwe_orig_data;
4523                 uint64_t dwe_orig_size;
4524                 list_node_t *dwe_node;
4525         } *dwe, *dwe_next;
4526         list_t defer_write_list;
4527 
4528         ASSERT(dev->l2ad_vdev != NULL);
4529 
4530         /* Lower the flag now, we might want to raise it again later. */
4531         *headroom_boost = B_FALSE;
4532 
4533         pio = NULL;
4534         write_sz = write_asize = write_psize = 0;
4535         full = B_FALSE;
4536         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4537         head->b_flags |= ARC_L2_WRITE_HEAD;
4538 
4539         /*
4540          * We will want to try to compress buffers that are at least 2x the
4541          * device sector size.
4542          */
4543         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4544 
4545         /*
4546          * Copy buffers for L2ARC writing.
4547          */
4548         list_create(&defer_write_list, sizeof (*dwe),
4549             offsetof(struct defer_write_entry, dwe_node));
4550         mutex_enter(&l2arc_buflist_mtx);
4551         for (int try = 0; try <= 3; try++) {
4552                 uint64_t passed_sz = 0;
4553 
4554                 list = l2arc_list_locked(try, &list_lock);
4555 
4556                 /*
4557                  * L2ARC fast warmup.
4558                  *
4559                  * Until the ARC is warm and starts to evict, read from the
4560                  * head of the ARC lists rather than the tail.
4561                  */
4562                 if (arc_warm == B_FALSE)
4563                         ab = list_head(list);
4564                 else
4565                         ab = list_tail(list);
4566 
4567                 headroom = target_sz * l2arc_headroom;
4568                 if (do_headroom_boost)
4569                         headroom = (headroom * l2arc_headroom_boost) / 100;
4570 
4571                 for (; ab; ab = ab_prev) {
4572                         l2arc_buf_hdr_t *l2hdr;
4573                         kmutex_t *hash_lock;

4574 
4575                         if (arc_warm == B_FALSE)
4576                                 ab_prev = list_next(list, ab);
4577                         else
4578                                 ab_prev = list_prev(list, ab);
4579 
4580                         hash_lock = HDR_LOCK(ab);
4581                         if (!mutex_tryenter(hash_lock)) {
4582                                 /*
4583                                  * Skip this buffer rather than waiting.
4584                                  */
4585                                 continue;
4586                         }
4587 
4588                         passed_sz += ab->b_size;
4589                         if (passed_sz > headroom) {
4590                                 /*
4591                                  * Searched too far.
4592                                  */
4593                                 mutex_exit(hash_lock);


4610                                  * Insert a dummy header on the buflist so
4611                                  * l2arc_write_done() can find where the
4612                                  * write buffers begin without searching.
4613                                  */
4614                                 list_insert_head(dev->l2ad_buflist, head);
4615 
4616                                 cb = kmem_alloc(
4617                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4618                                 cb->l2wcb_dev = dev;
4619                                 cb->l2wcb_head = head;
4620                                 pio = zio_root(spa, l2arc_write_done, cb,
4621                                     ZIO_FLAG_CANFAIL);
4622                         }
4623 
4624                         /*
4625                          * Create and add a new L2ARC header.
4626                          */
4627                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4628                         l2hdr->b_dev = dev;
4629                         ab->b_flags |= ARC_L2_WRITING;
4630                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
4631                         l2hdr->b_asize = ab->b_size;
4632 
4633                         /*
4634                          * Temporarily stash the buffer in defer_write_entries.
4635                          * The subsequent write step will pick it up from
4636                          * there. This is because we can't access ab->b_buf
4637                          * without holding the hash_lock, which we in turn
4638                          * can't access without holding the ARC list locks
4639                          * while walking the ARC lists (we want to avoid
4640                          * holding these locks during compression/writing).
4641                          */
4642                         dwe = kmem_alloc(sizeof (*dwe), KM_SLEEP);
4643                         dwe->dwe_buf = ab;
4644                         dwe->dwe_orig_data = ab->b_buf->b_data;
4645                         dwe->dwe_orig_size = ab->b_size;
4646 

4647                         ab->b_l2hdr = l2hdr;
4648 
4649                         list_insert_head(dev->l2ad_buflist, ab);
4650                         list_insert_tail(&defer_write_list, dwe);
4651 
4652                         /*
4653                          * Compute and store the buffer cksum before
4654                          * writing.  On debug the cksum is verified first.
4655                          */
4656                         arc_cksum_verify(ab->b_buf);
4657                         arc_cksum_compute(ab->b_buf, B_TRUE);
4658 
4659                         mutex_exit(hash_lock);
4660 
4661                         write_sz += dwe->dwe_orig_size;
4662                 }
4663 
4664                 mutex_exit(list_lock);
4665 
4666                 if (full == B_TRUE)
4667                         break;
4668         }
4669 
4670         /* No buffers selected for writing? */
4671         if (pio == NULL) {
4672                 ASSERT0(write_sz);
4673                 mutex_exit(&l2arc_buflist_mtx);
4674                 kmem_cache_free(hdr_cache, head);
4675                 list_destroy(&defer_write_list);
4676                 return (0);
4677         }
4678 
4679         mutex_exit(&l2arc_buflist_mtx);
4680 
4681         /*
4682          * Now start writing the buffers. We're starting at the write head
4683          * and work backwards, retracing the course of the buffer selector
4684          * loop above.
4685          */
4686         for (dwe = list_head(&defer_write_list); dwe != NULL; dwe = dwe_next) {

4687                 l2arc_buf_hdr_t *l2hdr;
4688                 uint64_t buf_sz;
4689 
4690                 dwe_next = list_next(&defer_write_list, dwe);
4691                 ab = dwe->dwe_buf;
4692 
4693                 /*
4694                  * Accessing ab->b_l2hdr without locking is safe here because
4695                  * we're holding the l2arc_buflist_mtx and no other thread will
4696                  * ever directly modify the L2 fields. In particular ab->b_buf
4697                  * may be invalid by now due to ARC eviction.
4698                  */
4699                 l2hdr = ab->b_l2hdr;
4700                 l2hdr->b_daddr = dev->l2ad_hand;
4701 
4702                 if ((ab->b_flags & ARC_L2COMPRESS) &&
4703                     l2hdr->b_asize >= buf_compress_minsz &&
4704                     l2arc_compress_buf(dwe->dwe_orig_data, dwe->dwe_orig_size,
4705                     &buf_data, &buf_sz, &l2hdr->b_compress)) {
4706                         /*
4707                          * If compression succeeded, enable headroom
4708                          * boost on the next scan cycle.
4709                          */
4710                         *headroom_boost = B_TRUE;
4711                         l2hdr->b_asize = buf_sz;
4712                 } else {
4713                         buf_data = dwe->dwe_orig_data;
4714                         buf_sz = dwe->dwe_orig_size;
4715                         l2hdr->b_asize = dwe->dwe_orig_size;
4716                 }

4717 







4718                 /* Compression may have squashed the buffer to zero length. */
4719                 if (buf_sz != 0) {
4720                         uint64_t buf_p_sz;
4721 
4722                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4723                             dev->l2ad_hand, l2hdr->b_asize, buf_data,
4724                             ZIO_CHECKSUM_OFF, NULL, NULL,
4725                             ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
4726                             B_FALSE);
4727 
4728                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4729                             zio_t *, wzio);
4730                         (void) zio_nowait(wzio);
4731 
4732                         write_asize += l2hdr->b_asize;
4733                         /*
4734                          * Keep the clock hand suitably device-aligned.
4735                          */
4736                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4737                         write_psize += buf_p_sz;
4738                         dev->l2ad_hand += buf_p_sz;
4739                 }
4740 
4741                 list_remove(&defer_write_list, dwe);
4742                 kmem_free(dwe, sizeof (*dwe));
4743         }
4744 
4745         list_destroy(&defer_write_list);
4746 
4747         ASSERT3U(write_asize, <=, target_sz);
4748         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4749         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4750         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4751         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4752         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4753 
4754         /*
4755          * Bump device hand to the device start if it is approaching the end.
4756          * l2arc_evict() will already have evicted ahead for this case.
4757          */
4758         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4759                 vdev_space_update(dev->l2ad_vdev,
4760                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4761                 dev->l2ad_hand = dev->l2ad_start;
4762                 dev->l2ad_evict = dev->l2ad_start;
4763                 dev->l2ad_first = B_FALSE;
4764         }
4765 
4766         dev->l2ad_writing = B_TRUE;
4767         (void) zio_wait(pio);
4768         dev->l2ad_writing = B_FALSE;
4769 
4770         return (write_asize);
4771 }
4772 
4773 /*
4774  * Compresses an L2ARC buffer.
4775  * The data to be compressed is in in_data and its size in in_sz. This routine
4776  * tries to compress the data and depending on the compression result there
4777  * are three possible outcomes:
4778  * *) The buffer was incompressible. The function returns with B_FALSE and
4779  *    does nothing else.
4780  * *) The buffer was all-zeros, so there is no need to write it to an L2
4781  *    device. To indicate this situation, the *out_data is set to NULL,
4782  *    *out_sz is set to zero, *compress is set to ZIO_COMPRESS_EMPTY and
4783  *    the function returns B_TRUE.
4784  * *) Compression succeeded and *out_data was set to point to a buffer holding
4785  *    the compressed data buffer, *out_sz was set to indicate the output size,
4786  *    *compress was set to the appropriate compression algorithm and B_TRUE is
4787  *    returned. Once writing is done the buffer will be automatically freed by
4788  *    l2arc_do_free_on_write().


4789  */
4790 static boolean_t
4791 l2arc_compress_buf(void *in_data, uint64_t in_sz, void **out_data,
4792     uint64_t *out_sz, enum zio_compress *compress)
4793 {
4794         void *cdata;

4795 
4796         cdata = zio_data_buf_alloc(in_sz);
4797         *out_sz = zio_compress_data(ZIO_COMPRESS_LZ4, in_data, cdata, in_sz);
4798 
4799         if (*out_sz == 0) {
4800                 /* Zero block, indicate that there's nothing to write. */
4801                 zio_data_buf_free(cdata, in_sz);
4802                 *compress = ZIO_COMPRESS_EMPTY;
4803                 *out_data = NULL;






4804                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4805                 return (B_TRUE);
4806         } else if (*out_sz > 0 && *out_sz < in_sz) {
4807                 /*
4808                  * Compression succeeded, we'll keep the cdata around for
4809                  * writing and release it after writing.
4810                  */
4811                 l2arc_data_free_t *df;
4812 
4813                 *compress = ZIO_COMPRESS_LZ4;
4814                 *out_data = cdata;
4815 
4816                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
4817                 df->l2df_data = cdata;
4818                 df->l2df_size = *out_sz;
4819                 df->l2df_func = zio_data_buf_free;
4820                 mutex_enter(&l2arc_free_on_write_mtx);
4821                 list_insert_head(l2arc_free_on_write, df);
4822                 mutex_exit(&l2arc_free_on_write_mtx);
4823 
4824                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
4825                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
4826                 return (B_TRUE);
4827         } else {
4828                 /*
4829                  * Compression failed, release the compressed buffer.

4830                  */
4831                 zio_data_buf_free(cdata, in_sz);
4832                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
4833                 return (B_FALSE);
4834         }
4835 }
4836 
4837 /*
4838  * Decompresses a zio read back from an l2arc device. On success, the
4839  * underlying zio's io_data buffer is overwritten by the uncompressed
4840  * version. On decompression error (corrupt compressed stream), the
4841  * zio->io_error value is set to signal an I/O error.
4842  *
4843  * Please note that the compressed data stream is not checksummed, so
4844  * if the underlying device is experiencing data corruption, we may feed
4845  * corrupt data to the decompressor, so the decompressor needs to be
4846  * able to handle this situation (LZ4 does).
4847  */
4848 static void
4849 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4850 {
4851         ASSERT(L2ARC_IS_VALID_COMPRESS(c));


4879                  * original compressed data (rather than decompressing to an
4880                  * aux buffer and then copying back the uncompressed buffer,
4881                  * which is likely to be much larger).
4882                  */
4883                 uint64_t csize;
4884                 void *cdata;
4885 
4886                 csize = zio->io_size;
4887                 cdata = zio_data_buf_alloc(csize);
4888                 bcopy(zio->io_data, cdata, csize);
4889                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
4890                     hdr->b_size) != 0)
4891                         zio->io_error = EIO;
4892                 zio_data_buf_free(cdata, csize);
4893         }
4894 
4895         /* Restore the expected uncompressed IO size. */
4896         zio->io_orig_size = zio->io_size = hdr->b_size;
4897 }
4898 






















4899 /*
4900  * This thread feeds the L2ARC at regular intervals.  This is the beating
4901  * heart of the L2ARC.
4902  */
4903 static void
4904 l2arc_feed_thread(void)
4905 {
4906         callb_cpr_t cpr;
4907         l2arc_dev_t *dev;
4908         spa_t *spa;
4909         uint64_t size, wrote;
4910         clock_t begin, next = ddi_get_lbolt();
4911         boolean_t headroom_boost = B_FALSE;
4912 
4913         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4914 
4915         mutex_enter(&l2arc_feed_thr_lock);
4916 
4917         while (l2arc_thread_exit == 0) {
4918                 CALLB_CPR_SAFE_BEGIN(&cpr);