Print this page
6214 zpools going south


 727 
 728         /* updated atomically */
 729         clock_t                 b_arc_access;
 730 
 731         /* self protecting */
 732         refcount_t              b_refcnt;
 733 
 734         arc_callback_t          *b_acb;
 735         /* temporary buffer holder for in-flight compressed data */
 736         void                    *b_tmp_cdata;
 737 } l1arc_buf_hdr_t;
 738 
 739 typedef struct l2arc_dev l2arc_dev_t;
 740 
 741 typedef struct l2arc_buf_hdr {
 742         /* protected by arc_buf_hdr mutex */
 743         l2arc_dev_t             *b_dev;         /* L2ARC device */
 744         uint64_t                b_daddr;        /* disk address, offset byte */
 745         /* real alloc'd buffer size depending on b_compress applied */
 746         int32_t                 b_asize;

 747 
 748         list_node_t             b_l2node;
 749 } l2arc_buf_hdr_t;
 750 
 751 struct arc_buf_hdr {
 752         /* protected by hash lock */
 753         dva_t                   b_dva;
 754         uint64_t                b_birth;
 755         /*
 756          * Even though this checksum is only set/verified when a buffer is in
 757          * the L1 cache, it needs to be in the set of common fields because it
 758          * must be preserved from the time before a buffer is written out to
 759          * L2ARC until after it is read back in.
 760          */
 761         zio_cksum_t             *b_freeze_cksum;
 762 
 763         arc_buf_hdr_t           *b_hash_next;
 764         arc_flags_t             b_flags;
 765 
 766         /* immutable */


 786 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
 787 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
 788 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
 789 
 790 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
 791 #define HDR_L2COMPRESS(hdr)     ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
 792 #define HDR_L2_READING(hdr)     \
 793             (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&       \
 794             ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 795 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 796 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 797 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 798 
 799 #define HDR_ISTYPE_METADATA(hdr)        \
 800             ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 801 #define HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
 802 
 803 #define HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 804 #define HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 805 
 806 /* For storing compression mode in b_flags */
 807 #define HDR_COMPRESS_OFFSET     24
 808 #define HDR_COMPRESS_NBITS      7
 809 
 810 #define HDR_GET_COMPRESS(hdr)   ((enum zio_compress)BF32_GET(hdr->b_flags, \
 811             HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
 812 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
 813             HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
 814 
 815 /*
 816  * Other sizes
 817  */
 818 
 819 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 820 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 821 
 822 /*
 823  * Hash table routines
 824  */
 825 
 826 #define HT_LOCK_PAD     64
 827 
 828 struct ht_lock {
 829         kmutex_t        ht_lock;
 830 #ifdef _KERNEL
 831         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 832 #endif
 833 };
 834 


2025                 return;
2026 
2027         /*
2028          * The header isn't being written to the l2arc device, thus it
2029          * shouldn't have a b_tmp_cdata to free.
2030          */
2031         if (!HDR_L2_WRITING(hdr)) {
2032                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2033                 return;
2034         }
2035 
2036         /*
2037          * The header does not have compression enabled. This can be due
2038          * to the buffer not being compressible, or because we're
2039          * freeing the buffer before the second phase of
2040          * l2arc_write_buffer() has started (which does the compression
2041          * step). In either case, b_tmp_cdata does not point to a
2042          * separately compressed buffer, so there's nothing to free (it
2043          * points to the same buffer as the arc_buf_t's b_data field).
2044          */
2045         if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
2046                 hdr->b_l1hdr.b_tmp_cdata = NULL;
2047                 return;
2048         }
2049 
2050         /*
2051          * There's nothing to free since the buffer was all zero's and
2052          * compressed to a zero length buffer.
2053          */
2054         if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
2055                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2056                 return;
2057         }
2058 
2059         ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
2060 
2061         arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
2062             hdr->b_size, zio_data_buf_free);
2063 
2064         ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
2065         hdr->b_l1hdr.b_tmp_cdata = NULL;
2066 }
2067 
2068 /*
2069  * Free up buf->b_data and if 'remove' is set, then pull the
2070  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2071  */
2072 static void
2073 arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
2074 {
2075         arc_buf_t **bufp;
2076 
2077         /* free up data associated with the buf */
2078         if (buf->b_data != NULL) {
2079                 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;


4152                         ASSERT0(hdr->b_l1hdr.b_datacnt);
4153                         hdr->b_l1hdr.b_datacnt = 1;
4154                         arc_get_data_buf(buf);
4155                         arc_access(hdr, hash_lock);
4156                 }
4157 
4158                 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
4159 
4160                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4161                 acb->acb_done = done;
4162                 acb->acb_private = private;
4163 
4164                 ASSERT(hdr->b_l1hdr.b_acb == NULL);
4165                 hdr->b_l1hdr.b_acb = acb;
4166                 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4167 
4168                 if (HDR_HAS_L2HDR(hdr) &&
4169                     (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4170                         devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4171                         addr = hdr->b_l2hdr.b_daddr;
4172                         b_compress = HDR_GET_COMPRESS(hdr);
4173                         b_asize = hdr->b_l2hdr.b_asize;
4174                         /*
4175                          * Lock out device removal.
4176                          */
4177                         if (vdev_is_dead(vd) ||
4178                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4179                                 vd = NULL;
4180                 }
4181 
4182                 if (hash_lock != NULL)
4183                         mutex_exit(hash_lock);
4184 
4185                 /*
4186                  * At this point, we have a level 1 cache miss.  Try again in
4187                  * L2ARC if possible.
4188                  */
4189                 ASSERT3U(hdr->b_size, ==, size);
4190                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4191                     uint64_t, size, zbookmark_phys_t *, zb);
4192                 ARCSTAT_BUMP(arcstat_misses);


5613         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5614 
5615         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
5616 
5617         cb = zio->io_private;
5618         ASSERT(cb != NULL);
5619         buf = cb->l2rcb_buf;
5620         ASSERT(buf != NULL);
5621 
5622         hash_lock = HDR_LOCK(buf->b_hdr);
5623         mutex_enter(hash_lock);
5624         hdr = buf->b_hdr;
5625         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5626 
5627         /*
5628          * If the buffer was compressed, decompress it first.
5629          */
5630         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
5631                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
5632         ASSERT(zio->io_data != NULL);


5633 
5634         /*
5635          * Check this survived the L2ARC journey.
5636          */
5637         equal = arc_cksum_equal(buf);
5638         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
5639                 mutex_exit(hash_lock);
5640                 zio->io_private = buf;
5641                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
5642                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
5643                 arc_read_done(zio);
5644         } else {
5645                 mutex_exit(hash_lock);
5646                 /*
5647                  * Buffer didn't survive caching.  Increment stats and
5648                  * reissue to the original storage device.
5649                  */
5650                 if (zio->io_error != 0) {
5651                         ARCSTAT_BUMP(arcstat_l2_io_error);
5652                 } else {
5653                         zio->io_error = SET_ERROR(EIO);
5654                 }
5655                 if (!equal)
5656                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
5657 
5658                 /*
5659                  * If there's no waiter, issue an async i/o to the primary
5660                  * storage now.  If there *is* a waiter, the caller must
5661                  * issue the i/o in a context where it's OK to block.
5662                  */
5663                 if (zio->io_waiter == NULL) {
5664                         zio_t *pio = zio_unique_parent(zio);
5665 
5666                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
5667 
5668                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
5669                             buf->b_data, zio->io_size, arc_read_done, buf,
5670                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
5671                 }
5672         }
5673 
5674         kmem_free(cb, sizeof (l2arc_read_callback_t));
5675 }
5676 
5677 /*
5678  * This is the list priority from which the L2ARC will search for pages to
5679  * cache.  This is used within loops (0..3) to cycle through lists in the
5680  * desired order.  This order can have a significant effect on cache
5681  * performance.
5682  *
5683  * Currently the metadata lists are hit first, MFU then MRU, followed by
5684  * the data lists.  This function returns a locked list, and also returns
5685  * the lock pointer.
5686  */
5687 static multilist_sublist_t *
5688 l2arc_sublist_lock(int list_num)
5689 {


5947                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
5948                                 cb->l2wcb_dev = dev;
5949                                 cb->l2wcb_head = head;
5950                                 pio = zio_root(spa, l2arc_write_done, cb,
5951                                     ZIO_FLAG_CANFAIL);
5952                         }
5953 
5954                         /*
5955                          * Create and add a new L2ARC header.
5956                          */
5957                         hdr->b_l2hdr.b_dev = dev;
5958                         hdr->b_flags |= ARC_FLAG_L2_WRITING;
5959                         /*
5960                          * Temporarily stash the data buffer in b_tmp_cdata.
5961                          * The subsequent write step will pick it up from
5962                          * there. This is because can't access b_l1hdr.b_buf
5963                          * without holding the hash_lock, which we in turn
5964                          * can't access without holding the ARC list locks
5965                          * (which we want to avoid during compression/writing).
5966                          */
5967                         HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
5968                         hdr->b_l2hdr.b_asize = hdr->b_size;
5969                         hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
5970 
5971                         /*
5972                          * Explicitly set the b_daddr field to a known
5973                          * value which means "invalid address". This
5974                          * enables us to differentiate which stage of
5975                          * l2arc_write_buffers() the particular header
5976                          * is in (e.g. this loop, or the one below).
5977                          * ARC_FLAG_L2_WRITING is not enough to make
5978                          * this distinction, and we need to know in
5979                          * order to do proper l2arc vdev accounting in
5980                          * arc_release() and arc_hdr_destroy().
5981                          *
5982                          * Note, we can't use a new flag to distinguish
5983                          * the two stages because we don't hold the
5984                          * header's hash_lock below, in the second stage
5985                          * of this function. Thus, we can't simply
5986                          * change the b_flags field to denote that the
5987                          * IO has been sent. We can change the b_daddr


6137  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6138  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6139  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6140  *    data buffer which holds the compressed data to be written, and b_asize
6141  *    tells us how much data there is. b_compress is set to the appropriate
6142  *    compression algorithm. Once writing is done, invoke
6143  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6144  *
6145  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6146  * buffer was incompressible).
6147  */
6148 static boolean_t
6149 l2arc_compress_buf(arc_buf_hdr_t *hdr)
6150 {
6151         void *cdata;
6152         size_t csize, len, rounded;
6153         ASSERT(HDR_HAS_L2HDR(hdr));
6154         l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
6155 
6156         ASSERT(HDR_HAS_L1HDR(hdr));
6157         ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
6158         ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6159 
6160         len = l2hdr->b_asize;
6161         cdata = zio_data_buf_alloc(len);
6162         ASSERT3P(cdata, !=, NULL);
6163         csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6164             cdata, l2hdr->b_asize);
6165 
6166         rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6167         if (rounded > csize) {
6168                 bzero((char *)cdata + csize, rounded - csize);
6169                 csize = rounded;
6170         }
6171 
6172         if (csize == 0) {
6173                 /* zero block, indicate that there's nothing to write */
6174                 zio_data_buf_free(cdata, len);
6175                 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
6176                 l2hdr->b_asize = 0;
6177                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6178                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6179                 return (B_TRUE);
6180         } else if (csize > 0 && csize < len) {
6181                 /*
6182                  * Compression succeeded, we'll keep the cdata around for
6183                  * writing and release it afterwards.
6184                  */
6185                 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
6186                 l2hdr->b_asize = csize;
6187                 hdr->b_l1hdr.b_tmp_cdata = cdata;
6188                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
6189                 return (B_TRUE);
6190         } else {
6191                 /*
6192                  * Compression failed, release the compressed buffer.
6193                  * l2hdr will be left unmodified.
6194                  */
6195                 zio_data_buf_free(cdata, len);
6196                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
6197                 return (B_FALSE);
6198         }
6199 }
6200 
6201 /*
6202  * Decompresses a zio read back from an l2arc device. On success, the
6203  * underlying zio's io_data buffer is overwritten by the uncompressed
6204  * version. On decompression error (corrupt compressed stream), the
6205  * zio->io_error value is set to signal an I/O error.


6252                 bcopy(zio->io_data, cdata, csize);
6253                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
6254                     hdr->b_size) != 0)
6255                         zio->io_error = EIO;
6256                 zio_data_buf_free(cdata, csize);
6257         }
6258 
6259         /* Restore the expected uncompressed IO size. */
6260         zio->io_orig_size = zio->io_size = hdr->b_size;
6261 }
6262 
6263 /*
6264  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6265  * This buffer serves as a temporary holder of compressed data while
6266  * the buffer entry is being written to an l2arc device. Once that is
6267  * done, we can dispose of it.
6268  */
6269 static void
6270 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6271 {
6272         enum zio_compress comp = HDR_GET_COMPRESS(hdr);

6273 
6274         ASSERT(HDR_HAS_L1HDR(hdr));
6275         ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6276 
6277         if (comp == ZIO_COMPRESS_OFF) {
6278                 /*
6279                  * In this case, b_tmp_cdata points to the same buffer
6280                  * as the arc_buf_t's b_data field. We don't want to
6281                  * free it, since the arc_buf_t will handle that.
6282                  */
6283                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6284         } else if (comp == ZIO_COMPRESS_EMPTY) {
6285                 /*
6286                  * In this case, b_tmp_cdata was compressed to an empty
6287                  * buffer, thus there's nothing to free and b_tmp_cdata
6288                  * should have been set to NULL in l2arc_write_buffers().
6289                  */
6290                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6291         } else {
6292                 /*




 727 
 728         /* updated atomically */
 729         clock_t                 b_arc_access;
 730 
 731         /* self protecting */
 732         refcount_t              b_refcnt;
 733 
 734         arc_callback_t          *b_acb;
 735         /* temporary buffer holder for in-flight compressed data */
 736         void                    *b_tmp_cdata;
 737 } l1arc_buf_hdr_t;
 738 
 739 typedef struct l2arc_dev l2arc_dev_t;
 740 
 741 typedef struct l2arc_buf_hdr {
 742         /* protected by arc_buf_hdr mutex */
 743         l2arc_dev_t             *b_dev;         /* L2ARC device */
 744         uint64_t                b_daddr;        /* disk address, offset byte */
 745         /* real alloc'd buffer size depending on b_compress applied */
 746         int32_t                 b_asize;
 747         uint8_t                 b_compress;
 748 
 749         list_node_t             b_l2node;
 750 } l2arc_buf_hdr_t;
 751 
 752 struct arc_buf_hdr {
 753         /* protected by hash lock */
 754         dva_t                   b_dva;
 755         uint64_t                b_birth;
 756         /*
 757          * Even though this checksum is only set/verified when a buffer is in
 758          * the L1 cache, it needs to be in the set of common fields because it
 759          * must be preserved from the time before a buffer is written out to
 760          * L2ARC until after it is read back in.
 761          */
 762         zio_cksum_t             *b_freeze_cksum;
 763 
 764         arc_buf_hdr_t           *b_hash_next;
 765         arc_flags_t             b_flags;
 766 
 767         /* immutable */


 787 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
 788 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
 789 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
 790 
 791 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
 792 #define HDR_L2COMPRESS(hdr)     ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
 793 #define HDR_L2_READING(hdr)     \
 794             (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&       \
 795             ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 796 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 797 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 798 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 799 
 800 #define HDR_ISTYPE_METADATA(hdr)        \
 801             ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 802 #define HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
 803 
 804 #define HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 805 #define HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 806 









 807 /*
 808  * Other sizes
 809  */
 810 
 811 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 812 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 813 
 814 /*
 815  * Hash table routines
 816  */
 817 
 818 #define HT_LOCK_PAD     64
 819 
 820 struct ht_lock {
 821         kmutex_t        ht_lock;
 822 #ifdef _KERNEL
 823         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 824 #endif
 825 };
 826 


2017                 return;
2018 
2019         /*
2020          * The header isn't being written to the l2arc device, thus it
2021          * shouldn't have a b_tmp_cdata to free.
2022          */
2023         if (!HDR_L2_WRITING(hdr)) {
2024                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2025                 return;
2026         }
2027 
2028         /*
2029          * The header does not have compression enabled. This can be due
2030          * to the buffer not being compressible, or because we're
2031          * freeing the buffer before the second phase of
2032          * l2arc_write_buffer() has started (which does the compression
2033          * step). In either case, b_tmp_cdata does not point to a
2034          * separately compressed buffer, so there's nothing to free (it
2035          * points to the same buffer as the arc_buf_t's b_data field).
2036          */
2037         if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) {
2038                 hdr->b_l1hdr.b_tmp_cdata = NULL;
2039                 return;
2040         }
2041 
2042         /*
2043          * There's nothing to free since the buffer was all zero's and
2044          * compressed to a zero length buffer.
2045          */
2046         if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) {
2047                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2048                 return;
2049         }
2050 
2051         ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress));
2052 
2053         arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
2054             hdr->b_size, zio_data_buf_free);
2055 
2056         ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
2057         hdr->b_l1hdr.b_tmp_cdata = NULL;
2058 }
2059 
2060 /*
2061  * Free up buf->b_data and if 'remove' is set, then pull the
2062  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2063  */
2064 static void
2065 arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
2066 {
2067         arc_buf_t **bufp;
2068 
2069         /* free up data associated with the buf */
2070         if (buf->b_data != NULL) {
2071                 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;


4144                         ASSERT0(hdr->b_l1hdr.b_datacnt);
4145                         hdr->b_l1hdr.b_datacnt = 1;
4146                         arc_get_data_buf(buf);
4147                         arc_access(hdr, hash_lock);
4148                 }
4149 
4150                 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
4151 
4152                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4153                 acb->acb_done = done;
4154                 acb->acb_private = private;
4155 
4156                 ASSERT(hdr->b_l1hdr.b_acb == NULL);
4157                 hdr->b_l1hdr.b_acb = acb;
4158                 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4159 
4160                 if (HDR_HAS_L2HDR(hdr) &&
4161                     (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4162                         devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4163                         addr = hdr->b_l2hdr.b_daddr;
4164                         b_compress = hdr->b_l2hdr.b_compress;
4165                         b_asize = hdr->b_l2hdr.b_asize;
4166                         /*
4167                          * Lock out device removal.
4168                          */
4169                         if (vdev_is_dead(vd) ||
4170                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4171                                 vd = NULL;
4172                 }
4173 
4174                 if (hash_lock != NULL)
4175                         mutex_exit(hash_lock);
4176 
4177                 /*
4178                  * At this point, we have a level 1 cache miss.  Try again in
4179                  * L2ARC if possible.
4180                  */
4181                 ASSERT3U(hdr->b_size, ==, size);
4182                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4183                     uint64_t, size, zbookmark_phys_t *, zb);
4184                 ARCSTAT_BUMP(arcstat_misses);


5605         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5606 
5607         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
5608 
5609         cb = zio->io_private;
5610         ASSERT(cb != NULL);
5611         buf = cb->l2rcb_buf;
5612         ASSERT(buf != NULL);
5613 
5614         hash_lock = HDR_LOCK(buf->b_hdr);
5615         mutex_enter(hash_lock);
5616         hdr = buf->b_hdr;
5617         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5618 
5619         /*
5620          * If the buffer was compressed, decompress it first.
5621          */
5622         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
5623                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
5624         ASSERT(zio->io_data != NULL);
5625         ASSERT3U(zio->io_size, ==, hdr->b_size);
5626         ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size);
5627 
5628         /*
5629          * Check this survived the L2ARC journey.
5630          */
5631         equal = arc_cksum_equal(buf);
5632         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
5633                 mutex_exit(hash_lock);
5634                 zio->io_private = buf;
5635                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
5636                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
5637                 arc_read_done(zio);
5638         } else {
5639                 mutex_exit(hash_lock);
5640                 /*
5641                  * Buffer didn't survive caching.  Increment stats and
5642                  * reissue to the original storage device.
5643                  */
5644                 if (zio->io_error != 0) {
5645                         ARCSTAT_BUMP(arcstat_l2_io_error);
5646                 } else {
5647                         zio->io_error = SET_ERROR(EIO);
5648                 }
5649                 if (!equal)
5650                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
5651 
5652                 /*
5653                  * If there's no waiter, issue an async i/o to the primary
5654                  * storage now.  If there *is* a waiter, the caller must
5655                  * issue the i/o in a context where it's OK to block.
5656                  */
5657                 if (zio->io_waiter == NULL) {
5658                         zio_t *pio = zio_unique_parent(zio);
5659 
5660                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
5661 
5662                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
5663                             buf->b_data, hdr->b_size, arc_read_done, buf,
5664                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
5665                 }
5666         }
5667 
5668         kmem_free(cb, sizeof (l2arc_read_callback_t));
5669 }
5670 
5671 /*
5672  * This is the list priority from which the L2ARC will search for pages to
5673  * cache.  This is used within loops (0..3) to cycle through lists in the
5674  * desired order.  This order can have a significant effect on cache
5675  * performance.
5676  *
5677  * Currently the metadata lists are hit first, MFU then MRU, followed by
5678  * the data lists.  This function returns a locked list, and also returns
5679  * the lock pointer.
5680  */
5681 static multilist_sublist_t *
5682 l2arc_sublist_lock(int list_num)
5683 {


5941                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
5942                                 cb->l2wcb_dev = dev;
5943                                 cb->l2wcb_head = head;
5944                                 pio = zio_root(spa, l2arc_write_done, cb,
5945                                     ZIO_FLAG_CANFAIL);
5946                         }
5947 
5948                         /*
5949                          * Create and add a new L2ARC header.
5950                          */
5951                         hdr->b_l2hdr.b_dev = dev;
5952                         hdr->b_flags |= ARC_FLAG_L2_WRITING;
5953                         /*
5954                          * Temporarily stash the data buffer in b_tmp_cdata.
5955                          * The subsequent write step will pick it up from
5956                          * there. This is because can't access b_l1hdr.b_buf
5957                          * without holding the hash_lock, which we in turn
5958                          * can't access without holding the ARC list locks
5959                          * (which we want to avoid during compression/writing).
5960                          */
5961                         hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF;
5962                         hdr->b_l2hdr.b_asize = hdr->b_size;
5963                         hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
5964 
5965                         /*
5966                          * Explicitly set the b_daddr field to a known
5967                          * value which means "invalid address". This
5968                          * enables us to differentiate which stage of
5969                          * l2arc_write_buffers() the particular header
5970                          * is in (e.g. this loop, or the one below).
5971                          * ARC_FLAG_L2_WRITING is not enough to make
5972                          * this distinction, and we need to know in
5973                          * order to do proper l2arc vdev accounting in
5974                          * arc_release() and arc_hdr_destroy().
5975                          *
5976                          * Note, we can't use a new flag to distinguish
5977                          * the two stages because we don't hold the
5978                          * header's hash_lock below, in the second stage
5979                          * of this function. Thus, we can't simply
5980                          * change the b_flags field to denote that the
5981                          * IO has been sent. We can change the b_daddr


6131  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6132  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6133  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6134  *    data buffer which holds the compressed data to be written, and b_asize
6135  *    tells us how much data there is. b_compress is set to the appropriate
6136  *    compression algorithm. Once writing is done, invoke
6137  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6138  *
6139  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6140  * buffer was incompressible).
6141  */
6142 static boolean_t
6143 l2arc_compress_buf(arc_buf_hdr_t *hdr)
6144 {
6145         void *cdata;
6146         size_t csize, len, rounded;
6147         ASSERT(HDR_HAS_L2HDR(hdr));
6148         l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
6149 
6150         ASSERT(HDR_HAS_L1HDR(hdr));
6151         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
6152         ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6153 
6154         len = l2hdr->b_asize;
6155         cdata = zio_data_buf_alloc(len);
6156         ASSERT3P(cdata, !=, NULL);
6157         csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6158             cdata, l2hdr->b_asize);
6159 
6160         rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6161         if (rounded > csize) {
6162                 bzero((char *)cdata + csize, rounded - csize);
6163                 csize = rounded;
6164         }
6165 
6166         if (csize == 0) {
6167                 /* zero block, indicate that there's nothing to write */
6168                 zio_data_buf_free(cdata, len);
6169                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
6170                 l2hdr->b_asize = 0;
6171                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6172                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6173                 return (B_TRUE);
6174         } else if (csize > 0 && csize < len) {
6175                 /*
6176                  * Compression succeeded, we'll keep the cdata around for
6177                  * writing and release it afterwards.
6178                  */
6179                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
6180                 l2hdr->b_asize = csize;
6181                 hdr->b_l1hdr.b_tmp_cdata = cdata;
6182                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
6183                 return (B_TRUE);
6184         } else {
6185                 /*
6186                  * Compression failed, release the compressed buffer.
6187                  * l2hdr will be left unmodified.
6188                  */
6189                 zio_data_buf_free(cdata, len);
6190                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
6191                 return (B_FALSE);
6192         }
6193 }
6194 
6195 /*
6196  * Decompresses a zio read back from an l2arc device. On success, the
6197  * underlying zio's io_data buffer is overwritten by the uncompressed
6198  * version. On decompression error (corrupt compressed stream), the
6199  * zio->io_error value is set to signal an I/O error.


6246                 bcopy(zio->io_data, cdata, csize);
6247                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
6248                     hdr->b_size) != 0)
6249                         zio->io_error = EIO;
6250                 zio_data_buf_free(cdata, csize);
6251         }
6252 
6253         /* Restore the expected uncompressed IO size. */
6254         zio->io_orig_size = zio->io_size = hdr->b_size;
6255 }
6256 
6257 /*
6258  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6259  * This buffer serves as a temporary holder of compressed data while
6260  * the buffer entry is being written to an l2arc device. Once that is
6261  * done, we can dispose of it.
6262  */
6263 static void
6264 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6265 {
6266         ASSERT(HDR_HAS_L2HDR(hdr));
6267         enum zio_compress comp = hdr->b_l2hdr.b_compress;
6268 
6269         ASSERT(HDR_HAS_L1HDR(hdr));
6270         ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6271 
6272         if (comp == ZIO_COMPRESS_OFF) {
6273                 /*
6274                  * In this case, b_tmp_cdata points to the same buffer
6275                  * as the arc_buf_t's b_data field. We don't want to
6276                  * free it, since the arc_buf_t will handle that.
6277                  */
6278                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6279         } else if (comp == ZIO_COMPRESS_EMPTY) {
6280                 /*
6281                  * In this case, b_tmp_cdata was compressed to an empty
6282                  * buffer, thus there's nothing to free and b_tmp_cdata
6283                  * should have been set to NULL in l2arc_write_buffers().
6284                  */
6285                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6286         } else {
6287                 /*