Print this page
3995 Memory leak of compressed buffers in l2arc_write_done
3997 ZFS L2ARC default behavior should allow reading while writing

@@ -115,10 +115,17 @@
  *      - L2ARC buflist creation
  *      - L2ARC buflist eviction
  *      - L2ARC write completion, which walks L2ARC buflists
  *      - ARC header destruction, as it removes from L2ARC buflists
  *      - ARC header release, as it removes from L2ARC buflists
+ *
+ * Please note that if you first grab the l2arc_buflist_mtx, you can't do a
+ * mutex_enter on a buffer's hash_lock anymore due to lock inversion. To grab
+ * the hash_lock you must use mutex_tryenter and possibly deal with the buffer
+ * not being available (due to e.g. some other thread holding it while trying
+ * to unconditionally grab the l2arc_buflist_mtx which you are holding). The
+ * inverse situation (first grab hash_lock, then l2arc_buflist_mtx) is safe.
  */
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/zio_compress.h>

@@ -620,11 +627,11 @@
 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
+boolean_t l2arc_norw = B_FALSE;                 /* no reads during writes */
 
 /*
  * L2ARC Internals
  */
 typedef struct l2arc_dev {

@@ -670,12 +677,10 @@
         uint64_t                b_daddr;        /* disk address, offset byte */
         /* compression applied to buffer data */
         enum zio_compress       b_compress;
         /* real alloc'd buffer size depending on b_compress applied */
         int                     b_asize;
-        /* temporary buffer holder for in-flight compressed data */
-        void                    *b_tmp_cdata;
 };
 
 typedef struct l2arc_data_free {
         /* protected by l2arc_free_on_write_mtx */
         void            *l2df_data;

@@ -690,14 +695,14 @@
 
 static void l2arc_read_done(zio_t *zio);
 static void l2arc_hdr_stat_add(void);
 static void l2arc_hdr_stat_remove(void);
 
-static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
+static boolean_t l2arc_compress_buf(void *in_data, uint64_t in_sz,
+    void **out_data, uint64_t *out_sz, enum zio_compress *compress);
 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
     enum zio_compress c);
-static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
         uint8_t *vdva = (uint8_t *)dva;

@@ -4115,11 +4120,11 @@
 
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
-l2arc_do_free_on_write()
+l2arc_do_free_on_write(void)
 {
         list_t *buflist;
         l2arc_data_free_t *df, *df_prev;
 
         mutex_enter(&l2arc_free_on_write_mtx);

@@ -4145,14 +4150,18 @@
 l2arc_write_done(zio_t *zio)
 {
         l2arc_write_callback_t *cb;
         l2arc_dev_t *dev;
         list_t *buflist;
-        arc_buf_hdr_t *head, *ab, *ab_prev;
-        l2arc_buf_hdr_t *abl2;
-        kmutex_t *hash_lock;
+        arc_buf_hdr_t *head, *ab;
 
+        struct defer_done_entry {
+                arc_buf_hdr_t *dde_buf;
+                list_node_t dde_node;
+        } *dde, *dde_next;
+        list_t defer_done_list;
+
         cb = zio->io_private;
         ASSERT(cb != NULL);
         dev = cb->l2wcb_dev;
         ASSERT(dev != NULL);
         head = cb->l2wcb_head;

@@ -4168,56 +4177,66 @@
         mutex_enter(&l2arc_buflist_mtx);
 
         /*
          * All writes completed, or an error was hit.
          */
-        for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
-                ab_prev = list_prev(buflist, ab);
-
-                hash_lock = HDR_LOCK(ab);
-                if (!mutex_tryenter(hash_lock)) {
+        list_create(&defer_done_list, sizeof (*dde),
+            offsetof(struct defer_done_entry, dde_node));
+        for (ab = list_prev(buflist, head); ab; ab = list_prev(buflist, ab)) {
                         /*
-                         * This buffer misses out.  It may be in a stage
-                         * of eviction.  Its ARC_L2_WRITING flag will be
-                         * left set, denying reads to this buffer.
+                 * Can't pause here to grab hash_lock while also holding
+                 * l2arc_buflist_mtx, so place the buffers on a temporary
+                 * thread-local list for later processing.
                          */
-                        ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
-                        continue;
+                dde = kmem_alloc(sizeof (*dde), KM_SLEEP);
+                dde->dde_buf = ab;
+                list_insert_tail(&defer_done_list, dde);
                 }
 
-                abl2 = ab->b_l2hdr;
+        atomic_inc_64(&l2arc_writes_done);
+        list_remove(buflist, head);
+        kmem_cache_free(hdr_cache, head);
+        mutex_exit(&l2arc_buflist_mtx);
 
                 /*
-                 * Release the temporary compressed buffer as soon as possible.
+         * Now process the buffers. We're not holding l2arc_buflist_mtx
+         * anymore, so we can do a regular mutex_enter on the hash_lock.
                  */
-                if (abl2->b_compress != ZIO_COMPRESS_OFF)
-                        l2arc_release_cdata_buf(ab);
+        for (dde = list_head(&defer_done_list); dde != NULL; dde = dde_next) {
+                kmutex_t *hash_lock;
 
+                dde_next = list_next(&defer_done_list, dde);
+                ab = dde->dde_buf;
+                hash_lock = HDR_LOCK(ab);
+
+                mutex_enter(hash_lock);
+
                 if (zio->io_error != 0) {
                         /*
                          * Error - drop L2ARC entry.
                          */
+                        l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+                        mutex_enter(&l2arc_buflist_mtx);
                         list_remove(buflist, ab);
-                        ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
+                        mutex_exit(&l2arc_buflist_mtx);
+                        ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
                         ab->b_l2hdr = NULL;
-                        kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+                        kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
                 }
 
                 /*
                  * Allow ARC to begin reads to this L2ARC entry.
                  */
                 ab->b_flags &= ~ARC_L2_WRITING;
 
                 mutex_exit(hash_lock);
+
+                list_remove(&defer_done_list, dde);
         }
+        list_destroy(&defer_done_list);
 
-        atomic_inc_64(&l2arc_writes_done);
-        list_remove(buflist, head);
-        kmem_cache_free(hdr_cache, head);
-        mutex_exit(&l2arc_buflist_mtx);
-
         l2arc_do_free_on_write();
 
         kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 

@@ -4348,11 +4367,11 @@
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
         list_t *buflist;
-        l2arc_buf_hdr_t *abl2;
+        l2arc_buf_hdr_t *l2hdr;
         arc_buf_hdr_t *ab, *ab_prev;
         kmutex_t *hash_lock;
         uint64_t taddr;
 
         buflist = dev->l2ad_buflist;

@@ -4448,14 +4467,14 @@
 
                         /*
                          * Tell ARC this no longer exists in L2ARC.
                          */
                         if (ab->b_l2hdr != NULL) {
-                                abl2 = ab->b_l2hdr;
-                                ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
+                                l2hdr = ab->b_l2hdr;
+                                ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
                                 ab->b_l2hdr = NULL;
-                                kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+                                kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
                         }
                         list_remove(buflist, ab);
 
                         /*

@@ -4496,10 +4515,17 @@
         boolean_t full;
         l2arc_write_callback_t *cb;
         zio_t *pio, *wzio;
         uint64_t guid = spa_load_guid(spa);
         const boolean_t do_headroom_boost = *headroom_boost;
+        struct defer_write_entry {
+                arc_buf_hdr_t *dwe_buf;
+                void *dwe_orig_data;
+                uint64_t dwe_orig_size;
+                list_node_t *dwe_node;
+        } *dwe, *dwe_next;
+        list_t defer_write_list;
 
         ASSERT(dev->l2ad_vdev != NULL);
 
         /* Lower the flag now, we might want to raise it again later. */
         *headroom_boost = B_FALSE;

@@ -4517,10 +4543,12 @@
         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
 
         /*
          * Copy buffers for L2ARC writing.
          */
+        list_create(&defer_write_list, sizeof (*dwe),
+            offsetof(struct defer_write_entry, dwe_node));
         mutex_enter(&l2arc_buflist_mtx);
         for (int try = 0; try <= 3; try++) {
                 uint64_t passed_sz = 0;
 
                 list = l2arc_list_locked(try, &list_lock);

@@ -4541,11 +4569,10 @@
                         headroom = (headroom * l2arc_headroom_boost) / 100;
 
                 for (; ab; ab = ab_prev) {
                         l2arc_buf_hdr_t *l2hdr;
                         kmutex_t *hash_lock;
-                        uint64_t buf_sz;
 
                         if (arc_warm == B_FALSE)
                                 ab_prev = list_next(list, ab);
                         else
                                 ab_prev = list_prev(list, ab);

@@ -4598,27 +4625,31 @@
                          * Create and add a new L2ARC header.
                          */
                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
                         l2hdr->b_dev = dev;
                         ab->b_flags |= ARC_L2_WRITING;
+                        l2hdr->b_compress = ZIO_COMPRESS_OFF;
+                        l2hdr->b_asize = ab->b_size;
 
                         /*
-                         * Temporarily stash the data buffer in b_tmp_cdata.
+                         * Temporarily stash the buffer in defer_write_entries.
                          * The subsequent write step will pick it up from
-                         * there. This is because can't access ab->b_buf
+                         * there. This is because we can't access ab->b_buf
                          * without holding the hash_lock, which we in turn
                          * can't access without holding the ARC list locks
-                         * (which we want to avoid during compression/writing).
+                         * while walking the ARC lists (we want to avoid
+                         * holding these locks during compression/writing).
                          */
-                        l2hdr->b_compress = ZIO_COMPRESS_OFF;
-                        l2hdr->b_asize = ab->b_size;
-                        l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+                        dwe = kmem_alloc(sizeof (*dwe), KM_SLEEP);
+                        dwe->dwe_buf = ab;
+                        dwe->dwe_orig_data = ab->b_buf->b_data;
+                        dwe->dwe_orig_size = ab->b_size;
 
-                        buf_sz = ab->b_size;
                         ab->b_l2hdr = l2hdr;
 
                         list_insert_head(dev->l2ad_buflist, ab);
+                        list_insert_tail(&defer_write_list, dwe);
 
                         /*
                          * Compute and store the buffer cksum before
                          * writing.  On debug the cksum is verified first.
                          */

@@ -4625,11 +4656,11 @@
                         arc_cksum_verify(ab->b_buf);
                         arc_cksum_compute(ab->b_buf, B_TRUE);
 
                         mutex_exit(hash_lock);
 
-                        write_sz += buf_sz;
+                        write_sz += dwe->dwe_orig_size;
                 }
 
                 mutex_exit(list_lock);
 
                 if (full == B_TRUE)

@@ -4639,74 +4670,81 @@
         /* No buffers selected for writing? */
         if (pio == NULL) {
                 ASSERT0(write_sz);
                 mutex_exit(&l2arc_buflist_mtx);
                 kmem_cache_free(hdr_cache, head);
+                list_destroy(&defer_write_list);
                 return (0);
         }
 
+        mutex_exit(&l2arc_buflist_mtx);
+
         /*
          * Now start writing the buffers. We're starting at the write head
          * and work backwards, retracing the course of the buffer selector
          * loop above.
          */
-        for (ab = list_prev(dev->l2ad_buflist, head); ab;
-            ab = list_prev(dev->l2ad_buflist, ab)) {
+        for (dwe = list_head(&defer_write_list); dwe != NULL; dwe = dwe_next) {
                 l2arc_buf_hdr_t *l2hdr;
                 uint64_t buf_sz;
 
+                dwe_next = list_next(&defer_write_list, dwe);
+                ab = dwe->dwe_buf;
+
                 /*
-                 * We shouldn't need to lock the buffer here, since we flagged
-                 * it as ARC_L2_WRITING in the previous step, but we must take
-                 * care to only access its L2 cache parameters. In particular,
-                 * ab->b_buf may be invalid by now due to ARC eviction.
+                 * Accessing ab->b_l2hdr without locking is safe here because
+                 * we're holding the l2arc_buflist_mtx and no other thread will
+                 * ever directly modify the L2 fields. In particular ab->b_buf
+                 * may be invalid by now due to ARC eviction.
                  */
                 l2hdr = ab->b_l2hdr;
                 l2hdr->b_daddr = dev->l2ad_hand;
 
                 if ((ab->b_flags & ARC_L2COMPRESS) &&
-                    l2hdr->b_asize >= buf_compress_minsz) {
-                        if (l2arc_compress_buf(l2hdr)) {
+                    l2hdr->b_asize >= buf_compress_minsz &&
+                    l2arc_compress_buf(dwe->dwe_orig_data, dwe->dwe_orig_size,
+                    &buf_data, &buf_sz, &l2hdr->b_compress)) {
                                 /*
                                  * If compression succeeded, enable headroom
                                  * boost on the next scan cycle.
                                  */
                                 *headroom_boost = B_TRUE;
+                        l2hdr->b_asize = buf_sz;
+                } else {
+                        buf_data = dwe->dwe_orig_data;
+                        buf_sz = dwe->dwe_orig_size;
+                        l2hdr->b_asize = dwe->dwe_orig_size;
                         }
-                }
 
-                /*
-                 * Pick up the buffer data we had previously stashed away
-                 * (and now potentially also compressed).
-                 */
-                buf_data = l2hdr->b_tmp_cdata;
-                buf_sz = l2hdr->b_asize;
-
                 /* Compression may have squashed the buffer to zero length. */
                 if (buf_sz != 0) {
                         uint64_t buf_p_sz;
 
                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
-                            dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
-                            NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
-                            ZIO_FLAG_CANFAIL, B_FALSE);
+                            dev->l2ad_hand, l2hdr->b_asize, buf_data,
+                            ZIO_CHECKSUM_OFF, NULL, NULL,
+                            ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
+                            B_FALSE);
 
                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
                             zio_t *, wzio);
                         (void) zio_nowait(wzio);
 
-                        write_asize += buf_sz;
+                        write_asize += l2hdr->b_asize;
                         /*
                          * Keep the clock hand suitably device-aligned.
                          */
                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
                         write_psize += buf_p_sz;
                         dev->l2ad_hand += buf_p_sz;
                 }
+
+                list_remove(&defer_write_list, dwe);
+                kmem_free(dwe, sizeof (*dwe));
         }
 
-        mutex_exit(&l2arc_buflist_mtx);
+        list_destroy(&defer_write_list);
 
         ASSERT3U(write_asize, <=, target_sz);
         ARCSTAT_BUMP(arcstat_l2_writes_sent);
         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
         ARCSTAT_INCR(arcstat_l2_size, write_sz);

@@ -4732,65 +4770,67 @@
         return (write_asize);
 }
 
 /*
  * Compresses an L2ARC buffer.
- * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
- * size in l2hdr->b_asize. This routine tries to compress the data and
- * depending on the compression result there are three possible outcomes:
- * *) The buffer was incompressible. The original l2hdr contents were left
- *    untouched and are ready for writing to an L2 device.
+ * The data to be compressed is in in_data and its size in in_sz. This routine
+ * tries to compress the data and depending on the compression result there
+ * are three possible outcomes:
+ * *) The buffer was incompressible. The function returns with B_FALSE and
+ *    does nothing else.
  * *) The buffer was all-zeros, so there is no need to write it to an L2
- *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
- *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
- * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
- *    data buffer which holds the compressed data to be written, and b_asize
- *    tells us how much data there is. b_compress is set to the appropriate
- *    compression algorithm. Once writing is done, invoke
- *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
- *
- * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
- * buffer was incompressible).
+ *    device. To indicate this situation, the *out_data is set to NULL,
+ *    *out_sz is set to zero, *compress is set to ZIO_COMPRESS_EMPTY and
+ *    the function returns B_TRUE.
+ * *) Compression succeeded and *out_data was set to point to a buffer holding
+ *    the compressed data buffer, *out_sz was set to indicate the output size,
+ *    *compress was set to the appropriate compression algorithm and B_TRUE is
+ *    returned. Once writing is done the buffer will be automatically freed by
+ *    l2arc_do_free_on_write().
  */
 static boolean_t
-l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+l2arc_compress_buf(void *in_data, uint64_t in_sz, void **out_data,
+    uint64_t *out_sz, enum zio_compress *compress)
 {
         void *cdata;
-        size_t csize, len;
 
-        ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
-        ASSERT(l2hdr->b_tmp_cdata != NULL);
+        cdata = zio_data_buf_alloc(in_sz);
+        *out_sz = zio_compress_data(ZIO_COMPRESS_LZ4, in_data, cdata, in_sz);
 
-        len = l2hdr->b_asize;
-        cdata = zio_data_buf_alloc(len);
-        csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
-            cdata, l2hdr->b_asize);
-
-        if (csize == 0) {
-                /* zero block, indicate that there's nothing to write */
-                zio_data_buf_free(cdata, len);
-                l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
-                l2hdr->b_asize = 0;
-                l2hdr->b_tmp_cdata = NULL;
+        if (*out_sz == 0) {
+                /* Zero block, indicate that there's nothing to write. */
+                zio_data_buf_free(cdata, in_sz);
+                *compress = ZIO_COMPRESS_EMPTY;
+                *out_data = NULL;
                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
                 return (B_TRUE);
-        } else if (csize > 0 && csize < len) {
+        } else if (*out_sz > 0 && *out_sz < in_sz) {
                 /*
                  * Compression succeeded, we'll keep the cdata around for
-                 * writing and release it afterwards.
+                 * writing and release it after writing.
                  */
-                l2hdr->b_compress = ZIO_COMPRESS_LZ4;
-                l2hdr->b_asize = csize;
-                l2hdr->b_tmp_cdata = cdata;
+                l2arc_data_free_t *df;
+
+                *compress = ZIO_COMPRESS_LZ4;
+                *out_data = cdata;
+
+                df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+                df->l2df_data = cdata;
+                df->l2df_size = *out_sz;
+                df->l2df_func = zio_data_buf_free;
+                mutex_enter(&l2arc_free_on_write_mtx);
+                list_insert_head(l2arc_free_on_write, df);
+                mutex_exit(&l2arc_free_on_write_mtx);
+
                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
+                ARCSTAT_BUMP(arcstat_l2_free_on_write);
                 return (B_TRUE);
         } else {
                 /*
                  * Compression failed, release the compressed buffer.
-                 * l2hdr will be left unmodified.
                  */
-                zio_data_buf_free(cdata, len);
+                zio_data_buf_free(cdata, in_sz);
                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
                 return (B_FALSE);
         }
 }
 

@@ -4854,32 +4894,10 @@
 
         /* Restore the expected uncompressed IO size. */
         zio->io_orig_size = zio->io_size = hdr->b_size;
 }
 
-/*
- * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
- * This buffer serves as a temporary holder of compressed data while
- * the buffer entry is being written to an l2arc device. Once that is
- * done, we can dispose of it.
- */
-static void
-l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
-{
-        l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
-
-        if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
-                /*
-                 * If the data was compressed, then we've allocated a
-                 * temporary buffer for it, so now we need to release it.
-                 */
-                ASSERT(l2hdr->b_tmp_cdata != NULL);
-                zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
-        }
-        l2hdr->b_tmp_cdata = NULL;
-}
-
 /*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
 static void