Print this page
3995 Memory leak of compressed buffers in l2arc_write_done
3997 ZFS L2ARC default behavior should allow reading while writing
@@ -115,10 +115,17 @@
* - L2ARC buflist creation
* - L2ARC buflist eviction
* - L2ARC write completion, which walks L2ARC buflists
* - ARC header destruction, as it removes from L2ARC buflists
* - ARC header release, as it removes from L2ARC buflists
+ *
+ * Please note that if you first grab the l2arc_buflist_mtx, you can't do a
+ * mutex_enter on a buffer's hash_lock anymore due to lock inversion. To grab
+ * the hash_lock you must use mutex_tryenter and possibly deal with the buffer
+ * not being available (due to e.g. some other thread holding it while trying
+ * to unconditionally grab the l2arc_buflist_mtx which you are holding). The
+ * inverse situation (first grab hash_lock, then l2arc_buflist_mtx) is safe.
*/
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/zio_compress.h>
@@ -620,11 +627,11 @@
uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
+boolean_t l2arc_norw = B_FALSE; /* no reads during writes */
/*
* L2ARC Internals
*/
typedef struct l2arc_dev {
@@ -670,12 +677,10 @@
uint64_t b_daddr; /* disk address, offset byte */
/* compression applied to buffer data */
enum zio_compress b_compress;
/* real alloc'd buffer size depending on b_compress applied */
int b_asize;
- /* temporary buffer holder for in-flight compressed data */
- void *b_tmp_cdata;
};
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
void *l2df_data;
@@ -690,14 +695,14 @@
static void l2arc_read_done(zio_t *zio);
static void l2arc_hdr_stat_add(void);
static void l2arc_hdr_stat_remove(void);
-static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
+static boolean_t l2arc_compress_buf(void *in_data, uint64_t in_sz,
+ void **out_data, uint64_t *out_sz, enum zio_compress *compress);
static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
enum zio_compress c);
-static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
uint8_t *vdva = (uint8_t *)dva;
@@ -4115,11 +4120,11 @@
/*
* Free buffers that were tagged for destruction.
*/
static void
-l2arc_do_free_on_write()
+l2arc_do_free_on_write(void)
{
list_t *buflist;
l2arc_data_free_t *df, *df_prev;
mutex_enter(&l2arc_free_on_write_mtx);
@@ -4145,14 +4150,18 @@
l2arc_write_done(zio_t *zio)
{
l2arc_write_callback_t *cb;
l2arc_dev_t *dev;
list_t *buflist;
- arc_buf_hdr_t *head, *ab, *ab_prev;
- l2arc_buf_hdr_t *abl2;
- kmutex_t *hash_lock;
+ arc_buf_hdr_t *head, *ab;
+ struct defer_done_entry {
+ arc_buf_hdr_t *dde_buf;
+ list_node_t dde_node;
+ } *dde, *dde_next;
+ list_t defer_done_list;
+
cb = zio->io_private;
ASSERT(cb != NULL);
dev = cb->l2wcb_dev;
ASSERT(dev != NULL);
head = cb->l2wcb_head;
@@ -4168,56 +4177,66 @@
mutex_enter(&l2arc_buflist_mtx);
/*
* All writes completed, or an error was hit.
*/
- for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
- ab_prev = list_prev(buflist, ab);
-
- hash_lock = HDR_LOCK(ab);
- if (!mutex_tryenter(hash_lock)) {
+ list_create(&defer_done_list, sizeof (*dde),
+ offsetof(struct defer_done_entry, dde_node));
+ for (ab = list_prev(buflist, head); ab; ab = list_prev(buflist, ab)) {
/*
- * This buffer misses out. It may be in a stage
- * of eviction. Its ARC_L2_WRITING flag will be
- * left set, denying reads to this buffer.
+ * Can't pause here to grab hash_lock while also holding
+ * l2arc_buflist_mtx, so place the buffers on a temporary
+ * thread-local list for later processing.
*/
- ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
- continue;
+ dde = kmem_alloc(sizeof (*dde), KM_SLEEP);
+ dde->dde_buf = ab;
+ list_insert_tail(&defer_done_list, dde);
}
- abl2 = ab->b_l2hdr;
+ atomic_inc_64(&l2arc_writes_done);
+ list_remove(buflist, head);
+ kmem_cache_free(hdr_cache, head);
+ mutex_exit(&l2arc_buflist_mtx);
/*
- * Release the temporary compressed buffer as soon as possible.
+ * Now process the buffers. We're not holding l2arc_buflist_mtx
+ * anymore, so we can do a regular mutex_enter on the hash_lock.
*/
- if (abl2->b_compress != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(ab);
+ for (dde = list_head(&defer_done_list); dde != NULL; dde = dde_next) {
+ kmutex_t *hash_lock;
+ dde_next = list_next(&defer_done_list, dde);
+ ab = dde->dde_buf;
+ hash_lock = HDR_LOCK(ab);
+
+ mutex_enter(hash_lock);
+
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
+ l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+ mutex_enter(&l2arc_buflist_mtx);
list_remove(buflist, ab);
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
+ mutex_exit(&l2arc_buflist_mtx);
+ ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
ab->b_l2hdr = NULL;
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
/*
* Allow ARC to begin reads to this L2ARC entry.
*/
ab->b_flags &= ~ARC_L2_WRITING;
mutex_exit(hash_lock);
+
+ list_remove(&defer_done_list, dde);
}
+ list_destroy(&defer_done_list);
- atomic_inc_64(&l2arc_writes_done);
- list_remove(buflist, head);
- kmem_cache_free(hdr_cache, head);
- mutex_exit(&l2arc_buflist_mtx);
-
l2arc_do_free_on_write();
kmem_free(cb, sizeof (l2arc_write_callback_t));
}
@@ -4348,11 +4367,11 @@
*/
static void
l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
{
list_t *buflist;
- l2arc_buf_hdr_t *abl2;
+ l2arc_buf_hdr_t *l2hdr;
arc_buf_hdr_t *ab, *ab_prev;
kmutex_t *hash_lock;
uint64_t taddr;
buflist = dev->l2ad_buflist;
@@ -4448,14 +4467,14 @@
/*
* Tell ARC this no longer exists in L2ARC.
*/
if (ab->b_l2hdr != NULL) {
- abl2 = ab->b_l2hdr;
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
+ l2hdr = ab->b_l2hdr;
+ ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
ab->b_l2hdr = NULL;
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
list_remove(buflist, ab);
/*
@@ -4496,10 +4515,17 @@
boolean_t full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
const boolean_t do_headroom_boost = *headroom_boost;
+ struct defer_write_entry {
+ arc_buf_hdr_t *dwe_buf;
+ void *dwe_orig_data;
+ uint64_t dwe_orig_size;
+ list_node_t *dwe_node;
+ } *dwe, *dwe_next;
+ list_t defer_write_list;
ASSERT(dev->l2ad_vdev != NULL);
/* Lower the flag now, we might want to raise it again later. */
*headroom_boost = B_FALSE;
@@ -4517,10 +4543,12 @@
buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
/*
* Copy buffers for L2ARC writing.
*/
+ list_create(&defer_write_list, sizeof (*dwe),
+ offsetof(struct defer_write_entry, dwe_node));
mutex_enter(&l2arc_buflist_mtx);
for (int try = 0; try <= 3; try++) {
uint64_t passed_sz = 0;
list = l2arc_list_locked(try, &list_lock);
@@ -4541,11 +4569,10 @@
headroom = (headroom * l2arc_headroom_boost) / 100;
for (; ab; ab = ab_prev) {
l2arc_buf_hdr_t *l2hdr;
kmutex_t *hash_lock;
- uint64_t buf_sz;
if (arc_warm == B_FALSE)
ab_prev = list_next(list, ab);
else
ab_prev = list_prev(list, ab);
@@ -4598,27 +4625,31 @@
* Create and add a new L2ARC header.
*/
l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
l2hdr->b_dev = dev;
ab->b_flags |= ARC_L2_WRITING;
+ l2hdr->b_compress = ZIO_COMPRESS_OFF;
+ l2hdr->b_asize = ab->b_size;
/*
- * Temporarily stash the data buffer in b_tmp_cdata.
+ * Temporarily stash the buffer in defer_write_entries.
* The subsequent write step will pick it up from
- * there. This is because can't access ab->b_buf
+ * there. This is because we can't access ab->b_buf
* without holding the hash_lock, which we in turn
* can't access without holding the ARC list locks
- * (which we want to avoid during compression/writing).
+ * while walking the ARC lists (we want to avoid
+ * holding these locks during compression/writing).
*/
- l2hdr->b_compress = ZIO_COMPRESS_OFF;
- l2hdr->b_asize = ab->b_size;
- l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+ dwe = kmem_alloc(sizeof (*dwe), KM_SLEEP);
+ dwe->dwe_buf = ab;
+ dwe->dwe_orig_data = ab->b_buf->b_data;
+ dwe->dwe_orig_size = ab->b_size;
- buf_sz = ab->b_size;
ab->b_l2hdr = l2hdr;
list_insert_head(dev->l2ad_buflist, ab);
+ list_insert_tail(&defer_write_list, dwe);
/*
* Compute and store the buffer cksum before
* writing. On debug the cksum is verified first.
*/
@@ -4625,11 +4656,11 @@
arc_cksum_verify(ab->b_buf);
arc_cksum_compute(ab->b_buf, B_TRUE);
mutex_exit(hash_lock);
- write_sz += buf_sz;
+ write_sz += dwe->dwe_orig_size;
}
mutex_exit(list_lock);
if (full == B_TRUE)
@@ -4639,74 +4670,81 @@
/* No buffers selected for writing? */
if (pio == NULL) {
ASSERT0(write_sz);
mutex_exit(&l2arc_buflist_mtx);
kmem_cache_free(hdr_cache, head);
+ list_destroy(&defer_write_list);
return (0);
}
+ mutex_exit(&l2arc_buflist_mtx);
+
/*
* Now start writing the buffers. We're starting at the write head
* and work backwards, retracing the course of the buffer selector
* loop above.
*/
- for (ab = list_prev(dev->l2ad_buflist, head); ab;
- ab = list_prev(dev->l2ad_buflist, ab)) {
+ for (dwe = list_head(&defer_write_list); dwe != NULL; dwe = dwe_next) {
l2arc_buf_hdr_t *l2hdr;
uint64_t buf_sz;
+ dwe_next = list_next(&defer_write_list, dwe);
+ ab = dwe->dwe_buf;
+
/*
- * We shouldn't need to lock the buffer here, since we flagged
- * it as ARC_L2_WRITING in the previous step, but we must take
- * care to only access its L2 cache parameters. In particular,
- * ab->b_buf may be invalid by now due to ARC eviction.
+ * Accessing ab->b_l2hdr without locking is safe here because
+ * we're holding the l2arc_buflist_mtx and no other thread will
+ * ever directly modify the L2 fields. In particular ab->b_buf
+ * may be invalid by now due to ARC eviction.
*/
l2hdr = ab->b_l2hdr;
l2hdr->b_daddr = dev->l2ad_hand;
if ((ab->b_flags & ARC_L2COMPRESS) &&
- l2hdr->b_asize >= buf_compress_minsz) {
- if (l2arc_compress_buf(l2hdr)) {
+ l2hdr->b_asize >= buf_compress_minsz &&
+ l2arc_compress_buf(dwe->dwe_orig_data, dwe->dwe_orig_size,
+ &buf_data, &buf_sz, &l2hdr->b_compress)) {
/*
* If compression succeeded, enable headroom
* boost on the next scan cycle.
*/
*headroom_boost = B_TRUE;
+ l2hdr->b_asize = buf_sz;
+ } else {
+ buf_data = dwe->dwe_orig_data;
+ buf_sz = dwe->dwe_orig_size;
+ l2hdr->b_asize = dwe->dwe_orig_size;
}
- }
- /*
- * Pick up the buffer data we had previously stashed away
- * (and now potentially also compressed).
- */
- buf_data = l2hdr->b_tmp_cdata;
- buf_sz = l2hdr->b_asize;
-
/* Compression may have squashed the buffer to zero length. */
if (buf_sz != 0) {
uint64_t buf_p_sz;
wzio = zio_write_phys(pio, dev->l2ad_vdev,
- dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
- NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_CANFAIL, B_FALSE);
+ dev->l2ad_hand, l2hdr->b_asize, buf_data,
+ ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
+ B_FALSE);
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
zio_t *, wzio);
(void) zio_nowait(wzio);
- write_asize += buf_sz;
+ write_asize += l2hdr->b_asize;
/*
* Keep the clock hand suitably device-aligned.
*/
buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
write_psize += buf_p_sz;
dev->l2ad_hand += buf_p_sz;
}
+
+ list_remove(&defer_write_list, dwe);
+ kmem_free(dwe, sizeof (*dwe));
}
- mutex_exit(&l2arc_buflist_mtx);
+ list_destroy(&defer_write_list);
ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
ARCSTAT_INCR(arcstat_l2_size, write_sz);
@@ -4732,65 +4770,67 @@
return (write_asize);
}
/*
* Compresses an L2ARC buffer.
- * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
- * size in l2hdr->b_asize. This routine tries to compress the data and
- * depending on the compression result there are three possible outcomes:
- * *) The buffer was incompressible. The original l2hdr contents were left
- * untouched and are ready for writing to an L2 device.
+ * The data to be compressed is in in_data and its size in in_sz. This routine
+ * tries to compress the data and depending on the compression result there
+ * are three possible outcomes:
+ * *) The buffer was incompressible. The function returns with B_FALSE and
+ * does nothing else.
* *) The buffer was all-zeros, so there is no need to write it to an L2
- * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
- * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
- * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
- * data buffer which holds the compressed data to be written, and b_asize
- * tells us how much data there is. b_compress is set to the appropriate
- * compression algorithm. Once writing is done, invoke
- * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
- *
- * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
- * buffer was incompressible).
+ * device. To indicate this situation, the *out_data is set to NULL,
+ * *out_sz is set to zero, *compress is set to ZIO_COMPRESS_EMPTY and
+ * the function returns B_TRUE.
+ * *) Compression succeeded and *out_data was set to point to a buffer holding
+ * the compressed data buffer, *out_sz was set to indicate the output size,
+ * *compress was set to the appropriate compression algorithm and B_TRUE is
+ * returned. Once writing is done the buffer will be automatically freed by
+ * l2arc_do_free_on_write().
*/
static boolean_t
-l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+l2arc_compress_buf(void *in_data, uint64_t in_sz, void **out_data,
+ uint64_t *out_sz, enum zio_compress *compress)
{
void *cdata;
- size_t csize, len;
- ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
- ASSERT(l2hdr->b_tmp_cdata != NULL);
+ cdata = zio_data_buf_alloc(in_sz);
+ *out_sz = zio_compress_data(ZIO_COMPRESS_LZ4, in_data, cdata, in_sz);
- len = l2hdr->b_asize;
- cdata = zio_data_buf_alloc(len);
- csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
- cdata, l2hdr->b_asize);
-
- if (csize == 0) {
- /* zero block, indicate that there's nothing to write */
- zio_data_buf_free(cdata, len);
- l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
- l2hdr->b_asize = 0;
- l2hdr->b_tmp_cdata = NULL;
+ if (*out_sz == 0) {
+ /* Zero block, indicate that there's nothing to write. */
+ zio_data_buf_free(cdata, in_sz);
+ *compress = ZIO_COMPRESS_EMPTY;
+ *out_data = NULL;
ARCSTAT_BUMP(arcstat_l2_compress_zeros);
return (B_TRUE);
- } else if (csize > 0 && csize < len) {
+ } else if (*out_sz > 0 && *out_sz < in_sz) {
/*
* Compression succeeded, we'll keep the cdata around for
- * writing and release it afterwards.
+ * writing and release it after writing.
*/
- l2hdr->b_compress = ZIO_COMPRESS_LZ4;
- l2hdr->b_asize = csize;
- l2hdr->b_tmp_cdata = cdata;
+ l2arc_data_free_t *df;
+
+ *compress = ZIO_COMPRESS_LZ4;
+ *out_data = cdata;
+
+ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df->l2df_data = cdata;
+ df->l2df_size = *out_sz;
+ df->l2df_func = zio_data_buf_free;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+
ARCSTAT_BUMP(arcstat_l2_compress_successes);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
return (B_TRUE);
} else {
/*
* Compression failed, release the compressed buffer.
- * l2hdr will be left unmodified.
*/
- zio_data_buf_free(cdata, len);
+ zio_data_buf_free(cdata, in_sz);
ARCSTAT_BUMP(arcstat_l2_compress_failures);
return (B_FALSE);
}
}
@@ -4854,32 +4894,10 @@
/* Restore the expected uncompressed IO size. */
zio->io_orig_size = zio->io_size = hdr->b_size;
}
-/*
- * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
- * This buffer serves as a temporary holder of compressed data while
- * the buffer entry is being written to an l2arc device. Once that is
- * done, we can dispose of it.
- */
-static void
-l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
-{
- l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
-
- if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
- /*
- * If the data was compressed, then we've allocated a
- * temporary buffer for it, so now we need to release it.
- */
- ASSERT(l2hdr->b_tmp_cdata != NULL);
- zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
- }
- l2hdr->b_tmp_cdata = NULL;
-}
-
/*
* This thread feeds the L2ARC at regular intervals. This is the beating
* heart of the L2ARC.
*/
static void