Print this page
3995 Memory leak of compressed buffers in l2arc_write_done
3997 ZFS L2ARC default behavior should allow reading while writing

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c
↓ open down ↓ 109 lines elided ↑ open up ↑
 110  110   * Note that the majority of the performance stats are manipulated
 111  111   * with atomic operations.
 112  112   *
 113  113   * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 114  114   *
 115  115   *      - L2ARC buflist creation
 116  116   *      - L2ARC buflist eviction
 117  117   *      - L2ARC write completion, which walks L2ARC buflists
 118  118   *      - ARC header destruction, as it removes from L2ARC buflists
 119  119   *      - ARC header release, as it removes from L2ARC buflists
      120 + *
      121 + * Please note that if you first grab the l2arc_buflist_mtx, you can't do a
      122 + * mutex_enter on a buffer's hash_lock anymore due to lock inversion. To grab
      123 + * the hash_lock you must use mutex_tryenter and possibly deal with the buffer
      124 + * not being available (due to e.g. some other thread holding it while trying
      125 + * to unconditionally grab the l2arc_buflist_mtx which you are holding). The
      126 + * inverse situation (first grab hash_lock, then l2arc_buflist_mtx) is safe.
 120  127   */
 121  128  
 122  129  #include <sys/spa.h>
 123  130  #include <sys/zio.h>
 124  131  #include <sys/zio_compress.h>
 125  132  #include <sys/zfs_context.h>
 126  133  #include <sys/arc.h>
 127  134  #include <sys/refcount.h>
 128  135  #include <sys/vdev.h>
 129  136  #include <sys/vdev_impl.h>
↓ open down ↓ 485 lines elided ↑ open up ↑
 615  622  
 616  623  /* L2ARC Performance Tunables */
 617  624  uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 618  625  uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 619  626  uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 620  627  uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 621  628  uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 622  629  uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 623  630  boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 624  631  boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 625      -boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
      632 +boolean_t l2arc_norw = B_FALSE;                 /* no reads during writes */
 626  633  
 627  634  /*
 628  635   * L2ARC Internals
 629  636   */
 630  637  typedef struct l2arc_dev {
 631  638          vdev_t                  *l2ad_vdev;     /* vdev */
 632  639          spa_t                   *l2ad_spa;      /* spa */
 633  640          uint64_t                l2ad_hand;      /* next write location */
 634  641          uint64_t                l2ad_start;     /* first addr on device */
 635  642          uint64_t                l2ad_end;       /* last addr on device */
↓ open down ↓ 29 lines elided ↑ open up ↑
 665  672  } l2arc_write_callback_t;
 666  673  
 667  674  struct l2arc_buf_hdr {
 668  675          /* protected by arc_buf_hdr  mutex */
 669  676          l2arc_dev_t             *b_dev;         /* L2ARC device */
 670  677          uint64_t                b_daddr;        /* disk address, offset byte */
 671  678          /* compression applied to buffer data */
 672  679          enum zio_compress       b_compress;
 673  680          /* real alloc'd buffer size depending on b_compress applied */
 674  681          int                     b_asize;
 675      -        /* temporary buffer holder for in-flight compressed data */
 676      -        void                    *b_tmp_cdata;
 677  682  };
 678  683  
 679  684  typedef struct l2arc_data_free {
 680  685          /* protected by l2arc_free_on_write_mtx */
 681  686          void            *l2df_data;
 682  687          size_t          l2df_size;
 683  688          void            (*l2df_func)(void *, size_t);
 684  689          list_node_t     l2df_list_node;
 685  690  } l2arc_data_free_t;
 686  691  
 687  692  static kmutex_t l2arc_feed_thr_lock;
 688  693  static kcondvar_t l2arc_feed_thr_cv;
 689  694  static uint8_t l2arc_thread_exit;
 690  695  
 691  696  static void l2arc_read_done(zio_t *zio);
 692  697  static void l2arc_hdr_stat_add(void);
 693  698  static void l2arc_hdr_stat_remove(void);
 694  699  
 695      -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
      700 +static boolean_t l2arc_compress_buf(void *in_data, uint64_t in_sz,
      701 +    void **out_data, uint64_t *out_sz, enum zio_compress *compress);
 696  702  static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 697  703      enum zio_compress c);
 698      -static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 699  704  
 700  705  static uint64_t
 701  706  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 702  707  {
 703  708          uint8_t *vdva = (uint8_t *)dva;
 704  709          uint64_t crc = -1ULL;
 705  710          int i;
 706  711  
 707  712          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 708  713  
↓ open down ↓ 3401 lines elided ↑ open up ↑
4110 4115                  spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4111 4116          mutex_exit(&spa_namespace_lock);
4112 4117  
4113 4118          return (next);
4114 4119  }
4115 4120  
4116 4121  /*
4117 4122   * Free buffers that were tagged for destruction.
4118 4123   */
4119 4124  static void
4120      -l2arc_do_free_on_write()
     4125 +l2arc_do_free_on_write(void)
4121 4126  {
4122 4127          list_t *buflist;
4123 4128          l2arc_data_free_t *df, *df_prev;
4124 4129  
4125 4130          mutex_enter(&l2arc_free_on_write_mtx);
4126 4131          buflist = l2arc_free_on_write;
4127 4132  
4128 4133          for (df = list_tail(buflist); df; df = df_prev) {
4129 4134                  df_prev = list_prev(buflist, df);
4130 4135                  ASSERT(df->l2df_data != NULL);
↓ open down ↓ 9 lines elided ↑ open up ↑
4140 4145  /*
4141 4146   * A write to a cache device has completed.  Update all headers to allow
4142 4147   * reads from these buffers to begin.
4143 4148   */
4144 4149  static void
4145 4150  l2arc_write_done(zio_t *zio)
4146 4151  {
4147 4152          l2arc_write_callback_t *cb;
4148 4153          l2arc_dev_t *dev;
4149 4154          list_t *buflist;
4150      -        arc_buf_hdr_t *head, *ab, *ab_prev;
4151      -        l2arc_buf_hdr_t *abl2;
4152      -        kmutex_t *hash_lock;
     4155 +        arc_buf_hdr_t *head, *ab;
4153 4156  
     4157 +        struct defer_done_entry {
     4158 +                arc_buf_hdr_t *dde_buf;
     4159 +                list_node_t dde_node;
     4160 +        } *dde, *dde_next;
     4161 +        list_t defer_done_list;
     4162 +
4154 4163          cb = zio->io_private;
4155 4164          ASSERT(cb != NULL);
4156 4165          dev = cb->l2wcb_dev;
4157 4166          ASSERT(dev != NULL);
4158 4167          head = cb->l2wcb_head;
4159 4168          ASSERT(head != NULL);
4160 4169          buflist = dev->l2ad_buflist;
4161 4170          ASSERT(buflist != NULL);
4162 4171          DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4163 4172              l2arc_write_callback_t *, cb);
4164 4173  
4165 4174          if (zio->io_error != 0)
4166 4175                  ARCSTAT_BUMP(arcstat_l2_writes_error);
4167 4176  
4168 4177          mutex_enter(&l2arc_buflist_mtx);
4169 4178  
4170 4179          /*
4171 4180           * All writes completed, or an error was hit.
4172 4181           */
4173      -        for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4174      -                ab_prev = list_prev(buflist, ab);
     4182 +        list_create(&defer_done_list, sizeof (*dde),
     4183 +            offsetof(struct defer_done_entry, dde_node));
     4184 +        for (ab = list_prev(buflist, head); ab; ab = list_prev(buflist, ab)) {
     4185 +                /*
     4186 +                 * Can't pause here to grab hash_lock while also holding
     4187 +                 * l2arc_buflist_mtx, so place the buffers on a temporary
     4188 +                 * thread-local list for later processing.
     4189 +                 */
     4190 +                dde = kmem_alloc(sizeof (*dde), KM_SLEEP);
     4191 +                dde->dde_buf = ab;
     4192 +                list_insert_tail(&defer_done_list, dde);
     4193 +        }
4175 4194  
4176      -                hash_lock = HDR_LOCK(ab);
4177      -                if (!mutex_tryenter(hash_lock)) {
4178      -                        /*
4179      -                         * This buffer misses out.  It may be in a stage
4180      -                         * of eviction.  Its ARC_L2_WRITING flag will be
4181      -                         * left set, denying reads to this buffer.
4182      -                         */
4183      -                        ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4184      -                        continue;
4185      -                }
     4195 +        atomic_inc_64(&l2arc_writes_done);
     4196 +        list_remove(buflist, head);
     4197 +        kmem_cache_free(hdr_cache, head);
     4198 +        mutex_exit(&l2arc_buflist_mtx);
4186 4199  
4187      -                abl2 = ab->b_l2hdr;
     4200 +        /*
     4201 +         * Now process the buffers. We're not holding l2arc_buflist_mtx
     4202 +         * anymore, so we can do a regular mutex_enter on the hash_lock.
     4203 +         */
     4204 +        for (dde = list_head(&defer_done_list); dde != NULL; dde = dde_next) {
     4205 +                kmutex_t *hash_lock;
4188 4206  
4189      -                /*
4190      -                 * Release the temporary compressed buffer as soon as possible.
4191      -                 */
4192      -                if (abl2->b_compress != ZIO_COMPRESS_OFF)
4193      -                        l2arc_release_cdata_buf(ab);
     4207 +                dde_next = list_next(&defer_done_list, dde);
     4208 +                ab = dde->dde_buf;
     4209 +                hash_lock = HDR_LOCK(ab);
4194 4210  
     4211 +                mutex_enter(hash_lock);
     4212 +
4195 4213                  if (zio->io_error != 0) {
4196 4214                          /*
4197 4215                           * Error - drop L2ARC entry.
4198 4216                           */
     4217 +                        l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
     4218 +                        mutex_enter(&l2arc_buflist_mtx);
4199 4219                          list_remove(buflist, ab);
4200      -                        ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
     4220 +                        mutex_exit(&l2arc_buflist_mtx);
     4221 +                        ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4201 4222                          ab->b_l2hdr = NULL;
4202      -                        kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
     4223 +                        kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
4203 4224                          ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4204 4225                  }
4205 4226  
4206 4227                  /*
4207 4228                   * Allow ARC to begin reads to this L2ARC entry.
4208 4229                   */
4209 4230                  ab->b_flags &= ~ARC_L2_WRITING;
4210 4231  
4211 4232                  mutex_exit(hash_lock);
     4233 +
     4234 +                list_remove(&defer_done_list, dde);
4212 4235          }
     4236 +        list_destroy(&defer_done_list);
4213 4237  
4214      -        atomic_inc_64(&l2arc_writes_done);
4215      -        list_remove(buflist, head);
4216      -        kmem_cache_free(hdr_cache, head);
4217      -        mutex_exit(&l2arc_buflist_mtx);
4218      -
4219 4238          l2arc_do_free_on_write();
4220 4239  
4221 4240          kmem_free(cb, sizeof (l2arc_write_callback_t));
4222 4241  }
4223 4242  
4224 4243  /*
4225 4244   * A read to a cache device completed.  Validate buffer contents before
4226 4245   * handing over to the regular ARC routines.
4227 4246   */
4228 4247  static void
↓ open down ↓ 114 lines elided ↑ open up ↑
4343 4362  /*
4344 4363   * Evict buffers from the device write hand to the distance specified in
4345 4364   * bytes.  This distance may span populated buffers, it may span nothing.
4346 4365   * This is clearing a region on the L2ARC device ready for writing.
4347 4366   * If the 'all' boolean is set, every buffer is evicted.
4348 4367   */
4349 4368  static void
4350 4369  l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4351 4370  {
4352 4371          list_t *buflist;
4353      -        l2arc_buf_hdr_t *abl2;
     4372 +        l2arc_buf_hdr_t *l2hdr;
4354 4373          arc_buf_hdr_t *ab, *ab_prev;
4355 4374          kmutex_t *hash_lock;
4356 4375          uint64_t taddr;
4357 4376  
4358 4377          buflist = dev->l2ad_buflist;
4359 4378  
4360 4379          if (buflist == NULL)
4361 4380                  return;
4362 4381  
4363 4382          if (!all && dev->l2ad_first) {
↓ open down ↓ 79 lines elided ↑ open up ↑
4443 4462                           */
4444 4463                          if (HDR_L2_READING(ab)) {
4445 4464                                  ARCSTAT_BUMP(arcstat_l2_evict_reading);
4446 4465                                  ab->b_flags |= ARC_L2_EVICTED;
4447 4466                          }
4448 4467  
4449 4468                          /*
4450 4469                           * Tell ARC this no longer exists in L2ARC.
4451 4470                           */
4452 4471                          if (ab->b_l2hdr != NULL) {
4453      -                                abl2 = ab->b_l2hdr;
4454      -                                ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
     4472 +                                l2hdr = ab->b_l2hdr;
     4473 +                                ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4455 4474                                  ab->b_l2hdr = NULL;
4456      -                                kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
     4475 +                                kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
4457 4476                                  ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4458 4477                          }
4459 4478                          list_remove(buflist, ab);
4460 4479  
4461 4480                          /*
4462 4481                           * This may have been leftover after a
4463 4482                           * failed write.
4464 4483                           */
4465 4484                          ab->b_flags &= ~ARC_L2_WRITING;
4466 4485                  }
↓ open down ↓ 24 lines elided ↑ open up ↑
4491 4510          list_t *list;
4492 4511          uint64_t write_asize, write_psize, write_sz, headroom,
4493 4512              buf_compress_minsz;
4494 4513          void *buf_data;
4495 4514          kmutex_t *list_lock;
4496 4515          boolean_t full;
4497 4516          l2arc_write_callback_t *cb;
4498 4517          zio_t *pio, *wzio;
4499 4518          uint64_t guid = spa_load_guid(spa);
4500 4519          const boolean_t do_headroom_boost = *headroom_boost;
     4520 +        struct defer_write_entry {
     4521 +                arc_buf_hdr_t *dwe_buf;
     4522 +                void *dwe_orig_data;
     4523 +                uint64_t dwe_orig_size;
     4524 +                list_node_t *dwe_node;
     4525 +        } *dwe, *dwe_next;
     4526 +        list_t defer_write_list;
4501 4527  
4502 4528          ASSERT(dev->l2ad_vdev != NULL);
4503 4529  
4504 4530          /* Lower the flag now, we might want to raise it again later. */
4505 4531          *headroom_boost = B_FALSE;
4506 4532  
4507 4533          pio = NULL;
4508 4534          write_sz = write_asize = write_psize = 0;
4509 4535          full = B_FALSE;
4510 4536          head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
↓ open down ↓ 1 lines elided ↑ open up ↑
4512 4538  
4513 4539          /*
4514 4540           * We will want to try to compress buffers that are at least 2x the
4515 4541           * device sector size.
4516 4542           */
4517 4543          buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4518 4544  
4519 4545          /*
4520 4546           * Copy buffers for L2ARC writing.
4521 4547           */
     4548 +        list_create(&defer_write_list, sizeof (*dwe),
     4549 +            offsetof(struct defer_write_entry, dwe_node));
4522 4550          mutex_enter(&l2arc_buflist_mtx);
4523 4551          for (int try = 0; try <= 3; try++) {
4524 4552                  uint64_t passed_sz = 0;
4525 4553  
4526 4554                  list = l2arc_list_locked(try, &list_lock);
4527 4555  
4528 4556                  /*
4529 4557                   * L2ARC fast warmup.
4530 4558                   *
4531 4559                   * Until the ARC is warm and starts to evict, read from the
↓ open down ↓ 4 lines elided ↑ open up ↑
4536 4564                  else
4537 4565                          ab = list_tail(list);
4538 4566  
4539 4567                  headroom = target_sz * l2arc_headroom;
4540 4568                  if (do_headroom_boost)
4541 4569                          headroom = (headroom * l2arc_headroom_boost) / 100;
4542 4570  
4543 4571                  for (; ab; ab = ab_prev) {
4544 4572                          l2arc_buf_hdr_t *l2hdr;
4545 4573                          kmutex_t *hash_lock;
4546      -                        uint64_t buf_sz;
4547 4574  
4548 4575                          if (arc_warm == B_FALSE)
4549 4576                                  ab_prev = list_next(list, ab);
4550 4577                          else
4551 4578                                  ab_prev = list_prev(list, ab);
4552 4579  
4553 4580                          hash_lock = HDR_LOCK(ab);
4554 4581                          if (!mutex_tryenter(hash_lock)) {
4555 4582                                  /*
4556 4583                                   * Skip this buffer rather than waiting.
↓ open down ↓ 36 lines elided ↑ open up ↑
4593 4620                                  pio = zio_root(spa, l2arc_write_done, cb,
4594 4621                                      ZIO_FLAG_CANFAIL);
4595 4622                          }
4596 4623  
4597 4624                          /*
4598 4625                           * Create and add a new L2ARC header.
4599 4626                           */
4600 4627                          l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4601 4628                          l2hdr->b_dev = dev;
4602 4629                          ab->b_flags |= ARC_L2_WRITING;
     4630 +                        l2hdr->b_compress = ZIO_COMPRESS_OFF;
     4631 +                        l2hdr->b_asize = ab->b_size;
4603 4632  
4604 4633                          /*
4605      -                         * Temporarily stash the data buffer in b_tmp_cdata.
     4634 +                         * Temporarily stash the buffer in defer_write_entries.
4606 4635                           * The subsequent write step will pick it up from
4607      -                         * there. This is because can't access ab->b_buf
     4636 +                         * there. This is because we can't access ab->b_buf
4608 4637                           * without holding the hash_lock, which we in turn
4609 4638                           * can't access without holding the ARC list locks
4610      -                         * (which we want to avoid during compression/writing).
     4639 +                         * while walking the ARC lists (we want to avoid
     4640 +                         * holding these locks during compression/writing).
4611 4641                           */
4612      -                        l2hdr->b_compress = ZIO_COMPRESS_OFF;
4613      -                        l2hdr->b_asize = ab->b_size;
4614      -                        l2hdr->b_tmp_cdata = ab->b_buf->b_data;
     4642 +                        dwe = kmem_alloc(sizeof (*dwe), KM_SLEEP);
     4643 +                        dwe->dwe_buf = ab;
     4644 +                        dwe->dwe_orig_data = ab->b_buf->b_data;
     4645 +                        dwe->dwe_orig_size = ab->b_size;
4615 4646  
4616      -                        buf_sz = ab->b_size;
4617 4647                          ab->b_l2hdr = l2hdr;
4618 4648  
4619 4649                          list_insert_head(dev->l2ad_buflist, ab);
     4650 +                        list_insert_tail(&defer_write_list, dwe);
4620 4651  
4621 4652                          /*
4622 4653                           * Compute and store the buffer cksum before
4623 4654                           * writing.  On debug the cksum is verified first.
4624 4655                           */
4625 4656                          arc_cksum_verify(ab->b_buf);
4626 4657                          arc_cksum_compute(ab->b_buf, B_TRUE);
4627 4658  
4628 4659                          mutex_exit(hash_lock);
4629 4660  
4630      -                        write_sz += buf_sz;
     4661 +                        write_sz += dwe->dwe_orig_size;
4631 4662                  }
4632 4663  
4633 4664                  mutex_exit(list_lock);
4634 4665  
4635 4666                  if (full == B_TRUE)
4636 4667                          break;
4637 4668          }
4638 4669  
4639 4670          /* No buffers selected for writing? */
4640 4671          if (pio == NULL) {
4641 4672                  ASSERT0(write_sz);
4642 4673                  mutex_exit(&l2arc_buflist_mtx);
4643 4674                  kmem_cache_free(hdr_cache, head);
     4675 +                list_destroy(&defer_write_list);
4644 4676                  return (0);
4645 4677          }
4646 4678  
     4679 +        mutex_exit(&l2arc_buflist_mtx);
     4680 +
4647 4681          /*
4648 4682           * Now start writing the buffers. We're starting at the write head
4649 4683           * and work backwards, retracing the course of the buffer selector
4650 4684           * loop above.
4651 4685           */
4652      -        for (ab = list_prev(dev->l2ad_buflist, head); ab;
4653      -            ab = list_prev(dev->l2ad_buflist, ab)) {
     4686 +        for (dwe = list_head(&defer_write_list); dwe != NULL; dwe = dwe_next) {
4654 4687                  l2arc_buf_hdr_t *l2hdr;
4655 4688                  uint64_t buf_sz;
4656 4689  
     4690 +                dwe_next = list_next(&defer_write_list, dwe);
     4691 +                ab = dwe->dwe_buf;
     4692 +
4657 4693                  /*
4658      -                 * We shouldn't need to lock the buffer here, since we flagged
4659      -                 * it as ARC_L2_WRITING in the previous step, but we must take
4660      -                 * care to only access its L2 cache parameters. In particular,
4661      -                 * ab->b_buf may be invalid by now due to ARC eviction.
     4694 +                 * Accessing ab->b_l2hdr without locking is safe here because
     4695 +                 * we're holding the l2arc_buflist_mtx and no other thread will
     4696 +                 * ever directly modify the L2 fields. In particular ab->b_buf
     4697 +                 * may be invalid by now due to ARC eviction.
4662 4698                   */
4663 4699                  l2hdr = ab->b_l2hdr;
4664 4700                  l2hdr->b_daddr = dev->l2ad_hand;
4665 4701  
4666 4702                  if ((ab->b_flags & ARC_L2COMPRESS) &&
4667      -                    l2hdr->b_asize >= buf_compress_minsz) {
4668      -                        if (l2arc_compress_buf(l2hdr)) {
4669      -                                /*
4670      -                                 * If compression succeeded, enable headroom
4671      -                                 * boost on the next scan cycle.
4672      -                                 */
4673      -                                *headroom_boost = B_TRUE;
4674      -                        }
     4703 +                    l2hdr->b_asize >= buf_compress_minsz &&
     4704 +                    l2arc_compress_buf(dwe->dwe_orig_data, dwe->dwe_orig_size,
     4705 +                    &buf_data, &buf_sz, &l2hdr->b_compress)) {
     4706 +                        /*
     4707 +                         * If compression succeeded, enable headroom
     4708 +                         * boost on the next scan cycle.
     4709 +                         */
     4710 +                        *headroom_boost = B_TRUE;
     4711 +                        l2hdr->b_asize = buf_sz;
     4712 +                } else {
     4713 +                        buf_data = dwe->dwe_orig_data;
     4714 +                        buf_sz = dwe->dwe_orig_size;
     4715 +                        l2hdr->b_asize = dwe->dwe_orig_size;
4675 4716                  }
4676 4717  
4677      -                /*
4678      -                 * Pick up the buffer data we had previously stashed away
4679      -                 * (and now potentially also compressed).
4680      -                 */
4681      -                buf_data = l2hdr->b_tmp_cdata;
4682      -                buf_sz = l2hdr->b_asize;
4683      -
4684 4718                  /* Compression may have squashed the buffer to zero length. */
4685 4719                  if (buf_sz != 0) {
4686 4720                          uint64_t buf_p_sz;
4687 4721  
4688 4722                          wzio = zio_write_phys(pio, dev->l2ad_vdev,
4689      -                            dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4690      -                            NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4691      -                            ZIO_FLAG_CANFAIL, B_FALSE);
     4723 +                            dev->l2ad_hand, l2hdr->b_asize, buf_data,
     4724 +                            ZIO_CHECKSUM_OFF, NULL, NULL,
     4725 +                            ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
     4726 +                            B_FALSE);
4692 4727  
4693 4728                          DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4694 4729                              zio_t *, wzio);
4695 4730                          (void) zio_nowait(wzio);
4696 4731  
4697      -                        write_asize += buf_sz;
     4732 +                        write_asize += l2hdr->b_asize;
4698 4733                          /*
4699 4734                           * Keep the clock hand suitably device-aligned.
4700 4735                           */
4701 4736                          buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4702 4737                          write_psize += buf_p_sz;
4703 4738                          dev->l2ad_hand += buf_p_sz;
4704 4739                  }
     4740 +
     4741 +                list_remove(&defer_write_list, dwe);
     4742 +                kmem_free(dwe, sizeof (*dwe));
4705 4743          }
4706 4744  
4707      -        mutex_exit(&l2arc_buflist_mtx);
     4745 +        list_destroy(&defer_write_list);
4708 4746  
4709 4747          ASSERT3U(write_asize, <=, target_sz);
4710 4748          ARCSTAT_BUMP(arcstat_l2_writes_sent);
4711 4749          ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4712 4750          ARCSTAT_INCR(arcstat_l2_size, write_sz);
4713 4751          ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4714 4752          vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4715 4753  
4716 4754          /*
4717 4755           * Bump device hand to the device start if it is approaching the end.
↓ open down ↓ 9 lines elided ↑ open up ↑
4727 4765  
4728 4766          dev->l2ad_writing = B_TRUE;
4729 4767          (void) zio_wait(pio);
4730 4768          dev->l2ad_writing = B_FALSE;
4731 4769  
4732 4770          return (write_asize);
4733 4771  }
4734 4772  
4735 4773  /*
4736 4774   * Compresses an L2ARC buffer.
4737      - * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4738      - * size in l2hdr->b_asize. This routine tries to compress the data and
4739      - * depending on the compression result there are three possible outcomes:
4740      - * *) The buffer was incompressible. The original l2hdr contents were left
4741      - *    untouched and are ready for writing to an L2 device.
     4775 + * The data to be compressed is in in_data and its size in in_sz. This routine
     4776 + * tries to compress the data and depending on the compression result there
     4777 + * are three possible outcomes:
     4778 + * *) The buffer was incompressible. The function returns with B_FALSE and
     4779 + *    does nothing else.
4742 4780   * *) The buffer was all-zeros, so there is no need to write it to an L2
4743      - *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4744      - *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4745      - * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4746      - *    data buffer which holds the compressed data to be written, and b_asize
4747      - *    tells us how much data there is. b_compress is set to the appropriate
4748      - *    compression algorithm. Once writing is done, invoke
4749      - *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4750      - *
4751      - * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4752      - * buffer was incompressible).
     4781 + *    device. To indicate this situation, the *out_data is set to NULL,
     4782 + *    *out_sz is set to zero, *compress is set to ZIO_COMPRESS_EMPTY and
     4783 + *    the function returns B_TRUE.
     4784 + * *) Compression succeeded and *out_data was set to point to a buffer holding
     4785 + *    the compressed data buffer, *out_sz was set to indicate the output size,
     4786 + *    *compress was set to the appropriate compression algorithm and B_TRUE is
     4787 + *    returned. Once writing is done the buffer will be automatically freed by
     4788 + *    l2arc_do_free_on_write().
4753 4789   */
4754 4790  static boolean_t
4755      -l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
     4791 +l2arc_compress_buf(void *in_data, uint64_t in_sz, void **out_data,
     4792 +    uint64_t *out_sz, enum zio_compress *compress)
4756 4793  {
4757 4794          void *cdata;
4758      -        size_t csize, len;
4759 4795  
4760      -        ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4761      -        ASSERT(l2hdr->b_tmp_cdata != NULL);
     4796 +        cdata = zio_data_buf_alloc(in_sz);
     4797 +        *out_sz = zio_compress_data(ZIO_COMPRESS_LZ4, in_data, cdata, in_sz);
4762 4798  
4763      -        len = l2hdr->b_asize;
4764      -        cdata = zio_data_buf_alloc(len);
4765      -        csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4766      -            cdata, l2hdr->b_asize);
4767      -
4768      -        if (csize == 0) {
4769      -                /* zero block, indicate that there's nothing to write */
4770      -                zio_data_buf_free(cdata, len);
4771      -                l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4772      -                l2hdr->b_asize = 0;
4773      -                l2hdr->b_tmp_cdata = NULL;
     4799 +        if (*out_sz == 0) {
     4800 +                /* Zero block, indicate that there's nothing to write. */
     4801 +                zio_data_buf_free(cdata, in_sz);
     4802 +                *compress = ZIO_COMPRESS_EMPTY;
     4803 +                *out_data = NULL;
4774 4804                  ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4775 4805                  return (B_TRUE);
4776      -        } else if (csize > 0 && csize < len) {
     4806 +        } else if (*out_sz > 0 && *out_sz < in_sz) {
4777 4807                  /*
4778 4808                   * Compression succeeded, we'll keep the cdata around for
4779      -                 * writing and release it afterwards.
     4809 +                 * writing and release it after writing.
4780 4810                   */
4781      -                l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4782      -                l2hdr->b_asize = csize;
4783      -                l2hdr->b_tmp_cdata = cdata;
     4811 +                l2arc_data_free_t *df;
     4812 +
     4813 +                *compress = ZIO_COMPRESS_LZ4;
     4814 +                *out_data = cdata;
     4815 +
     4816 +                df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
     4817 +                df->l2df_data = cdata;
     4818 +                df->l2df_size = *out_sz;
     4819 +                df->l2df_func = zio_data_buf_free;
     4820 +                mutex_enter(&l2arc_free_on_write_mtx);
     4821 +                list_insert_head(l2arc_free_on_write, df);
     4822 +                mutex_exit(&l2arc_free_on_write_mtx);
     4823 +
4784 4824                  ARCSTAT_BUMP(arcstat_l2_compress_successes);
     4825 +                ARCSTAT_BUMP(arcstat_l2_free_on_write);
4785 4826                  return (B_TRUE);
4786 4827          } else {
4787 4828                  /*
4788 4829                   * Compression failed, release the compressed buffer.
4789      -                 * l2hdr will be left unmodified.
4790 4830                   */
4791      -                zio_data_buf_free(cdata, len);
     4831 +                zio_data_buf_free(cdata, in_sz);
4792 4832                  ARCSTAT_BUMP(arcstat_l2_compress_failures);
4793 4833                  return (B_FALSE);
4794 4834          }
4795 4835  }
4796 4836  
4797 4837  /*
4798 4838   * Decompresses a zio read back from an l2arc device. On success, the
4799 4839   * underlying zio's io_data buffer is overwritten by the uncompressed
4800 4840   * version. On decompression error (corrupt compressed stream), the
4801 4841   * zio->io_error value is set to signal an I/O error.
↓ open down ↓ 47 lines elided ↑ open up ↑
4849 4889                  if (zio_decompress_data(c, cdata, zio->io_data, csize,
4850 4890                      hdr->b_size) != 0)
4851 4891                          zio->io_error = EIO;
4852 4892                  zio_data_buf_free(cdata, csize);
4853 4893          }
4854 4894  
4855 4895          /* Restore the expected uncompressed IO size. */
4856 4896          zio->io_orig_size = zio->io_size = hdr->b_size;
4857 4897  }
4858 4898  
4859      -/*
4860      - * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
4861      - * This buffer serves as a temporary holder of compressed data while
4862      - * the buffer entry is being written to an l2arc device. Once that is
4863      - * done, we can dispose of it.
4864      - */
4865      -static void
4866      -l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
4867      -{
4868      -        l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4869      -
4870      -        if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
4871      -                /*
4872      -                 * If the data was compressed, then we've allocated a
4873      -                 * temporary buffer for it, so now we need to release it.
4874      -                 */
4875      -                ASSERT(l2hdr->b_tmp_cdata != NULL);
4876      -                zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
4877      -        }
4878      -        l2hdr->b_tmp_cdata = NULL;
4879      -}
4880      -
4881 4899  /*
4882 4900   * This thread feeds the L2ARC at regular intervals.  This is the beating
4883 4901   * heart of the L2ARC.
4884 4902   */
4885 4903  static void
4886 4904  l2arc_feed_thread(void)
4887 4905  {
4888 4906          callb_cpr_t cpr;
4889 4907          l2arc_dev_t *dev;
4890 4908          spa_t *spa;
↓ open down ↓ 251 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX