Print this page
3525 Persistent L2ARC


 119  *      - ARC header release, as it removes from L2ARC buflists
 120  */
 121 
 122 #include <sys/spa.h>
 123 #include <sys/zio.h>
 124 #include <sys/zio_compress.h>
 125 #include <sys/zfs_context.h>
 126 #include <sys/arc.h>
 127 #include <sys/refcount.h>
 128 #include <sys/vdev.h>
 129 #include <sys/vdev_impl.h>
 130 #ifdef _KERNEL
 131 #include <sys/vmsystm.h>
 132 #include <vm/anon.h>
 133 #include <sys/fs/swapnode.h>
 134 #include <sys/dnlc.h>
 135 #endif
 136 #include <sys/callb.h>
 137 #include <sys/kstat.h>
 138 #include <zfs_fletcher.h>

 139 
 140 #ifndef _KERNEL
 141 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 142 boolean_t arc_watch = B_FALSE;
 143 int arc_procfd;
 144 #endif
 145 
 146 static kmutex_t         arc_reclaim_thr_lock;
 147 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 148 static uint8_t          arc_thread_exit;
 149 
 150 extern int zfs_write_limit_shift;
 151 extern uint64_t zfs_write_limit_max;
 152 extern kmutex_t zfs_write_limit_lock;
 153 
 154 #define ARC_REDUCE_DNLC_PERCENT 3
 155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 156 
 157 typedef enum arc_reclaim_strategy {
 158         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */


 290         kstat_named_t arcstat_l2_feeds;
 291         kstat_named_t arcstat_l2_rw_clash;
 292         kstat_named_t arcstat_l2_read_bytes;
 293         kstat_named_t arcstat_l2_write_bytes;
 294         kstat_named_t arcstat_l2_writes_sent;
 295         kstat_named_t arcstat_l2_writes_done;
 296         kstat_named_t arcstat_l2_writes_error;
 297         kstat_named_t arcstat_l2_writes_hdr_miss;
 298         kstat_named_t arcstat_l2_evict_lock_retry;
 299         kstat_named_t arcstat_l2_evict_reading;
 300         kstat_named_t arcstat_l2_free_on_write;
 301         kstat_named_t arcstat_l2_abort_lowmem;
 302         kstat_named_t arcstat_l2_cksum_bad;
 303         kstat_named_t arcstat_l2_io_error;
 304         kstat_named_t arcstat_l2_size;
 305         kstat_named_t arcstat_l2_asize;
 306         kstat_named_t arcstat_l2_hdr_size;
 307         kstat_named_t arcstat_l2_compress_successes;
 308         kstat_named_t arcstat_l2_compress_zeros;
 309         kstat_named_t arcstat_l2_compress_failures;


















 310         kstat_named_t arcstat_memory_throttle_count;
 311         kstat_named_t arcstat_duplicate_buffers;
 312         kstat_named_t arcstat_duplicate_buffers_size;
 313         kstat_named_t arcstat_duplicate_reads;
 314         kstat_named_t arcstat_meta_used;
 315         kstat_named_t arcstat_meta_limit;
 316         kstat_named_t arcstat_meta_max;
 317 } arc_stats_t;
 318 
 319 static arc_stats_t arc_stats = {
 320         { "hits",                       KSTAT_DATA_UINT64 },
 321         { "misses",                     KSTAT_DATA_UINT64 },
 322         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 323         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 324         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 325         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 326         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 327         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 328         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 329         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },


 356         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 357         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 358         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 359         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 360         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 361         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 362         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 363         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 364         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 365         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 366         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 367         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 368         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 369         { "l2_io_error",                KSTAT_DATA_UINT64 },
 370         { "l2_size",                    KSTAT_DATA_UINT64 },
 371         { "l2_asize",                   KSTAT_DATA_UINT64 },
 372         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 373         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 374         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 375         { "l2_compress_failures",       KSTAT_DATA_UINT64 },


















 376         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 377         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 378         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 379         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 380         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 381         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 382         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 383 };
 384 
 385 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 386 
 387 #define ARCSTAT_INCR(stat, val) \
 388         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 389 
 390 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 391 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 392 
 393 #define ARCSTAT_MAX(stat, val) {                                        \
 394         uint64_t m;                                                     \
 395         while ((val) > (m = arc_stats.stat.value.ui64) &&            \


 403 /*
 404  * We define a macro to allow ARC hits/misses to be easily broken down by
 405  * two separate conditions, giving a total of four different subtypes for
 406  * each of hits and misses (so eight statistics total).
 407  */
 408 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 409         if (cond1) {                                                    \
 410                 if (cond2) {                                            \
 411                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 412                 } else {                                                \
 413                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 414                 }                                                       \
 415         } else {                                                        \
 416                 if (cond2) {                                            \
 417                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 418                 } else {                                                \
 419                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 420                 }                                                       \
 421         }
 422 



















 423 kstat_t                 *arc_ksp;
 424 static arc_state_t      *arc_anon;
 425 static arc_state_t      *arc_mru;
 426 static arc_state_t      *arc_mru_ghost;
 427 static arc_state_t      *arc_mfu;
 428 static arc_state_t      *arc_mfu_ghost;
 429 static arc_state_t      *arc_l2c_only;
 430 
 431 /*
 432  * There are several ARC variables that are critical to export as kstats --
 433  * but we don't want to have to grovel around in the kstat whenever we wish to
 434  * manipulate them.  For these variables, we therefore define them to be in
 435  * terms of the statistic variable.  This assures that we are not introducing
 436  * the possibility of inconsistency by having shadow copies of the variables,
 437  * while still allowing the code to be readable.
 438  */
 439 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 440 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 441 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 442 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */


 610 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 611 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 612 
 613 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 614 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 615 
 616 /* L2ARC Performance Tunables */
 617 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 618 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 619 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 620 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 621 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 622 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 623 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 624 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 625 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 626 
 627 /*
 628  * L2ARC Internals
 629  */
 630 typedef struct l2arc_dev {
 631         vdev_t                  *l2ad_vdev;     /* vdev */
 632         spa_t                   *l2ad_spa;      /* spa */
 633         uint64_t                l2ad_hand;      /* next write location */
 634         uint64_t                l2ad_start;     /* first addr on device */
 635         uint64_t                l2ad_end;       /* last addr on device */
 636         uint64_t                l2ad_evict;     /* last addr eviction reached */
 637         boolean_t               l2ad_first;     /* first sweep through */
 638         boolean_t               l2ad_writing;   /* currently writing */
 639         list_t                  *l2ad_buflist;  /* buffer list */
 640         list_node_t             l2ad_node;      /* device list node */
 641 } l2arc_dev_t;
 642 
 643 static list_t L2ARC_dev_list;                   /* device list */
 644 static list_t *l2arc_dev_list;                  /* device list pointer */
 645 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 646 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 647 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 648 static list_t L2ARC_free_on_write;              /* free after write buf list */
 649 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 650 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 651 static uint64_t l2arc_ndev;                     /* number of devices */
 652 
 653 typedef struct l2arc_read_callback {
 654         arc_buf_t               *l2rcb_buf;             /* read buffer */
 655         spa_t                   *l2rcb_spa;             /* spa */
 656         blkptr_t                l2rcb_bp;               /* original blkptr */
 657         zbookmark_t             l2rcb_zb;               /* original bookmark */
 658         int                     l2rcb_flags;            /* original flags */
 659         enum zio_compress       l2rcb_compress;         /* applied compress */
 660 } l2arc_read_callback_t;
 661 
 662 typedef struct l2arc_write_callback {
 663         l2arc_dev_t     *l2wcb_dev;             /* device info */
 664         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */



 665 } l2arc_write_callback_t;
 666 
 667 struct l2arc_buf_hdr {
 668         /* protected by arc_buf_hdr  mutex */
 669         l2arc_dev_t             *b_dev;         /* L2ARC device */
 670         uint64_t                b_daddr;        /* disk address, offset byte */
 671         /* compression applied to buffer data */
 672         enum zio_compress       b_compress;
 673         /* real alloc'd buffer size depending on b_compress applied */
 674         int                     b_asize;
 675         /* temporary buffer holder for in-flight compressed data */
 676         void                    *b_tmp_cdata;
 677 };
 678 
 679 typedef struct l2arc_data_free {
 680         /* protected by l2arc_free_on_write_mtx */
 681         void            *l2df_data;
 682         size_t          l2df_size;
 683         void            (*l2df_func)(void *, size_t);
 684         list_node_t     l2df_list_node;
 685 } l2arc_data_free_t;
 686 
 687 static kmutex_t l2arc_feed_thr_lock;
 688 static kcondvar_t l2arc_feed_thr_cv;
 689 static uint8_t l2arc_thread_exit;
 690 
 691 static void l2arc_read_done(zio_t *zio);
 692 static void l2arc_hdr_stat_add(void);
 693 static void l2arc_hdr_stat_remove(void);
 694 
 695 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 696 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 697     enum zio_compress c);
 698 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 699 
































































































































































































































 700 static uint64_t
 701 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 702 {
 703         uint8_t *vdva = (uint8_t *)dva;
 704         uint64_t crc = -1ULL;
 705         int i;
 706 
 707         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 708 
 709         for (i = 0; i < sizeof (dva_t); i++)
 710                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 711 
 712         crc ^= (spa>>8) ^ birth;
 713 
 714         return (crc);
 715 }
 716 
 717 #define BUF_EMPTY(buf)                                          \
 718         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 719         (buf)->b_dva.dva_word[1] == 0 &&                     \


1220                         if (use_mutex)
1221                                 mutex_exit(&new_state->arcs_mtx);
1222                 }
1223         }
1224 
1225         ASSERT(!BUF_EMPTY(ab));
1226         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1227                 buf_hash_remove(ab);
1228 
1229         /* adjust state sizes */
1230         if (to_delta)
1231                 atomic_add_64(&new_state->arcs_size, to_delta);
1232         if (from_delta) {
1233                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1234                 atomic_add_64(&old_state->arcs_size, -from_delta);
1235         }
1236         ab->b_state = new_state;
1237 
1238         /* adjust l2arc hdr stats */
1239         if (new_state == arc_l2c_only)
1240                 l2arc_hdr_stat_add();
1241         else if (old_state == arc_l2c_only)
1242                 l2arc_hdr_stat_remove();
1243 }
1244 
1245 void
1246 arc_space_consume(uint64_t space, arc_space_type_t type)
1247 {
1248         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1249 
1250         switch (type) {
1251         case ARC_SPACE_DATA:
1252                 ARCSTAT_INCR(arcstat_data_size, space);
1253                 break;
1254         case ARC_SPACE_OTHER:
1255                 ARCSTAT_INCR(arcstat_other_size, space);
1256                 break;
1257         case ARC_SPACE_HDRS:
1258                 ARCSTAT_INCR(arcstat_hdr_size, space);
1259                 break;
1260         case ARC_SPACE_L2HDRS:


1324         hdr->b_type = type;
1325         hdr->b_spa = spa_load_guid(spa);
1326         hdr->b_state = arc_anon;
1327         hdr->b_arc_access = 0;
1328         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1329         buf->b_hdr = hdr;
1330         buf->b_data = NULL;
1331         buf->b_efunc = NULL;
1332         buf->b_private = NULL;
1333         buf->b_next = NULL;
1334         hdr->b_buf = buf;
1335         arc_get_data_buf(buf);
1336         hdr->b_datacnt = 1;
1337         hdr->b_flags = 0;
1338         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1339         (void) refcount_add(&hdr->b_refcnt, tag);
1340 
1341         return (buf);
1342 }
1343 



























1344 static char *arc_onloan_tag = "onloan";
1345 
1346 /*
1347  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1348  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1349  * buffers must be returned to the arc before they can be used by the DMU or
1350  * freed.
1351  */
1352 arc_buf_t *
1353 arc_loan_buf(spa_t *spa, int size)
1354 {
1355         arc_buf_t *buf;
1356 
1357         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1358 
1359         atomic_add_64(&arc_loaned_bytes, size);
1360         return (buf);
1361 }
1362 
1363 /*


3956  *      l2arc_noprefetch        skip caching prefetched buffers
3957  *      l2arc_headroom          number of max device writes to precache
3958  *      l2arc_headroom_boost    when we find compressed buffers during ARC
3959  *                              scanning, we multiply headroom by this
3960  *                              percentage factor for the next scan cycle,
3961  *                              since more compressed buffers are likely to
3962  *                              be present
3963  *      l2arc_feed_secs         seconds between L2ARC writing
3964  *
3965  * Tunables may be removed or added as future performance improvements are
3966  * integrated, and also may become zpool properties.
3967  *
3968  * There are three key functions that control how the L2ARC warms up:
3969  *
3970  *      l2arc_write_eligible()  check if a buffer is eligible to cache
3971  *      l2arc_write_size()      calculate how much to write
3972  *      l2arc_write_interval()  calculate sleep delay between writes
3973  *
3974  * These three functions determine what to write, how much, and how quickly
3975  * to send writes.










































































































































3976  */
3977 
3978 static boolean_t
3979 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3980 {
3981         /*
3982          * A buffer is *not* eligible for the L2ARC if it:
3983          * 1. belongs to a different spa.
3984          * 2. is already cached on the L2ARC.
3985          * 3. has an I/O in progress (it may be an incomplete read).
3986          * 4. is flagged not eligible (zfs property).
3987          */
3988         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3989             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3990                 return (B_FALSE);
3991 
3992         return (B_TRUE);
3993 }
3994 
3995 static uint64_t


4022         clock_t interval, next, now;
4023 
4024         /*
4025          * If the ARC lists are busy, increase our write rate; if the
4026          * lists are stale, idle back.  This is achieved by checking
4027          * how much we previously wrote - if it was more than half of
4028          * what we wanted, schedule the next write much sooner.
4029          */
4030         if (l2arc_feed_again && wrote > (wanted / 2))
4031                 interval = (hz * l2arc_feed_min_ms) / 1000;
4032         else
4033                 interval = hz * l2arc_feed_secs;
4034 
4035         now = ddi_get_lbolt();
4036         next = MAX(now, MIN(now + interval, began + interval));
4037 
4038         return (next);
4039 }
4040 
4041 static void
4042 l2arc_hdr_stat_add(void)
4043 {
4044         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);

4045         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4046 }
4047 
4048 static void
4049 l2arc_hdr_stat_remove(void)
4050 {
4051         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4052         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4053 }
4054 
4055 /*
4056  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4057  * If a device is returned, this also returns holding the spa config lock.
4058  */
4059 static l2arc_dev_t *
4060 l2arc_dev_get_next(void)
4061 {
4062         l2arc_dev_t *first, *next = NULL;
4063 
4064         /*
4065          * Lock out the removal of spas (spa_namespace_lock), then removal
4066          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4067          * both locks will be dropped and a spa config lock held instead.
4068          */
4069         mutex_enter(&spa_namespace_lock);
4070         mutex_enter(&l2arc_dev_mtx);
4071 
4072         /* if there are no vdevs, there is nothing to do */
4073         if (l2arc_ndev == 0)
4074                 goto out;
4075 
4076         first = NULL;
4077         next = l2arc_dev_last;
4078         do {
4079                 /* loop around the list looking for a non-faulted vdev */



4080                 if (next == NULL) {
4081                         next = list_head(l2arc_dev_list);
4082                 } else {
4083                         next = list_next(l2arc_dev_list, next);
4084                         if (next == NULL)
4085                                 next = list_head(l2arc_dev_list);
4086                 }
4087 
4088                 /* if we have come back to the start, bail out */
4089                 if (first == NULL)
4090                         first = next;
4091                 else if (next == first)
4092                         break;
4093 
4094         } while (vdev_is_dead(next->l2ad_vdev));
4095 
4096         /* if we were unable to find any usable vdevs, return NULL */
4097         if (vdev_is_dead(next->l2ad_vdev))
4098                 next = NULL;
4099 
4100         l2arc_dev_last = next;
4101 
4102 out:
4103         mutex_exit(&l2arc_dev_mtx);
4104 
4105         /*
4106          * Grab the config lock to prevent the 'next' device from being
4107          * removed while we are writing to it.
4108          */
4109         if (next != NULL)
4110                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4111         mutex_exit(&spa_namespace_lock);
4112 
4113         return (next);
4114 }
4115 
4116 /*
4117  * Free buffers that were tagged for destruction.


4155         ASSERT(cb != NULL);
4156         dev = cb->l2wcb_dev;
4157         ASSERT(dev != NULL);
4158         head = cb->l2wcb_head;
4159         ASSERT(head != NULL);
4160         buflist = dev->l2ad_buflist;
4161         ASSERT(buflist != NULL);
4162         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4163             l2arc_write_callback_t *, cb);
4164 
4165         if (zio->io_error != 0)
4166                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4167 
4168         mutex_enter(&l2arc_buflist_mtx);
4169 
4170         /*
4171          * All writes completed, or an error was hit.
4172          */
4173         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4174                 ab_prev = list_prev(buflist, ab);

4175 






4176                 hash_lock = HDR_LOCK(ab);
4177                 if (!mutex_tryenter(hash_lock)) {
4178                         /*
4179                          * This buffer misses out.  It may be in a stage
4180                          * of eviction.  Its ARC_L2_WRITING flag will be
4181                          * left set, denying reads to this buffer.
4182                          */
4183                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4184                         continue;
4185                 }
4186 
4187                 abl2 = ab->b_l2hdr;
4188 
4189                 /*
4190                  * Release the temporary compressed buffer as soon as possible.
4191                  */
4192                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4193                         l2arc_release_cdata_buf(ab);
4194 
4195                 if (zio->io_error != 0) {
4196                         /*
4197                          * Error - drop L2ARC entry.
4198                          */
4199                         list_remove(buflist, ab);
4200                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4201                         ab->b_l2hdr = NULL;
4202                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4203                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4204                 }
4205 
4206                 /*
4207                  * Allow ARC to begin reads to this L2ARC entry.
4208                  */
4209                 ab->b_flags &= ~ARC_L2_WRITING;
4210 
4211                 mutex_exit(hash_lock);
4212         }
4213 
4214         atomic_inc_64(&l2arc_writes_done);
4215         list_remove(buflist, head);
4216         kmem_cache_free(hdr_cache, head);
4217         mutex_exit(&l2arc_buflist_mtx);
4218 
4219         l2arc_do_free_on_write();
4220 




4221         kmem_free(cb, sizeof (l2arc_write_callback_t));
4222 }
4223 
4224 /*
4225  * A read to a cache device completed.  Validate buffer contents before
4226  * handing over to the regular ARC routines.
4227  */
4228 static void
4229 l2arc_read_done(zio_t *zio)
4230 {
4231         l2arc_read_callback_t *cb;
4232         arc_buf_hdr_t *hdr;
4233         arc_buf_t *buf;
4234         kmutex_t *hash_lock;
4235         int equal;
4236 
4237         ASSERT(zio->io_vd != NULL);
4238         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4239 
4240         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);


4482  *
4483  * Returns the number of bytes actually written (which may be smaller than
4484  * the delta by which the device hand has changed due to alignment).
4485  */
4486 static uint64_t
4487 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4488     boolean_t *headroom_boost)
4489 {
4490         arc_buf_hdr_t *ab, *ab_prev, *head;
4491         list_t *list;
4492         uint64_t write_asize, write_psize, write_sz, headroom,
4493             buf_compress_minsz;
4494         void *buf_data;
4495         kmutex_t *list_lock;
4496         boolean_t full;
4497         l2arc_write_callback_t *cb;
4498         zio_t *pio, *wzio;
4499         uint64_t guid = spa_load_guid(spa);
4500         const boolean_t do_headroom_boost = *headroom_boost;
4501 





4502         ASSERT(dev->l2ad_vdev != NULL);
4503 
4504         /* Lower the flag now, we might want to raise it again later. */
4505         *headroom_boost = B_FALSE;
4506 
4507         pio = NULL;

4508         write_sz = write_asize = write_psize = 0;
4509         full = B_FALSE;
4510         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4511         head->b_flags |= ARC_L2_WRITE_HEAD;
4512 
4513         /*
4514          * We will want to try to compress buffers that are at least 2x the
4515          * device sector size.
4516          */
4517         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4518 



4519         /*






4520          * Copy buffers for L2ARC writing.
4521          */
4522         mutex_enter(&l2arc_buflist_mtx);
4523         for (int try = 0; try <= 3; try++) {
4524                 uint64_t passed_sz = 0;
4525 
4526                 list = l2arc_list_locked(try, &list_lock);
4527 
4528                 /*
4529                  * L2ARC fast warmup.
4530                  *
4531                  * Until the ARC is warm and starts to evict, read from the
4532                  * head of the ARC lists rather than the tail.
4533                  */
4534                 if (arc_warm == B_FALSE)
4535                         ab = list_head(list);
4536                 else
4537                         ab = list_tail(list);
4538 
4539                 headroom = target_sz * l2arc_headroom;


4569 
4570                         if (!l2arc_write_eligible(guid, ab)) {
4571                                 mutex_exit(hash_lock);
4572                                 continue;
4573                         }
4574 
4575                         if ((write_sz + ab->b_size) > target_sz) {
4576                                 full = B_TRUE;
4577                                 mutex_exit(hash_lock);
4578                                 break;
4579                         }
4580 
4581                         if (pio == NULL) {
4582                                 /*
4583                                  * Insert a dummy header on the buflist so
4584                                  * l2arc_write_done() can find where the
4585                                  * write buffers begin without searching.
4586                                  */
4587                                 list_insert_head(dev->l2ad_buflist, head);
4588 
4589                                 cb = kmem_alloc(
4590                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4591                                 cb->l2wcb_dev = dev;
4592                                 cb->l2wcb_head = head;
4593                                 pio = zio_root(spa, l2arc_write_done, cb,
4594                                     ZIO_FLAG_CANFAIL);
4595                         }
4596 
4597                         /*
4598                          * Create and add a new L2ARC header.
4599                          */
4600                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4601                         l2hdr->b_dev = dev;
4602                         ab->b_flags |= ARC_L2_WRITING;
4603 
4604                         /*
4605                          * Temporarily stash the data buffer in b_tmp_cdata.
4606                          * The subsequent write step will pick it up from
4607                          * there. This is because can't access ab->b_buf
4608                          * without holding the hash_lock, which we in turn
4609                          * can't access without holding the ARC list locks


4611                          */
4612                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
4613                         l2hdr->b_asize = ab->b_size;
4614                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4615 
4616                         buf_sz = ab->b_size;
4617                         ab->b_l2hdr = l2hdr;
4618 
4619                         list_insert_head(dev->l2ad_buflist, ab);
4620 
4621                         /*
4622                          * Compute and store the buffer cksum before
4623                          * writing.  On debug the cksum is verified first.
4624                          */
4625                         arc_cksum_verify(ab->b_buf);
4626                         arc_cksum_compute(ab->b_buf, B_TRUE);
4627 
4628                         mutex_exit(hash_lock);
4629 
4630                         write_sz += buf_sz;

4631                 }
4632 
4633                 mutex_exit(list_lock);
4634 
4635                 if (full == B_TRUE)
4636                         break;
4637         }
4638 
4639         /* No buffers selected for writing? */
4640         if (pio == NULL) {
4641                 ASSERT0(write_sz);
4642                 mutex_exit(&l2arc_buflist_mtx);
4643                 kmem_cache_free(hdr_cache, head);
4644                 return (0);
4645         }
4646 



4647         /*
4648          * Now start writing the buffers. We're starting at the write head
4649          * and work backwards, retracing the course of the buffer selector
4650          * loop above.
4651          */
4652         for (ab = list_prev(dev->l2ad_buflist, head); ab;
4653             ab = list_prev(dev->l2ad_buflist, ab)) {
4654                 l2arc_buf_hdr_t *l2hdr;
4655                 uint64_t buf_sz;
4656 
4657                 /*
4658                  * We shouldn't need to lock the buffer here, since we flagged
4659                  * it as ARC_L2_WRITING in the previous step, but we must take
4660                  * care to only access its L2 cache parameters. In particular,
4661                  * ab->b_buf may be invalid by now due to ARC eviction.
4662                  */
4663                 l2hdr = ab->b_l2hdr;
4664                 l2hdr->b_daddr = dev->l2ad_hand;
4665 
4666                 if ((ab->b_flags & ARC_L2COMPRESS) &&
4667                     l2hdr->b_asize >= buf_compress_minsz) {
4668                         if (l2arc_compress_buf(l2hdr)) {
4669                                 /*
4670                                  * If compression succeeded, enable headroom
4671                                  * boost on the next scan cycle.
4672                                  */
4673                                 *headroom_boost = B_TRUE;


4685                 if (buf_sz != 0) {
4686                         uint64_t buf_p_sz;
4687 
4688                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4689                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4690                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4691                             ZIO_FLAG_CANFAIL, B_FALSE);
4692 
4693                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4694                             zio_t *, wzio);
4695                         (void) zio_nowait(wzio);
4696 
4697                         write_asize += buf_sz;
4698                         /*
4699                          * Keep the clock hand suitably device-aligned.
4700                          */
4701                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4702                         write_psize += buf_p_sz;
4703                         dev->l2ad_hand += buf_p_sz;
4704                 }
4705         }
4706 



4707         mutex_exit(&l2arc_buflist_mtx);
4708 
4709         ASSERT3U(write_asize, <=, target_sz);
4710         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4711         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4712         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4713         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4714         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4715 








4716         /*
4717          * Bump device hand to the device start if it is approaching the end.
4718          * l2arc_evict() will already have evicted ahead for this case.
4719          */
4720         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4721                 vdev_space_update(dev->l2ad_vdev,
4722                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4723                 dev->l2ad_hand = dev->l2ad_start;
4724                 dev->l2ad_evict = dev->l2ad_start;
4725                 dev->l2ad_first = B_FALSE;
4726         }
4727 
4728         dev->l2ad_writing = B_TRUE;
4729         (void) zio_wait(pio);
4730         dev->l2ad_writing = B_FALSE;
4731 
4732         return (write_asize);
4733 }
4734 
4735 /*


4977 }
4978 
4979 boolean_t
4980 l2arc_vdev_present(vdev_t *vd)
4981 {
4982         l2arc_dev_t *dev;
4983 
4984         mutex_enter(&l2arc_dev_mtx);
4985         for (dev = list_head(l2arc_dev_list); dev != NULL;
4986             dev = list_next(l2arc_dev_list, dev)) {
4987                 if (dev->l2ad_vdev == vd)
4988                         break;
4989         }
4990         mutex_exit(&l2arc_dev_mtx);
4991 
4992         return (dev != NULL);
4993 }
4994 
4995 /*
4996  * Add a vdev for use by the L2ARC.  By this point the spa has already
4997  * validated the vdev and opened it.

4998  */
4999 void
5000 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5001 {
5002         l2arc_dev_t *adddev;
5003 
5004         ASSERT(!l2arc_vdev_present(vd));
5005 
5006         /*
5007          * Create a new l2arc device entry.
5008          */
5009         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5010         adddev->l2ad_spa = spa;
5011         adddev->l2ad_vdev = vd;
5012         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5013         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5014         adddev->l2ad_hand = adddev->l2ad_start;
5015         adddev->l2ad_evict = adddev->l2ad_start;
5016         adddev->l2ad_first = B_TRUE;
5017         adddev->l2ad_writing = B_FALSE;

5018 
5019         /*
5020          * This is a list of all ARC buffers that are still valid on the
5021          * device.
5022          */
5023         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5024         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5025             offsetof(arc_buf_hdr_t, b_l2node));
5026 
5027         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5028 
5029         /*
5030          * Add device to global list
5031          */
5032         mutex_enter(&l2arc_dev_mtx);
5033         list_insert_head(l2arc_dev_list, adddev);
5034         atomic_inc_64(&l2arc_ndev);





5035         mutex_exit(&l2arc_dev_mtx);
5036 }
5037 
5038 /*
5039  * Remove a vdev from the L2ARC.
5040  */
5041 void
5042 l2arc_remove_vdev(vdev_t *vd)
5043 {
5044         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5045 
5046         /*
5047          * Find the device by vdev
5048          */
5049         mutex_enter(&l2arc_dev_mtx);
5050         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5051                 nextdev = list_next(l2arc_dev_list, dev);
5052                 if (vd == dev->l2ad_vdev) {
5053                         remdev = dev;
5054                         break;
5055                 }
5056         }
5057         ASSERT(remdev != NULL);
5058 
5059         /*
5060          * Remove device from global list
5061          */
5062         list_remove(l2arc_dev_list, remdev);
5063         l2arc_dev_last = NULL;          /* may have been invalidated */
5064         atomic_dec_64(&l2arc_ndev);
5065         mutex_exit(&l2arc_dev_mtx);
5066 
5067         /*
5068          * Clear all buflists and ARC references.  L2ARC device flush.
5069          */

5070         l2arc_evict(remdev, 0, B_TRUE);
5071         list_destroy(remdev->l2ad_buflist);
5072         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5073         kmem_free(remdev, sizeof (l2arc_dev_t));
5074 }
5075 
5076 void
5077 l2arc_init(void)
5078 {
5079         l2arc_thread_exit = 0;
5080         l2arc_ndev = 0;
5081         l2arc_writes_sent = 0;
5082         l2arc_writes_done = 0;
5083 
5084         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5085         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5086         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5087         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5088         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5089 


5121 {
5122         if (!(spa_mode_global & FWRITE))
5123                 return;
5124 
5125         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5126             TS_RUN, minclsyspri);
5127 }
5128 
5129 void
5130 l2arc_stop(void)
5131 {
5132         if (!(spa_mode_global & FWRITE))
5133                 return;
5134 
5135         mutex_enter(&l2arc_feed_thr_lock);
5136         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
5137         l2arc_thread_exit = 1;
5138         while (l2arc_thread_exit != 0)
5139                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5140         mutex_exit(&l2arc_feed_thr_lock);



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































5141 }


 119  *      - ARC header release, as it removes from L2ARC buflists
 120  */
 121 
 122 #include <sys/spa.h>
 123 #include <sys/zio.h>
 124 #include <sys/zio_compress.h>
 125 #include <sys/zfs_context.h>
 126 #include <sys/arc.h>
 127 #include <sys/refcount.h>
 128 #include <sys/vdev.h>
 129 #include <sys/vdev_impl.h>
 130 #ifdef _KERNEL
 131 #include <sys/vmsystm.h>
 132 #include <vm/anon.h>
 133 #include <sys/fs/swapnode.h>
 134 #include <sys/dnlc.h>
 135 #endif
 136 #include <sys/callb.h>
 137 #include <sys/kstat.h>
 138 #include <zfs_fletcher.h>
 139 #include <sys/byteorder.h>
 140 
 141 #ifndef _KERNEL
 142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 143 boolean_t arc_watch = B_FALSE;
 144 int arc_procfd;
 145 #endif
 146 
 147 static kmutex_t         arc_reclaim_thr_lock;
 148 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 149 static uint8_t          arc_thread_exit;
 150 
 151 extern int zfs_write_limit_shift;
 152 extern uint64_t zfs_write_limit_max;
 153 extern kmutex_t zfs_write_limit_lock;
 154 
 155 #define ARC_REDUCE_DNLC_PERCENT 3
 156 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 157 
 158 typedef enum arc_reclaim_strategy {
 159         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */


 291         kstat_named_t arcstat_l2_feeds;
 292         kstat_named_t arcstat_l2_rw_clash;
 293         kstat_named_t arcstat_l2_read_bytes;
 294         kstat_named_t arcstat_l2_write_bytes;
 295         kstat_named_t arcstat_l2_writes_sent;
 296         kstat_named_t arcstat_l2_writes_done;
 297         kstat_named_t arcstat_l2_writes_error;
 298         kstat_named_t arcstat_l2_writes_hdr_miss;
 299         kstat_named_t arcstat_l2_evict_lock_retry;
 300         kstat_named_t arcstat_l2_evict_reading;
 301         kstat_named_t arcstat_l2_free_on_write;
 302         kstat_named_t arcstat_l2_abort_lowmem;
 303         kstat_named_t arcstat_l2_cksum_bad;
 304         kstat_named_t arcstat_l2_io_error;
 305         kstat_named_t arcstat_l2_size;
 306         kstat_named_t arcstat_l2_asize;
 307         kstat_named_t arcstat_l2_hdr_size;
 308         kstat_named_t arcstat_l2_compress_successes;
 309         kstat_named_t arcstat_l2_compress_zeros;
 310         kstat_named_t arcstat_l2_compress_failures;
 311         kstat_named_t arcstat_l2_meta_writes;
 312         kstat_named_t arcstat_l2_meta_avg_size;
 313         kstat_named_t arcstat_l2_meta_avg_asize;
 314         kstat_named_t arcstat_l2_asize_to_meta_ratio;
 315         kstat_named_t arcstat_l2_rebuild_attempts;
 316         kstat_named_t arcstat_l2_rebuild_successes;
 317         kstat_named_t arcstat_l2_rebuild_unsupported;
 318         kstat_named_t arcstat_l2_rebuild_timeout;
 319         kstat_named_t arcstat_l2_rebuild_arc_bytes;
 320         kstat_named_t arcstat_l2_rebuild_l2arc_bytes;
 321         kstat_named_t arcstat_l2_rebuild_bufs;
 322         kstat_named_t arcstat_l2_rebuild_bufs_precached;
 323         kstat_named_t arcstat_l2_rebuild_metabufs;
 324         kstat_named_t arcstat_l2_rebuild_uberblk_errors;
 325         kstat_named_t arcstat_l2_rebuild_io_errors;
 326         kstat_named_t arcstat_l2_rebuild_cksum_errors;
 327         kstat_named_t arcstat_l2_rebuild_loop_errors;
 328         kstat_named_t arcstat_l2_rebuild_abort_lowmem;
 329         kstat_named_t arcstat_memory_throttle_count;
 330         kstat_named_t arcstat_duplicate_buffers;
 331         kstat_named_t arcstat_duplicate_buffers_size;
 332         kstat_named_t arcstat_duplicate_reads;
 333         kstat_named_t arcstat_meta_used;
 334         kstat_named_t arcstat_meta_limit;
 335         kstat_named_t arcstat_meta_max;
 336 } arc_stats_t;
 337 
 338 static arc_stats_t arc_stats = {
 339         { "hits",                       KSTAT_DATA_UINT64 },
 340         { "misses",                     KSTAT_DATA_UINT64 },
 341         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 342         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 343         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 344         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 345         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 346         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 347         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 348         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },


 375         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 376         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 377         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 378         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 379         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 380         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 381         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 382         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 383         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 384         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 385         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 386         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 387         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 388         { "l2_io_error",                KSTAT_DATA_UINT64 },
 389         { "l2_size",                    KSTAT_DATA_UINT64 },
 390         { "l2_asize",                   KSTAT_DATA_UINT64 },
 391         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 392         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 393         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 394         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 395         { "l2_meta_writes",             KSTAT_DATA_UINT64 },
 396         { "l2_meta_avg_size",           KSTAT_DATA_UINT64 },
 397         { "l2_meta_avg_asize",          KSTAT_DATA_UINT64 },
 398         { "l2_asize_to_meta_ratio",     KSTAT_DATA_UINT64 },
 399         { "l2_rebuild_attempts",        KSTAT_DATA_UINT64 },
 400         { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
 401         { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
 402         { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
 403         { "l2_rebuild_arc_bytes",       KSTAT_DATA_UINT64 },
 404         { "l2_rebuild_l2arc_bytes",     KSTAT_DATA_UINT64 },
 405         { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
 406         { "l2_rebuild_precached",       KSTAT_DATA_UINT64 },
 407         { "l2_rebuild_metabufs",        KSTAT_DATA_UINT64 },
 408         { "l2_rebuild_uberblk_errors",  KSTAT_DATA_UINT64 },
 409         { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
 410         { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
 411         { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
 412         { "l2_rebuild_abort_lowmem",    KSTAT_DATA_UINT64 },
 413         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 414         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 415         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 416         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 417         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 418         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 419         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 420 };
 421 
 422 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 423 
 424 #define ARCSTAT_INCR(stat, val) \
 425         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 426 
 427 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 428 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 429 
 430 #define ARCSTAT_MAX(stat, val) {                                        \
 431         uint64_t m;                                                     \
 432         while ((val) > (m = arc_stats.stat.value.ui64) &&            \


 440 /*
 441  * We define a macro to allow ARC hits/misses to be easily broken down by
 442  * two separate conditions, giving a total of four different subtypes for
 443  * each of hits and misses (so eight statistics total).
 444  */
 445 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 446         if (cond1) {                                                    \
 447                 if (cond2) {                                            \
 448                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 449                 } else {                                                \
 450                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 451                 }                                                       \
 452         } else {                                                        \
 453                 if (cond2) {                                            \
 454                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 455                 } else {                                                \
 456                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 457                 }                                                       \
 458         }
 459 
 460 /*
 461  * This macro allows us to use kstats as floating averages. Each time we
 462  * update this kstat, we first factor it and the update value by
 463  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
 464  * average. This macro assumes that integer loads and stores are atomic, but
 465  * is not safe for multiple writers updating the kstat in parallel (only the
 466  * last writer's update will remain).
 467  */
 468 #define ARCSTAT_F_AVG_FACTOR    3
 469 #define ARCSTAT_F_AVG(stat, value) \
 470         do { \
 471                 uint64_t x = ARCSTAT(stat); \
 472                 x = x - x / ARCSTAT_F_AVG_FACTOR + \
 473                     (value) / ARCSTAT_F_AVG_FACTOR; \
 474                 ARCSTAT(stat) = x; \
 475                 _NOTE(NOTREACHED) \
 476                 _NOTE(CONSTCOND) \
 477         } while (0)
 478 
 479 kstat_t                 *arc_ksp;
 480 static arc_state_t      *arc_anon;
 481 static arc_state_t      *arc_mru;
 482 static arc_state_t      *arc_mru_ghost;
 483 static arc_state_t      *arc_mfu;
 484 static arc_state_t      *arc_mfu_ghost;
 485 static arc_state_t      *arc_l2c_only;
 486 
 487 /*
 488  * There are several ARC variables that are critical to export as kstats --
 489  * but we don't want to have to grovel around in the kstat whenever we wish to
 490  * manipulate them.  For these variables, we therefore define them to be in
 491  * terms of the statistic variable.  This assures that we are not introducing
 492  * the possibility of inconsistency by having shadow copies of the variables,
 493  * while still allowing the code to be readable.
 494  */
 495 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 496 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 497 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 498 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */


 666 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 667 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 668 
 669 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 670 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 671 
 672 /* L2ARC Performance Tunables */
 673 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 674 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 675 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 676 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 677 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 678 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 679 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 680 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 681 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 682 
 683 /*
 684  * L2ARC Internals
 685  */
 686 typedef struct l2arc_dev l2arc_dev_t;












 687 static list_t L2ARC_dev_list;                   /* device list */
 688 static list_t *l2arc_dev_list;                  /* device list pointer */
 689 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 690 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 691 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 692 static list_t L2ARC_free_on_write;              /* free after write buf list */
 693 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 694 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 695 static uint64_t l2arc_ndev;                     /* number of devices */
 696 
 697 typedef struct l2arc_read_callback {
 698         arc_buf_t               *l2rcb_buf;             /* read buffer */
 699         spa_t                   *l2rcb_spa;             /* spa */
 700         blkptr_t                l2rcb_bp;               /* original blkptr */
 701         zbookmark_t             l2rcb_zb;               /* original bookmark */
 702         int                     l2rcb_flags;            /* original flags */
 703         enum zio_compress       l2rcb_compress;         /* applied compress */
 704 } l2arc_read_callback_t;
 705 
 706 typedef struct l2arc_write_callback {
 707         l2arc_dev_t     *l2wcb_dev;             /* device info */
 708         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 709         uint8_t         *l2wcb_pbuf;            /* pbuf sent in this write */
 710         uint32_t        l2wcb_pbuf_size;        /* size of committed pbuf */
 711         uint8_t         *l2wcb_ub_buf;          /* uberblock in this write */
 712 } l2arc_write_callback_t;
 713 
 714 struct l2arc_buf_hdr {
 715         /* protected by arc_buf_hdr  mutex */
 716         l2arc_dev_t             *b_dev;         /* L2ARC device */
 717         uint64_t                b_daddr;        /* disk address, offset byte */
 718         /* compression applied to buffer data */
 719         enum zio_compress       b_compress;
 720         /* real alloc'd buffer size depending on b_compress applied */
 721         int                     b_asize;
 722         /* temporary buffer holder for in-flight compressed data */
 723         void                    *b_tmp_cdata;
 724 };
 725 
 726 typedef struct l2arc_data_free {
 727         /* protected by l2arc_free_on_write_mtx */
 728         void            *l2df_data;
 729         size_t          l2df_size;
 730         void            (*l2df_func)(void *, size_t);
 731         list_node_t     l2df_list_node;
 732 } l2arc_data_free_t;
 733 
 734 static kmutex_t l2arc_feed_thr_lock;
 735 static kcondvar_t l2arc_feed_thr_cv;
 736 static uint8_t l2arc_thread_exit;
 737 
 738 static void l2arc_read_done(zio_t *zio);
 739 static void l2arc_hdr_stat_add(boolean_t from_arc);
 740 static void l2arc_hdr_stat_remove(void);
 741 
 742 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 743 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 744     enum zio_compress c);
 745 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 746 
 747 typedef enum {
 748         L2UBLK_BIG_ENDIAN = (1 << 0),     /* little endian assumed otherwise */
 749         L2UBLK_EVICT_FIRST = (1 << 1)     /* mirror of l2ad_first in l2dev */
 750 } l2uberblock_flags_t;
 751 
 752 typedef struct l2uberblock {
 753         uint32_t                ub_magic;
 754         uint8_t                 ub_version;
 755         l2uberblock_flags_t     ub_flags;
 756 
 757         uint64_t                ub_spa_guid;
 758         uint64_t                ub_birth;
 759         uint64_t                ub_evict_tail;  /* current evict pointer */
 760         uint64_t                ub_alloc_space; /* vdev space alloc status */
 761         uint64_t                ub_pbuf_daddr;  /* address of newest pbuf */
 762         uint32_t                ub_pbuf_asize;  /* size of newest pbuf */
 763         zio_cksum_t             ub_pbuf_cksum;  /* fletcher4 of newest pbuf */
 764 
 765         zio_cksum_t             ub_cksum;       /* cksum of uberblock */
 766 } l2uberblock_t;
 767 
 768 typedef enum {
 769         L2PBUF_BIG_ENDIAN = (1 << 0),     /* little endian assumed otherwise */
 770         L2PBUF_COMPRESSED = (1 << 1)      /* pbuf data items are compressed */
 771 } l2pbuf_flags_t;
 772 
 773 typedef struct l2pbuf {
 774         uint32_t                pb_magic;
 775         unsigned int            pb_version;
 776         l2pbuf_flags_t          pb_flags;
 777 
 778         uint64_t                pb_prev_daddr;  /* address of previous pbuf */
 779         uint32_t                pb_prev_asize;  /* size of previous pbuf */
 780         zio_cksum_t             pb_prev_cksum;  /* fletcher4 of prev. pbuf */
 781 
 782         /*
 783          * This is a set of item lists that are contained in this pbuf. Each
 784          * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's.
 785          * This serves as a soft timeout feature - once the limit of the
 786          * number of item lists that a pbuf can hold is reached, the pbuf is
 787          * flushed to stable storage, regardless of its total size.
 788          */
 789         list_t                  *pb_buflists_list;
 790 
 791         /*
 792          * Number of compressed bytes referenced by items in this pbuf and
 793          * the number of lists present.
 794          * This is not actually written to storage, it is only used by
 795          * internal algorithms which check for when a pbuf reaches a
 796          * certain size limit, after which it is flushed in a write.
 797          */
 798         uint64_t                pb_payload_asz;
 799         /* Same thing for number of buflists */
 800         int                     pb_nbuflists;
 801 
 802         /*
 803          * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size.
 804          * This is then used by l2arc_pbuf_restore to update used space
 805          * on the L2ARC vdev.
 806          */
 807         size_t                  pb_asize;
 808 } l2pbuf_t;
 809 
 810 typedef struct l2pbuf_buf l2pbuf_buf_t;
 811 typedef struct l2pbuf_buflist {
 812         uint32_t                l2pbl_nbufs;
 813         l2pbuf_buf_t            *l2pbl_bufs;
 814         list_node_t             l2pbl_node;
 815 } l2pbuf_buflist_t;
 816 
 817 struct l2pbuf_buf {
 818         dva_t                   b_dva;          /* dva of buffer */
 819         uint64_t                b_birth;        /* birth txg of buffer */
 820         uint64_t                b_cksum0;
 821         zio_cksum_t             b_freeze_cksum;
 822         uint32_t                b_size;         /* uncompressed buf size */
 823         uint64_t                b_l2daddr;      /* buf location on l2dev */
 824         uint32_t                b_l2asize;      /* actual buf data size */
 825         enum zio_compress       b_l2compress;   /* compression applied */
 826         uint16_t                b_contents_type;
 827         uint32_t                b_flags;
 828 };
 829 
 830 struct l2arc_dev {
 831         vdev_t                  *l2ad_vdev;     /* vdev */
 832         spa_t                   *l2ad_spa;      /* spa */
 833         uint64_t                l2ad_hand;      /* next write location */
 834         uint64_t                l2ad_start;     /* first addr on device */
 835         uint64_t                l2ad_end;       /* last addr on device */
 836         uint64_t                l2ad_evict;     /* last addr eviction reached */
 837         boolean_t               l2ad_first;     /* first sweep through */
 838         boolean_t               l2ad_writing;   /* currently writing */
 839         list_t                  *l2ad_buflist;  /* buffer list */
 840         list_node_t             l2ad_node;      /* device list node */
 841         l2pbuf_t                l2ad_pbuf;      /* currently open pbuf */
 842         uint64_t                l2ad_pbuf_daddr;        /* prev pbuf daddr */
 843         uint64_t                l2ad_pbuf_asize;        /* prev pbuf asize */
 844         zio_cksum_t             l2ad_pbuf_cksum;        /* prev pbuf cksum */
 845         /* uberblock birth counter - incremented for each committed uberblk */
 846         uint64_t                l2ad_uberblock_birth;
 847         /* flag indicating whether a rebuild is currently going on */
 848         boolean_t               l2ad_rebuilding;
 849 };
 850 
 851 /* Stores information about an L2ARC prefetch zio */
 852 typedef struct l2arc_prefetch_info {
 853         uint8_t                 *pi_buf;        /* where the zio writes to */
 854         uint64_t                pi_buflen;      /* length of `buf' */
 855         zio_t                   *pi_hdr_io;     /* see l2arc_pbuf_read below */
 856 } l2arc_prefetch_info_t;
 857 
 858 /* 256 x 4k of l2uberblocks */
 859 #define L2UBERBLOCK_SIZE        4096
 860 #define L2UBERBLOCK_MAGIC       0x12bab10c
 861 #define L2UBERBLOCK_MAX_VERSION 1       /* our maximum uberblock version */
 862 #define L2PBUF_MAGIC            0xdb0faba6
 863 #define L2PBUF_MAX_VERSION      1       /* our maximum pbuf version */
 864 #define L2PBUF_BUF_SIZE         88      /* size of one pbuf buf entry */
 865 #define L2PBUF_HDR_SIZE         56      /* pbuf header excluding any payload */
 866 #define L2PBUF_ENCODED_SIZE(_pb) \
 867         (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb))
 868 /*
 869  * Allocation limit for the payload of a pbuf. This also fundamentally
 870  * limits the number of bufs we can reference in a pbuf.
 871  */
 872 #define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024)
 873 #define L2PBUF_MAX_BUFS         (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE)
 874 #define L2PBUF_COMPRESS_MINSZ   8192    /* minimum size to compress a pbuf */
 875 #define L2PBUF_MAXSZ            100 * 1024 * 1024       /* maximum pbuf size */
 876 #define L2PBUF_MAX_BUFLISTS     128     /* max number of buflists per pbuf */
 877 #define L2ARC_REBUILD_TIMEOUT   60      /* a rebuild may take at most 60s */
 878 #define L2PBUF_IS_FULL(_pb) \
 879         ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \
 880         (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists)
 881 /*
 882  * These are the flags we allow to persist in L2ARC pbufs. The other flags
 883  * of an ARC buffer pertain to the buffer's runtime behavior.
 884  */
 885 #define L2ARC_PERSIST_FLAGS \
 886         (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH)
 887 
 888 /*
 889  * Used during L2ARC rebuild after each read operation to check whether we
 890  * haven't exceeded the rebuild timeout value.
 891  */
 892 #define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \
 893         do { \
 894                 if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \
 895                         __VA_ARGS__; \
 896                         ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \
 897                         cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \
 898                             "dropping remaining L2ARC metadata."); \
 899                         return; \
 900                 } \
 901                 _NOTE(NOTREACHED) \
 902                 _NOTE(CONSTCOND) \
 903         } while (0)
 904 
 905 /*
 906  * Performance tuning of L2ARC persistency:
 907  *
 908  * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt
 909  *              compressing it.
 910  * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers
 911  *              referenced from a pbuf. Once a pbuf reaches this size, it is
 912  *              committed to stable storage. Ideally, there should be approx.
 913  *              l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device.
 914  * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will
 915  *              be buffered in a pbuf before it is committed to L2ARC. This
 916  *              puts a soft temporal upper bound on pbuf commit intervals.
 917  * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
 918  *              pool import or when adding one manually later) will attempt
 919  *              to rebuild L2ARC buffer contents. In special circumstances,
 920  *              the administrator may want to set this to B_FALSE, if they
 921  *              are having trouble importing a pool or attaching an L2ARC
 922  *              device (e.g. the L2ARC device is slow to read in stored pbuf
 923  *              metadata, or the metadata has become somehow
 924  *              fragmented/unusable).
 925  * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
 926  *              avoid a slow L2ARC device from preventing pool import. If we
 927  *              are not done rebuilding an L2ARC device by this time, we
 928  *              stop the rebuild and return immediately.
 929  */
 930 uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ;
 931 uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ;
 932 uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS;
 933 boolean_t l2arc_rebuild_enabled = B_TRUE;
 934 uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
 935 
 936 static void l2arc_rebuild_start(l2arc_dev_t *dev);
 937 static void l2arc_rebuild(l2arc_dev_t *dev);
 938 static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb);
 939 static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev,
 940     uint64_t guid);
 941 
 942 static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub);
 943 static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
 944     zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io);
 945 static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr,
 946     uint32_t asize);
 947 static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize);
 948 static void l2arc_pbuf_prefetch_abort(zio_t *zio);
 949 
 950 static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf);
 951 static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub);
 952 static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
 953     uint64_t guid);
 954 static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio,
 955     l2arc_write_callback_t *cb);
 956 
 957 static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen);
 958 static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen,
 959     l2pbuf_t *pbuf);
 960 static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen,
 961     uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum);
 962 static void l2arc_pbuf_init(l2pbuf_t *pb);
 963 static void l2arc_pbuf_destroy(l2pbuf_t *pb);
 964 static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio,
 965     l2arc_write_callback_t *cb);
 966 static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs);
 967 static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
 968     const arc_buf_hdr_t *ab, int index);
 969 static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb);
 970 
 971 static uint64_t
 972 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 973 {
 974         uint8_t *vdva = (uint8_t *)dva;
 975         uint64_t crc = -1ULL;
 976         int i;
 977 
 978         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 979 
 980         for (i = 0; i < sizeof (dva_t); i++)
 981                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 982 
 983         crc ^= (spa>>8) ^ birth;
 984 
 985         return (crc);
 986 }
 987 
 988 #define BUF_EMPTY(buf)                                          \
 989         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 990         (buf)->b_dva.dva_word[1] == 0 &&                     \


1491                         if (use_mutex)
1492                                 mutex_exit(&new_state->arcs_mtx);
1493                 }
1494         }
1495 
1496         ASSERT(!BUF_EMPTY(ab));
1497         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1498                 buf_hash_remove(ab);
1499 
1500         /* adjust state sizes */
1501         if (to_delta)
1502                 atomic_add_64(&new_state->arcs_size, to_delta);
1503         if (from_delta) {
1504                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1505                 atomic_add_64(&old_state->arcs_size, -from_delta);
1506         }
1507         ab->b_state = new_state;
1508 
1509         /* adjust l2arc hdr stats */
1510         if (new_state == arc_l2c_only)
1511                 l2arc_hdr_stat_add(old_state != arc_anon);
1512         else if (old_state == arc_l2c_only)
1513                 l2arc_hdr_stat_remove();
1514 }
1515 
1516 void
1517 arc_space_consume(uint64_t space, arc_space_type_t type)
1518 {
1519         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1520 
1521         switch (type) {
1522         case ARC_SPACE_DATA:
1523                 ARCSTAT_INCR(arcstat_data_size, space);
1524                 break;
1525         case ARC_SPACE_OTHER:
1526                 ARCSTAT_INCR(arcstat_other_size, space);
1527                 break;
1528         case ARC_SPACE_HDRS:
1529                 ARCSTAT_INCR(arcstat_hdr_size, space);
1530                 break;
1531         case ARC_SPACE_L2HDRS:


1595         hdr->b_type = type;
1596         hdr->b_spa = spa_load_guid(spa);
1597         hdr->b_state = arc_anon;
1598         hdr->b_arc_access = 0;
1599         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1600         buf->b_hdr = hdr;
1601         buf->b_data = NULL;
1602         buf->b_efunc = NULL;
1603         buf->b_private = NULL;
1604         buf->b_next = NULL;
1605         hdr->b_buf = buf;
1606         arc_get_data_buf(buf);
1607         hdr->b_datacnt = 1;
1608         hdr->b_flags = 0;
1609         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1610         (void) refcount_add(&hdr->b_refcnt, tag);
1611 
1612         return (buf);
1613 }
1614 
1615 /*
1616  * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
1617  * This is used during l2arc reconstruction to make empty ARC buffers
1618  * which circumvent the regular disk->arc->l2arc path and instead come
1619  * into being in the reverse order, i.e. l2arc->arc->(disk).
1620  */
1621 arc_buf_hdr_t *
1622 arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
1623 {
1624         arc_buf_hdr_t *hdr;
1625 
1626         ASSERT3U(size, >, 0);
1627         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1628         ASSERT(BUF_EMPTY(hdr));
1629         hdr->b_size = size;
1630         hdr->b_type = type;
1631         hdr->b_spa = guid;
1632         hdr->b_state = arc_anon;
1633         hdr->b_arc_access = 0;
1634         hdr->b_buf = NULL;
1635         hdr->b_datacnt = 0;
1636         hdr->b_flags = 0;
1637         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1638 
1639         return (hdr);
1640 }
1641 
1642 static char *arc_onloan_tag = "onloan";
1643 
1644 /*
1645  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1646  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1647  * buffers must be returned to the arc before they can be used by the DMU or
1648  * freed.
1649  */
1650 arc_buf_t *
1651 arc_loan_buf(spa_t *spa, int size)
1652 {
1653         arc_buf_t *buf;
1654 
1655         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1656 
1657         atomic_add_64(&arc_loaned_bytes, size);
1658         return (buf);
1659 }
1660 
1661 /*


4254  *      l2arc_noprefetch        skip caching prefetched buffers
4255  *      l2arc_headroom          number of max device writes to precache
4256  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4257  *                              scanning, we multiply headroom by this
4258  *                              percentage factor for the next scan cycle,
4259  *                              since more compressed buffers are likely to
4260  *                              be present
4261  *      l2arc_feed_secs         seconds between L2ARC writing
4262  *
4263  * Tunables may be removed or added as future performance improvements are
4264  * integrated, and also may become zpool properties.
4265  *
4266  * There are three key functions that control how the L2ARC warms up:
4267  *
4268  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4269  *      l2arc_write_size()      calculate how much to write
4270  *      l2arc_write_interval()  calculate sleep delay between writes
4271  *
4272  * These three functions determine what to write, how much, and how quickly
4273  * to send writes.
4274  *
4275  * L2ARC persistency:
4276  *
4277  * When writing buffers to L2ARC, we periodically add some metadata to
4278  * make sure we can pick them up after reboot, thus dramatically reducing
4279  * the impact that any downtime has on the performance of storage systems
4280  * with large caches.
4281  *
4282  * The implementation works fairly simply by integrating the following two
4283  * modifications:
4284  *
4285  * *) Every now and then, at end of an L2ARC feed cycle, we append a piece
4286  *    of metadata (called a "pbuf", or "persistency buffer") to the L2ARC
4287  *    write. This allows us to understand what what's been written, so that
4288  *    we can rebuild the arc_buf_hdr_t structures of the main ARC buffers.
4289  *    The pbuf also includes a "back-reference" pointer to the previous
4290  *    pbuf, forming a linked list of pbufs on the L2ARC device.
4291  *
4292  * *) We reserve 4k of space at the start of each L2ARC device for our
4293  *    header bookkeeping purposes. This contains a single 4k uberblock, which
4294  *    contains our top-level reference structures. We update it on each pbuf
4295  *    write. If this write results in an inconsistent uberblock (e.g. due to
4296  *    power failure), we detect this by verifying the uberblock's checksum
4297  *    and simply drop the entries from L2ARC. Once an L2ARC pbuf update
4298  *    completes, we update the uberblock to point to it.
4299  *
4300  * Implementation diagram:
4301  *
4302  * +=== L2ARC device (not to scale) ======================================+
4303  * |       ____________newest pbuf pointer_____________                   |
4304  * |      /                                            \                  |
4305  * |     /                                              V                 |
4306  * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---|
4307  * |                       ^       / ^       / ^       /                  |
4308  * |                       `-prev-'  `-prev-'  `-prev-'                   |
4309  * |                         pbuf      pbuf      pbuf                     |
4310  * +======================================================================+
4311  *
4312  * On-device data structures:
4313  *
4314  * (L2ARC persistent uberblock)
4315  * struct l2uberblock {
4316  *      (these fields are in network byte order)
4317  *      uint32_t magic = 0x12bab10c;    l2-ber-block
4318  *      uint8_t  version = 0x1;
4319  *      uint8_t  reserved = 0x0;
4320  *      uint16_t ublk_flags;            see l2uberblock_flags_t
4321  *
4322  *      (byte order of fields below determined by `ublk_flags')
4323  *      uint64_t spa_guid;              what pool this l2arc dev belongs to
4324  *      uint64_t birth_txg;             ublk with highest birth_txg is newest
4325  *      uint64_t evict_tail;            current evict pointer on l2arc dev
4326  *      uint64_t alloc_space;           how much space is alloc'd on the dev
4327  *      uint64_t pbuf_daddr;            dev addr of the newest l2pbuf_t
4328  *      uint32_t pbuf_asize;            size of newest pbuf
4329  *      uint64_t pbuf_cksum[4];         fletcher4 of newest pbuf
4330  *
4331  *      uint8_t  reserved[3996] = {0x0, 0x0, ... 0x0};
4332  *
4333  *      uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above);
4334  * } l2dev_uberblock;
4335  *
4336  * (L2ARC persistent buffer list)
4337  * typedef struct l2pbuf_t {
4338  *      (these fields are in network byte order)
4339  *      uint32_t magic = 0xdb0faba6;    the-buffer-bag
4340  *      uint8_t  version = 0x1;
4341  *      uint8_t  reserved = 0x0;
4342  *      uint16_t pbuf_flags;            see l2pbuf_flags_t
4343  *
4344  *      (byte order of fields below determined by `pbuf_flags')
4345  *      uint64_t prev_pbuf_daddr;       previous pbuf dev addr
4346  *      uint32_t prev_pbuf_asize;       previous pbuf size
4347  *      uint64_t prev_pbuf_cksum[4];    fletcher4(of previous pbuf)
4348  *
4349  *      uint32_t items_size;            uncompressed size of `items' below
4350  *      (if (pbuf_flags & compress) decompress `items' prior to decoding)
4351  *      struct l2pbuf_buf_item {
4352  *              (these fields mirror [l2]arc_buf_hdr fields)
4353  *              uint64_t dva[2];                buffer's DVA
4354  *              uint64_t birth;                 buffer's birth TXG in ARC
4355  *              uint64_t cksum0;                lower 64-bits of buffer's cksum
4356  *              uint64_t freeze_cksum[4];       buffer's freeze cksum
4357  *              uint32_t size;                  uncompressed buffer data size
4358  *              uint64_t l2daddr;               device address (offset) of buf
4359  *              uint32_t l2asize;               actual space occupied by buf
4360  *              uint8_t  compress;              compress algo used on data
4361  *              uint8_t  contents_type;         buffer's contents type
4362  *              uint16_t reserved = 0x0;        for alignment and future use
4363  *              uint32_t flags;                 buffer's persistent flags
4364  *      } items[];                              continues for remainder of pbuf
4365  * } l2pbuf_t;
4366  *
4367  * L2ARC reconstruction:
4368  *
4369  * When writing data, we simply write in the standard rotary fashion,
4370  * evicting buffers as we go and simply writing new data over them (appending
4371  * an updated l2pbuf_t every now and then). This obviously means that once we
4372  * loop around the end of the device, we will start cutting into an already
4373  * committed l2pbuf (and its referenced data buffers), like so:
4374  *
4375  *    current write head__       __old tail
4376  *                        \     /
4377  *                        V    V
4378  * <--|bufs|pbuf|bufs|pbuf|    |bufs|pbuf|bufs|pbuf|-->
4379  *                         ^    ^^^^^^^^^_____________________________
4380  *                         |                                          \
4381  *                         <<nextwrite>> - will overwrite this pbuf --/
4382  *
4383  * When importing the pool, we detect this situation and use it to stop
4384  * our scanning process:
4385  * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the
4386  *      previous one.
4387  * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum)
4388  *      then the pbuf is invalid and stop scanning (goto step 3 below).
4389  * 3) if (this is the last valid pbuf)
4390  *      discard this pbuf as well (its ARC bufs may have been damaged by a
4391  *      partial overwrite).
4392  * (We could potentially salvage the remaining good arc bufs above in step 3,
4393  * buf the cost of doing so probably outweighs the value of the entire pbuf).
4394  *
4395  * There is one significant caveat to consider when rebuilding ARC contents
4396  * from an L2ARC device: what about invalidated buffers? Given the above
4397  * construction, we cannot update pbufs which we've already written to amend
4398  * them to remove buffers which were invalidated. Thus, during reconstruction,
4399  * we might be populating the cache with buffers for data that's not on the
4400  * main pool anymore, or may have been overwritten!
4401  *
4402  * As it turns out, this isn't a problem. Every arc_read request includes
4403  * both the DVA and, crucially, the birth TXG of the BP the caller is
4404  * looking for. So even if the cache were populated by completely rotten
4405  * blocks for data that had been long deleted and/or overwritten, we'll
4406  * never actually return bad data from the cache, since the DVA with the
4407  * birth TXG uniquely identify a block in space and time - once created,
4408  * a block is immutable on disk. The worst thing we have done is wasted
4409  * some time and memory at l2arc rebuild to reconstruct outdated ARC
4410  * entries that will get dropped from the l2arc as it is being updated
4411  * with new blocks.
4412  */
4413 
4414 static boolean_t
4415 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4416 {
4417         /*
4418          * A buffer is *not* eligible for the L2ARC if it:
4419          * 1. belongs to a different spa.
4420          * 2. is already cached on the L2ARC.
4421          * 3. has an I/O in progress (it may be an incomplete read).
4422          * 4. is flagged not eligible (zfs property).
4423          */
4424         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4425             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4426                 return (B_FALSE);
4427 
4428         return (B_TRUE);
4429 }
4430 
4431 static uint64_t


4458         clock_t interval, next, now;
4459 
4460         /*
4461          * If the ARC lists are busy, increase our write rate; if the
4462          * lists are stale, idle back.  This is achieved by checking
4463          * how much we previously wrote - if it was more than half of
4464          * what we wanted, schedule the next write much sooner.
4465          */
4466         if (l2arc_feed_again && wrote > (wanted / 2))
4467                 interval = (hz * l2arc_feed_min_ms) / 1000;
4468         else
4469                 interval = hz * l2arc_feed_secs;
4470 
4471         now = ddi_get_lbolt();
4472         next = MAX(now, MIN(now + interval, began + interval));
4473 
4474         return (next);
4475 }
4476 
4477 static void
4478 l2arc_hdr_stat_add(boolean_t from_arc)
4479 {
4480         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4481         if (from_arc)
4482                 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4483 }
4484 
4485 static void
4486 l2arc_hdr_stat_remove(void)
4487 {
4488         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4489         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4490 }
4491 
4492 /*
4493  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4494  * If a device is returned, this also returns holding the spa config lock.
4495  */
4496 static l2arc_dev_t *
4497 l2arc_dev_get_next(void)
4498 {
4499         l2arc_dev_t *first, *next = NULL;
4500 
4501         /*
4502          * Lock out the removal of spas (spa_namespace_lock), then removal
4503          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4504          * both locks will be dropped and a spa config lock held instead.
4505          */
4506         mutex_enter(&spa_namespace_lock);
4507         mutex_enter(&l2arc_dev_mtx);
4508 
4509         /* if there are no vdevs, there is nothing to do */
4510         if (l2arc_ndev == 0)
4511                 goto out;
4512 
4513         first = NULL;
4514         next = l2arc_dev_last;
4515         do {
4516                 /*
4517                  * Loop around the list looking for a non-faulted vdev
4518                  * and one that isn't currently doing an L2ARC rebuild.
4519                  */
4520                 if (next == NULL) {
4521                         next = list_head(l2arc_dev_list);
4522                 } else {
4523                         next = list_next(l2arc_dev_list, next);
4524                         if (next == NULL)
4525                                 next = list_head(l2arc_dev_list);
4526                 }
4527 
4528                 /* if we have come back to the start, bail out */
4529                 if (first == NULL)
4530                         first = next;
4531                 else if (next == first)
4532                         break;
4533 
4534         } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding);
4535 
4536         /* if we were unable to find any usable vdevs, return NULL */
4537         if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding)
4538                 next = NULL;
4539 
4540         l2arc_dev_last = next;
4541 
4542 out:
4543         mutex_exit(&l2arc_dev_mtx);
4544 
4545         /*
4546          * Grab the config lock to prevent the 'next' device from being
4547          * removed while we are writing to it.
4548          */
4549         if (next != NULL)
4550                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4551         mutex_exit(&spa_namespace_lock);
4552 
4553         return (next);
4554 }
4555 
4556 /*
4557  * Free buffers that were tagged for destruction.


4595         ASSERT(cb != NULL);
4596         dev = cb->l2wcb_dev;
4597         ASSERT(dev != NULL);
4598         head = cb->l2wcb_head;
4599         ASSERT(head != NULL);
4600         buflist = dev->l2ad_buflist;
4601         ASSERT(buflist != NULL);
4602         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4603             l2arc_write_callback_t *, cb);
4604 
4605         if (zio->io_error != 0)
4606                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4607 
4608         mutex_enter(&l2arc_buflist_mtx);
4609 
4610         /*
4611          * All writes completed, or an error was hit.
4612          */
4613         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4614                 ab_prev = list_prev(buflist, ab);
4615                 abl2 = ab->b_l2hdr;
4616 
4617                 /*
4618                  * Release the temporary compressed buffer as soon as possible.
4619                  */
4620                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4621                         l2arc_release_cdata_buf(ab);
4622 
4623                 hash_lock = HDR_LOCK(ab);
4624                 if (!mutex_tryenter(hash_lock)) {
4625                         /*
4626                          * This buffer misses out.  It may be in a stage
4627                          * of eviction.  Its ARC_L2_WRITING flag will be
4628                          * left set, denying reads to this buffer.
4629                          */
4630                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4631                         continue;
4632                 }
4633 








4634                 if (zio->io_error != 0) {
4635                         /*
4636                          * Error - drop L2ARC entry.
4637                          */
4638                         list_remove(buflist, ab);
4639                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4640                         ab->b_l2hdr = NULL;
4641                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4642                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4643                 }
4644 
4645                 /*
4646                  * Allow ARC to begin reads to this L2ARC entry.
4647                  */
4648                 ab->b_flags &= ~ARC_L2_WRITING;
4649 
4650                 mutex_exit(hash_lock);
4651         }
4652 
4653         atomic_inc_64(&l2arc_writes_done);
4654         list_remove(buflist, head);
4655         kmem_cache_free(hdr_cache, head);
4656         mutex_exit(&l2arc_buflist_mtx);
4657 
4658         l2arc_do_free_on_write();
4659 
4660         if (cb->l2wcb_pbuf)
4661                 kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size);
4662         if (cb->l2wcb_ub_buf)
4663                 kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE);
4664         kmem_free(cb, sizeof (l2arc_write_callback_t));
4665 }
4666 
4667 /*
4668  * A read to a cache device completed.  Validate buffer contents before
4669  * handing over to the regular ARC routines.
4670  */
4671 static void
4672 l2arc_read_done(zio_t *zio)
4673 {
4674         l2arc_read_callback_t *cb;
4675         arc_buf_hdr_t *hdr;
4676         arc_buf_t *buf;
4677         kmutex_t *hash_lock;
4678         int equal;
4679 
4680         ASSERT(zio->io_vd != NULL);
4681         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4682 
4683         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);


4925  *
4926  * Returns the number of bytes actually written (which may be smaller than
4927  * the delta by which the device hand has changed due to alignment).
4928  */
4929 static uint64_t
4930 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4931     boolean_t *headroom_boost)
4932 {
4933         arc_buf_hdr_t *ab, *ab_prev, *head;
4934         list_t *list;
4935         uint64_t write_asize, write_psize, write_sz, headroom,
4936             buf_compress_minsz;
4937         void *buf_data;
4938         kmutex_t *list_lock;
4939         boolean_t full;
4940         l2arc_write_callback_t *cb;
4941         zio_t *pio, *wzio;
4942         uint64_t guid = spa_load_guid(spa);
4943         const boolean_t do_headroom_boost = *headroom_boost;
4944 
4945         /* persistency-related */
4946         l2pbuf_t *pb;
4947         l2pbuf_buflist_t *pb_buflist;
4948         int num_bufs, buf_index;
4949 
4950         ASSERT(dev->l2ad_vdev != NULL);
4951 
4952         /* Lower the flag now, we might want to raise it again later. */
4953         *headroom_boost = B_FALSE;
4954 
4955         pio = NULL;
4956         cb = NULL;
4957         write_sz = write_asize = write_psize = 0;
4958         full = B_FALSE;
4959         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4960         head->b_flags |= ARC_L2_WRITE_HEAD;
4961 
4962         /*
4963          * We will want to try to compress buffers that are at least 2x the
4964          * device sector size.
4965          */
4966         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4967 
4968         pb = &dev->l2ad_pbuf;
4969         num_bufs = 0;
4970 
4971         /*
4972          * We will want to try to compress buffers that are at least 2x the
4973          * device sector size.
4974          */
4975         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4976 
4977         /*
4978          * Copy buffers for L2ARC writing.
4979          */
4980         mutex_enter(&l2arc_buflist_mtx);
4981         for (int try = 0; try <= 3; try++) {
4982                 uint64_t passed_sz = 0;
4983 
4984                 list = l2arc_list_locked(try, &list_lock);
4985 
4986                 /*
4987                  * L2ARC fast warmup.
4988                  *
4989                  * Until the ARC is warm and starts to evict, read from the
4990                  * head of the ARC lists rather than the tail.
4991                  */
4992                 if (arc_warm == B_FALSE)
4993                         ab = list_head(list);
4994                 else
4995                         ab = list_tail(list);
4996 
4997                 headroom = target_sz * l2arc_headroom;


5027 
5028                         if (!l2arc_write_eligible(guid, ab)) {
5029                                 mutex_exit(hash_lock);
5030                                 continue;
5031                         }
5032 
5033                         if ((write_sz + ab->b_size) > target_sz) {
5034                                 full = B_TRUE;
5035                                 mutex_exit(hash_lock);
5036                                 break;
5037                         }
5038 
5039                         if (pio == NULL) {
5040                                 /*
5041                                  * Insert a dummy header on the buflist so
5042                                  * l2arc_write_done() can find where the
5043                                  * write buffers begin without searching.
5044                                  */
5045                                 list_insert_head(dev->l2ad_buflist, head);
5046 
5047                                 cb = kmem_zalloc(
5048                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
5049                                 cb->l2wcb_dev = dev;
5050                                 cb->l2wcb_head = head;
5051                                 pio = zio_root(spa, l2arc_write_done, cb,
5052                                     ZIO_FLAG_CANFAIL);
5053                         }
5054 
5055                         /*
5056                          * Create and add a new L2ARC header.
5057                          */
5058                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5059                         l2hdr->b_dev = dev;
5060                         ab->b_flags |= ARC_L2_WRITING;
5061 
5062                         /*
5063                          * Temporarily stash the data buffer in b_tmp_cdata.
5064                          * The subsequent write step will pick it up from
5065                          * there. This is because can't access ab->b_buf
5066                          * without holding the hash_lock, which we in turn
5067                          * can't access without holding the ARC list locks


5069                          */
5070                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
5071                         l2hdr->b_asize = ab->b_size;
5072                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5073 
5074                         buf_sz = ab->b_size;
5075                         ab->b_l2hdr = l2hdr;
5076 
5077                         list_insert_head(dev->l2ad_buflist, ab);
5078 
5079                         /*
5080                          * Compute and store the buffer cksum before
5081                          * writing.  On debug the cksum is verified first.
5082                          */
5083                         arc_cksum_verify(ab->b_buf);
5084                         arc_cksum_compute(ab->b_buf, B_TRUE);
5085 
5086                         mutex_exit(hash_lock);
5087 
5088                         write_sz += buf_sz;
5089                         num_bufs++;
5090                 }
5091 
5092                 mutex_exit(list_lock);
5093 
5094                 if (full == B_TRUE)
5095                         break;
5096         }
5097 
5098         /* No buffers selected for writing? */
5099         if (pio == NULL) {
5100                 ASSERT0(write_sz);
5101                 mutex_exit(&l2arc_buflist_mtx);
5102                 kmem_cache_free(hdr_cache, head);
5103                 return (0);
5104         }
5105 
5106         /* expand the pbuf to include a new list */
5107         pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs);
5108 
5109         /*
5110          * Now start writing the buffers. We're starting at the write head
5111          * and work backwards, retracing the course of the buffer selector
5112          * loop above.
5113          */
5114         for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab;
5115             ab = list_prev(dev->l2ad_buflist, ab), buf_index++) {
5116                 l2arc_buf_hdr_t *l2hdr;
5117                 uint64_t buf_sz;
5118 
5119                 /*
5120                  * We shouldn't need to lock the buffer here, since we flagged
5121                  * it as ARC_L2_WRITING in the previous step, but we must take
5122                  * care to only access its L2 cache parameters. In particular,
5123                  * ab->b_buf may be invalid by now due to ARC eviction.
5124                  */
5125                 l2hdr = ab->b_l2hdr;
5126                 l2hdr->b_daddr = dev->l2ad_hand;
5127 
5128                 if ((ab->b_flags & ARC_L2COMPRESS) &&
5129                     l2hdr->b_asize >= buf_compress_minsz) {
5130                         if (l2arc_compress_buf(l2hdr)) {
5131                                 /*
5132                                  * If compression succeeded, enable headroom
5133                                  * boost on the next scan cycle.
5134                                  */
5135                                 *headroom_boost = B_TRUE;


5147                 if (buf_sz != 0) {
5148                         uint64_t buf_p_sz;
5149 
5150                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
5151                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5152                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5153                             ZIO_FLAG_CANFAIL, B_FALSE);
5154 
5155                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5156                             zio_t *, wzio);
5157                         (void) zio_nowait(wzio);
5158 
5159                         write_asize += buf_sz;
5160                         /*
5161                          * Keep the clock hand suitably device-aligned.
5162                          */
5163                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5164                         write_psize += buf_p_sz;
5165                         dev->l2ad_hand += buf_p_sz;
5166                 }

5167 
5168                 l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index);
5169         }
5170         ASSERT(buf_index == num_bufs);
5171         mutex_exit(&l2arc_buflist_mtx);
5172 
5173         ASSERT3U(write_asize, <=, target_sz);
5174         ARCSTAT_BUMP(arcstat_l2_writes_sent);
5175         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5176         ARCSTAT_INCR(arcstat_l2_size, write_sz);
5177         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5178         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5179 
5180         /* Is it time to commit this pbuf? */
5181         if (L2PBUF_IS_FULL(pb) &&
5182             dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) {
5183                 l2arc_pbuf_commit(dev, pio, cb);
5184                 l2arc_pbuf_destroy(pb);
5185                 l2arc_pbuf_init(pb);
5186         }
5187 
5188         /*
5189          * Bump device hand to the device start if it is approaching the end.
5190          * l2arc_evict() will already have evicted ahead for this case.
5191          */
5192         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5193                 vdev_space_update(dev->l2ad_vdev,
5194                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
5195                 dev->l2ad_hand = dev->l2ad_start;
5196                 dev->l2ad_evict = dev->l2ad_start;
5197                 dev->l2ad_first = B_FALSE;
5198         }
5199 
5200         dev->l2ad_writing = B_TRUE;
5201         (void) zio_wait(pio);
5202         dev->l2ad_writing = B_FALSE;
5203 
5204         return (write_asize);
5205 }
5206 
5207 /*


5449 }
5450 
5451 boolean_t
5452 l2arc_vdev_present(vdev_t *vd)
5453 {
5454         l2arc_dev_t *dev;
5455 
5456         mutex_enter(&l2arc_dev_mtx);
5457         for (dev = list_head(l2arc_dev_list); dev != NULL;
5458             dev = list_next(l2arc_dev_list, dev)) {
5459                 if (dev->l2ad_vdev == vd)
5460                         break;
5461         }
5462         mutex_exit(&l2arc_dev_mtx);
5463 
5464         return (dev != NULL);
5465 }
5466 
5467 /*
5468  * Add a vdev for use by the L2ARC.  By this point the spa has already
5469  * validated the vdev and opened it. The `rebuild' flag indicates whether
5470  * we should attempt an L2ARC persistency rebuild.
5471  */
5472 void
5473 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5474 {
5475         l2arc_dev_t *adddev;
5476 
5477         ASSERT(!l2arc_vdev_present(vd));
5478 
5479         /*
5480          * Create a new l2arc device entry.
5481          */
5482         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5483         adddev->l2ad_spa = spa;
5484         adddev->l2ad_vdev = vd;
5485         adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE;
5486         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5487         adddev->l2ad_hand = adddev->l2ad_start;
5488         adddev->l2ad_evict = adddev->l2ad_start;
5489         adddev->l2ad_first = B_TRUE;
5490         adddev->l2ad_writing = B_FALSE;
5491         l2arc_pbuf_init(&adddev->l2ad_pbuf);
5492 
5493         /*
5494          * This is a list of all ARC buffers that are still valid on the
5495          * device.
5496          */
5497         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5498         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5499             offsetof(arc_buf_hdr_t, b_l2node));
5500 
5501         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5502 
5503         /*
5504          * Add device to global list
5505          */
5506         mutex_enter(&l2arc_dev_mtx);
5507         list_insert_head(l2arc_dev_list, adddev);
5508         atomic_inc_64(&l2arc_ndev);
5509         if (rebuild && l2arc_rebuild_enabled) {
5510                 adddev->l2ad_rebuilding = B_TRUE;
5511                 (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev,
5512                     0, &p0, TS_RUN, minclsyspri);
5513         }
5514         mutex_exit(&l2arc_dev_mtx);
5515 }
5516 
5517 /*
5518  * Remove a vdev from the L2ARC.
5519  */
5520 void
5521 l2arc_remove_vdev(vdev_t *vd)
5522 {
5523         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5524 
5525         /*
5526          * Find the device by vdev
5527          */
5528         mutex_enter(&l2arc_dev_mtx);
5529         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5530                 nextdev = list_next(l2arc_dev_list, dev);
5531                 if (vd == dev->l2ad_vdev) {
5532                         remdev = dev;
5533                         break;
5534                 }
5535         }
5536         ASSERT(remdev != NULL);
5537 
5538         /*
5539          * Remove device from global list
5540          */
5541         list_remove(l2arc_dev_list, remdev);
5542         l2arc_dev_last = NULL;          /* may have been invalidated */
5543         atomic_dec_64(&l2arc_ndev);
5544         mutex_exit(&l2arc_dev_mtx);
5545 
5546         /*
5547          * Clear all buflists and ARC references.  L2ARC device flush.
5548          */
5549         l2arc_pbuf_destroy(&remdev->l2ad_pbuf);
5550         l2arc_evict(remdev, 0, B_TRUE);
5551         list_destroy(remdev->l2ad_buflist);
5552         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5553         kmem_free(remdev, sizeof (l2arc_dev_t));
5554 }
5555 
5556 void
5557 l2arc_init(void)
5558 {
5559         l2arc_thread_exit = 0;
5560         l2arc_ndev = 0;
5561         l2arc_writes_sent = 0;
5562         l2arc_writes_done = 0;
5563 
5564         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5565         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5566         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5567         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5568         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5569 


5601 {
5602         if (!(spa_mode_global & FWRITE))
5603                 return;
5604 
5605         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5606             TS_RUN, minclsyspri);
5607 }
5608 
5609 void
5610 l2arc_stop(void)
5611 {
5612         if (!(spa_mode_global & FWRITE))
5613                 return;
5614 
5615         mutex_enter(&l2arc_feed_thr_lock);
5616         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
5617         l2arc_thread_exit = 1;
5618         while (l2arc_thread_exit != 0)
5619                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5620         mutex_exit(&l2arc_feed_thr_lock);
5621 }
5622 
5623 /*
5624  * Main entry point for L2ARC metadata rebuilding. This function must be
5625  * called via thread_create so that the L2ARC metadata rebuild doesn't block
5626  * pool import and may proceed in parallel on all available L2ARC devices.
5627  */
5628 static void
5629 l2arc_rebuild_start(l2arc_dev_t *dev)
5630 {
5631         vdev_t *vd = dev->l2ad_vdev;
5632         spa_t *spa = dev->l2ad_spa;
5633 
5634         /* Lock out device removal. */
5635         spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
5636         ASSERT(dev->l2ad_rebuilding == B_TRUE);
5637         l2arc_rebuild(dev);
5638         dev->l2ad_rebuilding = B_FALSE;
5639         spa_config_exit(spa, SCL_L2ARC, vd);
5640         thread_exit();
5641 }
5642 
5643 /*
5644  * This function implements the actual L2ARC metadata rebuild. It:
5645  *
5646  * 1) scans the device for valid l2uberblocks
5647  * 2) if it finds a good uberblock, starts reading the pbuf chain
5648  * 3) restores each pbuf's contents to memory
5649  *
5650  * Operation stops under any of the following conditions:
5651  *
5652  * 1) We reach the end of the pbuf chain (the previous-buffer reference
5653  *    in the pbuf is zero).
5654  * 2) We encounter *any* error condition (cksum errors, io errors, looped
5655  *    pbufs, etc.).
5656  * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
5657  *    from making severely fragmented L2ARC pbufs or slow L2ARC devices
5658  *    prevent a machine from importing the pool (and letting the
5659  *    administrator take corrective action, e.g. by kicking the misbehaving
5660  *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
5661  *    rebuilding disabled).
5662  */
5663 static void
5664 l2arc_rebuild(l2arc_dev_t *dev)
5665 {
5666         int err;
5667         l2uberblock_t ub;
5668         l2pbuf_t pb;
5669         zio_t *this_io = NULL, *next_io = NULL;
5670         int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
5671 
5672         if ((err = l2arc_uberblock_find(dev, &ub)) != 0)
5673                 return;
5674         L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */);
5675 
5676         /* set up uberblock update info */
5677         dev->l2ad_uberblock_birth = ub.ub_birth + 1;
5678 
5679         /* initial sanity checks */
5680         l2arc_pbuf_init(&pb);
5681         if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize,
5682             ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) {
5683                 /* root pbuf is bad, we can't do anything about that */
5684                 if (err == EINVAL) {
5685                         ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5686                 } else {
5687                         ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5688                 }
5689                 l2arc_pbuf_destroy(&pb);
5690                 return;
5691         }
5692         L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5693 
5694         dev->l2ad_evict = ub.ub_evict_tail;
5695 
5696         /* keep on chaining in new blocks */
5697         dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr;
5698         dev->l2ad_pbuf_asize = ub.ub_pbuf_asize;
5699         dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum;
5700         dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
5701             ub.ub_pbuf_daddr + ub.ub_pbuf_asize);
5702         dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0);
5703 
5704         /* start the rebuild process */
5705         for (;;) {
5706                 l2pbuf_t pb_prev;
5707 
5708                 l2arc_pbuf_init(&pb_prev);
5709                 if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr,
5710                     pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io,
5711                     &next_io)) != 0) {
5712                         /*
5713                          * We are done reading, discard the last good buffer.
5714                          */
5715                         if (pb.pb_prev_daddr > dev->l2ad_hand &&
5716                             pb.pb_prev_asize > L2PBUF_HDR_SIZE) {
5717                                 /* this is an error, we stopped too early */
5718                                 if (err == EINVAL) {
5719                                         ARCSTAT_BUMP(
5720                                             arcstat_l2_rebuild_cksum_errors);
5721                                 } else {
5722                                         ARCSTAT_BUMP(
5723                                             arcstat_l2_rebuild_io_errors);
5724                                 }
5725                         }
5726                         l2arc_pbuf_destroy(&pb_prev);
5727                         l2arc_pbuf_destroy(&pb);
5728                         break;
5729                 }
5730 
5731                 /*
5732                  * Protection against infinite loops of pbufs. This is also
5733                  * our primary termination mechanism - once the buffer list
5734                  * loops around our starting pbuf, we can stop.
5735                  */
5736                 if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr &&
5737                     pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) {
5738                         ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
5739                         l2arc_pbuf_destroy(&pb);
5740                         l2arc_pbuf_destroy(&pb_prev);
5741                         if (next_io)
5742                                 l2arc_pbuf_prefetch_abort(next_io);
5743                         return;
5744                 }
5745 
5746                 /*
5747                  * Our memory pressure valve. If the system is running low
5748                  * on memory, rather than swamping memory with new ARC buf
5749                  * hdrs, we opt not to reconstruct the L2ARC. At this point,
5750                  * however, we have already set up our L2ARC dev to chain in
5751                  * new metadata pbufs, so the user may choose to re-add the
5752                  * L2ARC dev at a later time to reconstruct it (when there's
5753                  * less memory pressure).
5754                  */
5755                 if (arc_reclaim_needed()) {
5756                         ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
5757                         cmn_err(CE_NOTE, "System running low on memory, "
5758                             "aborting L2ARC rebuild.");
5759                         l2arc_pbuf_destroy(&pb);
5760                         l2arc_pbuf_destroy(&pb_prev);
5761                         if (next_io)
5762                                 l2arc_pbuf_prefetch_abort(next_io);
5763                         break;
5764                 }
5765 
5766                 /*
5767                  * Now that we know that the prev_pbuf checks out alright, we
5768                  * can start reconstruction from this pbuf - we can be sure
5769                  * that the L2ARC write hand has not yet reached any of our
5770                  * buffers.
5771                  */
5772                 l2arc_pbuf_restore(dev, &pb);
5773 
5774                 /* pbuf restored, continue with next one in the list */
5775                 l2arc_pbuf_destroy(&pb);
5776                 pb = pb_prev;
5777                 this_io = next_io;
5778                 next_io = NULL;
5779 
5780                 L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5781         }
5782 
5783         ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
5784 }
5785 
5786 /*
5787  * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries
5788  * which only contain an l2arc hdr, essentially restoring the buffers to
5789  * their L2ARC evicted state. This function also updates space usage on the
5790  * L2ARC vdev to make sure it tracks restored buffers.
5791  */
5792 static void
5793 l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb)
5794 {
5795         spa_t *spa;
5796         uint64_t guid;
5797         list_t *buflists_list;
5798         l2pbuf_buflist_t *buflist;
5799 
5800         mutex_enter(&l2arc_buflist_mtx);
5801         spa = dev->l2ad_vdev->vdev_spa;
5802         guid = spa_load_guid(spa);
5803         buflists_list = pb->pb_buflists_list;
5804         for (buflist = list_head(buflists_list); buflist;
5805             buflist = list_next(buflists_list, buflist)) {
5806                 int i;
5807                 uint64_t size, asize, psize;
5808 
5809                 size = asize = psize = 0;
5810                 for (i = 0; i < buflist->l2pbl_nbufs; i++) {
5811                         l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev,
5812                             guid);
5813                         size += buflist->l2pbl_bufs[i].b_size;
5814                         asize += buflist->l2pbl_bufs[i].b_l2asize;
5815                         psize += vdev_psize_to_asize(dev->l2ad_vdev,
5816                             buflist->l2pbl_bufs[i].b_l2asize);
5817                 }
5818                 ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size);
5819                 ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize);
5820                 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs);
5821                 vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
5822         }
5823         mutex_exit(&l2arc_buflist_mtx);
5824         ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs);
5825         vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev,
5826             pb->pb_asize), 0, 0);
5827 }
5828 
5829 /*
5830  * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into
5831  * a state indicating that it has been evicted to L2ARC.
5832  * The `guid' here is the ARC-load-guid from spa_load_guid.
5833  */
5834 static void
5835 l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid)
5836 {
5837         arc_buf_hdr_t *hdr;
5838         kmutex_t *hash_lock;
5839         dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]};
5840 
5841         hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock);
5842         if (hdr == NULL) {
5843                 /* not in cache, try to insert */
5844                 arc_buf_hdr_t *exists;
5845                 arc_buf_contents_t type = buf->b_contents_type;
5846                 l2arc_buf_hdr_t *l2hdr;
5847 
5848                 hdr = arc_buf_hdr_alloc(guid, buf->b_size, type);
5849                 hdr->b_dva = buf->b_dva;
5850                 hdr->b_birth = buf->b_birth;
5851                 hdr->b_cksum0 = buf->b_cksum0;
5852                 hdr->b_size = buf->b_size;
5853                 exists = buf_hash_insert(hdr, &hash_lock);
5854                 if (exists) {
5855                         /* somebody beat us to the hash insert */
5856                         mutex_exit(hash_lock);
5857                         arc_hdr_destroy(hdr);
5858                         ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
5859                         return;
5860                 }
5861                 hdr->b_flags = buf->b_flags;
5862                 mutex_enter(&hdr->b_freeze_lock);
5863                 ASSERT(hdr->b_freeze_cksum == NULL);
5864                 hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
5865                     KM_SLEEP);
5866                 *hdr->b_freeze_cksum = buf->b_freeze_cksum;
5867                 mutex_exit(&hdr->b_freeze_lock);
5868 
5869                 /* now rebuild the l2arc entry */
5870                 ASSERT(hdr->b_l2hdr == NULL);
5871                 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5872                 l2hdr->b_dev = dev;
5873                 l2hdr->b_daddr = buf->b_l2daddr;
5874                 l2hdr->b_asize = buf->b_l2asize;
5875                 l2hdr->b_compress = buf->b_l2compress;
5876                 hdr->b_l2hdr = l2hdr;
5877                 list_insert_head(dev->l2ad_buflist, hdr);
5878                 ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
5879                 ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
5880 
5881                 arc_change_state(arc_l2c_only, hdr, hash_lock);
5882         }
5883         mutex_exit(hash_lock);
5884 }
5885 
5886 /*
5887  * Attempts to locate and read the newest valid uberblock on the provided
5888  * L2ARC device and writes it to `ub'. On success, this function returns 0,
5889  * otherwise the appropriate error code is returned.
5890  */
5891 static int
5892 l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub)
5893 {
5894         int err = 0;
5895         uint8_t *ub_buf;
5896         uint64_t guid;
5897 
5898         ARCSTAT_BUMP(arcstat_l2_rebuild_attempts);
5899         ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
5900         guid = spa_guid(dev->l2ad_vdev->vdev_spa);
5901 
5902         if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
5903             VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf,
5904             ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
5905             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
5906             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
5907                 ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5908                 goto cleanup;
5909         }
5910 
5911         /*
5912          * Initial peek - does the device even have any usable uberblocks?
5913          * If not, don't bother continuing.
5914          */
5915         l2arc_uberblock_decode(ub_buf, ub);
5916         if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 ||
5917             ub->ub_version > L2UBERBLOCK_MAX_VERSION ||
5918             ub->ub_spa_guid != guid) {
5919                 err = ENOTSUP;
5920                 ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported);
5921                 goto cleanup;
5922         }
5923 
5924         /* now check to make sure that what we selected is okay */
5925         if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) {
5926                 if (err == EINVAL) {
5927                         ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5928                 } else {
5929                         ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors);
5930                 }
5931                 goto cleanup;
5932         }
5933 
5934         /* this uberblock is valid */
5935 
5936 cleanup:
5937         kmem_free(ub_buf, L2UBERBLOCK_SIZE);
5938         return (err);
5939 }
5940 
5941 /*
5942  * Reads a pbuf from storage, decodes it and validates its contents against
5943  * the provided checksum. The result is placed in `pb'.
5944  *
5945  * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching.
5946  * When issuing the first pbuf IO during rebuild, you should pass NULL for
5947  * `this_io'. This function will then issue a sync IO to read the pbuf and
5948  * also issue an async IO to fetch the next pbuf in the pbuf chain. The
5949  * prefetch IO is returned in `prefetch_io. On subsequent calls to this
5950  * function, pass the value returned in `prefetch_io' from the previous
5951  * call as `this_io' and a fresh `prefetch_io' pointer to hold the next
5952  * prefetch IO. Prior to the call, you should initialize your `prefetch_io'
5953  * pointer to be NULL. If no prefetch IO was issued, the pointer is left
5954  * set at NULL.
5955  *
5956  * Actual prefetching takes place in two steps: a header IO (pi_hdr_io)
5957  * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io
5958  * IO is used internally in this function to be able to `peek' at the next
5959  * buffer's header before the main IO to read it in completely has finished.
5960  * We can then begin to issue the IO for the next buffer in the chain before
5961  * we are done reading, keeping the L2ARC device's pipeline saturated with
5962  * reads (rather than issuing an IO, waiting for it to complete, validating
5963  * the returned buffer and issuing the next one). This will make sure that
5964  * the rebuild proceeds at maximum read throughput.
5965  *
5966  * On success, this function returns 0, otherwise it returns an appropriate
5967  * error code. On error the prefetching IO is aborted and cleared before
5968  * returning from this function. Therefore, if we return `success', the
5969  * caller can assume that we have taken care of cleanup of prefetch IOs.
5970  */
5971 static int
5972 l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
5973     zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io)
5974 {
5975         int err = 0;
5976         uint64_t prev_pb_start;
5977         uint32_t prev_pb_asize;
5978         zio_cksum_t calc_cksum, prev_pb_cksum;
5979         l2arc_prefetch_info_t *pi = NULL;
5980 
5981         ASSERT(dev != NULL);
5982         ASSERT(pb != NULL);
5983         ASSERT(*prefetch_io == NULL);
5984 
5985         if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) {
5986                 /* We could not have issued a prefetch IO for this */
5987                 ASSERT(this_io == NULL);
5988                 return (EINVAL);
5989         }
5990 
5991         /*
5992          * Check to see if we have issued the IO for this pbuf in a previous
5993          * run. If not, issue it now.
5994          */
5995         if (this_io == NULL)
5996                 this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize);
5997 
5998         /* Pick up the prefetch info buffer and read its contents */
5999         pi = this_io->io_private;
6000         ASSERT(pi != NULL);
6001         ASSERT(asize <= pi->pi_buflen);
6002 
6003         /* Wait for the IO to read this pbuf's header to complete */
6004         if ((err = zio_wait(pi->pi_hdr_io)) != 0) {
6005                 (void) zio_wait(this_io);
6006                 goto cleanup;
6007         }
6008 
6009         /*
6010          * Peek to see if we can start issuing the next pbuf IO immediately.
6011          * At this point, only the current pbuf's header has been read.
6012          */
6013         if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start,
6014             &prev_pb_asize, &prev_pb_cksum) == 0) {
6015                 uint64_t this_pb_start, this_pb_end, prev_pb_end;
6016                 /* Detect malformed pbuf references and loops */
6017                 this_pb_start = daddr;
6018                 this_pb_end = daddr + asize;
6019                 prev_pb_end = prev_pb_start + prev_pb_asize;
6020                 if ((prev_pb_start >= this_pb_start && prev_pb_start <
6021                     this_pb_end) ||
6022                     (prev_pb_end >= this_pb_start && prev_pb_end <
6023                     this_pb_end)) {
6024                         ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
6025                         cmn_err(CE_WARN, "Looping L2ARC metadata reference "
6026                             "detected, aborting rebuild.");
6027                         err = EINVAL;
6028                         goto cleanup;
6029                 }
6030                 /*
6031                  * Start issuing IO for the next pbuf early - this should
6032                  * help keep the L2ARC device busy while we read, decode
6033                  * and restore this pbuf.
6034                  */
6035                 if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize))
6036                         *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev,
6037                             prev_pb_start, prev_pb_asize);
6038         }
6039 
6040         /* Wait for the main pbuf IO to complete */
6041         if ((err = zio_wait(this_io)) != 0)
6042                 goto cleanup;
6043 
6044         /* Make sure the buffer checks out ok */
6045         fletcher_4_native(pi->pi_buf, asize, &calc_cksum);
6046         if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) {
6047                 err = EINVAL;
6048                 goto cleanup;
6049         }
6050 
6051         /* Now we can take our time decoding this buffer */
6052         if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0)
6053                 goto cleanup;
6054 
6055         /* This will be used in l2arc_pbuf_restore for space accounting */
6056         pb->pb_asize = asize;
6057 
6058         ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb));
6059         ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize);
6060         ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6061             pb->pb_payload_asz / asize);
6062 
6063 cleanup:
6064         kmem_free(pi->pi_buf, pi->pi_buflen);
6065         pi->pi_buf = NULL;
6066         kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6067         /* Abort an in-flight prefetch in case of error */
6068         if (err != 0 && *prefetch_io != NULL) {
6069                 l2arc_pbuf_prefetch_abort(*prefetch_io);
6070                 *prefetch_io = NULL;
6071         }
6072         return (err);
6073 }
6074 
6075 /*
6076  * Validates a pbuf device address to make sure that it can be read
6077  * from the provided L2ARC device. Returns 1 if the address is within
6078  * the device's bounds, or 0 if not.
6079  */
6080 static int
6081 l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize)
6082 {
6083         uint32_t psize;
6084         uint64_t end;
6085 
6086         psize = vdev_psize_to_asize(dev->l2ad_vdev, asize);
6087         end = daddr + psize;
6088 
6089         if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE ||
6090             asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start ||
6091             /* check that the buffer address is correctly aligned */
6092             (daddr & (vdev_psize_to_asize(dev->l2ad_vdev,
6093             SPA_MINBLOCKSIZE) - 1)) != 0)
6094                 return (0);
6095         else
6096                 return (1);
6097 }
6098 
6099 /*
6100  * Starts an asynchronous read IO to read a pbuf. This is used in pbuf
6101  * reconstruction to start reading the next pbuf before we are done
6102  * decoding and reconstructing the current pbuf, to keep the l2arc device
6103  * nice and hot with read IO to process.
6104  * The returned zio will contain a newly allocated memory buffers for the IO
6105  * data which should then be freed by the caller once the zio is no longer
6106  * needed (i.e. due to it having completed). If you wish to abort this
6107  * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care
6108  * of disposing of the allocated buffers correctly.
6109  */
6110 static zio_t *
6111 l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize)
6112 {
6113         uint32_t i, psize;
6114         zio_t *pio, *hdr_io;
6115         uint64_t hdr_rsize;
6116         uint8_t *buf;
6117         l2arc_prefetch_info_t *pinfo;
6118 
6119         psize = vdev_psize_to_asize(vd, asize);
6120         buf = kmem_alloc(psize, KM_SLEEP);
6121         pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP);
6122         pinfo->pi_buf = buf;
6123         pinfo->pi_buflen = psize;
6124 
6125         /*
6126          * We start issuing the IO for the pbuf header early. This
6127          * allows l2arc_pbuf_read to start issuing IO for the next
6128          * buffer before the current pbuf is read in completely.
6129          */
6130 
6131         hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE);
6132         ASSERT(hdr_rsize <= psize);
6133         pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL,
6134             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6135             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
6136         hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf,
6137             ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
6138             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6139             ZIO_FLAG_DONT_RETRY, B_FALSE);
6140         (void) zio_nowait(hdr_io);
6141 
6142         /*
6143          * Read in the rest of the pbuf - this can take longer than just
6144          * having a peek at the header.
6145          */
6146         pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE |
6147             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6148             ZIO_FLAG_DONT_RETRY);
6149         for (i = hdr_rsize; i < psize; ) {
6150                 uint64_t rsize = psize - i;
6151                 zio_t *rzio;
6152 
6153                 if (psize - i > SPA_MAXBLOCKSIZE)
6154                         rsize = SPA_MAXBLOCKSIZE;
6155                 ASSERT(rsize >= SPA_MINBLOCKSIZE);
6156                 rzio = zio_read_phys(pio, vd, daddr + i,
6157                     rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6158                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE |
6159                     ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6160                     ZIO_FLAG_DONT_RETRY, B_FALSE);
6161                 (void) zio_nowait(rzio);
6162                 i += rsize;
6163         }
6164 
6165         return (pio);
6166 }
6167 
6168 /*
6169  * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data
6170  * buffers allocated for it.
6171  */
6172 static void
6173 l2arc_pbuf_prefetch_abort(zio_t *zio)
6174 {
6175         l2arc_prefetch_info_t *pi;
6176 
6177         pi = zio->io_private;
6178         ASSERT(pi != NULL);
6179         if (pi->pi_hdr_io != NULL)
6180                 (void) zio_wait(pi->pi_hdr_io);
6181         (void) zio_wait(zio);
6182         kmem_free(pi->pi_buf, pi->pi_buflen);
6183         pi->pi_buf = NULL;
6184         kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6185 }
6186 
6187 /*
6188  * Encodes an l2uberblock_t structure into a destination buffer. This
6189  * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting
6190  * uberblock is always of this constant size.
6191  */
6192 static void
6193 l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf)
6194 {
6195         zio_cksum_t cksum;
6196 
6197         bzero(buf, L2UBERBLOCK_SIZE);
6198 
6199 #if defined(_BIG_ENDIAN)
6200         *(uint32_t *)buf = L2UBERBLOCK_MAGIC;
6201         *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN;
6202 #else   /* !defined(_BIG_ENDIAN) */
6203         *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC);
6204         /* zero flags is ok */
6205 #endif  /* !defined(_BIG_ENDIAN) */
6206         buf[4] = L2UBERBLOCK_MAX_VERSION;
6207 
6208         /* rest in native byte order */
6209         *(uint64_t *)(buf + 8) = ub->ub_spa_guid;
6210         *(uint64_t *)(buf + 16) = ub->ub_birth;
6211         *(uint64_t *)(buf + 24) = ub->ub_evict_tail;
6212         *(uint64_t *)(buf + 32) = ub->ub_alloc_space;
6213         *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr;
6214         *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize;
6215         bcopy(&ub->ub_pbuf_cksum, buf + 52, 32);
6216 
6217         fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6218         bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32);
6219 }
6220 
6221 /*
6222  * Decodes an l2uberblock_t from an on-disk representation. Please note
6223  * that this function does not perform any uberblock validation and
6224  * checksumming - call l2arc_uberblock_verify() for that.
6225  */
6226 static void
6227 l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub)
6228 {
6229         boolean_t bswap_needed;
6230 
6231         /* these always come in big endian */
6232 #if defined(_BIG_ENDIAN)
6233         ub->ub_magic = *(uint32_t *)buf;
6234         ub->ub_flags = *(uint16_t *)(buf + 6);
6235         bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1);
6236 #else   /* !defined(_BIG_ENDIAN) */
6237         ub->ub_magic = BSWAP_32(*(uint32_t *)buf);
6238         ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6));
6239         bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0);
6240 #endif  /* !defined(_BIG_ENDIAN) */
6241         ub->ub_version = buf[4];
6242 
6243         ub->ub_spa_guid = *(uint64_t *)(buf + 8);
6244         ub->ub_birth = *(uint64_t *)(buf + 16);
6245         ub->ub_evict_tail = *(uint64_t *)(buf + 24);
6246         ub->ub_alloc_space = *(uint64_t *)(buf + 32);
6247         ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40);
6248         ub->ub_pbuf_asize = *(uint32_t *)(buf + 48);
6249         bcopy(buf + 52, &ub->ub_pbuf_cksum, 36);
6250         bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32);
6251 
6252         /* swap the rest if endianness doesn't match us */
6253         if (bswap_needed) {
6254                 ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid);
6255                 ub->ub_birth = BSWAP_64(ub->ub_birth);
6256                 ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail);
6257                 ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space);
6258                 ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr);
6259                 ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize);
6260                 ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum);
6261                 ZIO_CHECKSUM_BSWAP(&ub->ub_cksum);
6262         }
6263 }
6264 
6265 /*
6266  * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is
6267  * valid and matches its checksum.
6268  */
6269 static int
6270 l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
6271     uint64_t guid)
6272 {
6273         zio_cksum_t cksum;
6274 
6275         if (ub->ub_magic != L2UBERBLOCK_MAGIC ||
6276             ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION)
6277                 /*
6278                  * bad magic or invalid version => persistent l2arc not
6279                  * supported
6280                  */
6281                 return (ENOTSUP);
6282 
6283         if (ub->ub_spa_guid != guid)
6284                 /* this l2arc dev isn't ours */
6285                 return (EINVAL);
6286 
6287         fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6288         if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum))
6289                 /* bad checksum, corrupt uberblock */
6290                 return (EINVAL);
6291 
6292         return (0);
6293 }
6294 
6295 /*
6296  * Schedules a zio to update the uberblock on an l2arc device. The zio is
6297  * initiated as a child of `pio' and `cb' is filled with the information
6298  * needed to free the uberblock data buffer after writing.
6299  */
6300 static void
6301 l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6302 {
6303         uint8_t *ub_buf;
6304         l2uberblock_t ub;
6305         zio_t *wzio;
6306         vdev_stat_t st;
6307 
6308         ASSERT(cb->l2wcb_ub_buf == NULL);
6309         vdev_get_stats(dev->l2ad_vdev, &st);
6310 
6311         bzero(&ub, sizeof (ub));
6312         ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6313         ub.ub_birth = dev->l2ad_uberblock_birth++;
6314         ub.ub_evict_tail = dev->l2ad_evict;
6315         ub.ub_alloc_space = st.vs_alloc;
6316         ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr;
6317         ub.ub_pbuf_asize = dev->l2ad_pbuf_asize;
6318         ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum;
6319         if (dev->l2ad_first)
6320                 ub.ub_flags |= L2UBLK_EVICT_FIRST;
6321 
6322         ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
6323         cb->l2wcb_ub_buf = ub_buf;
6324         l2arc_uberblock_encode(&ub, ub_buf);
6325         wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
6326             L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL,
6327             ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6328         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6329             zio_t *, wzio);
6330         (void) zio_nowait(wzio);
6331 }
6332 
6333 /*
6334  * Encodes a l2pbuf_t structure into the portable on-disk format. The
6335  * `buf' buffer must be suitably sized to hold the entire uncompressed
6336  * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function
6337  * also compresses the buffer.
6338  *
6339  * The return value is the length of the resulting encoded pbuf structure.
6340  * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression
6341  * was applied, or smaller if compression was applied. In either case,
6342  * prior to writing to disk, the caller must suitably pad the output
6343  * buffer so that it is aligned on a multiple of the underlying storage
6344  * system's block size.
6345  */
6346 static uint32_t
6347 l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen)
6348 {
6349         uint16_t flags = 0;
6350         uint8_t *dst_buf;
6351         uint32_t enclen;
6352         l2pbuf_buflist_t *buflist;
6353 
6354         enclen = L2PBUF_ENCODED_SIZE(pb);
6355         ASSERT(buflen >= enclen);
6356         bzero(buf, enclen);
6357 
6358         /* non-header portions of pbufs are in native byte order */
6359         *(uint64_t *)(buf + 8) = pb->pb_prev_daddr;
6360         *(uint32_t *)(buf + 16) = pb->pb_prev_asize;
6361         bcopy(&pb->pb_prev_cksum, buf + 20, 32);
6362         *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE;
6363 
6364         /* first we encode the buflists uncompressed */
6365         dst_buf = buf + L2PBUF_HDR_SIZE;
6366         for (buflist = list_head(pb->pb_buflists_list); buflist;
6367             buflist = list_next(pb->pb_buflists_list, buflist)) {
6368                 int i;
6369 
6370                 ASSERT(buflist->l2pbl_nbufs != 0);
6371                 for (i = 0; i < buflist->l2pbl_nbufs; i++) {
6372                         l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6373 
6374                         ASSERT(pbl_buf->b_size != 0);
6375                         *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0];
6376                         *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1];
6377                         *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth;
6378                         *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0;
6379                         bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32);
6380                         *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size;
6381                         *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr;
6382                         *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize;
6383                         dst_buf[80] = pbl_buf->b_l2compress;
6384                         dst_buf[81] = pbl_buf->b_contents_type;
6385                         *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags;
6386                         dst_buf += L2PBUF_BUF_SIZE;
6387                 }
6388         }
6389         ASSERT((uint32_t)(dst_buf - buf) == enclen);
6390 
6391         /* and then compress them if necessary */
6392         if (enclen >= l2arc_pbuf_compress_minsz) {
6393                 uint8_t *cbuf;
6394                 size_t slen, clen;
6395 
6396                 slen = l2arc_pbuf_items_encoded_size(pb);
6397                 cbuf = kmem_alloc(slen, KM_SLEEP);
6398                 clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0);
6399                 ASSERT(clen != 0);
6400                 if (clen < slen) {
6401                         bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen);
6402                         flags |= L2PBUF_COMPRESSED;
6403                         /* zero out the rest of the input buffer */
6404                         bzero(buf + L2PBUF_HDR_SIZE + clen,
6405                             buflen - (L2PBUF_HDR_SIZE + clen));
6406                         /* adjust our buffer length now that it's shortened */
6407                         enclen = L2PBUF_HDR_SIZE + clen;
6408                 }
6409                 kmem_free(cbuf, slen);
6410         }
6411 
6412         /* the header goes last since `flags' may change due to compression */
6413 #if defined(_BIG_ENDIAN)
6414         *(uint32_t *)buf = L2PBUF_MAGIC;
6415         flags |= L2PBUF_BIG_ENDIAN;
6416         *(uint16_t *)(buf + 6) = flags;
6417 #else   /* !defined(_BIG_ENDIAN) */
6418         *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC);
6419         *(uint16_t *)(buf + 6) = BSWAP_16(flags);
6420 #endif  /* !defined(_BIG_ENDIAN) */
6421         buf[4] = L2PBUF_MAX_VERSION;
6422 
6423         return (enclen);
6424 }
6425 
6426 /*
6427  * Decodes a stored l2pbuf_t structure previously encoded using
6428  * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf
6429  * must be initialized by l2arc_pbuf_init by the caller beforehand, but
6430  * must not have been used to store any buffers yet.
6431  *
6432  * Please note that we don't do checksum verification here, as we don't
6433  * know our own checksum (that's know by the previous block in the linked
6434  * list, or by the uberblock). This should be performed by the caller
6435  * prior to calling l2arc_pbuf_decode.
6436  */
6437 static int
6438 l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb)
6439 {
6440         boolean_t bswap_needed;
6441         uint32_t payload_sz, payload_asz;
6442         uint8_t *src_bufs;
6443         l2pbuf_buflist_t *buflist;
6444         int i, nbufs;
6445 
6446         ASSERT(input_buf != NULL);
6447         ASSERT(pb != NULL);
6448         ASSERT(pb->pb_version != 0);
6449         ASSERT(pb->pb_nbuflists == 0);
6450 
6451         /* no valid buffer can be this small */
6452         if (buflen < L2PBUF_HDR_SIZE)
6453                 return (EINVAL);
6454 
6455         /* these always come in big endian */
6456 #if defined(_BIG_ENDIAN)
6457         pb->pb_magic = *(uint32_t *)input_buf;
6458         pb->pb_flags = *(uint16_t *)(input_buf + 6);
6459         bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1);
6460 #else   /* !defined(_BIG_ENDIAN) */
6461         pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf);
6462         pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6));
6463         bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0);
6464 #endif  /* !defined(_BIG_ENDIAN) */
6465         pb->pb_version = input_buf[4];
6466 
6467         if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0)
6468                 return (EINVAL);
6469         if (pb->pb_version > L2PBUF_MAX_VERSION)
6470                 return (ENOTSUP);
6471 
6472         /* remainder of pbuf may need bswap'ping */
6473         pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8);
6474         pb->pb_prev_asize = *(uint64_t *)(input_buf + 16);
6475         bcopy(input_buf + 20, &pb->pb_prev_cksum, 32);
6476         payload_sz = *(uint32_t *)(input_buf + 52);
6477         payload_asz = buflen - L2PBUF_HDR_SIZE;
6478 
6479         if (bswap_needed) {
6480                 pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr);
6481                 pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize);
6482                 ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum);
6483                 payload_sz = BSWAP_32(payload_sz);
6484         }
6485 
6486         /* check for sensible buffer allocation limits */
6487         if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) ||
6488             (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) ||
6489             (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0)
6490                 return (EINVAL);
6491         nbufs = payload_sz / L2PBUF_BUF_SIZE;
6492 
6493         /* decompression might be needed */
6494         if (pb->pb_flags & L2PBUF_COMPRESSED) {
6495                 src_bufs = kmem_alloc(payload_sz, KM_SLEEP);
6496                 if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs,
6497                     payload_asz, payload_sz, 0) != 0) {
6498                         kmem_free(src_bufs, payload_sz);
6499                         return (EINVAL);
6500                 }
6501         } else {
6502                 src_bufs = input_buf + L2PBUF_HDR_SIZE;
6503         }
6504 
6505         /* Decode individual pbuf items from our source buffer. */
6506         buflist = l2arc_pbuf_buflist_alloc(pb, nbufs);
6507         for (i = 0; i < nbufs; i++) {
6508                 l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6509                 const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE;
6510 
6511                 pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src;
6512                 pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8);
6513                 pbl_buf->b_birth = *(uint64_t *)(src + 16);
6514                 pbl_buf->b_cksum0 = *(uint64_t *)(src + 24);
6515                 bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32);
6516                 pbl_buf->b_size = *(uint32_t *)(src + 64);
6517                 pbl_buf->b_l2daddr = *(uint64_t *)(src + 68);
6518                 pbl_buf->b_l2asize = *(uint32_t *)(src + 76);
6519                 pbl_buf->b_l2compress = src[80];
6520                 pbl_buf->b_contents_type = src[81];
6521                 pbl_buf->b_flags = *(uint32_t *)(src + 84);
6522 
6523                 if (bswap_needed) {
6524                         pbl_buf->b_dva.dva_word[0] =
6525                             BSWAP_64(pbl_buf->b_dva.dva_word[0]);
6526                         pbl_buf->b_dva.dva_word[1] =
6527                             BSWAP_64(pbl_buf->b_dva.dva_word[1]);
6528                         pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth);
6529                         pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0);
6530                         ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum);
6531                         pbl_buf->b_size = BSWAP_32(pbl_buf->b_size);
6532                         pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr);
6533                         pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize);
6534                         pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags);
6535                 }
6536 
6537                 pb->pb_payload_asz += pbl_buf->b_l2asize;
6538         }
6539 
6540         if (pb->pb_flags & L2PBUF_COMPRESSED)
6541                 kmem_free(src_bufs, payload_sz);
6542 
6543         return (0);
6544 }
6545 
6546 /*
6547  * Decodes the previous buffer pointer encoded in a pbuf. This is used
6548  * during L2ARC reconstruction to "peek" at the next buffer and start
6549  * issuing IO to fetch it early, before decoding of the current buffer
6550  * is done (which can take time due to decompression).
6551  * Returns 0 on success (and fills in the return parameters `daddr',
6552  * `asize' and `cksum' with the info of the previous pbuf), and an errno
6553  * on error.
6554  */
6555 static int
6556 l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr,
6557     uint32_t *asize, zio_cksum_t *cksum)
6558 {
6559         boolean_t bswap_needed;
6560         uint16_t version, flags;
6561         uint32_t magic;
6562 
6563         ASSERT(buf != NULL);
6564 
6565         /* no valid buffer can be this small */
6566         if (buflen <= L2PBUF_HDR_SIZE)
6567                 return (EINVAL);
6568 
6569         /* these always come in big endian */
6570 #if defined(_BIG_ENDIAN)
6571         magic = *(uint32_t *)buf;
6572         flags = *(uint16_t *)(buf + 6);
6573         bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1);
6574 #else   /* !defined(_BIG_ENDIAN) */
6575         magic = BSWAP_32(*(uint32_t *)buf);
6576         flags = BSWAP_16(*(uint16_t *)(buf + 6));
6577         bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0);
6578 #endif  /* !defined(_BIG_ENDIAN) */
6579         version = buf[4];
6580 
6581         if (magic != L2PBUF_MAGIC || version == 0)
6582                 return (EINVAL);
6583         if (version > L2PBUF_MAX_VERSION)
6584                 return (ENOTSUP);
6585 
6586         *daddr = *(uint64_t *)(buf + 4);
6587         *asize = *(uint64_t *)(buf + 12);
6588         bcopy(buf + 16, cksum, 32);
6589 
6590         if (bswap_needed) {
6591                 *daddr = BSWAP_64(*daddr);
6592                 *asize = BSWAP_64(*asize);
6593                 ZIO_CHECKSUM_BSWAP(cksum);
6594         }
6595 
6596         return (0);
6597 }
6598 
6599 /*
6600  * Initializes a pbuf structure into a clean state. All version and flags
6601  * fields are filled in as appropriate for this architecture.
6602  * If the structure was used before, first call l2arc_pbuf_destroy on it,
6603  * as this function assumes the structure is uninitialized.
6604  */
6605 static void
6606 l2arc_pbuf_init(l2pbuf_t *pb)
6607 {
6608         bzero(pb, sizeof (l2pbuf_t));
6609         pb->pb_version = L2PBUF_MAX_VERSION;
6610 #if defined(_BIG_ENDIAN)
6611         pb->pb_flags |= L2PB_BIG_ENDIAN;
6612 #endif
6613         pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP);
6614         list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t),
6615             offsetof(l2pbuf_buflist_t, l2pbl_node));
6616 }
6617 
6618 /*
6619  * Destroys a pbuf structure and puts it into a clean state ready to be
6620  * initialized by l2arc_pbuf_init. All buflists created by
6621  * l2arc_pbuf_buflist_alloc are released as well.
6622  */
6623 static void
6624 l2arc_pbuf_destroy(l2pbuf_t *pb)
6625 {
6626         list_t *buflist_list = pb->pb_buflists_list;
6627         l2pbuf_buflist_t *buflist;
6628 
6629         while ((buflist = list_head(buflist_list)) != NULL) {
6630                 ASSERT(buflist->l2pbl_nbufs > 0);
6631                 kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) *
6632                     buflist->l2pbl_nbufs);
6633                 list_remove(buflist_list, buflist);
6634                 kmem_free(buflist, sizeof (l2pbuf_buflist_t));
6635         }
6636         pb->pb_nbuflists = 0;
6637         list_destroy(pb->pb_buflists_list);
6638         kmem_free(pb->pb_buflists_list, sizeof (list_t));
6639         bzero(pb, sizeof (l2pbuf_t));
6640 }
6641 
6642 /*
6643  * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs'
6644  * buffers. This is used during the buffer write cycle - each cycle allocates
6645  * a new buflist and fills it with buffers it writes. Then, when the pbuf
6646  * reaches its buflist limit, it is commited to stable storage.
6647  */
6648 static l2pbuf_buflist_t *
6649 l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs)
6650 {
6651         l2pbuf_buflist_t *buflist;
6652 
6653         ASSERT(pb->pb_buflists_list != NULL);
6654         buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP);
6655         buflist->l2pbl_nbufs = nbufs;
6656         buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs,
6657             KM_SLEEP);
6658         list_insert_tail(pb->pb_buflists_list, buflist);
6659         pb->pb_nbuflists++;
6660 
6661         return (buflist);
6662 }
6663 
6664 /*
6665  * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'.
6666  * The buffer being inserted must be present in L2ARC.
6667  */
6668 static void
6669 l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
6670     const arc_buf_hdr_t *ab, int index)
6671 {
6672         l2pbuf_buf_t *pb_buf;
6673         const l2arc_buf_hdr_t *l2hdr;
6674 
6675         l2hdr = ab->b_l2hdr;
6676         ASSERT(l2hdr != NULL);
6677         ASSERT(pbl->l2pbl_nbufs > index);
6678 
6679         pb_buf = &pbl->l2pbl_bufs[index];
6680         pb_buf->b_dva = ab->b_dva;
6681         pb_buf->b_birth = ab->b_birth;
6682         pb_buf->b_cksum0 = ab->b_cksum0;
6683         pb_buf->b_freeze_cksum = *ab->b_freeze_cksum;
6684         pb_buf->b_size = ab->b_size;
6685         pb_buf->b_l2daddr = l2hdr->b_daddr;
6686         pb_buf->b_l2asize = l2hdr->b_asize;
6687         pb_buf->b_l2compress = l2hdr->b_compress;
6688         pb_buf->b_contents_type = ab->b_type;
6689         pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS;
6690         pb->pb_payload_asz += l2hdr->b_asize;
6691 }
6692 
6693 /*
6694  * Commits a pbuf to stable storage. This routine is invoked when writing
6695  * ARC buffers to an L2ARC device. When the pbuf associated with the device
6696  * has reached its limits (either in size or in number of writes), it is
6697  * scheduled here for writing.
6698  * This function allocates some memory to temporarily hold the serialized
6699  * buffer to be written. This is then released in l2arc_write_done.
6700  */
6701 static void
6702 l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6703 {
6704         l2pbuf_t *pb = &dev->l2ad_pbuf;
6705         uint64_t i, est_encsize, bufsize, encsize, io_size;
6706         uint8_t *pb_buf;
6707 
6708         pb->pb_prev_daddr = dev->l2ad_pbuf_daddr;
6709         pb->pb_prev_asize = dev->l2ad_pbuf_asize;
6710         pb->pb_prev_cksum = dev->l2ad_pbuf_cksum;
6711 
6712         est_encsize = L2PBUF_ENCODED_SIZE(pb);
6713         bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize);
6714         pb_buf = kmem_zalloc(bufsize, KM_SLEEP);
6715         encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize);
6716         cb->l2wcb_pbuf = pb_buf;
6717         cb->l2wcb_pbuf_size = bufsize;
6718 
6719         dev->l2ad_pbuf_daddr = dev->l2ad_hand;
6720         dev->l2ad_pbuf_asize = encsize;
6721         fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum);
6722 
6723         io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize);
6724         for (i = 0; i < io_size; ) {
6725                 zio_t *wzio;
6726                 uint64_t wsize = io_size - i;
6727 
6728                 if (wsize > SPA_MAXBLOCKSIZE)
6729                         wsize = SPA_MAXBLOCKSIZE;
6730                 ASSERT(wsize >= SPA_MINBLOCKSIZE);
6731                 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i,
6732                     wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6733                     ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6734                 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6735                     zio_t *, wzio);
6736                 (void) zio_nowait(wzio);
6737                 i += wsize;
6738         }
6739 
6740         dev->l2ad_hand += io_size;
6741         vdev_space_update(dev->l2ad_vdev, io_size, 0, 0);
6742         l2arc_uberblock_update(dev, pio, cb);
6743 
6744         ARCSTAT_INCR(arcstat_l2_write_bytes, io_size);
6745         ARCSTAT_BUMP(arcstat_l2_meta_writes);
6746         ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize);
6747         ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize);
6748         ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6749             pb->pb_payload_asz / encsize);
6750 }
6751 
6752 /*
6753  * Returns the number of bytes occupied by the payload buffer items of
6754  * a pbuf in portable (on-disk) encoded form, i.e. the bytes following
6755  * L2PBUF_HDR_SIZE.
6756  */
6757 static uint32_t
6758 l2arc_pbuf_items_encoded_size(l2pbuf_t *pb)
6759 {
6760         uint32_t size = 0;
6761         l2pbuf_buflist_t *buflist;
6762 
6763         for (buflist = list_head(pb->pb_buflists_list); buflist != NULL;
6764             buflist = list_next(pb->pb_buflists_list, buflist))
6765                 size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs;
6766 
6767         return (size);
6768 }