| 
 
 
 121  */
 122 
 123 #include <sys/spa.h>
 124 #include <sys/zio.h>
 125 #include <sys/zio_compress.h>
 126 #include <sys/zfs_context.h>
 127 #include <sys/arc.h>
 128 #include <sys/refcount.h>
 129 #include <sys/vdev.h>
 130 #include <sys/vdev_impl.h>
 131 #include <sys/dsl_pool.h>
 132 #ifdef _KERNEL
 133 #include <sys/vmsystm.h>
 134 #include <vm/anon.h>
 135 #include <sys/fs/swapnode.h>
 136 #include <sys/dnlc.h>
 137 #endif
 138 #include <sys/callb.h>
 139 #include <sys/kstat.h>
 140 #include <zfs_fletcher.h>
 141 
 142 #ifndef _KERNEL
 143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 144 boolean_t arc_watch = B_FALSE;
 145 int arc_procfd;
 146 #endif
 147 
 148 static kmutex_t         arc_reclaim_thr_lock;
 149 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 150 static uint8_t          arc_thread_exit;
 151 
 152 #define ARC_REDUCE_DNLC_PERCENT 3
 153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154 
 155 typedef enum arc_reclaim_strategy {
 156         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158 } arc_reclaim_strategy_t;
 159 
 160 /*
 
 299         kstat_named_t arcstat_l2_feeds;
 300         kstat_named_t arcstat_l2_rw_clash;
 301         kstat_named_t arcstat_l2_read_bytes;
 302         kstat_named_t arcstat_l2_write_bytes;
 303         kstat_named_t arcstat_l2_writes_sent;
 304         kstat_named_t arcstat_l2_writes_done;
 305         kstat_named_t arcstat_l2_writes_error;
 306         kstat_named_t arcstat_l2_writes_hdr_miss;
 307         kstat_named_t arcstat_l2_evict_lock_retry;
 308         kstat_named_t arcstat_l2_evict_reading;
 309         kstat_named_t arcstat_l2_free_on_write;
 310         kstat_named_t arcstat_l2_abort_lowmem;
 311         kstat_named_t arcstat_l2_cksum_bad;
 312         kstat_named_t arcstat_l2_io_error;
 313         kstat_named_t arcstat_l2_size;
 314         kstat_named_t arcstat_l2_asize;
 315         kstat_named_t arcstat_l2_hdr_size;
 316         kstat_named_t arcstat_l2_compress_successes;
 317         kstat_named_t arcstat_l2_compress_zeros;
 318         kstat_named_t arcstat_l2_compress_failures;
 319         kstat_named_t arcstat_memory_throttle_count;
 320         kstat_named_t arcstat_duplicate_buffers;
 321         kstat_named_t arcstat_duplicate_buffers_size;
 322         kstat_named_t arcstat_duplicate_reads;
 323         kstat_named_t arcstat_meta_used;
 324         kstat_named_t arcstat_meta_limit;
 325         kstat_named_t arcstat_meta_max;
 326 } arc_stats_t;
 327 
 328 static arc_stats_t arc_stats = {
 329         { "hits",                       KSTAT_DATA_UINT64 },
 330         { "misses",                     KSTAT_DATA_UINT64 },
 331         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 332         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 333         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 334         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 335         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 336         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 337         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 338         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 
 365         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 366         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 367         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 368         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 369         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 370         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 371         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 372         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 373         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 374         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 375         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 376         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 377         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 378         { "l2_io_error",                KSTAT_DATA_UINT64 },
 379         { "l2_size",                    KSTAT_DATA_UINT64 },
 380         { "l2_asize",                   KSTAT_DATA_UINT64 },
 381         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 382         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 383         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 384         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 385         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 386         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 387         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 388         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 389         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 390         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 391         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 392 };
 393 
 394 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 395 
 396 #define ARCSTAT_INCR(stat, val) \
 397         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 398 
 399 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 400 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 401 
 402 #define ARCSTAT_MAX(stat, val) {                                        \
 403         uint64_t m;                                                     \
 404         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 
 412 /*
 413  * We define a macro to allow ARC hits/misses to be easily broken down by
 414  * two separate conditions, giving a total of four different subtypes for
 415  * each of hits and misses (so eight statistics total).
 416  */
 417 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 418         if (cond1) {                                                    \
 419                 if (cond2) {                                            \
 420                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 421                 } else {                                                \
 422                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 423                 }                                                       \
 424         } else {                                                        \
 425                 if (cond2) {                                            \
 426                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 427                 } else {                                                \
 428                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 429                 }                                                       \
 430         }
 431 
 432 kstat_t                 *arc_ksp;
 433 static arc_state_t      *arc_anon;
 434 static arc_state_t      *arc_mru;
 435 static arc_state_t      *arc_mru_ghost;
 436 static arc_state_t      *arc_mfu;
 437 static arc_state_t      *arc_mfu_ghost;
 438 static arc_state_t      *arc_l2c_only;
 439 
 440 /*
 441  * There are several ARC variables that are critical to export as kstats --
 442  * but we don't want to have to grovel around in the kstat whenever we wish to
 443  * manipulate them.  For these variables, we therefore define them to be in
 444  * terms of the statistic variable.  This assures that we are not introducing
 445  * the possibility of inconsistency by having shadow copies of the variables,
 446  * while still allowing the code to be readable.
 447  */
 448 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 449 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 450 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 451 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 
 620 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 621 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 622 
 623 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 624 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 625 
 626 /* L2ARC Performance Tunables */
 627 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 628 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 629 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 630 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 631 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 632 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 633 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 634 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 635 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 636 
 637 /*
 638  * L2ARC Internals
 639  */
 640 typedef struct l2arc_dev {
 641         vdev_t                  *l2ad_vdev;     /* vdev */
 642         spa_t                   *l2ad_spa;      /* spa */
 643         uint64_t                l2ad_hand;      /* next write location */
 644         uint64_t                l2ad_start;     /* first addr on device */
 645         uint64_t                l2ad_end;       /* last addr on device */
 646         uint64_t                l2ad_evict;     /* last addr eviction reached */
 647         boolean_t               l2ad_first;     /* first sweep through */
 648         boolean_t               l2ad_writing;   /* currently writing */
 649         list_t                  *l2ad_buflist;  /* buffer list */
 650         list_node_t             l2ad_node;      /* device list node */
 651 } l2arc_dev_t;
 652 
 653 static list_t L2ARC_dev_list;                   /* device list */
 654 static list_t *l2arc_dev_list;                  /* device list pointer */
 655 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 656 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 657 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 658 static list_t L2ARC_free_on_write;              /* free after write buf list */
 659 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 660 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 661 static uint64_t l2arc_ndev;                     /* number of devices */
 662 
 663 typedef struct l2arc_read_callback {
 664         arc_buf_t               *l2rcb_buf;             /* read buffer */
 665         spa_t                   *l2rcb_spa;             /* spa */
 666         blkptr_t                l2rcb_bp;               /* original blkptr */
 667         zbookmark_t             l2rcb_zb;               /* original bookmark */
 668         int                     l2rcb_flags;            /* original flags */
 669         enum zio_compress       l2rcb_compress;         /* applied compress */
 670 } l2arc_read_callback_t;
 671 
 672 typedef struct l2arc_write_callback {
 673         l2arc_dev_t     *l2wcb_dev;             /* device info */
 674         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 675 } l2arc_write_callback_t;
 676 
 677 struct l2arc_buf_hdr {
 678         /* protected by arc_buf_hdr  mutex */
 679         l2arc_dev_t             *b_dev;         /* L2ARC device */
 680         uint64_t                b_daddr;        /* disk address, offset byte */
 681         /* compression applied to buffer data */
 682         enum zio_compress       b_compress;
 683         /* real alloc'd buffer size depending on b_compress applied */
 684         int                     b_asize;
 685         /* temporary buffer holder for in-flight compressed data */
 686         void                    *b_tmp_cdata;
 687 };
 688 
 689 typedef struct l2arc_data_free {
 690         /* protected by l2arc_free_on_write_mtx */
 691         void            *l2df_data;
 692         size_t          l2df_size;
 693         void            (*l2df_func)(void *, size_t);
 694         list_node_t     l2df_list_node;
 695 } l2arc_data_free_t;
 696 
 697 static kmutex_t l2arc_feed_thr_lock;
 698 static kcondvar_t l2arc_feed_thr_cv;
 699 static uint8_t l2arc_thread_exit;
 700 
 701 static void l2arc_read_done(zio_t *zio);
 702 static void l2arc_hdr_stat_add(void);
 703 static void l2arc_hdr_stat_remove(void);
 704 
 705 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 706 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 707     enum zio_compress c);
 708 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 709 
 710 static uint64_t
 711 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 712 {
 713         uint8_t *vdva = (uint8_t *)dva;
 714         uint64_t crc = -1ULL;
 715         int i;
 716 
 717         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 718 
 719         for (i = 0; i < sizeof (dva_t); i++)
 720                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 721 
 722         crc ^= (spa>>8) ^ birth;
 723 
 724         return (crc);
 725 }
 726 
 727 #define BUF_EMPTY(buf)                                          \
 728         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 729         (buf)->b_dva.dva_word[1] == 0 &&                     \
 730         (buf)->b_birth == 0)
 
1230                         if (use_mutex)
1231                                 mutex_exit(&new_state->arcs_mtx);
1232                 }
1233         }
1234 
1235         ASSERT(!BUF_EMPTY(ab));
1236         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1237                 buf_hash_remove(ab);
1238 
1239         /* adjust state sizes */
1240         if (to_delta)
1241                 atomic_add_64(&new_state->arcs_size, to_delta);
1242         if (from_delta) {
1243                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1244                 atomic_add_64(&old_state->arcs_size, -from_delta);
1245         }
1246         ab->b_state = new_state;
1247 
1248         /* adjust l2arc hdr stats */
1249         if (new_state == arc_l2c_only)
1250                 l2arc_hdr_stat_add();
1251         else if (old_state == arc_l2c_only)
1252                 l2arc_hdr_stat_remove();
1253 }
1254 
1255 void
1256 arc_space_consume(uint64_t space, arc_space_type_t type)
1257 {
1258         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1259 
1260         switch (type) {
1261         case ARC_SPACE_DATA:
1262                 ARCSTAT_INCR(arcstat_data_size, space);
1263                 break;
1264         case ARC_SPACE_OTHER:
1265                 ARCSTAT_INCR(arcstat_other_size, space);
1266                 break;
1267         case ARC_SPACE_HDRS:
1268                 ARCSTAT_INCR(arcstat_hdr_size, space);
1269                 break;
1270         case ARC_SPACE_L2HDRS:
 
1334         hdr->b_type = type;
1335         hdr->b_spa = spa_load_guid(spa);
1336         hdr->b_state = arc_anon;
1337         hdr->b_arc_access = 0;
1338         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1339         buf->b_hdr = hdr;
1340         buf->b_data = NULL;
1341         buf->b_efunc = NULL;
1342         buf->b_private = NULL;
1343         buf->b_next = NULL;
1344         hdr->b_buf = buf;
1345         arc_get_data_buf(buf);
1346         hdr->b_datacnt = 1;
1347         hdr->b_flags = 0;
1348         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1349         (void) refcount_add(&hdr->b_refcnt, tag);
1350 
1351         return (buf);
1352 }
1353 
1354 static char *arc_onloan_tag = "onloan";
1355 
1356 /*
1357  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1358  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1359  * buffers must be returned to the arc before they can be used by the DMU or
1360  * freed.
1361  */
1362 arc_buf_t *
1363 arc_loan_buf(spa_t *spa, int size)
1364 {
1365         arc_buf_t *buf;
1366 
1367         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1368 
1369         atomic_add_64(&arc_loaned_bytes, size);
1370         return (buf);
1371 }
1372 
1373 /*
 
1571                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1572                 /*
1573                  * To prevent arc_free() and l2arc_evict() from
1574                  * attempting to free the same buffer at the same time,
1575                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1576                  * give it priority.  l2arc_evict() can't destroy this
1577                  * header while we are waiting on l2arc_buflist_mtx.
1578                  *
1579                  * The hdr may be removed from l2ad_buflist before we
1580                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1581                  */
1582                 if (!buflist_held) {
1583                         mutex_enter(&l2arc_buflist_mtx);
1584                         l2hdr = hdr->b_l2hdr;
1585                 }
1586 
1587                 if (l2hdr != NULL) {
1588                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1589                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1590                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1591                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1592                         if (hdr->b_state == arc_l2c_only)
1593                                 l2arc_hdr_stat_remove();
1594                         hdr->b_l2hdr = NULL;
1595                 }
1596 
1597                 if (!buflist_held)
1598                         mutex_exit(&l2arc_buflist_mtx);
1599         }
1600 
1601         if (!BUF_EMPTY(hdr)) {
1602                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1603                 buf_discard_identity(hdr);
1604         }
1605         while (hdr->b_buf) {
1606                 arc_buf_t *buf = hdr->b_buf;
1607 
1608                 if (buf->b_efunc) {
1609                         mutex_enter(&arc_eviction_mtx);
1610                         mutex_enter(&buf->b_evict_lock);
1611                         ASSERT(buf->b_hdr != NULL);
 
3028                         buf->b_next = NULL;
3029                         hdr->b_buf = buf;
3030                         ASSERT(hdr->b_datacnt == 0);
3031                         hdr->b_datacnt = 1;
3032                         arc_get_data_buf(buf);
3033                         arc_access(hdr, hash_lock);
3034                 }
3035 
3036                 ASSERT(!GHOST_STATE(hdr->b_state));
3037 
3038                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3039                 acb->acb_done = done;
3040                 acb->acb_private = private;
3041 
3042                 ASSERT(hdr->b_acb == NULL);
3043                 hdr->b_acb = acb;
3044                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3045 
3046                 if (hdr->b_l2hdr != NULL &&
3047                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3048                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3049                         addr = hdr->b_l2hdr->b_daddr;
3050                         b_compress = hdr->b_l2hdr->b_compress;
3051                         b_asize = hdr->b_l2hdr->b_asize;
3052                         /*
3053                          * Lock out device removal.
3054                          */
3055                         if (vdev_is_dead(vd) ||
3056                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3057                                 vd = NULL;
3058                 }
3059 
3060                 mutex_exit(hash_lock);
3061 
3062                 /*
3063                  * At this point, we have a level 1 cache miss.  Try again in
3064                  * L2ARC if possible.
3065                  */
3066                 ASSERT3U(hdr->b_size, ==, size);
3067                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
 
3401                 atomic_add_64(&arc_anon->arcs_size, blksz);
3402         } else {
3403                 mutex_exit(&buf->b_evict_lock);
3404                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3405                 ASSERT(!list_link_active(&hdr->b_arc_node));
3406                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3407                 if (hdr->b_state != arc_anon)
3408                         arc_change_state(arc_anon, hdr, hash_lock);
3409                 hdr->b_arc_access = 0;
3410                 if (hash_lock)
3411                         mutex_exit(hash_lock);
3412 
3413                 buf_discard_identity(hdr);
3414                 arc_buf_thaw(buf);
3415         }
3416         buf->b_efunc = NULL;
3417         buf->b_private = NULL;
3418 
3419         if (l2hdr) {
3420                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3421                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3422                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3423                 mutex_exit(&l2arc_buflist_mtx);
3424         }
3425 }
3426 
3427 int
3428 arc_released(arc_buf_t *buf)
3429 {
3430         int released;
3431 
3432         mutex_enter(&buf->b_evict_lock);
3433         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3434         mutex_exit(&buf->b_evict_lock);
3435         return (released);
3436 }
3437 
3438 int
3439 arc_has_callback(arc_buf_t *buf)
3440 {
3441         int callback;
 
4016  *      l2arc_noprefetch        skip caching prefetched buffers
4017  *      l2arc_headroom          number of max device writes to precache
4018  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4019  *                              scanning, we multiply headroom by this
4020  *                              percentage factor for the next scan cycle,
4021  *                              since more compressed buffers are likely to
4022  *                              be present
4023  *      l2arc_feed_secs         seconds between L2ARC writing
4024  *
4025  * Tunables may be removed or added as future performance improvements are
4026  * integrated, and also may become zpool properties.
4027  *
4028  * There are three key functions that control how the L2ARC warms up:
4029  *
4030  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4031  *      l2arc_write_size()      calculate how much to write
4032  *      l2arc_write_interval()  calculate sleep delay between writes
4033  *
4034  * These three functions determine what to write, how much, and how quickly
4035  * to send writes.
4036  */
4037 
4038 static boolean_t
4039 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4040 {
4041         /*
4042          * A buffer is *not* eligible for the L2ARC if it:
4043          * 1. belongs to a different spa.
4044          * 2. is already cached on the L2ARC.
4045          * 3. has an I/O in progress (it may be an incomplete read).
4046          * 4. is flagged not eligible (zfs property).
4047          */
4048         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4049             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4050                 return (B_FALSE);
4051 
4052         return (B_TRUE);
4053 }
4054 
4055 static uint64_t
 
4082         clock_t interval, next, now;
4083 
4084         /*
4085          * If the ARC lists are busy, increase our write rate; if the
4086          * lists are stale, idle back.  This is achieved by checking
4087          * how much we previously wrote - if it was more than half of
4088          * what we wanted, schedule the next write much sooner.
4089          */
4090         if (l2arc_feed_again && wrote > (wanted / 2))
4091                 interval = (hz * l2arc_feed_min_ms) / 1000;
4092         else
4093                 interval = hz * l2arc_feed_secs;
4094 
4095         now = ddi_get_lbolt();
4096         next = MAX(now, MIN(now + interval, began + interval));
4097 
4098         return (next);
4099 }
4100 
4101 static void
4102 l2arc_hdr_stat_add(void)
4103 {
4104         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4105         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4106 }
4107 
4108 static void
4109 l2arc_hdr_stat_remove(void)
4110 {
4111         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4112         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4113 }
4114 
4115 /*
4116  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4117  * If a device is returned, this also returns holding the spa config lock.
4118  */
4119 static l2arc_dev_t *
4120 l2arc_dev_get_next(void)
4121 {
4122         l2arc_dev_t *first, *next = NULL;
4123 
4124         /*
4125          * Lock out the removal of spas (spa_namespace_lock), then removal
4126          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4127          * both locks will be dropped and a spa config lock held instead.
4128          */
4129         mutex_enter(&spa_namespace_lock);
4130         mutex_enter(&l2arc_dev_mtx);
4131 
4132         /* if there are no vdevs, there is nothing to do */
4133         if (l2arc_ndev == 0)
4134                 goto out;
4135 
4136         first = NULL;
4137         next = l2arc_dev_last;
4138         do {
4139                 /* loop around the list looking for a non-faulted vdev */
4140                 if (next == NULL) {
4141                         next = list_head(l2arc_dev_list);
4142                 } else {
4143                         next = list_next(l2arc_dev_list, next);
4144                         if (next == NULL)
4145                                 next = list_head(l2arc_dev_list);
4146                 }
4147 
4148                 /* if we have come back to the start, bail out */
4149                 if (first == NULL)
4150                         first = next;
4151                 else if (next == first)
4152                         break;
4153 
4154         } while (vdev_is_dead(next->l2ad_vdev));
4155 
4156         /* if we were unable to find any usable vdevs, return NULL */
4157         if (vdev_is_dead(next->l2ad_vdev))
4158                 next = NULL;
4159 
4160         l2arc_dev_last = next;
4161 
4162 out:
4163         mutex_exit(&l2arc_dev_mtx);
4164 
4165         /*
4166          * Grab the config lock to prevent the 'next' device from being
4167          * removed while we are writing to it.
4168          */
4169         if (next != NULL)
4170                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4171         mutex_exit(&spa_namespace_lock);
4172 
4173         return (next);
4174 }
4175 
4176 /*
4177  * Free buffers that were tagged for destruction.
 
4191                 ASSERT(df->l2df_func != NULL);
4192                 df->l2df_func(df->l2df_data, df->l2df_size);
4193                 list_remove(buflist, df);
4194                 kmem_free(df, sizeof (l2arc_data_free_t));
4195         }
4196 
4197         mutex_exit(&l2arc_free_on_write_mtx);
4198 }
4199 
4200 /*
4201  * A write to a cache device has completed.  Update all headers to allow
4202  * reads from these buffers to begin.
4203  */
4204 static void
4205 l2arc_write_done(zio_t *zio)
4206 {
4207         l2arc_write_callback_t *cb;
4208         l2arc_dev_t *dev;
4209         list_t *buflist;
4210         arc_buf_hdr_t *head, *ab, *ab_prev;
4211         l2arc_buf_hdr_t *abl2;
4212         kmutex_t *hash_lock;
4213 
4214         cb = zio->io_private;
4215         ASSERT(cb != NULL);
4216         dev = cb->l2wcb_dev;
4217         ASSERT(dev != NULL);
4218         head = cb->l2wcb_head;
4219         ASSERT(head != NULL);
4220         buflist = dev->l2ad_buflist;
4221         ASSERT(buflist != NULL);
4222         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4223             l2arc_write_callback_t *, cb);
4224 
4225         if (zio->io_error != 0)
4226                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4227 
4228         mutex_enter(&l2arc_buflist_mtx);
4229 
4230         /*
4231          * All writes completed, or an error was hit.
4232          */
4233         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4234                 ab_prev = list_prev(buflist, ab);
4235 
4236                 hash_lock = HDR_LOCK(ab);
4237                 if (!mutex_tryenter(hash_lock)) {
4238                         /*
4239                          * This buffer misses out.  It may be in a stage
4240                          * of eviction.  Its ARC_L2_WRITING flag will be
4241                          * left set, denying reads to this buffer.
4242                          */
4243                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4244                         continue;
4245                 }
4246 
4247                 abl2 = ab->b_l2hdr;
4248 
4249                 /*
4250                  * Release the temporary compressed buffer as soon as possible.
4251                  */
4252                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4253                         l2arc_release_cdata_buf(ab);
4254 
4255                 if (zio->io_error != 0) {
4256                         /*
4257                          * Error - drop L2ARC entry.
4258                          */
4259                         list_remove(buflist, ab);
4260                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4261                         ab->b_l2hdr = NULL;
4262                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4263                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4264                 }
4265 
4266                 /*
4267                  * Allow ARC to begin reads to this L2ARC entry.
4268                  */
4269                 ab->b_flags &= ~ARC_L2_WRITING;
4270 
4271                 mutex_exit(hash_lock);
4272         }
4273 
4274         atomic_inc_64(&l2arc_writes_done);
4275         list_remove(buflist, head);
4276         kmem_cache_free(hdr_cache, head);
4277         mutex_exit(&l2arc_buflist_mtx);
4278 
4279         l2arc_do_free_on_write();
4280 
4281         kmem_free(cb, sizeof (l2arc_write_callback_t));
4282 }
4283 
4284 /*
4285  * A read to a cache device completed.  Validate buffer contents before
4286  * handing over to the regular ARC routines.
4287  */
4288 static void
4289 l2arc_read_done(zio_t *zio)
4290 {
4291         l2arc_read_callback_t *cb;
4292         arc_buf_hdr_t *hdr;
4293         arc_buf_t *buf;
4294         kmutex_t *hash_lock;
4295         int equal;
4296 
4297         ASSERT(zio->io_vd != NULL);
4298         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4299 
4300         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
4384         case 1:
4385                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4386                 *lock = &arc_mru->arcs_mtx;
4387                 break;
4388         case 2:
4389                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4390                 *lock = &arc_mfu->arcs_mtx;
4391                 break;
4392         case 3:
4393                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4394                 *lock = &arc_mru->arcs_mtx;
4395                 break;
4396         }
4397 
4398         ASSERT(!(MUTEX_HELD(*lock)));
4399         mutex_enter(*lock);
4400         return (list);
4401 }
4402 
4403 /*
4404  * Evict buffers from the device write hand to the distance specified in
4405  * bytes.  This distance may span populated buffers, it may span nothing.
4406  * This is clearing a region on the L2ARC device ready for writing.
4407  * If the 'all' boolean is set, every buffer is evicted.
4408  */
4409 static void
4410 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4411 {
4412         list_t *buflist;
4413         l2arc_buf_hdr_t *abl2;
4414         arc_buf_hdr_t *ab, *ab_prev;
4415         kmutex_t *hash_lock;
4416         uint64_t taddr;
4417 
4418         buflist = dev->l2ad_buflist;
4419 
4420         if (buflist == NULL)
4421                 return;
4422 
4423         if (!all && dev->l2ad_first) {
4424                 /*
4425                  * This is the first sweep through the device.  There is
4426                  * nothing to evict.
4427                  */
4428                 return;
4429         }
4430 
4431         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4432                 /*
4433                  * When nearing the end of the device, evict to the end
4434                  * before the device write hand jumps to the start.
4435                  */
4436                 taddr = dev->l2ad_end;
4437         } else {
4438                 taddr = dev->l2ad_hand + distance;
4439         }
4440         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4441             uint64_t, taddr, boolean_t, all);
4442 
4443 top:
4444         mutex_enter(&l2arc_buflist_mtx);
4445         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4446                 ab_prev = list_prev(buflist, ab);
4447 
4448                 hash_lock = HDR_LOCK(ab);
4449                 if (!mutex_tryenter(hash_lock)) {
4450                         /*
 
4493                          * arc_hdr_destroy() will call list_remove()
4494                          * and decrement arcstat_l2_size.
4495                          */
4496                         arc_change_state(arc_anon, ab, hash_lock);
4497                         arc_hdr_destroy(ab);
4498                 } else {
4499                         /*
4500                          * Invalidate issued or about to be issued
4501                          * reads, since we may be about to write
4502                          * over this location.
4503                          */
4504                         if (HDR_L2_READING(ab)) {
4505                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4506                                 ab->b_flags |= ARC_L2_EVICTED;
4507                         }
4508 
4509                         /*
4510                          * Tell ARC this no longer exists in L2ARC.
4511                          */
4512                         if (ab->b_l2hdr != NULL) {
4513                                 abl2 = ab->b_l2hdr;
4514                                 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4515                                 ab->b_l2hdr = NULL;
4516                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4517                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4518                         }
4519                         list_remove(buflist, ab);
4520 
4521                         /*
4522                          * This may have been leftover after a
4523                          * failed write.
4524                          */
4525                         ab->b_flags &= ~ARC_L2_WRITING;
4526                 }
4527                 mutex_exit(hash_lock);
4528         }
4529         mutex_exit(&l2arc_buflist_mtx);
4530 
4531         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4532         dev->l2ad_evict = taddr;
4533 }
4534 
4535 /*
4536  * Find and write ARC buffers to the L2ARC device.
4537  *
4538  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4539  * for reading until they have completed writing.
4540  * The headroom_boost is an in-out parameter used to maintain headroom boost
4541  * state between calls to this function.
4542  *
4543  * Returns the number of bytes actually written (which may be smaller than
4544  * the delta by which the device hand has changed due to alignment).
4545  */
4546 static uint64_t
4547 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4548     boolean_t *headroom_boost)
4549 {
4550         arc_buf_hdr_t *ab, *ab_prev, *head;
4551         list_t *list;
4552         uint64_t write_asize, write_psize, write_sz, headroom,
4553             buf_compress_minsz;
4554         void *buf_data;
4555         kmutex_t *list_lock;
4556         boolean_t full;
4557         l2arc_write_callback_t *cb;
4558         zio_t *pio, *wzio;
4559         uint64_t guid = spa_load_guid(spa);
4560         const boolean_t do_headroom_boost = *headroom_boost;
4561 
4562         ASSERT(dev->l2ad_vdev != NULL);
4563 
4564         /* Lower the flag now, we might want to raise it again later. */
4565         *headroom_boost = B_FALSE;
4566 
4567         pio = NULL;
4568         write_sz = write_asize = write_psize = 0;
4569         full = B_FALSE;
4570         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4571         head->b_flags |= ARC_L2_WRITE_HEAD;
4572 
4573         /*
4574          * We will want to try to compress buffers that are at least 2x the
4575          * device sector size.
4576          */
4577         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4578 
4579         /*
4580          * Copy buffers for L2ARC writing.
4581          */
4582         mutex_enter(&l2arc_buflist_mtx);
4583         for (int try = 0; try <= 3; try++) {
4584                 uint64_t passed_sz = 0;
4585 
4586                 list = l2arc_list_locked(try, &list_lock);
4587 
4588                 /*
4589                  * L2ARC fast warmup.
4590                  *
4591                  * Until the ARC is warm and starts to evict, read from the
4592                  * head of the ARC lists rather than the tail.
4593                  */
4594                 if (arc_warm == B_FALSE)
4595                         ab = list_head(list);
4596                 else
4597                         ab = list_tail(list);
4598 
4599                 headroom = target_sz * l2arc_headroom;
4600                 if (do_headroom_boost)
4601                         headroom = (headroom * l2arc_headroom_boost) / 100;
4602 
4603                 for (; ab; ab = ab_prev) {
4604                         l2arc_buf_hdr_t *l2hdr;
4605                         kmutex_t *hash_lock;
4606                         uint64_t buf_sz;
4607 
4608                         if (arc_warm == B_FALSE)
4609                                 ab_prev = list_next(list, ab);
4610                         else
4611                                 ab_prev = list_prev(list, ab);
4612 
4613                         hash_lock = HDR_LOCK(ab);
4614                         if (!mutex_tryenter(hash_lock)) {
4615                                 /*
4616                                  * Skip this buffer rather than waiting.
4617                                  */
4618                                 continue;
4619                         }
4620 
4621                         passed_sz += ab->b_size;
4622                         if (passed_sz > headroom) {
4623                                 /*
4624                                  * Searched too far.
4625                                  */
4626                                 mutex_exit(hash_lock);
4627                                 break;
4628                         }
4629 
4630                         if (!l2arc_write_eligible(guid, ab)) {
4631                                 mutex_exit(hash_lock);
4632                                 continue;
4633                         }
4634 
4635                         if ((write_sz + ab->b_size) > target_sz) {
4636                                 full = B_TRUE;
4637                                 mutex_exit(hash_lock);
4638                                 break;
4639                         }
4640 
4641                         if (pio == NULL) {
4642                                 /*
4643                                  * Insert a dummy header on the buflist so
4644                                  * l2arc_write_done() can find where the
4645                                  * write buffers begin without searching.
4646                                  */
4647                                 list_insert_head(dev->l2ad_buflist, head);
4648 
4649                                 cb = kmem_alloc(
4650                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4651                                 cb->l2wcb_dev = dev;
4652                                 cb->l2wcb_head = head;
4653                                 pio = zio_root(spa, l2arc_write_done, cb,
4654                                     ZIO_FLAG_CANFAIL);
4655                         }
4656 
4657                         /*
4658                          * Create and add a new L2ARC header.
4659                          */
4660                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4661                         l2hdr->b_dev = dev;
4662                         ab->b_flags |= ARC_L2_WRITING;
4663 
4664                         /*
4665                          * Temporarily stash the data buffer in b_tmp_cdata.
4666                          * The subsequent write step will pick it up from
4667                          * there. This is because can't access ab->b_buf
4668                          * without holding the hash_lock, which we in turn
4669                          * can't access without holding the ARC list locks
4670                          * (which we want to avoid during compression/writing).
4671                          */
4672                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
4673                         l2hdr->b_asize = ab->b_size;
4674                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4675 
4676                         buf_sz = ab->b_size;
4677                         ab->b_l2hdr = l2hdr;
4678 
4679                         list_insert_head(dev->l2ad_buflist, ab);
4680 
4681                         /*
4682                          * Compute and store the buffer cksum before
4683                          * writing.  On debug the cksum is verified first.
4684                          */
4685                         arc_cksum_verify(ab->b_buf);
4686                         arc_cksum_compute(ab->b_buf, B_TRUE);
4687 
4688                         mutex_exit(hash_lock);
4689 
4690                         write_sz += buf_sz;
4691                 }
4692 
4693                 mutex_exit(list_lock);
4694 
4695                 if (full == B_TRUE)
4696                         break;
4697         }
4698 
4699         /* No buffers selected for writing? */
4700         if (pio == NULL) {
4701                 ASSERT0(write_sz);
4702                 mutex_exit(&l2arc_buflist_mtx);
4703                 kmem_cache_free(hdr_cache, head);
4704                 return (0);
4705         }
4706 
4707         /*
4708          * Now start writing the buffers. We're starting at the write head
4709          * and work backwards, retracing the course of the buffer selector
4710          * loop above.
4711          */
4712         for (ab = list_prev(dev->l2ad_buflist, head); ab;
4713             ab = list_prev(dev->l2ad_buflist, ab)) {
4714                 l2arc_buf_hdr_t *l2hdr;
4715                 uint64_t buf_sz;
4716 
4717                 /*
4718                  * We shouldn't need to lock the buffer here, since we flagged
4719                  * it as ARC_L2_WRITING in the previous step, but we must take
4720                  * care to only access its L2 cache parameters. In particular,
4721                  * ab->b_buf may be invalid by now due to ARC eviction.
 
4726                 if ((ab->b_flags & ARC_L2COMPRESS) &&
4727                     l2hdr->b_asize >= buf_compress_minsz) {
4728                         if (l2arc_compress_buf(l2hdr)) {
4729                                 /*
4730                                  * If compression succeeded, enable headroom
4731                                  * boost on the next scan cycle.
4732                                  */
4733                                 *headroom_boost = B_TRUE;
4734                         }
4735                 }
4736 
4737                 /*
4738                  * Pick up the buffer data we had previously stashed away
4739                  * (and now potentially also compressed).
4740                  */
4741                 buf_data = l2hdr->b_tmp_cdata;
4742                 buf_sz = l2hdr->b_asize;
4743 
4744                 /* Compression may have squashed the buffer to zero length. */
4745                 if (buf_sz != 0) {
4746                         uint64_t buf_p_sz;
4747 
4748                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4749                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4750                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4751                             ZIO_FLAG_CANFAIL, B_FALSE);
4752 
4753                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4754                             zio_t *, wzio);
4755                         (void) zio_nowait(wzio);
4756 
4757                         write_asize += buf_sz;
4758                         /*
4759                          * Keep the clock hand suitably device-aligned.
4760                          */
4761                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4762                         write_psize += buf_p_sz;
4763                         dev->l2ad_hand += buf_p_sz;
4764                 }
4765         }
4766 
4767         mutex_exit(&l2arc_buflist_mtx);
4768 
4769         ASSERT3U(write_asize, <=, target_sz);
4770         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4771         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4772         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4773         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4774         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4775 
4776         /*
4777          * Bump device hand to the device start if it is approaching the end.
4778          * l2arc_evict() will already have evicted ahead for this case.
4779          */
4780         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4781                 vdev_space_update(dev->l2ad_vdev,
4782                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4783                 dev->l2ad_hand = dev->l2ad_start;
4784                 dev->l2ad_evict = dev->l2ad_start;
4785                 dev->l2ad_first = B_FALSE;
4786         }
4787 
4788         dev->l2ad_writing = B_TRUE;
4789         (void) zio_wait(pio);
4790         dev->l2ad_writing = B_FALSE;
4791 
4792         return (write_asize);
4793 }
4794 
4795 /*
4796  * Compresses an L2ARC buffer.
4797  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4798  * size in l2hdr->b_asize. This routine tries to compress the data and
4799  * depending on the compression result there are three possible outcomes:
4800  * *) The buffer was incompressible. The original l2hdr contents were left
 
5022                  * Write ARC buffers.
5023                  */
5024                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5025 
5026                 /*
5027                  * Calculate interval between writes.
5028                  */
5029                 next = l2arc_write_interval(begin, size, wrote);
5030                 spa_config_exit(spa, SCL_L2ARC, dev);
5031         }
5032 
5033         l2arc_thread_exit = 0;
5034         cv_broadcast(&l2arc_feed_thr_cv);
5035         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
5036         thread_exit();
5037 }
5038 
5039 boolean_t
5040 l2arc_vdev_present(vdev_t *vd)
5041 {
5042         l2arc_dev_t *dev;
5043 
5044         mutex_enter(&l2arc_dev_mtx);
5045         for (dev = list_head(l2arc_dev_list); dev != NULL;
5046             dev = list_next(l2arc_dev_list, dev)) {
5047                 if (dev->l2ad_vdev == vd)
5048                         break;
5049         }
5050         mutex_exit(&l2arc_dev_mtx);
5051 
5052         return (dev != NULL);
5053 }
5054 
5055 /*
5056  * Add a vdev for use by the L2ARC.  By this point the spa has already
5057  * validated the vdev and opened it.
5058  */
5059 void
5060 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5061 {
5062         l2arc_dev_t *adddev;
5063 
5064         ASSERT(!l2arc_vdev_present(vd));
5065 
5066         /*
5067          * Create a new l2arc device entry.
5068          */
5069         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5070         adddev->l2ad_spa = spa;
5071         adddev->l2ad_vdev = vd;
5072         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5073         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5074         adddev->l2ad_hand = adddev->l2ad_start;
5075         adddev->l2ad_evict = adddev->l2ad_start;
5076         adddev->l2ad_first = B_TRUE;
5077         adddev->l2ad_writing = B_FALSE;
5078 
5079         /*
5080          * This is a list of all ARC buffers that are still valid on the
5081          * device.
5082          */
5083         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5084         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5085             offsetof(arc_buf_hdr_t, b_l2node));
5086 
5087         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5088 
5089         /*
5090          * Add device to global list
5091          */
5092         mutex_enter(&l2arc_dev_mtx);
5093         list_insert_head(l2arc_dev_list, adddev);
5094         atomic_inc_64(&l2arc_ndev);
5095         mutex_exit(&l2arc_dev_mtx);
5096 }
5097 
5098 /*
5099  * Remove a vdev from the L2ARC.
5100  */
5101 void
5102 l2arc_remove_vdev(vdev_t *vd)
5103 {
5104         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5105 
5106         /*
5107          * Find the device by vdev
5108          */
5109         mutex_enter(&l2arc_dev_mtx);
5110         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5111                 nextdev = list_next(l2arc_dev_list, dev);
5112                 if (vd == dev->l2ad_vdev) {
5113                         remdev = dev;
5114                         break;
 
5181 {
5182         if (!(spa_mode_global & FWRITE))
5183                 return;
5184 
5185         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5186             TS_RUN, minclsyspri);
5187 }
5188 
5189 void
5190 l2arc_stop(void)
5191 {
5192         if (!(spa_mode_global & FWRITE))
5193                 return;
5194 
5195         mutex_enter(&l2arc_feed_thr_lock);
5196         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
5197         l2arc_thread_exit = 1;
5198         while (l2arc_thread_exit != 0)
5199                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5200         mutex_exit(&l2arc_feed_thr_lock);
5201 }
 | 
 
 
 121  */
 122 
 123 #include <sys/spa.h>
 124 #include <sys/zio.h>
 125 #include <sys/zio_compress.h>
 126 #include <sys/zfs_context.h>
 127 #include <sys/arc.h>
 128 #include <sys/refcount.h>
 129 #include <sys/vdev.h>
 130 #include <sys/vdev_impl.h>
 131 #include <sys/dsl_pool.h>
 132 #ifdef _KERNEL
 133 #include <sys/vmsystm.h>
 134 #include <vm/anon.h>
 135 #include <sys/fs/swapnode.h>
 136 #include <sys/dnlc.h>
 137 #endif
 138 #include <sys/callb.h>
 139 #include <sys/kstat.h>
 140 #include <zfs_fletcher.h>
 141 #include <sys/byteorder.h>
 142 #include <sys/spa_impl.h>
 143 
 144 #ifndef _KERNEL
 145 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 146 boolean_t arc_watch = B_FALSE;
 147 int arc_procfd;
 148 #endif
 149 
 150 static kmutex_t         arc_reclaim_thr_lock;
 151 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 152 static uint8_t          arc_thread_exit;
 153 
 154 #define ARC_REDUCE_DNLC_PERCENT 3
 155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 156 
 157 typedef enum arc_reclaim_strategy {
 158         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 159         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 160 } arc_reclaim_strategy_t;
 161 
 162 /*
 
 301         kstat_named_t arcstat_l2_feeds;
 302         kstat_named_t arcstat_l2_rw_clash;
 303         kstat_named_t arcstat_l2_read_bytes;
 304         kstat_named_t arcstat_l2_write_bytes;
 305         kstat_named_t arcstat_l2_writes_sent;
 306         kstat_named_t arcstat_l2_writes_done;
 307         kstat_named_t arcstat_l2_writes_error;
 308         kstat_named_t arcstat_l2_writes_hdr_miss;
 309         kstat_named_t arcstat_l2_evict_lock_retry;
 310         kstat_named_t arcstat_l2_evict_reading;
 311         kstat_named_t arcstat_l2_free_on_write;
 312         kstat_named_t arcstat_l2_abort_lowmem;
 313         kstat_named_t arcstat_l2_cksum_bad;
 314         kstat_named_t arcstat_l2_io_error;
 315         kstat_named_t arcstat_l2_size;
 316         kstat_named_t arcstat_l2_asize;
 317         kstat_named_t arcstat_l2_hdr_size;
 318         kstat_named_t arcstat_l2_compress_successes;
 319         kstat_named_t arcstat_l2_compress_zeros;
 320         kstat_named_t arcstat_l2_compress_failures;
 321         kstat_named_t arcstat_l2_log_blk_writes;
 322         kstat_named_t arcstat_l2_log_blk_avg_size;
 323         kstat_named_t arcstat_l2_data_to_meta_ratio;
 324         kstat_named_t arcstat_l2_rebuild_successes;
 325         kstat_named_t arcstat_l2_rebuild_abort_unsupported;
 326         kstat_named_t arcstat_l2_rebuild_abort_timeout;
 327         kstat_named_t arcstat_l2_rebuild_abort_io_errors;
 328         kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
 329         kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
 330         kstat_named_t arcstat_l2_rebuild_abort_lowmem;
 331         kstat_named_t arcstat_l2_rebuild_size;
 332         kstat_named_t arcstat_l2_rebuild_bufs;
 333         kstat_named_t arcstat_l2_rebuild_bufs_precached;
 334         kstat_named_t arcstat_l2_rebuild_psize;
 335         kstat_named_t arcstat_l2_rebuild_log_blks;
 336         kstat_named_t arcstat_memory_throttle_count;
 337         kstat_named_t arcstat_duplicate_buffers;
 338         kstat_named_t arcstat_duplicate_buffers_size;
 339         kstat_named_t arcstat_duplicate_reads;
 340         kstat_named_t arcstat_meta_used;
 341         kstat_named_t arcstat_meta_limit;
 342         kstat_named_t arcstat_meta_max;
 343 } arc_stats_t;
 344 
 345 static arc_stats_t arc_stats = {
 346         { "hits",                       KSTAT_DATA_UINT64 },
 347         { "misses",                     KSTAT_DATA_UINT64 },
 348         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 349         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 350         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 351         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 352         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 353         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 354         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 355         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 
 382         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 383         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 384         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 385         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 386         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 387         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 388         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 389         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 390         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 391         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 392         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 393         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 394         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 395         { "l2_io_error",                KSTAT_DATA_UINT64 },
 396         { "l2_size",                    KSTAT_DATA_UINT64 },
 397         { "l2_asize",                   KSTAT_DATA_UINT64 },
 398         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 399         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 400         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 401         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 402         { "l2_log_blk_writes",          KSTAT_DATA_UINT64 },
 403         { "l2_log_blk_avg_size",        KSTAT_DATA_UINT64 },
 404         { "l2_data_to_meta_ratio",      KSTAT_DATA_UINT64 },
 405         { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
 406         { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
 407         { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
 408         { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
 409         { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
 410         { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
 411         { "l2_rebuild_lowmem",          KSTAT_DATA_UINT64 },
 412         { "l2_rebuild_psize",           KSTAT_DATA_UINT64 },
 413         { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
 414         { "l2_rebuild_bufs_precached",  KSTAT_DATA_UINT64 },
 415         { "l2_rebuild_size",            KSTAT_DATA_UINT64 },
 416         { "l2_rebuild_log_blks",        KSTAT_DATA_UINT64 },
 417         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 418         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 419         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 420         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 421         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 422         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 423         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 424 };
 425 
 426 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 427 
 428 #define ARCSTAT_INCR(stat, val) \
 429         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 430 
 431 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 432 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 433 
 434 #define ARCSTAT_MAX(stat, val) {                                        \
 435         uint64_t m;                                                     \
 436         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 
 444 /*
 445  * We define a macro to allow ARC hits/misses to be easily broken down by
 446  * two separate conditions, giving a total of four different subtypes for
 447  * each of hits and misses (so eight statistics total).
 448  */
 449 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 450         if (cond1) {                                                    \
 451                 if (cond2) {                                            \
 452                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 453                 } else {                                                \
 454                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 455                 }                                                       \
 456         } else {                                                        \
 457                 if (cond2) {                                            \
 458                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 459                 } else {                                                \
 460                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 461                 }                                                       \
 462         }
 463 
 464 /*
 465  * This macro allows us to use kstats as floating averages. Each time we
 466  * update this kstat, we first factor it and the update value by
 467  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
 468  * average. This macro assumes that integer loads and stores are atomic, but
 469  * is not safe for multiple writers updating the kstat in parallel (only the
 470  * last writer's update will remain).
 471  */
 472 #define ARCSTAT_F_AVG_FACTOR    3
 473 #define ARCSTAT_F_AVG(stat, value) \
 474         do { \
 475                 uint64_t x = ARCSTAT(stat); \
 476                 x = x - x / ARCSTAT_F_AVG_FACTOR + \
 477                     (value) / ARCSTAT_F_AVG_FACTOR; \
 478                 ARCSTAT(stat) = x; \
 479                 _NOTE(NOTREACHED) \
 480                 _NOTE(CONSTCOND) \
 481         } while (0)
 482 
 483 kstat_t                 *arc_ksp;
 484 static arc_state_t      *arc_anon;
 485 static arc_state_t      *arc_mru;
 486 static arc_state_t      *arc_mru_ghost;
 487 static arc_state_t      *arc_mfu;
 488 static arc_state_t      *arc_mfu_ghost;
 489 static arc_state_t      *arc_l2c_only;
 490 
 491 /*
 492  * There are several ARC variables that are critical to export as kstats --
 493  * but we don't want to have to grovel around in the kstat whenever we wish to
 494  * manipulate them.  For these variables, we therefore define them to be in
 495  * terms of the statistic variable.  This assures that we are not introducing
 496  * the possibility of inconsistency by having shadow copies of the variables,
 497  * while still allowing the code to be readable.
 498  */
 499 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 500 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 501 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 502 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 
 671 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 672 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 673 
 674 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 675 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 676 
 677 /* L2ARC Performance Tunables */
 678 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 679 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 680 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 681 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 682 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 683 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 684 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 685 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 686 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 687 
 688 /*
 689  * L2ARC Internals
 690  */
 691 typedef struct l2arc_dev l2arc_dev_t;
 692 static list_t L2ARC_dev_list;                   /* device list */
 693 static list_t *l2arc_dev_list;                  /* device list pointer */
 694 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 695 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 696 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 697 static list_t L2ARC_free_on_write;              /* free after write buf list */
 698 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 699 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 700 static uint64_t l2arc_ndev;                     /* number of devices */
 701 
 702 typedef struct l2arc_read_callback {
 703         arc_buf_t               *l2rcb_buf;             /* read buffer */
 704         spa_t                   *l2rcb_spa;             /* spa */
 705         blkptr_t                l2rcb_bp;               /* original blkptr */
 706         zbookmark_t             l2rcb_zb;               /* original bookmark */
 707         int                     l2rcb_flags;            /* original flags */
 708         enum zio_compress       l2rcb_compress;         /* applied compress */
 709 } l2arc_read_callback_t;
 710 
 711 typedef struct l2arc_write_callback {
 712         l2arc_dev_t     *l2wcb_dev;             /* device info */
 713         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 714         /* list of in-flight l2arc_log_blk_buf_t's */
 715         list_t          l2wcb_log_blk_buf_list;
 716 } l2arc_write_callback_t;
 717 
 718 struct l2arc_buf_hdr {
 719         /* protected by arc_buf_hdr  mutex */
 720         l2arc_dev_t             *b_dev;         /* L2ARC device */
 721         uint64_t                b_daddr;        /* disk address, offset byte */
 722         /* compression applied to buffer data */
 723         enum zio_compress       b_compress;
 724         /* real alloc'd buffer size depending on b_compress applied */
 725         int                     b_asize;
 726         /* temporary buffer holder for in-flight compressed data */
 727         void                    *b_tmp_cdata;
 728 };
 729 
 730 typedef struct l2arc_data_free {
 731         /* protected by l2arc_free_on_write_mtx */
 732         void            *l2df_data;
 733         size_t          l2df_size;
 734         void            (*l2df_func)(void *, size_t);
 735         list_node_t     l2df_list_node;
 736 } l2arc_data_free_t;
 737 
 738 static kmutex_t l2arc_feed_thr_lock;
 739 static kcondvar_t l2arc_feed_thr_cv;
 740 static uint8_t l2arc_thread_exit;
 741 
 742 static void l2arc_read_done(zio_t *zio);
 743 static void l2arc_hdr_stat_add(boolean_t from_arc);
 744 static void l2arc_hdr_stat_remove(void);
 745 static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
 746 
 747 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 748 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 749     enum zio_compress c);
 750 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 751 
 752 enum {
 753         L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0)      /* mirror of l2ad_first */
 754 };
 755 
 756 /*
 757  * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
 758  */
 759 typedef struct l2arc_log_blk_ptr {
 760         uint64_t        l2lbp_daddr;    /* device address of log */
 761         /*
 762          * l2lbp_prop is the same format as the blk_prop in blkptr_t:
 763          *      * logical size (in sectors)
 764          *      * physical (compressed) size (in sectors)
 765          *      * compression algorithm (we always LZ4-compress l2arc logs)
 766          *      * checksum algorithm (used for l2lbp_cksum)
 767          *      * object type & level (unused for now)
 768          */
 769         uint64_t        l2lbp_prop;
 770         zio_cksum_t     l2lbp_cksum;    /* fletcher4 of log */
 771 } l2arc_log_blk_ptr_t;
 772 
 773 /*
 774  * The persistent L2ARC device header.
 775  */
 776 typedef struct l2arc_dev_hdr_phys {
 777         uint64_t        l2dh_magic;
 778         zio_cksum_t     l2dh_self_cksum;        /* fletcher4 of fields below */
 779 
 780         /*
 781          * Global L2ARC device state and metadata.
 782          */
 783         uint64_t        l2dh_spa_guid;
 784         uint64_t        l2dh_evict_tail;        /* current evict pointer */
 785         uint64_t        l2dh_alloc_space;       /* vdev space alloc status */
 786         uint64_t        l2dh_flags;             /* l2arc_dev_hdr_flags_t */
 787 
 788         /*
 789          * Start of log block chain. [0] -> newest log, [1] -> one older (used
 790          * for initiating prefetch).
 791          */
 792         l2arc_log_blk_ptr_t     l2dh_start_lbps[2];
 793 
 794         const uint64_t  l2dh_pad[43];           /* pad to 512 bytes */
 795 } l2arc_dev_hdr_phys_t;
 796 CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
 797 
 798 /*
 799  * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
 800  */
 801 typedef struct l2arc_log_ent_phys {
 802         dva_t                   l2le_dva;       /* dva of buffer */
 803         uint64_t                l2le_birth;     /* birth txg of buffer */
 804         uint64_t                l2le_cksum0;
 805         zio_cksum_t             l2le_freeze_cksum;
 806         /*
 807          * l2le_prop is the same format as the blk_prop in blkptr_t:
 808          *      * logical size (in sectors)
 809          *      * physical (compressed) size (in sectors)
 810          *      * compression algorithm
 811          *      * checksum algorithm (used for cksum0)
 812          *      * object type & level (used to restore arc_buf_contents_t)
 813          */
 814         uint64_t                l2le_prop;
 815         uint64_t                l2le_daddr;     /* buf location on l2dev */
 816         const uint64_t          l2le_pad[6];    /* resv'd for future use */
 817 } l2arc_log_ent_phys_t;
 818 
 819 /*
 820  * These design limits give us the following overhead (before compression):
 821  *      avg_blk_sz      overhead
 822  *      1k              12.51 %
 823  *      2k               6.26 %
 824  *      4k               3.13 %
 825  *      8k               1.56 %
 826  *      16k              0.78 %
 827  *      32k              0.39 %
 828  *      64k              0.20 %
 829  *      128k             0.10 %
 830  * Compression should be able to sequeeze these down by about a factor of 2x.
 831  */
 832 #define L2ARC_LOG_BLK_SIZE                      (128 * 1024)    /* 128k */
 833 #define L2ARC_LOG_BLK_HEADER_LEN                (128)
 834 #define L2ARC_LOG_BLK_ENTRIES                   /* 1023 entries */      \
 835         ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) /              \
 836         sizeof (l2arc_log_ent_phys_t))
 837 /*
 838  * Maximum amount of data in an l2arc log block (used to terminate rebuilding
 839  * before we hit the write head and restore potentially corrupted blocks).
 840  */
 841 #define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE  \
 842         (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
 843 /*
 844  * For the persistency and rebuild algorithms to operate reliably we need
 845  * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
 846  * excessive log block looping might confuse the log chain end detection).
 847  * Under normal circumstances this is not a problem, since this is somewhere
 848  * around only 400 MB.
 849  */
 850 #define L2ARC_PERSIST_MIN_SIZE  (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
 851 
 852 /*
 853  * A log block of up to 1023 ARC buffer log entries, chained into the
 854  * persistent L2ARC metadata linked list.
 855  */
 856 typedef struct l2arc_log_blk_phys {
 857         /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
 858         uint64_t                l2lb_magic;
 859         l2arc_log_blk_ptr_t     l2lb_back2_lbp; /* back 2 steps in chain */
 860         uint64_t                l2lb_pad[9];    /* resv'd for future use */
 861         /* Payload */
 862         l2arc_log_ent_phys_t    l2lb_entries[L2ARC_LOG_BLK_ENTRIES];
 863 } l2arc_log_blk_phys_t;
 864 
 865 CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
 866 CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) -
 867     offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
 868 
 869 /*
 870  * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
 871  * written to the L2ARC device. They may be compressed, hence the uint8_t[].
 872  */
 873 typedef struct l2arc_log_blk_buf {
 874         uint8_t         l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
 875         list_node_t     l2lbb_node;
 876 } l2arc_log_blk_buf_t;
 877 
 878 /* Macros for the manipulation fields in the blk_prop format of blkptr_t */
 879 #define BLKPROP_GET_LSIZE(_obj, _field)         \
 880         BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
 881 #define BLKPROP_SET_LSIZE(_obj, _field, x)      \
 882         BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
 883 #define BLKPROP_GET_PSIZE(_obj, _field)         \
 884         BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
 885 #define BLKPROP_SET_PSIZE(_obj, _field, x)      \
 886         BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
 887 #define BLKPROP_GET_COMPRESS(_obj, _field)      \
 888         BF64_GET((_obj)->_field, 32, 8)
 889 #define BLKPROP_SET_COMPRESS(_obj, _field, x)   \
 890         BF64_SET((_obj)->_field, 32, 8, x)
 891 #define BLKPROP_GET_CHECKSUM(_obj, _field)      \
 892         BF64_GET((_obj)->_field, 40, 8)
 893 #define BLKPROP_SET_CHECKSUM(_obj, _field, x)   \
 894         BF64_SET((_obj)->_field, 40, 8, x)
 895 #define BLKPROP_GET_TYPE(_obj, _field)          \
 896         BF64_GET((_obj)->_field, 48, 8)
 897 #define BLKPROP_SET_TYPE(_obj, _field, x)       \
 898         BF64_SET((_obj)->_field, 48, 8, x)
 899 
 900 /* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */
 901 #define LBP_GET_LSIZE(_add)             BLKPROP_GET_LSIZE(_add, l2lbp_prop)
 902 #define LBP_SET_LSIZE(_add, x)          BLKPROP_SET_LSIZE(_add, l2lbp_prop, x)
 903 #define LBP_GET_PSIZE(_add)             BLKPROP_GET_PSIZE(_add, l2lbp_prop)
 904 #define LBP_SET_PSIZE(_add, x)          BLKPROP_SET_PSIZE(_add, l2lbp_prop, x)
 905 #define LBP_GET_COMPRESS(_add)          BLKPROP_GET_COMPRESS(_add, l2lbp_prop)
 906 #define LBP_SET_COMPRESS(_add, x)       BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \
 907     x)
 908 #define LBP_GET_CHECKSUM(_add)          BLKPROP_GET_CHECKSUM(_add, l2lbp_prop)
 909 #define LBP_SET_CHECKSUM(_add, x)       BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \
 910     x)
 911 #define LBP_GET_TYPE(_add)              BLKPROP_GET_TYPE(_add, l2lbp_prop)
 912 #define LBP_SET_TYPE(_add, x)           BLKPROP_SET_TYPE(_add, l2lbp_prop, x)
 913 
 914 /* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */
 915 #define LE_GET_LSIZE(_le)       BLKPROP_GET_LSIZE(_le, l2le_prop)
 916 #define LE_SET_LSIZE(_le, x)    BLKPROP_SET_LSIZE(_le, l2le_prop, x)
 917 #define LE_GET_PSIZE(_le)       BLKPROP_GET_PSIZE(_le, l2le_prop)
 918 #define LE_SET_PSIZE(_le, x)    BLKPROP_SET_PSIZE(_le, l2le_prop, x)
 919 #define LE_GET_COMPRESS(_le)    BLKPROP_GET_COMPRESS(_le, l2le_prop)
 920 #define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x)
 921 #define LE_GET_CHECKSUM(_le)    BLKPROP_GET_CHECKSUM(_le, l2le_prop)
 922 #define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x)
 923 #define LE_GET_TYPE(_le)        BLKPROP_GET_TYPE(_le, l2le_prop)
 924 #define LE_SET_TYPE(_le, x)     BLKPROP_SET_TYPE(_le, l2le_prop, x)
 925 
 926 #define PTR_SWAP(x, y)          \
 927         do {                    \
 928                 void *tmp = (x);\
 929                 x = y;          \
 930                 y = tmp;        \
 931                 _NOTE(CONSTCOND)\
 932         } while (0)
 933 
 934 #define L2ARC_DEV_HDR_MAGIC     0x12bab10c00000001LLU
 935 #define L2ARC_LOG_BLK_MAGIC     0x120103b10c000001LLU
 936 #define L2ARC_REBUILD_TIMEOUT   300     /* a rebuild may take at most 300s */
 937 
 938 struct l2arc_dev {
 939         vdev_t                  *l2ad_vdev;     /* vdev */
 940         spa_t                   *l2ad_spa;      /* spa */
 941         uint64_t                l2ad_hand;      /* next write location */
 942         uint64_t                l2ad_start;     /* first addr on device */
 943         uint64_t                l2ad_end;       /* last addr on device */
 944         uint64_t                l2ad_evict;     /* last addr eviction reached */
 945         boolean_t               l2ad_first;     /* first sweep through */
 946         boolean_t               l2ad_writing;   /* currently writing */
 947         list_t                  *l2ad_buflist;  /* buffer list */
 948         list_node_t             l2ad_node;      /* device list node */
 949         l2arc_dev_hdr_phys_t    l2ad_dev_hdr;   /* persistent device header */
 950         l2arc_log_blk_phys_t    l2ad_log_blk;   /* currently open log block */
 951         int                     l2ad_log_ent_idx; /* index into cur log blk */
 952         /* number of bytes in current log block's payload */
 953         uint64_t                l2ad_log_blk_payload_asize;
 954         /* flag indicating whether a rebuild is scheduled or is going on */
 955         boolean_t               l2ad_rebuild;
 956 };
 957 
 958 /*
 959  * Performance tuning of L2ARC persistency:
 960  *
 961  * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
 962  *              pool import or when adding one manually later) will attempt
 963  *              to rebuild L2ARC buffer contents. In special circumstances,
 964  *              the administrator may want to set this to B_FALSE, if they
 965  *              are having trouble importing a pool or attaching an L2ARC
 966  *              device (e.g. the L2ARC device is slow to read in stored log
 967  *              metadata, or the metadata has become somehow
 968  *              fragmented/unusable).
 969  * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
 970  *              avoid a slow L2ARC device from preventing pool import. If we
 971  *              are not done rebuilding an L2ARC device by this time, we
 972  *              stop the rebuild and return immediately.
 973  */
 974 boolean_t l2arc_rebuild_enabled = B_TRUE;
 975 uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
 976 
 977 /*
 978  * L2ARC persistency rebuild routines.
 979  */
 980 static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
 981 static int l2arc_rebuild(l2arc_dev_t *dev);
 982 static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
 983     l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
 984 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
 985     l2arc_dev_t *dev, uint64_t guid);
 986 
 987 /*
 988  * L2ARC persistency read I/O routines.
 989  */
 990 static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr);
 991 static int l2arc_log_blk_read(l2arc_dev_t *dev,
 992     const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp,
 993     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
 994     uint8_t *this_lb_buf, uint8_t *next_lb_buf,
 995     zio_t *this_io, zio_t **next_io);
 996 static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev,
 997     const l2arc_log_blk_ptr_t *lp);
 998 static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
 999     const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf);
1000 static void l2arc_log_blk_prefetch_abort(zio_t *zio);
1001 
1002 /*
1003  * L2ARC persistency write I/O routines.
1004  */
1005 static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
1006 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
1007     l2arc_write_callback_t *cb);
1008 
1009 /*
1010  * L2ARC persistency auxilliary routines.
1011  */
1012 static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
1013     zio_cksum_t *cksum);
1014 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
1015     const arc_buf_hdr_t *ab);
1016 static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
1017     uint64_t top, uint64_t check);
1018 static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline);
1019 
1020 static inline uint64_t
1021 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1022 {
1023         uint8_t *vdva = (uint8_t *)dva;
1024         uint64_t crc = -1ULL;
1025         int i;
1026 
1027         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
1028 
1029         for (i = 0; i < sizeof (dva_t); i++)
1030                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
1031 
1032         crc ^= (spa>>8) ^ birth;
1033 
1034         return (crc);
1035 }
1036 
1037 #define BUF_EMPTY(buf)                                          \
1038         ((buf)->b_dva.dva_word[0] == 0 &&                    \
1039         (buf)->b_dva.dva_word[1] == 0 &&                     \
1040         (buf)->b_birth == 0)
 
1540                         if (use_mutex)
1541                                 mutex_exit(&new_state->arcs_mtx);
1542                 }
1543         }
1544 
1545         ASSERT(!BUF_EMPTY(ab));
1546         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1547                 buf_hash_remove(ab);
1548 
1549         /* adjust state sizes */
1550         if (to_delta)
1551                 atomic_add_64(&new_state->arcs_size, to_delta);
1552         if (from_delta) {
1553                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1554                 atomic_add_64(&old_state->arcs_size, -from_delta);
1555         }
1556         ab->b_state = new_state;
1557 
1558         /* adjust l2arc hdr stats */
1559         if (new_state == arc_l2c_only)
1560                 l2arc_hdr_stat_add(old_state != arc_anon);
1561         else if (old_state == arc_l2c_only)
1562                 l2arc_hdr_stat_remove();
1563 }
1564 
1565 void
1566 arc_space_consume(uint64_t space, arc_space_type_t type)
1567 {
1568         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1569 
1570         switch (type) {
1571         case ARC_SPACE_DATA:
1572                 ARCSTAT_INCR(arcstat_data_size, space);
1573                 break;
1574         case ARC_SPACE_OTHER:
1575                 ARCSTAT_INCR(arcstat_other_size, space);
1576                 break;
1577         case ARC_SPACE_HDRS:
1578                 ARCSTAT_INCR(arcstat_hdr_size, space);
1579                 break;
1580         case ARC_SPACE_L2HDRS:
 
1644         hdr->b_type = type;
1645         hdr->b_spa = spa_load_guid(spa);
1646         hdr->b_state = arc_anon;
1647         hdr->b_arc_access = 0;
1648         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1649         buf->b_hdr = hdr;
1650         buf->b_data = NULL;
1651         buf->b_efunc = NULL;
1652         buf->b_private = NULL;
1653         buf->b_next = NULL;
1654         hdr->b_buf = buf;
1655         arc_get_data_buf(buf);
1656         hdr->b_datacnt = 1;
1657         hdr->b_flags = 0;
1658         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1659         (void) refcount_add(&hdr->b_refcnt, tag);
1660 
1661         return (buf);
1662 }
1663 
1664 /*
1665  * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
1666  * This is used during l2arc reconstruction to make empty ARC buffers
1667  * which circumvent the regular disk->arc->l2arc path and instead come
1668  * into being in the reverse order, i.e. l2arc->arc->(disk).
1669  */
1670 arc_buf_hdr_t *
1671 arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
1672 {
1673         arc_buf_hdr_t *hdr;
1674 
1675         ASSERT3U(size, >, 0);
1676         hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
1677         ASSERT(BUF_EMPTY(hdr));
1678         hdr->b_size = size;
1679         hdr->b_type = type;
1680         hdr->b_spa = guid;
1681         hdr->b_state = arc_anon;
1682         hdr->b_arc_access = 0;
1683         hdr->b_buf = NULL;
1684         hdr->b_datacnt = 0;
1685         hdr->b_flags = 0;
1686         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1687 
1688         return (hdr);
1689 }
1690 
1691 static char *arc_onloan_tag = "onloan";
1692 
1693 /*
1694  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1695  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1696  * buffers must be returned to the arc before they can be used by the DMU or
1697  * freed.
1698  */
1699 arc_buf_t *
1700 arc_loan_buf(spa_t *spa, int size)
1701 {
1702         arc_buf_t *buf;
1703 
1704         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1705 
1706         atomic_add_64(&arc_loaned_bytes, size);
1707         return (buf);
1708 }
1709 
1710 /*
 
1908                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1909                 /*
1910                  * To prevent arc_free() and l2arc_evict() from
1911                  * attempting to free the same buffer at the same time,
1912                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1913                  * give it priority.  l2arc_evict() can't destroy this
1914                  * header while we are waiting on l2arc_buflist_mtx.
1915                  *
1916                  * The hdr may be removed from l2ad_buflist before we
1917                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1918                  */
1919                 if (!buflist_held) {
1920                         mutex_enter(&l2arc_buflist_mtx);
1921                         l2hdr = hdr->b_l2hdr;
1922                 }
1923 
1924                 if (l2hdr != NULL) {
1925                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1926                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1927                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1928                         kmem_free(l2hdr, sizeof (*l2hdr));
1929                         if (hdr->b_state == arc_l2c_only)
1930                                 l2arc_hdr_stat_remove();
1931                         hdr->b_l2hdr = NULL;
1932                 }
1933 
1934                 if (!buflist_held)
1935                         mutex_exit(&l2arc_buflist_mtx);
1936         }
1937 
1938         if (!BUF_EMPTY(hdr)) {
1939                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1940                 buf_discard_identity(hdr);
1941         }
1942         while (hdr->b_buf) {
1943                 arc_buf_t *buf = hdr->b_buf;
1944 
1945                 if (buf->b_efunc) {
1946                         mutex_enter(&arc_eviction_mtx);
1947                         mutex_enter(&buf->b_evict_lock);
1948                         ASSERT(buf->b_hdr != NULL);
 
3365                         buf->b_next = NULL;
3366                         hdr->b_buf = buf;
3367                         ASSERT(hdr->b_datacnt == 0);
3368                         hdr->b_datacnt = 1;
3369                         arc_get_data_buf(buf);
3370                         arc_access(hdr, hash_lock);
3371                 }
3372 
3373                 ASSERT(!GHOST_STATE(hdr->b_state));
3374 
3375                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3376                 acb->acb_done = done;
3377                 acb->acb_private = private;
3378 
3379                 ASSERT(hdr->b_acb == NULL);
3380                 hdr->b_acb = acb;
3381                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3382 
3383                 if (hdr->b_l2hdr != NULL &&
3384                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3385                         /*
3386                          * Need to stash these before letting go of hash_lock
3387                          */
3388                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3389                         addr = hdr->b_l2hdr->b_daddr;
3390                         b_compress = hdr->b_l2hdr->b_compress;
3391                         b_asize = hdr->b_l2hdr->b_asize;
3392                         /*
3393                          * Lock out device removal.
3394                          */
3395                         if (vdev_is_dead(vd) ||
3396                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3397                                 vd = NULL;
3398                 }
3399 
3400                 mutex_exit(hash_lock);
3401 
3402                 /*
3403                  * At this point, we have a level 1 cache miss.  Try again in
3404                  * L2ARC if possible.
3405                  */
3406                 ASSERT3U(hdr->b_size, ==, size);
3407                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
 
3741                 atomic_add_64(&arc_anon->arcs_size, blksz);
3742         } else {
3743                 mutex_exit(&buf->b_evict_lock);
3744                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3745                 ASSERT(!list_link_active(&hdr->b_arc_node));
3746                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3747                 if (hdr->b_state != arc_anon)
3748                         arc_change_state(arc_anon, hdr, hash_lock);
3749                 hdr->b_arc_access = 0;
3750                 if (hash_lock)
3751                         mutex_exit(hash_lock);
3752 
3753                 buf_discard_identity(hdr);
3754                 arc_buf_thaw(buf);
3755         }
3756         buf->b_efunc = NULL;
3757         buf->b_private = NULL;
3758 
3759         if (l2hdr) {
3760                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3761                 kmem_free(l2hdr, sizeof (*l2hdr));
3762                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3763                 mutex_exit(&l2arc_buflist_mtx);
3764         }
3765 }
3766 
3767 int
3768 arc_released(arc_buf_t *buf)
3769 {
3770         int released;
3771 
3772         mutex_enter(&buf->b_evict_lock);
3773         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3774         mutex_exit(&buf->b_evict_lock);
3775         return (released);
3776 }
3777 
3778 int
3779 arc_has_callback(arc_buf_t *buf)
3780 {
3781         int callback;
 
4356  *      l2arc_noprefetch        skip caching prefetched buffers
4357  *      l2arc_headroom          number of max device writes to precache
4358  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4359  *                              scanning, we multiply headroom by this
4360  *                              percentage factor for the next scan cycle,
4361  *                              since more compressed buffers are likely to
4362  *                              be present
4363  *      l2arc_feed_secs         seconds between L2ARC writing
4364  *
4365  * Tunables may be removed or added as future performance improvements are
4366  * integrated, and also may become zpool properties.
4367  *
4368  * There are three key functions that control how the L2ARC warms up:
4369  *
4370  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4371  *      l2arc_write_size()      calculate how much to write
4372  *      l2arc_write_interval()  calculate sleep delay between writes
4373  *
4374  * These three functions determine what to write, how much, and how quickly
4375  * to send writes.
4376  *
4377  * L2ARC persistency:
4378  *
4379  * When writing buffers to L2ARC, we periodically add some metadata to
4380  * make sure we can pick them up after reboot, thus dramatically reducing
4381  * the impact that any downtime has on the performance of storage systems
4382  * with large caches.
4383  *
4384  * The implementation works fairly simply by integrating the following two
4385  * modifications:
4386  *
4387  * *) Every now and then we mix in a piece of metadata (called a log block)
4388  *    into the L2ARC write. This allows us to understand what's been written,
4389  *    so that we can rebuild the arc_buf_hdr_t structures of the main ARC
4390  *    buffers. The log block also includes a "back-reference" pointer to the
4391  *    previous block, forming a back-linked list of blocks on the L2ARC device.
4392  *
4393  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
4394  *    for our header bookkeeping purposes. This contains a device header, which
4395  *    contains our top-level reference structures. We update it each time we
4396  *    write a new log block, so that we're able to locate it in the L2ARC
4397  *    device. If this write results in an inconsistent device header (e.g. due
4398  *    to power failure), we detect this by verifying the header's checksum
4399  *    and simply drop the entries from L2ARC.
4400  *
4401  * Implementation diagram:
4402  *
4403  * +=== L2ARC device (not to scale) ======================================+
4404  * |       __________newest log block pointers_________                   |
4405  * |      /                                  \1 back   \latest            |
4406  * |     /                                    V         V                 |
4407  * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
4408  * |                       ^       / ^       / ^       /                  |
4409  * |                       `-prev-'  `-prev-'  `-prev-'                   |
4410  * |                         lb        lb        lb                       |
4411  * +======================================================================+
4412  *
4413  * On-device data structures:
4414  *
4415  * L2ARC device header: l2arc_dev_hdr_phys_t
4416  * L2ARC log block:     l2arc_log_blk_phys_t
4417  *
4418  * L2ARC reconstruction:
4419  *
4420  * When writing data, we simply write in the standard rotary fashion,
4421  * evicting buffers as we go and simply writing new data over them (writing
4422  * a new log block every now and then). This obviously means that once we
4423  * loop around the end of the device, we will start cutting into an already
4424  * committed log block (and its referenced data buffers), like so:
4425  *
4426  *    current write head__       __old tail
4427  *                        \     /
4428  *                        V    V
4429  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
4430  *                         ^    ^^^^^^^^^___________________________________
4431  *                         |                                                \
4432  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
4433  *
4434  * When importing the pool, we detect this situation and use it to stop
4435  * our scanning process (see l2arc_rebuild).
4436  *
4437  * There is one significant caveat to consider when rebuilding ARC contents
4438  * from an L2ARC device: what about invalidated buffers? Given the above
4439  * construction, we cannot update blocks which we've already written to amend
4440  * them to remove buffers which were invalidated. Thus, during reconstruction,
4441  * we might be populating the cache with buffers for data that's not on the
4442  * main pool anymore, or may have been overwritten!
4443  *
4444  * As it turns out, this isn't a problem. Every arc_read request includes
4445  * both the DVA and, crucially, the birth TXG of the BP the caller is
4446  * looking for. So even if the cache were populated by completely rotten
4447  * blocks for data that had been long deleted and/or overwritten, we'll
4448  * never actually return bad data from the cache, since the DVA with the
4449  * birth TXG uniquely identify a block in space and time - once created,
4450  * a block is immutable on disk. The worst thing we have done is wasted
4451  * some time and memory at l2arc rebuild to reconstruct outdated ARC
4452  * entries that will get dropped from the l2arc as it is being updated
4453  * with new blocks.
4454  */
4455 
4456 static boolean_t
4457 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4458 {
4459         /*
4460          * A buffer is *not* eligible for the L2ARC if it:
4461          * 1. belongs to a different spa.
4462          * 2. is already cached on the L2ARC.
4463          * 3. has an I/O in progress (it may be an incomplete read).
4464          * 4. is flagged not eligible (zfs property).
4465          */
4466         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4467             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4468                 return (B_FALSE);
4469 
4470         return (B_TRUE);
4471 }
4472 
4473 static uint64_t
 
4500         clock_t interval, next, now;
4501 
4502         /*
4503          * If the ARC lists are busy, increase our write rate; if the
4504          * lists are stale, idle back.  This is achieved by checking
4505          * how much we previously wrote - if it was more than half of
4506          * what we wanted, schedule the next write much sooner.
4507          */
4508         if (l2arc_feed_again && wrote > (wanted / 2))
4509                 interval = (hz * l2arc_feed_min_ms) / 1000;
4510         else
4511                 interval = hz * l2arc_feed_secs;
4512 
4513         now = ddi_get_lbolt();
4514         next = MAX(now, MIN(now + interval, began + interval));
4515 
4516         return (next);
4517 }
4518 
4519 static void
4520 l2arc_hdr_stat_add(boolean_t from_arc)
4521 {
4522         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4523         if (from_arc)
4524                 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4525 }
4526 
4527 static void
4528 l2arc_hdr_stat_remove(void)
4529 {
4530         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4531         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4532 }
4533 
4534 /*
4535  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4536  * If a device is returned, this also returns holding the spa config lock.
4537  */
4538 static l2arc_dev_t *
4539 l2arc_dev_get_next(void)
4540 {
4541         l2arc_dev_t *first, *next = NULL;
4542 
4543         /*
4544          * Lock out the removal of spas (spa_namespace_lock), then removal
4545          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4546          * both locks will be dropped and a spa config lock held instead.
4547          */
4548         mutex_enter(&spa_namespace_lock);
4549         mutex_enter(&l2arc_dev_mtx);
4550 
4551         /* if there are no vdevs, there is nothing to do */
4552         if (l2arc_ndev == 0)
4553                 goto out;
4554 
4555         first = NULL;
4556         next = l2arc_dev_last;
4557         do {
4558                 /*
4559                  * Loop around the list looking for a non-faulted vdev
4560                  * and one that isn't currently doing an L2ARC rebuild.
4561                  */
4562                 if (next == NULL) {
4563                         next = list_head(l2arc_dev_list);
4564                 } else {
4565                         next = list_next(l2arc_dev_list, next);
4566                         if (next == NULL)
4567                                 next = list_head(l2arc_dev_list);
4568                 }
4569 
4570                 /* if we have come back to the start, bail out */
4571                 if (first == NULL)
4572                         first = next;
4573                 else if (next == first)
4574                         break;
4575 
4576         } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
4577 
4578         /* if we were unable to find any usable vdevs, return NULL */
4579         if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
4580                 next = NULL;
4581 
4582         l2arc_dev_last = next;
4583 
4584 out:
4585         mutex_exit(&l2arc_dev_mtx);
4586 
4587         /*
4588          * Grab the config lock to prevent the 'next' device from being
4589          * removed while we are writing to it.
4590          */
4591         if (next != NULL)
4592                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4593         mutex_exit(&spa_namespace_lock);
4594 
4595         return (next);
4596 }
4597 
4598 /*
4599  * Free buffers that were tagged for destruction.
 
4613                 ASSERT(df->l2df_func != NULL);
4614                 df->l2df_func(df->l2df_data, df->l2df_size);
4615                 list_remove(buflist, df);
4616                 kmem_free(df, sizeof (l2arc_data_free_t));
4617         }
4618 
4619         mutex_exit(&l2arc_free_on_write_mtx);
4620 }
4621 
4622 /*
4623  * A write to a cache device has completed.  Update all headers to allow
4624  * reads from these buffers to begin.
4625  */
4626 static void
4627 l2arc_write_done(zio_t *zio)
4628 {
4629         l2arc_write_callback_t *cb;
4630         l2arc_dev_t *dev;
4631         list_t *buflist;
4632         arc_buf_hdr_t *head, *ab, *ab_prev;
4633         l2arc_buf_hdr_t *l2hdr;
4634         kmutex_t *hash_lock;
4635         l2arc_log_blk_buf_t *lb_buf;
4636 
4637         cb = zio->io_private;
4638         ASSERT(cb != NULL);
4639         dev = cb->l2wcb_dev;
4640         ASSERT(dev != NULL);
4641         head = cb->l2wcb_head;
4642         ASSERT(head != NULL);
4643         buflist = dev->l2ad_buflist;
4644         ASSERT(buflist != NULL);
4645         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4646             l2arc_write_callback_t *, cb);
4647 
4648         if (zio->io_error != 0)
4649                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4650 
4651         mutex_enter(&l2arc_buflist_mtx);
4652 
4653         /*
4654          * All writes completed, or an error was hit.
4655          */
4656         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4657                 ab_prev = list_prev(buflist, ab);
4658                 l2hdr = ab->b_l2hdr;
4659 
4660                 /*
4661                  * Release the temporary compressed buffer as soon as possible.
4662                  */
4663                 if (l2hdr->b_compress != ZIO_COMPRESS_OFF)
4664                         l2arc_release_cdata_buf(ab);
4665 
4666                 hash_lock = HDR_LOCK(ab);
4667                 if (!mutex_tryenter(hash_lock)) {
4668                         /*
4669                          * This buffer misses out.  It may be in a stage
4670                          * of eviction.  Its ARC_L2_WRITING flag will be
4671                          * left set, denying reads to this buffer.
4672                          */
4673                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4674                         continue;
4675                 }
4676 
4677                 if (zio->io_error != 0) {
4678                         /*
4679                          * Error - drop L2ARC entry.
4680                          */
4681                         list_remove(buflist, ab);
4682                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4683                         ab->b_l2hdr = NULL;
4684                         kmem_free(l2hdr, sizeof (*l2hdr));
4685                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4686                 }
4687 
4688                 /*
4689                  * Allow ARC to begin reads to this L2ARC entry.
4690                  */
4691                 ab->b_flags &= ~ARC_L2_WRITING;
4692 
4693                 mutex_exit(hash_lock);
4694         }
4695 
4696         atomic_inc_64(&l2arc_writes_done);
4697         list_remove(buflist, head);
4698         kmem_cache_free(hdr_cache, head);
4699         mutex_exit(&l2arc_buflist_mtx);
4700 
4701         l2arc_do_free_on_write();
4702 
4703         for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL;
4704             lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) {
4705                 (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list);
4706                 kmem_free(lb_buf, sizeof (*lb_buf));
4707         }
4708         list_destroy(&cb->l2wcb_log_blk_buf_list);
4709         kmem_free(cb, sizeof (l2arc_write_callback_t));
4710 }
4711 
4712 /*
4713  * A read to a cache device completed.  Validate buffer contents before
4714  * handing over to the regular ARC routines.
4715  */
4716 static void
4717 l2arc_read_done(zio_t *zio)
4718 {
4719         l2arc_read_callback_t *cb;
4720         arc_buf_hdr_t *hdr;
4721         arc_buf_t *buf;
4722         kmutex_t *hash_lock;
4723         int equal;
4724 
4725         ASSERT(zio->io_vd != NULL);
4726         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4727 
4728         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
4812         case 1:
4813                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4814                 *lock = &arc_mru->arcs_mtx;
4815                 break;
4816         case 2:
4817                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4818                 *lock = &arc_mfu->arcs_mtx;
4819                 break;
4820         case 3:
4821                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4822                 *lock = &arc_mru->arcs_mtx;
4823                 break;
4824         }
4825 
4826         ASSERT(!(MUTEX_HELD(*lock)));
4827         mutex_enter(*lock);
4828         return (list);
4829 }
4830 
4831 /*
4832  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
4833  * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
4834  * overhead in processing to make sure there is enough headroom available
4835  * when writing buffers.
4836  */
4837 static inline uint64_t
4838 l2arc_log_blk_overhead(uint64_t write_sz)
4839 {
4840         return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
4841             L2ARC_LOG_BLK_SIZE;
4842 }
4843 
4844 /*
4845  * Evict buffers from the device write hand to the distance specified in
4846  * bytes.  This distance may span populated buffers, it may span nothing.
4847  * This is clearing a region on the L2ARC device ready for writing.
4848  * If the 'all' boolean is set, every buffer is evicted.
4849  */
4850 static void
4851 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4852 {
4853         list_t *buflist;
4854         l2arc_buf_hdr_t *l2hdr;
4855         arc_buf_hdr_t *ab, *ab_prev;
4856         kmutex_t *hash_lock;
4857         uint64_t taddr;
4858 
4859         buflist = dev->l2ad_buflist;
4860 
4861         if (buflist == NULL)
4862                 return;
4863 
4864         if (!all && dev->l2ad_first) {
4865                 /*
4866                  * This is the first sweep through the device.  There is
4867                  * nothing to evict.
4868                  */
4869                 return;
4870         }
4871 
4872         /*
4873          * We need to add in the worst case scenario of log block overhead.
4874          */
4875         distance += l2arc_log_blk_overhead(distance);
4876         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4877                 /*
4878                  * When nearing the end of the device, evict to the end
4879                  * before the device write hand jumps to the start.
4880                  */
4881                 taddr = dev->l2ad_end;
4882         } else {
4883                 taddr = dev->l2ad_hand + distance;
4884         }
4885         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4886             uint64_t, taddr, boolean_t, all);
4887 
4888 top:
4889         mutex_enter(&l2arc_buflist_mtx);
4890         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4891                 ab_prev = list_prev(buflist, ab);
4892 
4893                 hash_lock = HDR_LOCK(ab);
4894                 if (!mutex_tryenter(hash_lock)) {
4895                         /*
 
4938                          * arc_hdr_destroy() will call list_remove()
4939                          * and decrement arcstat_l2_size.
4940                          */
4941                         arc_change_state(arc_anon, ab, hash_lock);
4942                         arc_hdr_destroy(ab);
4943                 } else {
4944                         /*
4945                          * Invalidate issued or about to be issued
4946                          * reads, since we may be about to write
4947                          * over this location.
4948                          */
4949                         if (HDR_L2_READING(ab)) {
4950                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4951                                 ab->b_flags |= ARC_L2_EVICTED;
4952                         }
4953 
4954                         /*
4955                          * Tell ARC this no longer exists in L2ARC.
4956                          */
4957                         if (ab->b_l2hdr != NULL) {
4958                                 l2hdr = ab->b_l2hdr;
4959                                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4960                                 ab->b_l2hdr = NULL;
4961                                 kmem_free(l2hdr, sizeof (*l2hdr));
4962                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4963                         }
4964                         list_remove(buflist, ab);
4965 
4966                         /*
4967                          * This may have been leftover after a
4968                          * failed write.
4969                          */
4970                         ab->b_flags &= ~ARC_L2_WRITING;
4971                 }
4972                 mutex_exit(hash_lock);
4973         }
4974         mutex_exit(&l2arc_buflist_mtx);
4975 
4976         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4977         dev->l2ad_evict = taddr;
4978 }
4979 
4980 /*
4981  * Find and write ARC buffers to the L2ARC device.
4982  *
4983  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4984  * for reading until they have completed writing.
4985  * The headroom_boost is an in-out parameter used to maintain headroom boost
4986  * state between calls to this function.
4987  *
4988  * Returns the number of bytes actually written (which may be smaller than
4989  * the delta by which the device hand has changed due to alignment).
4990  */
4991 static uint64_t
4992 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4993     boolean_t *headroom_boost)
4994 {
4995         arc_buf_hdr_t *ab, *ab_prev, *head;
4996         list_t *list;
4997         /*
4998          * These variables mean:
4999          * - write_size: in-memory size of ARC buffers we've written (before
5000          *      compression).
5001          * - write_asize: actual on-disk size of ARC buffers we've written
5002          *      (after compression).
5003          * - write_aligned_asize: actual sum of space taken by ARC buffers
5004          *      on the device (after compression and alignment, so that
5005          *      every buffer starts on a multiple of the device block size).
5006          * - headroom: L2ARC scanning headroom (we won't scan beyond this
5007          *      distance from the list tail).
5008          * - buf_compress_minsz: minimum in-memory ARC buffer size for us
5009          *      to try compressing it.
5010          */
5011         uint64_t write_size, write_asize, write_aligned_asize, headroom,
5012             buf_compress_minsz;
5013         void *buf_data;
5014         kmutex_t *list_lock;
5015         boolean_t full;
5016         l2arc_write_callback_t *cb;
5017         zio_t *pio, *wzio;
5018         uint64_t guid = spa_load_guid(spa);
5019         const boolean_t do_headroom_boost = *headroom_boost;
5020         boolean_t dev_hdr_update = B_FALSE;
5021 
5022         ASSERT(dev->l2ad_vdev != NULL);
5023 
5024         /* Lower the flag now, we might want to raise it again later. */
5025         *headroom_boost = B_FALSE;
5026 
5027         pio = NULL;
5028         cb = NULL;
5029         write_size = write_asize = write_aligned_asize = 0;
5030         full = B_FALSE;
5031         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5032         head->b_flags |= ARC_L2_WRITE_HEAD;
5033 
5034         /*
5035          * We will want to try to compress buffers that are at least 2x the
5036          * device sector size.
5037          */
5038         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5039 
5040         /*
5041          * Copy buffers for L2ARC writing.
5042          */
5043         mutex_enter(&l2arc_buflist_mtx);
5044         for (int try = 0; try <= 3; try++) {
5045                 uint64_t passed_sz = 0;
5046 
5047                 list = l2arc_list_locked(try, &list_lock);
5048 
5049                 /*
5050                  * L2ARC fast warmup.
5051                  *
5052                  * Until the ARC is warm and starts to evict, read from the
5053                  * head of the ARC lists rather than the tail.
5054                  */
5055                 if (arc_warm == B_FALSE)
5056                         ab = list_head(list);
5057                 else
5058                         ab = list_tail(list);
5059 
5060                 headroom = target_sz * l2arc_headroom;
5061                 if (do_headroom_boost)
5062                         headroom = (headroom * l2arc_headroom_boost) / 100;
5063 
5064                 for (; ab; ab = ab_prev) {
5065                         l2arc_buf_hdr_t *l2hdr;
5066                         kmutex_t *hash_lock;
5067                         uint64_t buf_aligned_size;
5068 
5069                         if (arc_warm == B_FALSE)
5070                                 ab_prev = list_next(list, ab);
5071                         else
5072                                 ab_prev = list_prev(list, ab);
5073 
5074                         hash_lock = HDR_LOCK(ab);
5075                         if (!mutex_tryenter(hash_lock)) {
5076                                 /*
5077                                  * Skip this buffer rather than waiting.
5078                                  */
5079                                 continue;
5080                         }
5081 
5082                         /*
5083                          * When examining whether we've met our write target,
5084                          * we must always use the aligned size of the buffer,
5085                          * since that's the maximum amount of space a buffer
5086                          * can take up on the L2ARC device.
5087                          */
5088                         buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev,
5089                             ab->b_size);
5090                         passed_sz += buf_aligned_size;
5091                         if (passed_sz > headroom) {
5092                                 /*
5093                                  * Searched too far.
5094                                  */
5095                                 mutex_exit(hash_lock);
5096                                 break;
5097                         }
5098 
5099                         if (!l2arc_write_eligible(guid, ab)) {
5100                                 mutex_exit(hash_lock);
5101                                 continue;
5102                         }
5103 
5104                         if ((write_size + buf_aligned_size) > target_sz) {
5105                                 full = B_TRUE;
5106                                 mutex_exit(hash_lock);
5107                                 break;
5108                         }
5109 
5110                         if (pio == NULL) {
5111                                 /*
5112                                  * Insert a dummy header on the buflist so
5113                                  * l2arc_write_done() can find where the
5114                                  * write buffers begin without searching.
5115                                  */
5116                                 list_insert_head(dev->l2ad_buflist, head);
5117 
5118                                 cb = kmem_zalloc(
5119                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
5120                                 cb->l2wcb_dev = dev;
5121                                 cb->l2wcb_head = head;
5122                                 list_create(&cb->l2wcb_log_blk_buf_list,
5123                                     sizeof (l2arc_log_blk_buf_t),
5124                                     offsetof(l2arc_log_blk_buf_t, l2lbb_node));
5125                                 pio = zio_root(spa, l2arc_write_done, cb,
5126                                     ZIO_FLAG_CANFAIL);
5127                         }
5128 
5129                         /*
5130                          * Create and add a new L2ARC header.
5131                          */
5132                         l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
5133                         l2hdr->b_dev = dev;
5134                         ab->b_flags |= ARC_L2_WRITING;
5135 
5136                         /*
5137                          * Temporarily stash the data buffer in b_tmp_cdata.
5138                          * The subsequent write step will pick it up from
5139                          * there. This is because can't access ab->b_buf
5140                          * without holding the hash_lock, which we in turn
5141                          * can't access without holding the ARC list locks
5142                          * (which we want to avoid during compression/writing).
5143                          */
5144                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
5145                         l2hdr->b_asize = ab->b_size;
5146                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5147 
5148                         ab->b_l2hdr = l2hdr;
5149 
5150                         list_insert_head(dev->l2ad_buflist, ab);
5151 
5152                         /*
5153                          * Compute and store the buffer cksum before
5154                          * writing.  On debug the cksum is verified first.
5155                          */
5156                         arc_cksum_verify(ab->b_buf);
5157                         arc_cksum_compute(ab->b_buf, B_TRUE);
5158 
5159                         mutex_exit(hash_lock);
5160 
5161                         write_size += buf_aligned_size;
5162                 }
5163 
5164                 mutex_exit(list_lock);
5165 
5166                 if (full == B_TRUE)
5167                         break;
5168         }
5169 
5170         /* No buffers selected for writing? */
5171         if (pio == NULL) {
5172                 ASSERT0(write_size);
5173                 mutex_exit(&l2arc_buflist_mtx);
5174                 kmem_cache_free(hdr_cache, head);
5175                 return (0);
5176         }
5177 
5178         /*
5179          * Now start writing the buffers. We're starting at the write head
5180          * and work backwards, retracing the course of the buffer selector
5181          * loop above.
5182          */
5183         for (ab = list_prev(dev->l2ad_buflist, head); ab;
5184             ab = list_prev(dev->l2ad_buflist, ab)) {
5185                 l2arc_buf_hdr_t *l2hdr;
5186                 uint64_t buf_sz;
5187 
5188                 /*
5189                  * We shouldn't need to lock the buffer here, since we flagged
5190                  * it as ARC_L2_WRITING in the previous step, but we must take
5191                  * care to only access its L2 cache parameters. In particular,
5192                  * ab->b_buf may be invalid by now due to ARC eviction.
 
5197                 if ((ab->b_flags & ARC_L2COMPRESS) &&
5198                     l2hdr->b_asize >= buf_compress_minsz) {
5199                         if (l2arc_compress_buf(l2hdr)) {
5200                                 /*
5201                                  * If compression succeeded, enable headroom
5202                                  * boost on the next scan cycle.
5203                                  */
5204                                 *headroom_boost = B_TRUE;
5205                         }
5206                 }
5207 
5208                 /*
5209                  * Pick up the buffer data we had previously stashed away
5210                  * (and now potentially also compressed).
5211                  */
5212                 buf_data = l2hdr->b_tmp_cdata;
5213                 buf_sz = l2hdr->b_asize;
5214 
5215                 /* Compression may have squashed the buffer to zero length. */
5216                 if (buf_sz != 0) {
5217                         uint64_t buf_aligned_asize;
5218 
5219                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
5220                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5221                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5222                             ZIO_FLAG_CANFAIL, B_FALSE);
5223 
5224                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5225                             zio_t *, wzio);
5226                         (void) zio_nowait(wzio);
5227 
5228                         write_asize += buf_sz;
5229                         /*
5230                          * Keep the clock hand suitably device-aligned.
5231                          */
5232                         buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev,
5233                             buf_sz);
5234                         write_aligned_asize += buf_aligned_asize;
5235                         dev->l2ad_hand += buf_aligned_asize;
5236                         ASSERT(dev->l2ad_hand <= dev->l2ad_evict ||
5237                             dev->l2ad_first);
5238                 }
5239 
5240                 if (l2arc_log_blk_insert(dev, ab)) {
5241                         l2arc_log_blk_commit(dev, pio, cb);
5242                         dev_hdr_update = B_TRUE;
5243                 }
5244         }
5245         mutex_exit(&l2arc_buflist_mtx);
5246 
5247         if (dev_hdr_update)
5248                 l2arc_dev_hdr_update(dev, pio);
5249 
5250         VERIFY3U(write_aligned_asize, <=, target_sz);
5251         ARCSTAT_BUMP(arcstat_l2_writes_sent);
5252         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5253         ARCSTAT_INCR(arcstat_l2_size, write_size);
5254         ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize);
5255         vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0);
5256 
5257         /*
5258          * Bump device hand to the device start if it is approaching the end.
5259          * l2arc_evict() will already have evicted ahead for this case.
5260          */
5261         if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
5262             dev->l2ad_end) {
5263                 vdev_space_update(dev->l2ad_vdev,
5264                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
5265                 dev->l2ad_hand = dev->l2ad_start;
5266                 dev->l2ad_evict = dev->l2ad_start;
5267                 dev->l2ad_first = B_FALSE;
5268         }
5269 
5270         dev->l2ad_writing = B_TRUE;
5271         (void) zio_wait(pio);
5272         dev->l2ad_writing = B_FALSE;
5273 
5274         return (write_asize);
5275 }
5276 
5277 /*
5278  * Compresses an L2ARC buffer.
5279  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5280  * size in l2hdr->b_asize. This routine tries to compress the data and
5281  * depending on the compression result there are three possible outcomes:
5282  * *) The buffer was incompressible. The original l2hdr contents were left
 
5504                  * Write ARC buffers.
5505                  */
5506                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5507 
5508                 /*
5509                  * Calculate interval between writes.
5510                  */
5511                 next = l2arc_write_interval(begin, size, wrote);
5512                 spa_config_exit(spa, SCL_L2ARC, dev);
5513         }
5514 
5515         l2arc_thread_exit = 0;
5516         cv_broadcast(&l2arc_feed_thr_cv);
5517         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
5518         thread_exit();
5519 }
5520 
5521 boolean_t
5522 l2arc_vdev_present(vdev_t *vd)
5523 {
5524         return (l2arc_vdev_get(vd) != NULL);
5525 }
5526 
5527 static l2arc_dev_t *
5528 l2arc_vdev_get(vdev_t *vd)
5529 {
5530         l2arc_dev_t     *dev;
5531         boolean_t       held = MUTEX_HELD(&l2arc_dev_mtx);
5532 
5533         if (!held)
5534                 mutex_enter(&l2arc_dev_mtx);
5535         for (dev = list_head(l2arc_dev_list); dev != NULL;
5536             dev = list_next(l2arc_dev_list, dev)) {
5537                 if (dev->l2ad_vdev == vd)
5538                         break;
5539         }
5540         if (!held)
5541                 mutex_exit(&l2arc_dev_mtx);
5542 
5543         return (dev);
5544 }
5545 
5546 /*
5547  * Add a vdev for use by the L2ARC.  By this point the spa has already
5548  * validated the vdev and opened it. The `rebuild' flag indicates whether
5549  * we should attempt an L2ARC persistency rebuild.
5550  */
5551 void
5552 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5553 {
5554         l2arc_dev_t *adddev;
5555 
5556         ASSERT(!l2arc_vdev_present(vd));
5557 
5558         /*
5559          * Create a new l2arc device entry.
5560          */
5561         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5562         adddev->l2ad_spa = spa;
5563         adddev->l2ad_vdev = vd;
5564         /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */
5565         adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE;
5566         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5567         adddev->l2ad_hand = adddev->l2ad_start;
5568         adddev->l2ad_evict = adddev->l2ad_start;
5569         adddev->l2ad_first = B_TRUE;
5570         adddev->l2ad_writing = B_FALSE;
5571 
5572         /*
5573          * This is a list of all ARC buffers that are still valid on the
5574          * device.
5575          */
5576         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5577         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5578             offsetof(arc_buf_hdr_t, b_l2node));
5579 
5580         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5581 
5582         /*
5583          * Add device to global list
5584          */
5585         mutex_enter(&l2arc_dev_mtx);
5586         list_insert_head(l2arc_dev_list, adddev);
5587         atomic_inc_64(&l2arc_ndev);
5588         if (rebuild && l2arc_rebuild_enabled &&
5589             adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
5590                 /*
5591                  * Just mark the device as pending for a rebuild. We won't
5592                  * be starting a rebuild in line here as it would block pool
5593                  * import. Instead spa_load_impl will hand that off to an
5594                  * async task which will call l2arc_spa_rebuild_start.
5595                  */
5596                 adddev->l2ad_rebuild = B_TRUE;
5597         }
5598         mutex_exit(&l2arc_dev_mtx);
5599 }
5600 
5601 /*
5602  * Remove a vdev from the L2ARC.
5603  */
5604 void
5605 l2arc_remove_vdev(vdev_t *vd)
5606 {
5607         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5608 
5609         /*
5610          * Find the device by vdev
5611          */
5612         mutex_enter(&l2arc_dev_mtx);
5613         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5614                 nextdev = list_next(l2arc_dev_list, dev);
5615                 if (vd == dev->l2ad_vdev) {
5616                         remdev = dev;
5617                         break;
 
5684 {
5685         if (!(spa_mode_global & FWRITE))
5686                 return;
5687 
5688         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5689             TS_RUN, minclsyspri);
5690 }
5691 
5692 void
5693 l2arc_stop(void)
5694 {
5695         if (!(spa_mode_global & FWRITE))
5696                 return;
5697 
5698         mutex_enter(&l2arc_feed_thr_lock);
5699         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
5700         l2arc_thread_exit = 1;
5701         while (l2arc_thread_exit != 0)
5702                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5703         mutex_exit(&l2arc_feed_thr_lock);
5704 }
5705 
5706 /*
5707  * Punches out rebuild threads for the L2ARC devices in a spa. This should
5708  * be called as one of the final steps of a pool import.
5709  */
5710 void
5711 l2arc_spa_rebuild_start(spa_t *spa)
5712 {
5713         l2arc_dev_t     *dev;
5714         /*
5715          * Locate the spa's l2arc devices and kick off rebuild threads.
5716          */
5717         mutex_enter(&l2arc_dev_mtx);
5718         for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
5719                 dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
5720                 ASSERT(dev != NULL);
5721                 if (dev->l2ad_rebuild) {
5722                         (void) thread_create(NULL, 0, l2arc_dev_rebuild_start,
5723                             dev, 0, &p0, TS_RUN, minclsyspri);
5724                 }
5725         }
5726         mutex_exit(&l2arc_dev_mtx);
5727 }
5728 
5729 /*
5730  * Main entry point for L2ARC rebuilding.
5731  */
5732 static void
5733 l2arc_dev_rebuild_start(l2arc_dev_t *dev)
5734 {
5735         spa_t *spa = dev->l2ad_spa;
5736         vdev_t *vd = dev->l2ad_vdev;
5737 
5738         /* Lock out device removal. */
5739         spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
5740         ASSERT(dev->l2ad_rebuild);
5741         (void) l2arc_rebuild(dev);
5742         dev->l2ad_rebuild = B_FALSE;
5743         spa_config_exit(spa, SCL_L2ARC, vd);
5744         thread_exit();
5745 }
5746 
5747 /*
5748  * This function implements the actual L2ARC metadata rebuild. It:
5749  *
5750  * 1) reads the device's header
5751  * 2) if a good device header is found, starts reading the log block chain
5752  * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
5753  *
5754  * Operation stops under any of the following conditions:
5755  *
5756  * 1) We reach the end of the log blk chain (the back-reference in the blk is
5757  *    invalid or loops over our starting point).
5758  * 2) We encounter *any* error condition (cksum errors, io errors, looped
5759  *    blocks, etc.).
5760  * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
5761  *    from making severely fragmented L2ARC log blocks or slow L2ARC devices
5762  *    prevent a machine from finishing a pool import (and thus letting the
5763  *    administrator take corrective action, e.g. by kicking the misbehaving
5764  *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
5765  *    rebuilding disabled).
5766  */
5767 static int
5768 l2arc_rebuild(l2arc_dev_t *dev)
5769 {
5770         int                     err;
5771         l2arc_log_blk_phys_t    *this_lb, *next_lb;
5772         uint8_t                 *this_lb_buf, *next_lb_buf;
5773         zio_t                   *this_io = NULL, *next_io = NULL;
5774         int64_t                 deadline;
5775         l2arc_log_blk_ptr_t     lb_ptrs[2];
5776         boolean_t               first_pass;
5777         uint64_t                load_guid;
5778 
5779         load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
5780         deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
5781         /*
5782          * Device header processing phase.
5783          */
5784         if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) {
5785                 /* device header corrupted, start a new one */
5786                 bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr));
5787                 return (err);
5788         }
5789         if (l2arc_check_rebuild_timeout_hit(deadline))
5790                 return (SET_ERROR(ETIMEDOUT));
5791 
5792         /* Retrieve the persistent L2ARC device state */
5793         dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail;
5794         dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
5795             dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr +
5796             LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0]));
5797         dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags &
5798             L2ARC_DEV_HDR_EVICT_FIRST);
5799 
5800         /* Prepare the rebuild processing state */
5801         bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
5802         this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
5803         next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
5804         this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
5805         next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
5806         first_pass = B_TRUE;
5807 
5808         /* Start the rebuild process */
5809         for (;;) {
5810                 if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0]))
5811                         /* We hit an invalid block address, end the rebuild. */
5812                         break;
5813 
5814                 if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
5815                     this_lb, next_lb, this_lb_buf, next_lb_buf,
5816                     this_io, &next_io)) != 0)
5817                         break;
5818 
5819                 /* Protection against infinite loops of log blocks. */
5820                 if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr,
5821                     lb_ptrs[0].l2lbp_daddr,
5822                     dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
5823                     !first_pass) {
5824                         ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
5825                         err = SET_ERROR(ELOOP);
5826                         break;
5827                 }
5828 
5829                 /*
5830                  * Our memory pressure valve. If the system is running low
5831                  * on memory, rather than swamping memory with new ARC buf
5832                  * hdrs, we opt not to rebuild the L2ARC. At this point,
5833                  * however, we have already set up our L2ARC dev to chain in
5834                  * new metadata log blk, so the user may choose to re-add the
5835                  * L2ARC dev at a later time to reconstruct it (when there's
5836                  * less memory pressure).
5837                  */
5838                 if (arc_reclaim_needed()) {
5839                         ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
5840                         cmn_err(CE_NOTE, "System running low on memory, "
5841                             "aborting L2ARC rebuild.");
5842                         err = SET_ERROR(ENOMEM);
5843                         break;
5844                 }
5845 
5846                 /*
5847                  * Now that we know that the next_lb checks out alright, we
5848                  * can start reconstruction from this lb - we can be sure
5849                  * that the L2ARC write hand has not yet reached any of our
5850                  * buffers.
5851                  */
5852                 l2arc_log_blk_restore(dev, load_guid, this_lb,
5853                     LBP_GET_PSIZE(&lb_ptrs[0]));
5854 
5855                 /*
5856                  * End of list detection. We can look ahead two steps in the
5857                  * blk chain and if the 2nd blk from this_lb dips below the
5858                  * initial chain starting point, then we know two things:
5859                  *      1) it can't be valid, and
5860                  *      2) the next_lb's ARC entries might have already been
5861                  *      partially overwritten and so we should stop before
5862                  *      we restore it
5863                  */
5864                 if (l2arc_range_check_overlap(
5865                     this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr,
5866                     dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
5867                     !first_pass)
5868                         break;
5869 
5870                 /* log blk restored, continue with next one in the list */
5871                 lb_ptrs[0] = lb_ptrs[1];
5872                 lb_ptrs[1] = this_lb->l2lb_back2_lbp;
5873                 PTR_SWAP(this_lb, next_lb);
5874                 PTR_SWAP(this_lb_buf, next_lb_buf);
5875                 this_io = next_io;
5876                 next_io = NULL;
5877                 first_pass = B_FALSE;
5878 
5879                 if (l2arc_check_rebuild_timeout_hit(deadline)) {
5880                         err = SET_ERROR(ETIMEDOUT);
5881                         break;
5882                 }
5883         }
5884         if (next_io != NULL)
5885                 l2arc_log_blk_prefetch_abort(next_io);
5886         kmem_free(this_lb, sizeof (*this_lb));
5887         kmem_free(next_lb, sizeof (*next_lb));
5888         kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
5889         kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
5890         if (err == 0)
5891                 ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
5892 
5893         return (err);
5894 }
5895 
5896 /*
5897  * Restores the payload of a log blk to ARC. This creates empty ARC hdr
5898  * entries which only contain an l2arc hdr, essentially restoring the
5899  * buffers to their L2ARC evicted state. This function also updates space
5900  * usage on the L2ARC vdev to make sure it tracks restored buffers.
5901  */
5902 static void
5903 l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
5904     l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
5905 {
5906         uint64_t        size = 0, psize = 0;
5907 
5908         mutex_enter(&l2arc_buflist_mtx);
5909 
5910         for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
5911                 /*
5912                  * Restore goes in the reverse direction to preserve correct
5913                  * temporal ordering of buffers in the l2ad_buflist.
5914                  */
5915                 l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid);
5916                 size += LE_GET_LSIZE(&lb->l2lb_entries[i]);
5917                 psize += LE_GET_PSIZE(&lb->l2lb_entries[i]);
5918         }
5919         mutex_exit(&l2arc_buflist_mtx);
5920 
5921         /*
5922          * Record rebuild stats:
5923          *      size            In-memory size of restored buffer data in ARC
5924          *      psize           Physical size of restored buffers in the L2ARC
5925          *      bufs            # of ARC buffer headers restored
5926          *      log_blks        # of L2ARC log entries processed during restore
5927          */
5928         ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
5929         ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
5930         ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
5931         ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
5932         ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
5933         ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
5934         vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
5935 }
5936 
5937 /*
5938  * Restores a single ARC buf hdr from a log block. The ARC buffer is put
5939  * into a state indicating that it has been evicted to L2ARC.
5940  */
5941 static void
5942 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
5943     uint64_t load_guid)
5944 {
5945         arc_buf_hdr_t   *hdr, *exists;
5946         kmutex_t        *hash_lock;
5947         arc_buf_contents_t      type = LE_GET_TYPE(le);
5948         l2arc_buf_hdr_t         *l2hdr;
5949 
5950         hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type);
5951         hdr->b_dva = le->l2le_dva;
5952         hdr->b_birth = le->l2le_birth;
5953         hdr->b_cksum0 = le->l2le_cksum0;
5954         hdr->b_size = LE_GET_LSIZE(le);
5955         exists = buf_hash_insert(hdr, &hash_lock);
5956         if (exists) {
5957                 /* Buffer was already cached, no need to restore it. */
5958                 mutex_exit(hash_lock);
5959                 arc_hdr_destroy(hdr);
5960                 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
5961                 return;
5962         }
5963         hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE;
5964         if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF)
5965                 hdr->b_flags |= ARC_L2COMPRESS;
5966         mutex_enter(&hdr->b_freeze_lock);
5967         ASSERT(hdr->b_freeze_cksum == NULL);
5968         hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
5969         *hdr->b_freeze_cksum = le->l2le_freeze_cksum;
5970         mutex_exit(&hdr->b_freeze_lock);
5971 
5972         /* now rebuild the l2arc entry */
5973         ASSERT(hdr->b_l2hdr == NULL);
5974         l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
5975         l2hdr->b_dev = dev;
5976         l2hdr->b_daddr = le->l2le_daddr;
5977         l2hdr->b_asize = LE_GET_PSIZE(le);
5978         l2hdr->b_compress = LE_GET_COMPRESS(le);
5979         hdr->b_l2hdr = l2hdr;
5980         list_insert_tail(dev->l2ad_buflist, hdr);
5981         ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
5982         ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
5983 
5984         arc_change_state(arc_l2c_only, hdr, hash_lock);
5985         mutex_exit(hash_lock);
5986 }
5987 
5988 /*
5989  * Attempts to read the device header on the provided L2ARC device and writes
5990  * it to `ub'. On success, this function returns 0, otherwise the appropriate
5991  * error code is returned.
5992  */
5993 static int
5994 l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr)
5995 {
5996         int             err;
5997         uint64_t        guid;
5998         zio_cksum_t     cksum;
5999 
6000         guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6001 
6002         if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
6003             VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr,
6004             ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
6005             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6006             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
6007                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
6008                 return (err);
6009         }
6010 
6011         if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
6012                 byteswap_uint64_array(hdr, sizeof (*hdr));
6013 
6014         if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC ||
6015             hdr->l2dh_spa_guid != guid) {
6016                 /*
6017                  * Attempt to rebuild a device containing no actual dev hdr
6018                  * or containing a header from some other pool.
6019                  */
6020                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
6021                 return (SET_ERROR(ENOTSUP));
6022         }
6023 
6024         l2arc_dev_hdr_checksum(hdr, &cksum);
6025         if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) {
6026                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
6027                 return (SET_ERROR(EINVAL));
6028         }
6029         if (hdr->l2dh_evict_tail < dev->l2ad_start ||
6030             hdr->l2dh_evict_tail >= dev->l2ad_end) {
6031                 /* Data in dev hdr is invalid for this device. */
6032                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
6033                 return (SET_ERROR(EINVAL));
6034         }
6035 
6036         return (0);
6037 }
6038 
6039 /*
6040  * Reads L2ARC log blocks from storage and validates their contents.
6041  *
6042  * This function implements a simple prefetcher to make sure that while
6043  * we're processing one buffer the L2ARC is already prefetching the next
6044  * one in the chain.
6045  *
6046  * The arguments this_lp and next_lp point to the current and next log blk
6047  * address in the block chain. Similarly, this_lb and next_lb hold the
6048  * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
6049  * and next_lb_buf must be buffers of appropriate to hold a raw
6050  * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
6051  * to buffer decompression).
6052  *
6053  * The `this_io' and `next_io' arguments are used for block prefetching.
6054  * When issuing the first blk IO during rebuild, you should pass NULL for
6055  * `this_io'. This function will then issue a sync IO to read the block and
6056  * also issue an async IO to fetch the next block in the block chain. The
6057  * prefetch IO is returned in `next_io'. On subsequent calls to this
6058  * function, pass the value returned in `next_io' from the previous call
6059  * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
6060  * Prior to the call, you should initialize your `next_io' pointer to be
6061  * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
6062  *
6063  * On success, this function returns 0, otherwise it returns an appropriate
6064  * error code. On error the prefetching IO is aborted and cleared before
6065  * returning from this function. Therefore, if we return `success', the
6066  * caller can assume that we have taken care of cleanup of prefetch IOs.
6067  */
6068 static int
6069 l2arc_log_blk_read(l2arc_dev_t *dev,
6070     const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp,
6071     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
6072     uint8_t *this_lb_buf, uint8_t *next_lb_buf,
6073     zio_t *this_io, zio_t **next_io)
6074 {
6075         int err = 0;
6076         zio_cksum_t cksum;
6077 
6078         ASSERT(this_lbp != NULL && next_lbp != NULL);
6079         ASSERT(this_lb != NULL && next_lb != NULL);
6080         ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
6081         ASSERT(next_io != NULL && *next_io == NULL);
6082         ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp));
6083 
6084         /*
6085          * Check to see if we have issued the IO for this log blk in a
6086          * previous run. If not, this is the first call, so issue it now.
6087          */
6088         if (this_io == NULL) {
6089                 this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
6090                     this_lb_buf);
6091         }
6092 
6093         /*
6094          * Peek to see if we can start issuing the next IO immediately.
6095          */
6096         if (l2arc_log_blk_ptr_valid(dev, next_lbp)) {
6097                 /*
6098                  * Start issuing IO for the next log blk early - this
6099                  * should help keep the L2ARC device busy while we
6100                  * decompress and restore this log blk.
6101                  */
6102                 *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
6103                     next_lb_buf);
6104         }
6105 
6106         /* Wait for the IO to read this log block to complete */
6107         if ((err = zio_wait(this_io)) != 0) {
6108                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
6109                 goto cleanup;
6110         }
6111 
6112         /* Make sure the buffer checks out */
6113         fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum);
6114         if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) {
6115                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
6116                 err = SET_ERROR(EINVAL);
6117                 goto cleanup;
6118         }
6119 
6120         /* Now we can take our time decoding this buffer */
6121         switch (LBP_GET_COMPRESS(this_lbp)) {
6122         case ZIO_COMPRESS_OFF:
6123                 bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
6124                 break;
6125         case ZIO_COMPRESS_LZ4:
6126                 if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
6127                     this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
6128                     sizeof (*this_lb))) != 0) {
6129                         err = SET_ERROR(EINVAL);
6130                         goto cleanup;
6131                 }
6132                 break;
6133         default:
6134                 err = SET_ERROR(EINVAL);
6135                 goto cleanup;
6136         }
6137         if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
6138                 byteswap_uint64_array(this_lb, sizeof (*this_lb));
6139         if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) {
6140                 err = SET_ERROR(EINVAL);
6141                 goto cleanup;
6142         }
6143 cleanup:
6144         /* Abort an in-flight prefetch I/O in case of error */
6145         if (err != 0 && *next_io != NULL) {
6146                 l2arc_log_blk_prefetch_abort(*next_io);
6147                 *next_io = NULL;
6148         }
6149         return (err);
6150 }
6151 
6152 /*
6153  * Validates an L2ARC log blk address to make sure that it can be read
6154  * from the provided L2ARC device. Returns B_TRUE if the address is
6155  * within the device's bounds, or B_FALSE if not.
6156  */
6157 static boolean_t
6158 l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp)
6159 {
6160         uint64_t psize = LBP_GET_PSIZE(lbp);
6161         uint64_t end = lbp->l2lbp_daddr + psize;
6162 
6163         /*
6164          * A log block is valid if all of the following conditions are true:
6165          * - it fits entirely between l2ad_start and l2ad_end
6166          * - it has a valid size
6167          * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it
6168          *      doesn't sit in the evicted region)
6169          */
6170         return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end &&
6171             psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
6172             lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand);
6173 }
6174 
6175 /*
6176  * Starts an asynchronous read IO to read a log block. This is used in log
6177  * block reconstruction to start reading the next block before we are done
6178  * decoding and reconstructing the current block, to keep the l2arc device
6179  * nice and hot with read IO to process.
6180  * The returned zio will contain a newly allocated memory buffers for the IO
6181  * data which should then be freed by the caller once the zio is no longer
6182  * needed (i.e. due to it having completed). If you wish to abort this
6183  * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
6184  * care of disposing of the allocated buffers correctly.
6185  */
6186 static zio_t *
6187 l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp,
6188     uint8_t *lb_buf)
6189 {
6190         uint32_t psize;
6191         zio_t *pio;
6192 
6193         psize = LBP_GET_PSIZE(lbp);
6194         ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
6195         pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
6196             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6197             ZIO_FLAG_DONT_RETRY);
6198         (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize,
6199             lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
6200             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6201             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
6202 
6203         return (pio);
6204 }
6205 
6206 /*
6207  * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
6208  * buffers allocated for it.
6209  */
6210 static void
6211 l2arc_log_blk_prefetch_abort(zio_t *zio)
6212 {
6213         (void) zio_wait(zio);
6214 }
6215 
6216 /*
6217  * Creates a zio to update the device header on an l2arc device. The zio is
6218  * initiated as a child of `pio'.
6219  */
6220 static void
6221 l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
6222 {
6223         zio_t *wzio;
6224         vdev_stat_t st;
6225         l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr;
6226 
6227         vdev_get_stats(dev->l2ad_vdev, &st);
6228 
6229         hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC;
6230         hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6231         hdr->l2dh_evict_tail = dev->l2ad_evict;
6232         hdr->l2dh_alloc_space = st.vs_alloc;
6233         hdr->l2dh_flags = 0;
6234         if (dev->l2ad_first)
6235                 hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
6236 
6237         /* checksum operation goes last */
6238         l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum);
6239 
6240         CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE &&
6241             sizeof (*hdr) <= SPA_MAXBLOCKSIZE);
6242         wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
6243             sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL,
6244             NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6245         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6246             zio_t *, wzio);
6247         (void) zio_nowait(wzio);
6248 }
6249 
6250 /*
6251  * Commits a log block to the L2ARC device. This routine is invoked from
6252  * l2arc_write_buffers when the log block fills up.
6253  * This function allocates some memory to temporarily hold the serialized
6254  * buffer to be written. This is then released in l2arc_write_done.
6255  */
6256 static void
6257 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
6258     l2arc_write_callback_t *cb)
6259 {
6260         l2arc_log_blk_phys_t    *lb = &dev->l2ad_log_blk;
6261         uint64_t                psize, asize;
6262         l2arc_log_blk_buf_t     *lb_buf;
6263         zio_t                   *wzio;
6264 
6265         VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
6266 
6267         /* link the buffer into the block chain */
6268         lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1];
6269         lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC;
6270 
6271         /* try to compress the buffer */
6272         lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
6273         list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf);
6274         VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb,
6275             lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0);
6276 
6277         /*
6278          * Update the start log blk pointer in the device header to point
6279          * to the log block we're about to write.
6280          */
6281         dev->l2ad_dev_hdr.l2dh_start_lbps[1] =
6282             dev->l2ad_dev_hdr.l2dh_start_lbps[0];
6283         dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand;
6284         LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb));
6285         LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize);
6286         LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6287             ZIO_CHECKSUM_FLETCHER_4);
6288         LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0);
6289         if (psize < sizeof (*lb)) {
6290                 /* compression succeeded */
6291                 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6292                     ZIO_COMPRESS_LZ4);
6293         } else {
6294                 /* compression failed */
6295                 bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb));
6296                 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6297                     ZIO_COMPRESS_OFF);
6298         }
6299         /* checksum what we're about to write */
6300         fletcher_4_native(lb_buf->l2lbb_log_blk, psize,
6301             &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum);
6302 
6303         /* perform the write itself */
6304         CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
6305             L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
6306         wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
6307             psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
6308             ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6309         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
6310         (void) zio_nowait(wzio);
6311 
6312         /* realign the device hand */
6313         asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
6314         dev->l2ad_hand += asize;
6315         VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first);
6316         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
6317 
6318         /* bump the kstats */
6319         ARCSTAT_INCR(arcstat_l2_write_bytes, psize);
6320         ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
6321         ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
6322         ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
6323             dev->l2ad_log_blk_payload_asize / asize);
6324 
6325         dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0;
6326 }
6327 
6328 /*
6329  * Computes the checksum of `hdr' and stores it in `cksum'.
6330  */
6331 static void
6332 l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
6333 {
6334         fletcher_4_native((uint8_t *)hdr +
6335             offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
6336             sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
6337             cksum);
6338 }
6339 
6340 /*
6341  * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
6342  * The buffer being inserted must be present in L2ARC.
6343  * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
6344  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
6345  */
6346 static boolean_t
6347 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
6348 {
6349         l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
6350         l2arc_log_ent_phys_t *le;
6351         const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
6352         int index = dev->l2ad_log_ent_idx++;
6353 
6354         ASSERT(l2hdr != NULL);
6355         ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
6356 
6357         le = &lb->l2lb_entries[index];
6358         bzero(le, sizeof (*le));
6359         le->l2le_dva = ab->b_dva;
6360         le->l2le_birth = ab->b_birth;
6361         le->l2le_cksum0 = ab->b_cksum0;
6362         le->l2le_daddr = l2hdr->b_daddr;
6363         LE_SET_LSIZE(le, ab->b_size);
6364         LE_SET_PSIZE(le, l2hdr->b_asize);
6365         LE_SET_COMPRESS(le, l2hdr->b_compress);
6366         le->l2le_freeze_cksum = *ab->b_freeze_cksum;
6367         LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
6368         LE_SET_TYPE(le, ab->b_type);
6369         dev->l2ad_log_blk_payload_asize += l2hdr->b_asize;
6370 
6371         return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
6372 }
6373 
6374 /*
6375  * Checks whether a given L2ARC device address sits in a time-sequential
6376  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
6377  * just do a range comparison, we need to handle the situation in which the
6378  * range wraps around the end of the L2ARC device. Arguments:
6379  *      bottom  Lower end of the range to check (written to earlier).
6380  *      top     Upper end of the range to check (written to later).
6381  *      check   The address for which we want to determine if it sits in
6382  *              between the top and bottom.
6383  *
6384  * The 3-way conditional below represents the following cases:
6385  *
6386  *      bottom < top : Sequentially ordered case:
6387  *        <check>--------+-------------------+
6388  *                       |  (overlap here?)  |
6389  *       L2ARC dev       V                   V
6390  *       |---------------<bottom>============<top>--------------|
6391  *
6392  *      bottom > top: Looped-around case:
6393  *                            <check>--------+------------------+
6394  *                                           |  (overlap here?) |
6395  *       L2ARC dev                           V                  V
6396  *       |===============<top>---------------<bottom>===========|
6397  *       ^               ^
6398  *       |  (or here?)   |
6399  *       +---------------+---------<check>
6400  *
6401  *      top == bottom : Just a single address comparison.
6402  */
6403 static inline boolean_t
6404 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
6405 {
6406         if (bottom < top)
6407                 return (bottom <= check && check <= top);
6408         else if (bottom > top)
6409                 return (check <= top || bottom <= check);
6410         else
6411                 return (check == top);
6412 }
6413 
6414 /*
6415  * Checks whether a rebuild timeout deadline has been hit and if it has,
6416  * increments the appropriate error counters.
6417  */
6418 static boolean_t
6419 l2arc_check_rebuild_timeout_hit(int64_t deadline)
6420 {
6421         if (deadline != 0 && deadline < ddi_get_lbolt64()) {
6422                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout);
6423                 cmn_err(CE_WARN, "L2ARC rebuild is taking too long, "
6424                     "dropping remaining L2ARC metadata.");
6425                 return (B_TRUE);
6426         } else {
6427                 return (B_FALSE);
6428         }
6429 }
 |