Print this page
3525 Persistent L2ARC

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c
↓ open down ↓ 130 lines elided ↑ open up ↑
 131  131  #include <sys/dsl_pool.h>
 132  132  #ifdef _KERNEL
 133  133  #include <sys/vmsystm.h>
 134  134  #include <vm/anon.h>
 135  135  #include <sys/fs/swapnode.h>
 136  136  #include <sys/dnlc.h>
 137  137  #endif
 138  138  #include <sys/callb.h>
 139  139  #include <sys/kstat.h>
 140  140  #include <zfs_fletcher.h>
      141 +#include <sys/byteorder.h>
      142 +#include <sys/spa_impl.h>
 141  143  
 142  144  #ifndef _KERNEL
 143  145  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 144  146  boolean_t arc_watch = B_FALSE;
 145  147  int arc_procfd;
 146  148  #endif
 147  149  
 148  150  static kmutex_t         arc_reclaim_thr_lock;
 149  151  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 150  152  static uint8_t          arc_thread_exit;
↓ open down ↓ 158 lines elided ↑ open up ↑
 309  311          kstat_named_t arcstat_l2_free_on_write;
 310  312          kstat_named_t arcstat_l2_abort_lowmem;
 311  313          kstat_named_t arcstat_l2_cksum_bad;
 312  314          kstat_named_t arcstat_l2_io_error;
 313  315          kstat_named_t arcstat_l2_size;
 314  316          kstat_named_t arcstat_l2_asize;
 315  317          kstat_named_t arcstat_l2_hdr_size;
 316  318          kstat_named_t arcstat_l2_compress_successes;
 317  319          kstat_named_t arcstat_l2_compress_zeros;
 318  320          kstat_named_t arcstat_l2_compress_failures;
      321 +        kstat_named_t arcstat_l2_log_blk_writes;
      322 +        kstat_named_t arcstat_l2_log_blk_avg_size;
      323 +        kstat_named_t arcstat_l2_data_to_meta_ratio;
      324 +        kstat_named_t arcstat_l2_rebuild_successes;
      325 +        kstat_named_t arcstat_l2_rebuild_abort_unsupported;
      326 +        kstat_named_t arcstat_l2_rebuild_abort_timeout;
      327 +        kstat_named_t arcstat_l2_rebuild_abort_io_errors;
      328 +        kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
      329 +        kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
      330 +        kstat_named_t arcstat_l2_rebuild_abort_lowmem;
      331 +        kstat_named_t arcstat_l2_rebuild_size;
      332 +        kstat_named_t arcstat_l2_rebuild_bufs;
      333 +        kstat_named_t arcstat_l2_rebuild_bufs_precached;
      334 +        kstat_named_t arcstat_l2_rebuild_psize;
      335 +        kstat_named_t arcstat_l2_rebuild_log_blks;
 319  336          kstat_named_t arcstat_memory_throttle_count;
 320  337          kstat_named_t arcstat_duplicate_buffers;
 321  338          kstat_named_t arcstat_duplicate_buffers_size;
 322  339          kstat_named_t arcstat_duplicate_reads;
 323  340          kstat_named_t arcstat_meta_used;
 324  341          kstat_named_t arcstat_meta_limit;
 325  342          kstat_named_t arcstat_meta_max;
 326  343  } arc_stats_t;
 327  344  
 328  345  static arc_stats_t arc_stats = {
↓ open down ↓ 46 lines elided ↑ open up ↑
 375  392          { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 376  393          { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 377  394          { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 378  395          { "l2_io_error",                KSTAT_DATA_UINT64 },
 379  396          { "l2_size",                    KSTAT_DATA_UINT64 },
 380  397          { "l2_asize",                   KSTAT_DATA_UINT64 },
 381  398          { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 382  399          { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 383  400          { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 384  401          { "l2_compress_failures",       KSTAT_DATA_UINT64 },
      402 +        { "l2_log_blk_writes",          KSTAT_DATA_UINT64 },
      403 +        { "l2_log_blk_avg_size",        KSTAT_DATA_UINT64 },
      404 +        { "l2_data_to_meta_ratio",      KSTAT_DATA_UINT64 },
      405 +        { "l2_rebuild_successes",       KSTAT_DATA_UINT64 },
      406 +        { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
      407 +        { "l2_rebuild_timeout",         KSTAT_DATA_UINT64 },
      408 +        { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
      409 +        { "l2_rebuild_cksum_errors",    KSTAT_DATA_UINT64 },
      410 +        { "l2_rebuild_loop_errors",     KSTAT_DATA_UINT64 },
      411 +        { "l2_rebuild_lowmem",          KSTAT_DATA_UINT64 },
      412 +        { "l2_rebuild_psize",           KSTAT_DATA_UINT64 },
      413 +        { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
      414 +        { "l2_rebuild_bufs_precached",  KSTAT_DATA_UINT64 },
      415 +        { "l2_rebuild_size",            KSTAT_DATA_UINT64 },
      416 +        { "l2_rebuild_log_blks",        KSTAT_DATA_UINT64 },
 385  417          { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 386  418          { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 387  419          { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 388  420          { "duplicate_reads",            KSTAT_DATA_UINT64 },
 389  421          { "arc_meta_used",              KSTAT_DATA_UINT64 },
 390  422          { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 391  423          { "arc_meta_max",               KSTAT_DATA_UINT64 }
 392  424  };
 393  425  
 394  426  #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
↓ open down ↓ 27 lines elided ↑ open up ↑
 422  454                          ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 423  455                  }                                                       \
 424  456          } else {                                                        \
 425  457                  if (cond2) {                                            \
 426  458                          ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 427  459                  } else {                                                \
 428  460                          ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 429  461                  }                                                       \
 430  462          }
 431  463  
      464 +/*
      465 + * This macro allows us to use kstats as floating averages. Each time we
      466 + * update this kstat, we first factor it and the update value by
      467 + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
      468 + * average. This macro assumes that integer loads and stores are atomic, but
      469 + * is not safe for multiple writers updating the kstat in parallel (only the
      470 + * last writer's update will remain).
      471 + */
      472 +#define ARCSTAT_F_AVG_FACTOR    3
      473 +#define ARCSTAT_F_AVG(stat, value) \
      474 +        do { \
      475 +                uint64_t x = ARCSTAT(stat); \
      476 +                x = x - x / ARCSTAT_F_AVG_FACTOR + \
      477 +                    (value) / ARCSTAT_F_AVG_FACTOR; \
      478 +                ARCSTAT(stat) = x; \
      479 +                _NOTE(NOTREACHED) \
      480 +                _NOTE(CONSTCOND) \
      481 +        } while (0)
      482 +
 432  483  kstat_t                 *arc_ksp;
 433  484  static arc_state_t      *arc_anon;
 434  485  static arc_state_t      *arc_mru;
 435  486  static arc_state_t      *arc_mru_ghost;
 436  487  static arc_state_t      *arc_mfu;
 437  488  static arc_state_t      *arc_mfu_ghost;
 438  489  static arc_state_t      *arc_l2c_only;
 439  490  
 440  491  /*
 441  492   * There are several ARC variables that are critical to export as kstats --
↓ open down ↓ 188 lines elided ↑ open up ↑
 630  681  uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 631  682  uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 632  683  uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 633  684  boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 634  685  boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 635  686  boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 636  687  
 637  688  /*
 638  689   * L2ARC Internals
 639  690   */
 640      -typedef struct l2arc_dev {
 641      -        vdev_t                  *l2ad_vdev;     /* vdev */
 642      -        spa_t                   *l2ad_spa;      /* spa */
 643      -        uint64_t                l2ad_hand;      /* next write location */
 644      -        uint64_t                l2ad_start;     /* first addr on device */
 645      -        uint64_t                l2ad_end;       /* last addr on device */
 646      -        uint64_t                l2ad_evict;     /* last addr eviction reached */
 647      -        boolean_t               l2ad_first;     /* first sweep through */
 648      -        boolean_t               l2ad_writing;   /* currently writing */
 649      -        list_t                  *l2ad_buflist;  /* buffer list */
 650      -        list_node_t             l2ad_node;      /* device list node */
 651      -} l2arc_dev_t;
 652      -
      691 +typedef struct l2arc_dev l2arc_dev_t;
 653  692  static list_t L2ARC_dev_list;                   /* device list */
 654  693  static list_t *l2arc_dev_list;                  /* device list pointer */
 655  694  static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 656  695  static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 657  696  static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 658  697  static list_t L2ARC_free_on_write;              /* free after write buf list */
 659  698  static list_t *l2arc_free_on_write;             /* free after write list ptr */
 660  699  static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 661  700  static uint64_t l2arc_ndev;                     /* number of devices */
 662  701  
↓ open down ↓ 2 lines elided ↑ open up ↑
 665  704          spa_t                   *l2rcb_spa;             /* spa */
 666  705          blkptr_t                l2rcb_bp;               /* original blkptr */
 667  706          zbookmark_t             l2rcb_zb;               /* original bookmark */
 668  707          int                     l2rcb_flags;            /* original flags */
 669  708          enum zio_compress       l2rcb_compress;         /* applied compress */
 670  709  } l2arc_read_callback_t;
 671  710  
 672  711  typedef struct l2arc_write_callback {
 673  712          l2arc_dev_t     *l2wcb_dev;             /* device info */
 674  713          arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
      714 +        /* list of in-flight l2arc_log_blk_buf_t's */
      715 +        list_t          l2wcb_log_blk_buf_list;
 675  716  } l2arc_write_callback_t;
 676  717  
 677  718  struct l2arc_buf_hdr {
 678  719          /* protected by arc_buf_hdr  mutex */
 679  720          l2arc_dev_t             *b_dev;         /* L2ARC device */
 680  721          uint64_t                b_daddr;        /* disk address, offset byte */
 681  722          /* compression applied to buffer data */
 682  723          enum zio_compress       b_compress;
 683  724          /* real alloc'd buffer size depending on b_compress applied */
 684  725          int                     b_asize;
↓ open down ↓ 7 lines elided ↑ open up ↑
 692  733          size_t          l2df_size;
 693  734          void            (*l2df_func)(void *, size_t);
 694  735          list_node_t     l2df_list_node;
 695  736  } l2arc_data_free_t;
 696  737  
 697  738  static kmutex_t l2arc_feed_thr_lock;
 698  739  static kcondvar_t l2arc_feed_thr_cv;
 699  740  static uint8_t l2arc_thread_exit;
 700  741  
 701  742  static void l2arc_read_done(zio_t *zio);
 702      -static void l2arc_hdr_stat_add(void);
      743 +static void l2arc_hdr_stat_add(boolean_t from_arc);
 703  744  static void l2arc_hdr_stat_remove(void);
      745 +static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
 704  746  
 705  747  static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 706  748  static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 707  749      enum zio_compress c);
 708  750  static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 709  751  
 710      -static uint64_t
      752 +enum {
      753 +        L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0)    /* mirror of l2ad_first */
      754 +};
      755 +
      756 +/*
      757 + * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
      758 + */
      759 +typedef struct l2arc_log_blk_ptr {
      760 +        uint64_t        l2lbp_daddr;    /* device address of log */
      761 +        /*
      762 +         * l2lbp_prop is the same format as the blk_prop in blkptr_t:
      763 +         *      * logical size (in sectors)
      764 +         *      * physical (compressed) size (in sectors)
      765 +         *      * compression algorithm (we always LZ4-compress l2arc logs)
      766 +         *      * checksum algorithm (used for l2lbp_cksum)
      767 +         *      * object type & level (unused for now)
      768 +         */
      769 +        uint64_t        l2lbp_prop;
      770 +        zio_cksum_t     l2lbp_cksum;    /* fletcher4 of log */
      771 +} l2arc_log_blk_ptr_t;
      772 +
      773 +/*
      774 + * The persistent L2ARC device header.
      775 + */
      776 +typedef struct l2arc_dev_hdr_phys {
      777 +        uint64_t        l2dh_magic;
      778 +        zio_cksum_t     l2dh_self_cksum;        /* fletcher4 of fields below */
      779 +
      780 +        /*
      781 +         * Global L2ARC device state and metadata.
      782 +         */
      783 +        uint64_t        l2dh_spa_guid;
      784 +        uint64_t        l2dh_evict_tail;        /* current evict pointer */
      785 +        uint64_t        l2dh_alloc_space;       /* vdev space alloc status */
      786 +        uint64_t        l2dh_flags;             /* l2arc_dev_hdr_flags_t */
      787 +
      788 +        /*
      789 +         * Start of log block chain. [0] -> newest log, [1] -> one older (used
      790 +         * for initiating prefetch).
      791 +         */
      792 +        l2arc_log_blk_ptr_t     l2dh_start_lbps[2];
      793 +
      794 +        const uint64_t  l2dh_pad[43];           /* pad to 512 bytes */
      795 +} l2arc_dev_hdr_phys_t;
      796 +CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
      797 +
      798 +/*
      799 + * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
      800 + */
      801 +typedef struct l2arc_log_ent_phys {
      802 +        dva_t                   l2le_dva;       /* dva of buffer */
      803 +        uint64_t                l2le_birth;     /* birth txg of buffer */
      804 +        uint64_t                l2le_cksum0;
      805 +        zio_cksum_t             l2le_freeze_cksum;
      806 +        /*
      807 +         * l2le_prop is the same format as the blk_prop in blkptr_t:
      808 +         *      * logical size (in sectors)
      809 +         *      * physical (compressed) size (in sectors)
      810 +         *      * compression algorithm
      811 +         *      * checksum algorithm (used for cksum0)
      812 +         *      * object type & level (used to restore arc_buf_contents_t)
      813 +         */
      814 +        uint64_t                l2le_prop;
      815 +        uint64_t                l2le_daddr;     /* buf location on l2dev */
      816 +        const uint64_t          l2le_pad[6];    /* resv'd for future use */
      817 +} l2arc_log_ent_phys_t;
      818 +
      819 +/*
      820 + * These design limits give us the following overhead (before compression):
      821 + *      avg_blk_sz      overhead
      822 + *      1k              12.51 %
      823 + *      2k               6.26 %
      824 + *      4k               3.13 %
      825 + *      8k               1.56 %
      826 + *      16k              0.78 %
      827 + *      32k              0.39 %
      828 + *      64k              0.20 %
      829 + *      128k             0.10 %
      830 + * Compression should be able to sequeeze these down by about a factor of 2x.
      831 + */
      832 +#define L2ARC_LOG_BLK_SIZE                      (128 * 1024)    /* 128k */
      833 +#define L2ARC_LOG_BLK_HEADER_LEN                (128)
      834 +#define L2ARC_LOG_BLK_ENTRIES                   /* 1023 entries */      \
      835 +        ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) /              \
      836 +        sizeof (l2arc_log_ent_phys_t))
      837 +/*
      838 + * Maximum amount of data in an l2arc log block (used to terminate rebuilding
      839 + * before we hit the write head and restore potentially corrupted blocks).
      840 + */
      841 +#define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE  \
      842 +        (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
      843 +/*
      844 + * For the persistency and rebuild algorithms to operate reliably we need
      845 + * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
      846 + * excessive log block looping might confuse the log chain end detection).
      847 + * Under normal circumstances this is not a problem, since this is somewhere
      848 + * around only 400 MB.
      849 + */
      850 +#define L2ARC_PERSIST_MIN_SIZE  (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
      851 +
      852 +/*
      853 + * A log block of up to 1023 ARC buffer log entries, chained into the
      854 + * persistent L2ARC metadata linked list.
      855 + */
      856 +typedef struct l2arc_log_blk_phys {
      857 +        /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
      858 +        uint64_t                l2lb_magic;
      859 +        l2arc_log_blk_ptr_t     l2lb_back2_lbp; /* back 2 steps in chain */
      860 +        uint64_t                l2lb_pad[9];    /* resv'd for future use */
      861 +        /* Payload */
      862 +        l2arc_log_ent_phys_t    l2lb_entries[L2ARC_LOG_BLK_ENTRIES];
      863 +} l2arc_log_blk_phys_t;
      864 +
      865 +CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
      866 +CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) -
      867 +    offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
      868 +
      869 +/*
      870 + * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
      871 + * written to the L2ARC device. They may be compressed, hence the uint8_t[].
      872 + */
      873 +typedef struct l2arc_log_blk_buf {
      874 +        uint8_t         l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
      875 +        list_node_t     l2lbb_node;
      876 +} l2arc_log_blk_buf_t;
      877 +
      878 +/* Macros for the manipulation fields in the blk_prop format of blkptr_t */
      879 +#define BLKPROP_GET_LSIZE(_obj, _field)         \
      880 +        BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
      881 +#define BLKPROP_SET_LSIZE(_obj, _field, x)      \
      882 +        BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
      883 +#define BLKPROP_GET_PSIZE(_obj, _field)         \
      884 +        BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
      885 +#define BLKPROP_SET_PSIZE(_obj, _field, x)      \
      886 +        BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
      887 +#define BLKPROP_GET_COMPRESS(_obj, _field)      \
      888 +        BF64_GET((_obj)->_field, 32, 8)
      889 +#define BLKPROP_SET_COMPRESS(_obj, _field, x)   \
      890 +        BF64_SET((_obj)->_field, 32, 8, x)
      891 +#define BLKPROP_GET_CHECKSUM(_obj, _field)      \
      892 +        BF64_GET((_obj)->_field, 40, 8)
      893 +#define BLKPROP_SET_CHECKSUM(_obj, _field, x)   \
      894 +        BF64_SET((_obj)->_field, 40, 8, x)
      895 +#define BLKPROP_GET_TYPE(_obj, _field)          \
      896 +        BF64_GET((_obj)->_field, 48, 8)
      897 +#define BLKPROP_SET_TYPE(_obj, _field, x)       \
      898 +        BF64_SET((_obj)->_field, 48, 8, x)
      899 +
      900 +/* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */
      901 +#define LBP_GET_LSIZE(_add)             BLKPROP_GET_LSIZE(_add, l2lbp_prop)
      902 +#define LBP_SET_LSIZE(_add, x)          BLKPROP_SET_LSIZE(_add, l2lbp_prop, x)
      903 +#define LBP_GET_PSIZE(_add)             BLKPROP_GET_PSIZE(_add, l2lbp_prop)
      904 +#define LBP_SET_PSIZE(_add, x)          BLKPROP_SET_PSIZE(_add, l2lbp_prop, x)
      905 +#define LBP_GET_COMPRESS(_add)          BLKPROP_GET_COMPRESS(_add, l2lbp_prop)
      906 +#define LBP_SET_COMPRESS(_add, x)       BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \
      907 +    x)
      908 +#define LBP_GET_CHECKSUM(_add)          BLKPROP_GET_CHECKSUM(_add, l2lbp_prop)
      909 +#define LBP_SET_CHECKSUM(_add, x)       BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \
      910 +    x)
      911 +#define LBP_GET_TYPE(_add)              BLKPROP_GET_TYPE(_add, l2lbp_prop)
      912 +#define LBP_SET_TYPE(_add, x)           BLKPROP_SET_TYPE(_add, l2lbp_prop, x)
      913 +
      914 +/* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */
      915 +#define LE_GET_LSIZE(_le)       BLKPROP_GET_LSIZE(_le, l2le_prop)
      916 +#define LE_SET_LSIZE(_le, x)    BLKPROP_SET_LSIZE(_le, l2le_prop, x)
      917 +#define LE_GET_PSIZE(_le)       BLKPROP_GET_PSIZE(_le, l2le_prop)
      918 +#define LE_SET_PSIZE(_le, x)    BLKPROP_SET_PSIZE(_le, l2le_prop, x)
      919 +#define LE_GET_COMPRESS(_le)    BLKPROP_GET_COMPRESS(_le, l2le_prop)
      920 +#define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x)
      921 +#define LE_GET_CHECKSUM(_le)    BLKPROP_GET_CHECKSUM(_le, l2le_prop)
      922 +#define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x)
      923 +#define LE_GET_TYPE(_le)        BLKPROP_GET_TYPE(_le, l2le_prop)
      924 +#define LE_SET_TYPE(_le, x)     BLKPROP_SET_TYPE(_le, l2le_prop, x)
      925 +
      926 +#define PTR_SWAP(x, y)          \
      927 +        do {                    \
      928 +                void *tmp = (x);\
      929 +                x = y;          \
      930 +                y = tmp;        \
      931 +                _NOTE(CONSTCOND)\
      932 +        } while (0)
      933 +
      934 +#define L2ARC_DEV_HDR_MAGIC     0x12bab10c00000001LLU
      935 +#define L2ARC_LOG_BLK_MAGIC     0x120103b10c000001LLU
      936 +#define L2ARC_REBUILD_TIMEOUT   300     /* a rebuild may take at most 300s */
      937 +
      938 +struct l2arc_dev {
      939 +        vdev_t                  *l2ad_vdev;     /* vdev */
      940 +        spa_t                   *l2ad_spa;      /* spa */
      941 +        uint64_t                l2ad_hand;      /* next write location */
      942 +        uint64_t                l2ad_start;     /* first addr on device */
      943 +        uint64_t                l2ad_end;       /* last addr on device */
      944 +        uint64_t                l2ad_evict;     /* last addr eviction reached */
      945 +        boolean_t               l2ad_first;     /* first sweep through */
      946 +        boolean_t               l2ad_writing;   /* currently writing */
      947 +        list_t                  *l2ad_buflist;  /* buffer list */
      948 +        list_node_t             l2ad_node;      /* device list node */
      949 +        l2arc_dev_hdr_phys_t    l2ad_dev_hdr;   /* persistent device header */
      950 +        l2arc_log_blk_phys_t    l2ad_log_blk;   /* currently open log block */
      951 +        int                     l2ad_log_ent_idx; /* index into cur log blk */
      952 +        /* number of bytes in current log block's payload */
      953 +        uint64_t                l2ad_log_blk_payload_asize;
      954 +        /* flag indicating whether a rebuild is scheduled or is going on */
      955 +        boolean_t               l2ad_rebuild;
      956 +};
      957 +
      958 +/*
      959 + * Performance tuning of L2ARC persistency:
      960 + *
      961 + * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
      962 + *              pool import or when adding one manually later) will attempt
      963 + *              to rebuild L2ARC buffer contents. In special circumstances,
      964 + *              the administrator may want to set this to B_FALSE, if they
      965 + *              are having trouble importing a pool or attaching an L2ARC
      966 + *              device (e.g. the L2ARC device is slow to read in stored log
      967 + *              metadata, or the metadata has become somehow
      968 + *              fragmented/unusable).
      969 + * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
      970 + *              avoid a slow L2ARC device from preventing pool import. If we
      971 + *              are not done rebuilding an L2ARC device by this time, we
      972 + *              stop the rebuild and return immediately.
      973 + */
      974 +boolean_t l2arc_rebuild_enabled = B_TRUE;
      975 +uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
      976 +
      977 +/*
      978 + * L2ARC persistency rebuild routines.
      979 + */
      980 +static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
      981 +static int l2arc_rebuild(l2arc_dev_t *dev);
      982 +static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
      983 +    l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
      984 +static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
      985 +    l2arc_dev_t *dev, uint64_t guid);
      986 +
      987 +/*
      988 + * L2ARC persistency read I/O routines.
      989 + */
      990 +static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr);
      991 +static int l2arc_log_blk_read(l2arc_dev_t *dev,
      992 +    const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp,
      993 +    l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
      994 +    uint8_t *this_lb_buf, uint8_t *next_lb_buf,
      995 +    zio_t *this_io, zio_t **next_io);
      996 +static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev,
      997 +    const l2arc_log_blk_ptr_t *lp);
      998 +static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
      999 +    const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf);
     1000 +static void l2arc_log_blk_prefetch_abort(zio_t *zio);
     1001 +
     1002 +/*
     1003 + * L2ARC persistency write I/O routines.
     1004 + */
     1005 +static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
     1006 +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     1007 +    l2arc_write_callback_t *cb);
     1008 +
     1009 +/*
     1010 + * L2ARC persistency auxilliary routines.
     1011 + */
     1012 +static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
     1013 +    zio_cksum_t *cksum);
     1014 +static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
     1015 +    const arc_buf_hdr_t *ab);
     1016 +static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
     1017 +    uint64_t top, uint64_t check);
     1018 +static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline);
     1019 +
     1020 +static inline uint64_t
 711 1021  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 712 1022  {
 713 1023          uint8_t *vdva = (uint8_t *)dva;
 714 1024          uint64_t crc = -1ULL;
 715 1025          int i;
 716 1026  
 717 1027          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 718 1028  
 719 1029          for (i = 0; i < sizeof (dva_t); i++)
 720 1030                  crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
↓ open down ↓ 519 lines elided ↑ open up ↑
1240 1550          if (to_delta)
1241 1551                  atomic_add_64(&new_state->arcs_size, to_delta);
1242 1552          if (from_delta) {
1243 1553                  ASSERT3U(old_state->arcs_size, >=, from_delta);
1244 1554                  atomic_add_64(&old_state->arcs_size, -from_delta);
1245 1555          }
1246 1556          ab->b_state = new_state;
1247 1557  
1248 1558          /* adjust l2arc hdr stats */
1249 1559          if (new_state == arc_l2c_only)
1250      -                l2arc_hdr_stat_add();
     1560 +                l2arc_hdr_stat_add(old_state != arc_anon);
1251 1561          else if (old_state == arc_l2c_only)
1252 1562                  l2arc_hdr_stat_remove();
1253 1563  }
1254 1564  
1255 1565  void
1256 1566  arc_space_consume(uint64_t space, arc_space_type_t type)
1257 1567  {
1258 1568          ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1259 1569  
1260 1570          switch (type) {
↓ open down ↓ 83 lines elided ↑ open up ↑
1344 1654          hdr->b_buf = buf;
1345 1655          arc_get_data_buf(buf);
1346 1656          hdr->b_datacnt = 1;
1347 1657          hdr->b_flags = 0;
1348 1658          ASSERT(refcount_is_zero(&hdr->b_refcnt));
1349 1659          (void) refcount_add(&hdr->b_refcnt, tag);
1350 1660  
1351 1661          return (buf);
1352 1662  }
1353 1663  
     1664 +/*
     1665 + * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
     1666 + * This is used during l2arc reconstruction to make empty ARC buffers
     1667 + * which circumvent the regular disk->arc->l2arc path and instead come
     1668 + * into being in the reverse order, i.e. l2arc->arc->(disk).
     1669 + */
     1670 +arc_buf_hdr_t *
     1671 +arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
     1672 +{
     1673 +        arc_buf_hdr_t *hdr;
     1674 +
     1675 +        ASSERT3U(size, >, 0);
     1676 +        hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
     1677 +        ASSERT(BUF_EMPTY(hdr));
     1678 +        hdr->b_size = size;
     1679 +        hdr->b_type = type;
     1680 +        hdr->b_spa = guid;
     1681 +        hdr->b_state = arc_anon;
     1682 +        hdr->b_arc_access = 0;
     1683 +        hdr->b_buf = NULL;
     1684 +        hdr->b_datacnt = 0;
     1685 +        hdr->b_flags = 0;
     1686 +        ASSERT(refcount_is_zero(&hdr->b_refcnt));
     1687 +
     1688 +        return (hdr);
     1689 +}
     1690 +
1354 1691  static char *arc_onloan_tag = "onloan";
1355 1692  
1356 1693  /*
1357 1694   * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1358 1695   * flight data by arc_tempreserve_space() until they are "returned". Loaned
1359 1696   * buffers must be returned to the arc before they can be used by the DMU or
1360 1697   * freed.
1361 1698   */
1362 1699  arc_buf_t *
1363 1700  arc_loan_buf(spa_t *spa, int size)
↓ open down ↓ 217 lines elided ↑ open up ↑
1581 1918                   */
1582 1919                  if (!buflist_held) {
1583 1920                          mutex_enter(&l2arc_buflist_mtx);
1584 1921                          l2hdr = hdr->b_l2hdr;
1585 1922                  }
1586 1923  
1587 1924                  if (l2hdr != NULL) {
1588 1925                          list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1589 1926                          ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1590 1927                          ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1591      -                        kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
     1928 +                        kmem_free(l2hdr, sizeof (*l2hdr));
1592 1929                          if (hdr->b_state == arc_l2c_only)
1593 1930                                  l2arc_hdr_stat_remove();
1594 1931                          hdr->b_l2hdr = NULL;
1595 1932                  }
1596 1933  
1597 1934                  if (!buflist_held)
1598 1935                          mutex_exit(&l2arc_buflist_mtx);
1599 1936          }
1600 1937  
1601 1938          if (!BUF_EMPTY(hdr)) {
↓ open down ↓ 1436 lines elided ↑ open up ↑
3038 3375                  acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3039 3376                  acb->acb_done = done;
3040 3377                  acb->acb_private = private;
3041 3378  
3042 3379                  ASSERT(hdr->b_acb == NULL);
3043 3380                  hdr->b_acb = acb;
3044 3381                  hdr->b_flags |= ARC_IO_IN_PROGRESS;
3045 3382  
3046 3383                  if (hdr->b_l2hdr != NULL &&
3047 3384                      (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
     3385 +                        /*
     3386 +                         * Need to stash these before letting go of hash_lock
     3387 +                         */
3048 3388                          devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3049 3389                          addr = hdr->b_l2hdr->b_daddr;
3050 3390                          b_compress = hdr->b_l2hdr->b_compress;
3051 3391                          b_asize = hdr->b_l2hdr->b_asize;
3052 3392                          /*
3053 3393                           * Lock out device removal.
3054 3394                           */
3055 3395                          if (vdev_is_dead(vd) ||
3056 3396                              !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3057 3397                                  vd = NULL;
↓ open down ↓ 353 lines elided ↑ open up ↑
3411 3751                          mutex_exit(hash_lock);
3412 3752  
3413 3753                  buf_discard_identity(hdr);
3414 3754                  arc_buf_thaw(buf);
3415 3755          }
3416 3756          buf->b_efunc = NULL;
3417 3757          buf->b_private = NULL;
3418 3758  
3419 3759          if (l2hdr) {
3420 3760                  ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3421      -                kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
     3761 +                kmem_free(l2hdr, sizeof (*l2hdr));
3422 3762                  ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3423 3763                  mutex_exit(&l2arc_buflist_mtx);
3424 3764          }
3425 3765  }
3426 3766  
3427 3767  int
3428 3768  arc_released(arc_buf_t *buf)
3429 3769  {
3430 3770          int released;
3431 3771  
↓ open down ↓ 594 lines elided ↑ open up ↑
4026 4366   * integrated, and also may become zpool properties.
4027 4367   *
4028 4368   * There are three key functions that control how the L2ARC warms up:
4029 4369   *
4030 4370   *      l2arc_write_eligible()  check if a buffer is eligible to cache
4031 4371   *      l2arc_write_size()      calculate how much to write
4032 4372   *      l2arc_write_interval()  calculate sleep delay between writes
4033 4373   *
4034 4374   * These three functions determine what to write, how much, and how quickly
4035 4375   * to send writes.
     4376 + *
     4377 + * L2ARC persistency:
     4378 + *
     4379 + * When writing buffers to L2ARC, we periodically add some metadata to
     4380 + * make sure we can pick them up after reboot, thus dramatically reducing
     4381 + * the impact that any downtime has on the performance of storage systems
     4382 + * with large caches.
     4383 + *
     4384 + * The implementation works fairly simply by integrating the following two
     4385 + * modifications:
     4386 + *
     4387 + * *) Every now and then we mix in a piece of metadata (called a log block)
     4388 + *    into the L2ARC write. This allows us to understand what's been written,
     4389 + *    so that we can rebuild the arc_buf_hdr_t structures of the main ARC
     4390 + *    buffers. The log block also includes a "back-reference" pointer to the
     4391 + *    previous block, forming a back-linked list of blocks on the L2ARC device.
     4392 + *
     4393 + * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
     4394 + *    for our header bookkeeping purposes. This contains a device header, which
     4395 + *    contains our top-level reference structures. We update it each time we
     4396 + *    write a new log block, so that we're able to locate it in the L2ARC
     4397 + *    device. If this write results in an inconsistent device header (e.g. due
     4398 + *    to power failure), we detect this by verifying the header's checksum
     4399 + *    and simply drop the entries from L2ARC.
     4400 + *
     4401 + * Implementation diagram:
     4402 + *
     4403 + * +=== L2ARC device (not to scale) ======================================+
     4404 + * |       __________newest log block pointers_________                   |
     4405 + * |      /                                  \1 back   \latest            |
     4406 + * |     /                                    V         V                 |
     4407 + * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
     4408 + * |                       ^       / ^       / ^       /                  |
     4409 + * |                       `-prev-'  `-prev-'  `-prev-'                   |
     4410 + * |                         lb        lb        lb                       |
     4411 + * +======================================================================+
     4412 + *
     4413 + * On-device data structures:
     4414 + *
     4415 + * L2ARC device header: l2arc_dev_hdr_phys_t
     4416 + * L2ARC log block:     l2arc_log_blk_phys_t
     4417 + *
     4418 + * L2ARC reconstruction:
     4419 + *
     4420 + * When writing data, we simply write in the standard rotary fashion,
     4421 + * evicting buffers as we go and simply writing new data over them (writing
     4422 + * a new log block every now and then). This obviously means that once we
     4423 + * loop around the end of the device, we will start cutting into an already
     4424 + * committed log block (and its referenced data buffers), like so:
     4425 + *
     4426 + *    current write head__       __old tail
     4427 + *                        \     /
     4428 + *                        V    V
     4429 + * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
     4430 + *                         ^    ^^^^^^^^^___________________________________
     4431 + *                         |                                                \
     4432 + *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
     4433 + *
     4434 + * When importing the pool, we detect this situation and use it to stop
     4435 + * our scanning process (see l2arc_rebuild).
     4436 + *
     4437 + * There is one significant caveat to consider when rebuilding ARC contents
     4438 + * from an L2ARC device: what about invalidated buffers? Given the above
     4439 + * construction, we cannot update blocks which we've already written to amend
     4440 + * them to remove buffers which were invalidated. Thus, during reconstruction,
     4441 + * we might be populating the cache with buffers for data that's not on the
     4442 + * main pool anymore, or may have been overwritten!
     4443 + *
     4444 + * As it turns out, this isn't a problem. Every arc_read request includes
     4445 + * both the DVA and, crucially, the birth TXG of the BP the caller is
     4446 + * looking for. So even if the cache were populated by completely rotten
     4447 + * blocks for data that had been long deleted and/or overwritten, we'll
     4448 + * never actually return bad data from the cache, since the DVA with the
     4449 + * birth TXG uniquely identify a block in space and time - once created,
     4450 + * a block is immutable on disk. The worst thing we have done is wasted
     4451 + * some time and memory at l2arc rebuild to reconstruct outdated ARC
     4452 + * entries that will get dropped from the l2arc as it is being updated
     4453 + * with new blocks.
4036 4454   */
4037 4455  
4038 4456  static boolean_t
4039 4457  l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4040 4458  {
4041 4459          /*
4042 4460           * A buffer is *not* eligible for the L2ARC if it:
4043 4461           * 1. belongs to a different spa.
4044 4462           * 2. is already cached on the L2ARC.
4045 4463           * 3. has an I/O in progress (it may be an incomplete read).
↓ open down ↓ 46 lines elided ↑ open up ↑
4092 4510          else
4093 4511                  interval = hz * l2arc_feed_secs;
4094 4512  
4095 4513          now = ddi_get_lbolt();
4096 4514          next = MAX(now, MIN(now + interval, began + interval));
4097 4515  
4098 4516          return (next);
4099 4517  }
4100 4518  
4101 4519  static void
4102      -l2arc_hdr_stat_add(void)
     4520 +l2arc_hdr_stat_add(boolean_t from_arc)
4103 4521  {
4104 4522          ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4105      -        ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
     4523 +        if (from_arc)
     4524 +                ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4106 4525  }
4107 4526  
4108 4527  static void
4109 4528  l2arc_hdr_stat_remove(void)
4110 4529  {
4111 4530          ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4112 4531          ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4113 4532  }
4114 4533  
4115 4534  /*
↓ open down ↓ 13 lines elided ↑ open up ↑
4129 4548          mutex_enter(&spa_namespace_lock);
4130 4549          mutex_enter(&l2arc_dev_mtx);
4131 4550  
4132 4551          /* if there are no vdevs, there is nothing to do */
4133 4552          if (l2arc_ndev == 0)
4134 4553                  goto out;
4135 4554  
4136 4555          first = NULL;
4137 4556          next = l2arc_dev_last;
4138 4557          do {
4139      -                /* loop around the list looking for a non-faulted vdev */
     4558 +                /*
     4559 +                 * Loop around the list looking for a non-faulted vdev
     4560 +                 * and one that isn't currently doing an L2ARC rebuild.
     4561 +                 */
4140 4562                  if (next == NULL) {
4141 4563                          next = list_head(l2arc_dev_list);
4142 4564                  } else {
4143 4565                          next = list_next(l2arc_dev_list, next);
4144 4566                          if (next == NULL)
4145 4567                                  next = list_head(l2arc_dev_list);
4146 4568                  }
4147 4569  
4148 4570                  /* if we have come back to the start, bail out */
4149 4571                  if (first == NULL)
4150 4572                          first = next;
4151 4573                  else if (next == first)
4152 4574                          break;
4153 4575  
4154      -        } while (vdev_is_dead(next->l2ad_vdev));
     4576 +        } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
4155 4577  
4156 4578          /* if we were unable to find any usable vdevs, return NULL */
4157      -        if (vdev_is_dead(next->l2ad_vdev))
     4579 +        if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
4158 4580                  next = NULL;
4159 4581  
4160 4582          l2arc_dev_last = next;
4161 4583  
4162 4584  out:
4163 4585          mutex_exit(&l2arc_dev_mtx);
4164 4586  
4165 4587          /*
4166 4588           * Grab the config lock to prevent the 'next' device from being
4167 4589           * removed while we are writing to it.
↓ open down ↓ 33 lines elided ↑ open up ↑
4201 4623   * A write to a cache device has completed.  Update all headers to allow
4202 4624   * reads from these buffers to begin.
4203 4625   */
4204 4626  static void
4205 4627  l2arc_write_done(zio_t *zio)
4206 4628  {
4207 4629          l2arc_write_callback_t *cb;
4208 4630          l2arc_dev_t *dev;
4209 4631          list_t *buflist;
4210 4632          arc_buf_hdr_t *head, *ab, *ab_prev;
4211      -        l2arc_buf_hdr_t *abl2;
     4633 +        l2arc_buf_hdr_t *l2hdr;
4212 4634          kmutex_t *hash_lock;
     4635 +        l2arc_log_blk_buf_t *lb_buf;
4213 4636  
4214 4637          cb = zio->io_private;
4215 4638          ASSERT(cb != NULL);
4216 4639          dev = cb->l2wcb_dev;
4217 4640          ASSERT(dev != NULL);
4218 4641          head = cb->l2wcb_head;
4219 4642          ASSERT(head != NULL);
4220 4643          buflist = dev->l2ad_buflist;
4221 4644          ASSERT(buflist != NULL);
4222 4645          DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
↓ open down ↓ 2 lines elided ↑ open up ↑
4225 4648          if (zio->io_error != 0)
4226 4649                  ARCSTAT_BUMP(arcstat_l2_writes_error);
4227 4650  
4228 4651          mutex_enter(&l2arc_buflist_mtx);
4229 4652  
4230 4653          /*
4231 4654           * All writes completed, or an error was hit.
4232 4655           */
4233 4656          for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4234 4657                  ab_prev = list_prev(buflist, ab);
     4658 +                l2hdr = ab->b_l2hdr;
4235 4659  
     4660 +                /*
     4661 +                 * Release the temporary compressed buffer as soon as possible.
     4662 +                 */
     4663 +                if (l2hdr->b_compress != ZIO_COMPRESS_OFF)
     4664 +                        l2arc_release_cdata_buf(ab);
     4665 +
4236 4666                  hash_lock = HDR_LOCK(ab);
4237 4667                  if (!mutex_tryenter(hash_lock)) {
4238 4668                          /*
4239 4669                           * This buffer misses out.  It may be in a stage
4240 4670                           * of eviction.  Its ARC_L2_WRITING flag will be
4241 4671                           * left set, denying reads to this buffer.
4242 4672                           */
4243 4673                          ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4244 4674                          continue;
4245 4675                  }
4246 4676  
4247      -                abl2 = ab->b_l2hdr;
4248      -
4249      -                /*
4250      -                 * Release the temporary compressed buffer as soon as possible.
4251      -                 */
4252      -                if (abl2->b_compress != ZIO_COMPRESS_OFF)
4253      -                        l2arc_release_cdata_buf(ab);
4254      -
4255 4677                  if (zio->io_error != 0) {
4256 4678                          /*
4257 4679                           * Error - drop L2ARC entry.
4258 4680                           */
4259 4681                          list_remove(buflist, ab);
4260      -                        ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
     4682 +                        ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4261 4683                          ab->b_l2hdr = NULL;
4262      -                        kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
     4684 +                        kmem_free(l2hdr, sizeof (*l2hdr));
4263 4685                          ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4264 4686                  }
4265 4687  
4266 4688                  /*
4267 4689                   * Allow ARC to begin reads to this L2ARC entry.
4268 4690                   */
4269 4691                  ab->b_flags &= ~ARC_L2_WRITING;
4270 4692  
4271 4693                  mutex_exit(hash_lock);
4272 4694          }
4273 4695  
4274 4696          atomic_inc_64(&l2arc_writes_done);
4275 4697          list_remove(buflist, head);
4276 4698          kmem_cache_free(hdr_cache, head);
4277 4699          mutex_exit(&l2arc_buflist_mtx);
4278 4700  
4279 4701          l2arc_do_free_on_write();
4280 4702  
     4703 +        for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL;
     4704 +            lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) {
     4705 +                (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list);
     4706 +                kmem_free(lb_buf, sizeof (*lb_buf));
     4707 +        }
     4708 +        list_destroy(&cb->l2wcb_log_blk_buf_list);
4281 4709          kmem_free(cb, sizeof (l2arc_write_callback_t));
4282 4710  }
4283 4711  
4284 4712  /*
4285 4713   * A read to a cache device completed.  Validate buffer contents before
4286 4714   * handing over to the regular ARC routines.
4287 4715   */
4288 4716  static void
4289 4717  l2arc_read_done(zio_t *zio)
4290 4718  {
↓ open down ↓ 103 lines elided ↑ open up ↑
4394 4822                  *lock = &arc_mru->arcs_mtx;
4395 4823                  break;
4396 4824          }
4397 4825  
4398 4826          ASSERT(!(MUTEX_HELD(*lock)));
4399 4827          mutex_enter(*lock);
4400 4828          return (list);
4401 4829  }
4402 4830  
4403 4831  /*
     4832 + * Calculates the maximum overhead of L2ARC metadata log blocks for a given
     4833 + * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
     4834 + * overhead in processing to make sure there is enough headroom available
     4835 + * when writing buffers.
     4836 + */
     4837 +static inline uint64_t
     4838 +l2arc_log_blk_overhead(uint64_t write_sz)
     4839 +{
     4840 +        return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
     4841 +            L2ARC_LOG_BLK_SIZE;
     4842 +}
     4843 +
     4844 +/*
4404 4845   * Evict buffers from the device write hand to the distance specified in
4405 4846   * bytes.  This distance may span populated buffers, it may span nothing.
4406 4847   * This is clearing a region on the L2ARC device ready for writing.
4407 4848   * If the 'all' boolean is set, every buffer is evicted.
4408 4849   */
4409 4850  static void
4410 4851  l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4411 4852  {
4412 4853          list_t *buflist;
4413      -        l2arc_buf_hdr_t *abl2;
     4854 +        l2arc_buf_hdr_t *l2hdr;
4414 4855          arc_buf_hdr_t *ab, *ab_prev;
4415 4856          kmutex_t *hash_lock;
4416 4857          uint64_t taddr;
4417 4858  
4418 4859          buflist = dev->l2ad_buflist;
4419 4860  
4420 4861          if (buflist == NULL)
4421 4862                  return;
4422 4863  
4423 4864          if (!all && dev->l2ad_first) {
4424 4865                  /*
4425 4866                   * This is the first sweep through the device.  There is
4426 4867                   * nothing to evict.
4427 4868                   */
4428 4869                  return;
4429 4870          }
4430 4871  
     4872 +        /*
     4873 +         * We need to add in the worst case scenario of log block overhead.
     4874 +         */
     4875 +        distance += l2arc_log_blk_overhead(distance);
4431 4876          if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4432 4877                  /*
4433 4878                   * When nearing the end of the device, evict to the end
4434 4879                   * before the device write hand jumps to the start.
4435 4880                   */
4436 4881                  taddr = dev->l2ad_end;
4437 4882          } else {
4438 4883                  taddr = dev->l2ad_hand + distance;
4439 4884          }
4440 4885          DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
↓ open down ↓ 62 lines elided ↑ open up ↑
4503 4948                           */
4504 4949                          if (HDR_L2_READING(ab)) {
4505 4950                                  ARCSTAT_BUMP(arcstat_l2_evict_reading);
4506 4951                                  ab->b_flags |= ARC_L2_EVICTED;
4507 4952                          }
4508 4953  
4509 4954                          /*
4510 4955                           * Tell ARC this no longer exists in L2ARC.
4511 4956                           */
4512 4957                          if (ab->b_l2hdr != NULL) {
4513      -                                abl2 = ab->b_l2hdr;
4514      -                                ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
     4958 +                                l2hdr = ab->b_l2hdr;
     4959 +                                ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4515 4960                                  ab->b_l2hdr = NULL;
4516      -                                kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
     4961 +                                kmem_free(l2hdr, sizeof (*l2hdr));
4517 4962                                  ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4518 4963                          }
4519 4964                          list_remove(buflist, ab);
4520 4965  
4521 4966                          /*
4522 4967                           * This may have been leftover after a
4523 4968                           * failed write.
4524 4969                           */
4525 4970                          ab->b_flags &= ~ARC_L2_WRITING;
4526 4971                  }
↓ open down ↓ 15 lines elided ↑ open up ↑
4542 4987   *
4543 4988   * Returns the number of bytes actually written (which may be smaller than
4544 4989   * the delta by which the device hand has changed due to alignment).
4545 4990   */
4546 4991  static uint64_t
4547 4992  l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4548 4993      boolean_t *headroom_boost)
4549 4994  {
4550 4995          arc_buf_hdr_t *ab, *ab_prev, *head;
4551 4996          list_t *list;
4552      -        uint64_t write_asize, write_psize, write_sz, headroom,
     4997 +        /*
     4998 +         * These variables mean:
     4999 +         * - write_size: in-memory size of ARC buffers we've written (before
     5000 +         *      compression).
     5001 +         * - write_asize: actual on-disk size of ARC buffers we've written
     5002 +         *      (after compression).
     5003 +         * - write_aligned_asize: actual sum of space taken by ARC buffers
     5004 +         *      on the device (after compression and alignment, so that
     5005 +         *      every buffer starts on a multiple of the device block size).
     5006 +         * - headroom: L2ARC scanning headroom (we won't scan beyond this
     5007 +         *      distance from the list tail).
     5008 +         * - buf_compress_minsz: minimum in-memory ARC buffer size for us
     5009 +         *      to try compressing it.
     5010 +         */
     5011 +        uint64_t write_size, write_asize, write_aligned_asize, headroom,
4553 5012              buf_compress_minsz;
4554 5013          void *buf_data;
4555 5014          kmutex_t *list_lock;
4556 5015          boolean_t full;
4557 5016          l2arc_write_callback_t *cb;
4558 5017          zio_t *pio, *wzio;
4559 5018          uint64_t guid = spa_load_guid(spa);
4560 5019          const boolean_t do_headroom_boost = *headroom_boost;
     5020 +        boolean_t dev_hdr_update = B_FALSE;
4561 5021  
4562 5022          ASSERT(dev->l2ad_vdev != NULL);
4563 5023  
4564 5024          /* Lower the flag now, we might want to raise it again later. */
4565 5025          *headroom_boost = B_FALSE;
4566 5026  
4567 5027          pio = NULL;
4568      -        write_sz = write_asize = write_psize = 0;
     5028 +        cb = NULL;
     5029 +        write_size = write_asize = write_aligned_asize = 0;
4569 5030          full = B_FALSE;
4570 5031          head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4571 5032          head->b_flags |= ARC_L2_WRITE_HEAD;
4572 5033  
4573 5034          /*
4574 5035           * We will want to try to compress buffers that are at least 2x the
4575 5036           * device sector size.
4576 5037           */
4577 5038          buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4578 5039  
↓ open down ↓ 17 lines elided ↑ open up ↑
4596 5057                  else
4597 5058                          ab = list_tail(list);
4598 5059  
4599 5060                  headroom = target_sz * l2arc_headroom;
4600 5061                  if (do_headroom_boost)
4601 5062                          headroom = (headroom * l2arc_headroom_boost) / 100;
4602 5063  
4603 5064                  for (; ab; ab = ab_prev) {
4604 5065                          l2arc_buf_hdr_t *l2hdr;
4605 5066                          kmutex_t *hash_lock;
4606      -                        uint64_t buf_sz;
     5067 +                        uint64_t buf_aligned_size;
4607 5068  
4608 5069                          if (arc_warm == B_FALSE)
4609 5070                                  ab_prev = list_next(list, ab);
4610 5071                          else
4611 5072                                  ab_prev = list_prev(list, ab);
4612 5073  
4613 5074                          hash_lock = HDR_LOCK(ab);
4614 5075                          if (!mutex_tryenter(hash_lock)) {
4615 5076                                  /*
4616 5077                                   * Skip this buffer rather than waiting.
4617 5078                                   */
4618 5079                                  continue;
4619 5080                          }
4620 5081  
4621      -                        passed_sz += ab->b_size;
     5082 +                        /*
     5083 +                         * When examining whether we've met our write target,
     5084 +                         * we must always use the aligned size of the buffer,
     5085 +                         * since that's the maximum amount of space a buffer
     5086 +                         * can take up on the L2ARC device.
     5087 +                         */
     5088 +                        buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev,
     5089 +                            ab->b_size);
     5090 +                        passed_sz += buf_aligned_size;
4622 5091                          if (passed_sz > headroom) {
4623 5092                                  /*
4624 5093                                   * Searched too far.
4625 5094                                   */
4626 5095                                  mutex_exit(hash_lock);
4627 5096                                  break;
4628 5097                          }
4629 5098  
4630 5099                          if (!l2arc_write_eligible(guid, ab)) {
4631 5100                                  mutex_exit(hash_lock);
4632 5101                                  continue;
4633 5102                          }
4634 5103  
4635      -                        if ((write_sz + ab->b_size) > target_sz) {
     5104 +                        if ((write_size + buf_aligned_size) > target_sz) {
4636 5105                                  full = B_TRUE;
4637 5106                                  mutex_exit(hash_lock);
4638 5107                                  break;
4639 5108                          }
4640 5109  
4641 5110                          if (pio == NULL) {
4642 5111                                  /*
4643 5112                                   * Insert a dummy header on the buflist so
4644 5113                                   * l2arc_write_done() can find where the
4645 5114                                   * write buffers begin without searching.
4646 5115                                   */
4647 5116                                  list_insert_head(dev->l2ad_buflist, head);
4648 5117  
4649      -                                cb = kmem_alloc(
     5118 +                                cb = kmem_zalloc(
4650 5119                                      sizeof (l2arc_write_callback_t), KM_SLEEP);
4651 5120                                  cb->l2wcb_dev = dev;
4652 5121                                  cb->l2wcb_head = head;
     5122 +                                list_create(&cb->l2wcb_log_blk_buf_list,
     5123 +                                    sizeof (l2arc_log_blk_buf_t),
     5124 +                                    offsetof(l2arc_log_blk_buf_t, l2lbb_node));
4653 5125                                  pio = zio_root(spa, l2arc_write_done, cb,
4654 5126                                      ZIO_FLAG_CANFAIL);
4655 5127                          }
4656 5128  
4657 5129                          /*
4658 5130                           * Create and add a new L2ARC header.
4659 5131                           */
4660      -                        l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
     5132 +                        l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
4661 5133                          l2hdr->b_dev = dev;
4662 5134                          ab->b_flags |= ARC_L2_WRITING;
4663 5135  
4664 5136                          /*
4665 5137                           * Temporarily stash the data buffer in b_tmp_cdata.
4666 5138                           * The subsequent write step will pick it up from
4667 5139                           * there. This is because can't access ab->b_buf
4668 5140                           * without holding the hash_lock, which we in turn
4669 5141                           * can't access without holding the ARC list locks
4670 5142                           * (which we want to avoid during compression/writing).
4671 5143                           */
4672 5144                          l2hdr->b_compress = ZIO_COMPRESS_OFF;
4673 5145                          l2hdr->b_asize = ab->b_size;
4674 5146                          l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4675 5147  
4676      -                        buf_sz = ab->b_size;
4677 5148                          ab->b_l2hdr = l2hdr;
4678 5149  
4679 5150                          list_insert_head(dev->l2ad_buflist, ab);
4680 5151  
4681 5152                          /*
4682 5153                           * Compute and store the buffer cksum before
4683 5154                           * writing.  On debug the cksum is verified first.
4684 5155                           */
4685 5156                          arc_cksum_verify(ab->b_buf);
4686 5157                          arc_cksum_compute(ab->b_buf, B_TRUE);
4687 5158  
4688 5159                          mutex_exit(hash_lock);
4689 5160  
4690      -                        write_sz += buf_sz;
     5161 +                        write_size += buf_aligned_size;
4691 5162                  }
4692 5163  
4693 5164                  mutex_exit(list_lock);
4694 5165  
4695 5166                  if (full == B_TRUE)
4696 5167                          break;
4697 5168          }
4698 5169  
4699 5170          /* No buffers selected for writing? */
4700 5171          if (pio == NULL) {
4701      -                ASSERT0(write_sz);
     5172 +                ASSERT0(write_size);
4702 5173                  mutex_exit(&l2arc_buflist_mtx);
4703 5174                  kmem_cache_free(hdr_cache, head);
4704 5175                  return (0);
4705 5176          }
4706 5177  
4707 5178          /*
4708 5179           * Now start writing the buffers. We're starting at the write head
4709 5180           * and work backwards, retracing the course of the buffer selector
4710 5181           * loop above.
4711 5182           */
↓ open down ↓ 24 lines elided ↑ open up ↑
4736 5207  
4737 5208                  /*
4738 5209                   * Pick up the buffer data we had previously stashed away
4739 5210                   * (and now potentially also compressed).
4740 5211                   */
4741 5212                  buf_data = l2hdr->b_tmp_cdata;
4742 5213                  buf_sz = l2hdr->b_asize;
4743 5214  
4744 5215                  /* Compression may have squashed the buffer to zero length. */
4745 5216                  if (buf_sz != 0) {
4746      -                        uint64_t buf_p_sz;
     5217 +                        uint64_t buf_aligned_asize;
4747 5218  
4748 5219                          wzio = zio_write_phys(pio, dev->l2ad_vdev,
4749 5220                              dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4750 5221                              NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4751 5222                              ZIO_FLAG_CANFAIL, B_FALSE);
4752 5223  
4753 5224                          DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4754 5225                              zio_t *, wzio);
4755 5226                          (void) zio_nowait(wzio);
4756 5227  
4757 5228                          write_asize += buf_sz;
4758 5229                          /*
4759 5230                           * Keep the clock hand suitably device-aligned.
4760 5231                           */
4761      -                        buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4762      -                        write_psize += buf_p_sz;
4763      -                        dev->l2ad_hand += buf_p_sz;
     5232 +                        buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev,
     5233 +                            buf_sz);
     5234 +                        write_aligned_asize += buf_aligned_asize;
     5235 +                        dev->l2ad_hand += buf_aligned_asize;
     5236 +                        ASSERT(dev->l2ad_hand <= dev->l2ad_evict ||
     5237 +                            dev->l2ad_first);
4764 5238                  }
4765      -        }
4766 5239  
     5240 +                if (l2arc_log_blk_insert(dev, ab)) {
     5241 +                        l2arc_log_blk_commit(dev, pio, cb);
     5242 +                        dev_hdr_update = B_TRUE;
     5243 +                }
     5244 +        }
4767 5245          mutex_exit(&l2arc_buflist_mtx);
4768 5246  
4769      -        ASSERT3U(write_asize, <=, target_sz);
     5247 +        if (dev_hdr_update)
     5248 +                l2arc_dev_hdr_update(dev, pio);
     5249 +
     5250 +        VERIFY3U(write_aligned_asize, <=, target_sz);
4770 5251          ARCSTAT_BUMP(arcstat_l2_writes_sent);
4771 5252          ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4772      -        ARCSTAT_INCR(arcstat_l2_size, write_sz);
4773      -        ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4774      -        vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
     5253 +        ARCSTAT_INCR(arcstat_l2_size, write_size);
     5254 +        ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize);
     5255 +        vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0);
4775 5256  
4776 5257          /*
4777 5258           * Bump device hand to the device start if it is approaching the end.
4778 5259           * l2arc_evict() will already have evicted ahead for this case.
4779 5260           */
4780      -        if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
     5261 +        if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
     5262 +            dev->l2ad_end) {
4781 5263                  vdev_space_update(dev->l2ad_vdev,
4782 5264                      dev->l2ad_end - dev->l2ad_hand, 0, 0);
4783 5265                  dev->l2ad_hand = dev->l2ad_start;
4784 5266                  dev->l2ad_evict = dev->l2ad_start;
4785 5267                  dev->l2ad_first = B_FALSE;
4786 5268          }
4787 5269  
4788 5270          dev->l2ad_writing = B_TRUE;
4789 5271          (void) zio_wait(pio);
4790 5272          dev->l2ad_writing = B_FALSE;
↓ open down ↓ 241 lines elided ↑ open up ↑
5032 5514  
5033 5515          l2arc_thread_exit = 0;
5034 5516          cv_broadcast(&l2arc_feed_thr_cv);
5035 5517          CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
5036 5518          thread_exit();
5037 5519  }
5038 5520  
5039 5521  boolean_t
5040 5522  l2arc_vdev_present(vdev_t *vd)
5041 5523  {
5042      -        l2arc_dev_t *dev;
     5524 +        return (l2arc_vdev_get(vd) != NULL);
     5525 +}
5043 5526  
5044      -        mutex_enter(&l2arc_dev_mtx);
     5527 +static l2arc_dev_t *
     5528 +l2arc_vdev_get(vdev_t *vd)
     5529 +{
     5530 +        l2arc_dev_t     *dev;
     5531 +        boolean_t       held = MUTEX_HELD(&l2arc_dev_mtx);
     5532 +
     5533 +        if (!held)
     5534 +                mutex_enter(&l2arc_dev_mtx);
5045 5535          for (dev = list_head(l2arc_dev_list); dev != NULL;
5046 5536              dev = list_next(l2arc_dev_list, dev)) {
5047 5537                  if (dev->l2ad_vdev == vd)
5048 5538                          break;
5049 5539          }
5050      -        mutex_exit(&l2arc_dev_mtx);
     5540 +        if (!held)
     5541 +                mutex_exit(&l2arc_dev_mtx);
5051 5542  
5052      -        return (dev != NULL);
     5543 +        return (dev);
5053 5544  }
5054 5545  
5055 5546  /*
5056 5547   * Add a vdev for use by the L2ARC.  By this point the spa has already
5057      - * validated the vdev and opened it.
     5548 + * validated the vdev and opened it. The `rebuild' flag indicates whether
     5549 + * we should attempt an L2ARC persistency rebuild.
5058 5550   */
5059 5551  void
5060      -l2arc_add_vdev(spa_t *spa, vdev_t *vd)
     5552 +l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5061 5553  {
5062 5554          l2arc_dev_t *adddev;
5063 5555  
5064 5556          ASSERT(!l2arc_vdev_present(vd));
5065 5557  
5066 5558          /*
5067 5559           * Create a new l2arc device entry.
5068 5560           */
5069 5561          adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5070 5562          adddev->l2ad_spa = spa;
5071 5563          adddev->l2ad_vdev = vd;
5072      -        adddev->l2ad_start = VDEV_LABEL_START_SIZE;
     5564 +        /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */
     5565 +        adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE;
5073 5566          adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5074 5567          adddev->l2ad_hand = adddev->l2ad_start;
5075 5568          adddev->l2ad_evict = adddev->l2ad_start;
5076 5569          adddev->l2ad_first = B_TRUE;
5077 5570          adddev->l2ad_writing = B_FALSE;
5078 5571  
5079 5572          /*
5080 5573           * This is a list of all ARC buffers that are still valid on the
5081 5574           * device.
5082 5575           */
↓ open down ↓ 2 lines elided ↑ open up ↑
5085 5578              offsetof(arc_buf_hdr_t, b_l2node));
5086 5579  
5087 5580          vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5088 5581  
5089 5582          /*
5090 5583           * Add device to global list
5091 5584           */
5092 5585          mutex_enter(&l2arc_dev_mtx);
5093 5586          list_insert_head(l2arc_dev_list, adddev);
5094 5587          atomic_inc_64(&l2arc_ndev);
     5588 +        if (rebuild && l2arc_rebuild_enabled &&
     5589 +            adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
     5590 +                /*
     5591 +                 * Just mark the device as pending for a rebuild. We won't
     5592 +                 * be starting a rebuild in line here as it would block pool
     5593 +                 * import. Instead spa_load_impl will hand that off to an
     5594 +                 * async task which will call l2arc_spa_rebuild_start.
     5595 +                 */
     5596 +                adddev->l2ad_rebuild = B_TRUE;
     5597 +        }
5095 5598          mutex_exit(&l2arc_dev_mtx);
5096 5599  }
5097 5600  
5098 5601  /*
5099 5602   * Remove a vdev from the L2ARC.
5100 5603   */
5101 5604  void
5102 5605  l2arc_remove_vdev(vdev_t *vd)
5103 5606  {
5104 5607          l2arc_dev_t *dev, *nextdev, *remdev = NULL;
↓ open down ↓ 86 lines elided ↑ open up ↑
5191 5694  {
5192 5695          if (!(spa_mode_global & FWRITE))
5193 5696                  return;
5194 5697  
5195 5698          mutex_enter(&l2arc_feed_thr_lock);
5196 5699          cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
5197 5700          l2arc_thread_exit = 1;
5198 5701          while (l2arc_thread_exit != 0)
5199 5702                  cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5200 5703          mutex_exit(&l2arc_feed_thr_lock);
     5704 +}
     5705 +
     5706 +/*
     5707 + * Punches out rebuild threads for the L2ARC devices in a spa. This should
     5708 + * be called as one of the final steps of a pool import.
     5709 + */
     5710 +void
     5711 +l2arc_spa_rebuild_start(spa_t *spa)
     5712 +{
     5713 +        l2arc_dev_t     *dev;
     5714 +        /*
     5715 +         * Locate the spa's l2arc devices and kick off rebuild threads.
     5716 +         */
     5717 +        mutex_enter(&l2arc_dev_mtx);
     5718 +        for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
     5719 +                dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
     5720 +                ASSERT(dev != NULL);
     5721 +                if (dev->l2ad_rebuild) {
     5722 +                        (void) thread_create(NULL, 0, l2arc_dev_rebuild_start,
     5723 +                            dev, 0, &p0, TS_RUN, minclsyspri);
     5724 +                }
     5725 +        }
     5726 +        mutex_exit(&l2arc_dev_mtx);
     5727 +}
     5728 +
     5729 +/*
     5730 + * Main entry point for L2ARC rebuilding.
     5731 + */
     5732 +static void
     5733 +l2arc_dev_rebuild_start(l2arc_dev_t *dev)
     5734 +{
     5735 +        spa_t *spa = dev->l2ad_spa;
     5736 +        vdev_t *vd = dev->l2ad_vdev;
     5737 +
     5738 +        /* Lock out device removal. */
     5739 +        spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
     5740 +        ASSERT(dev->l2ad_rebuild);
     5741 +        (void) l2arc_rebuild(dev);
     5742 +        dev->l2ad_rebuild = B_FALSE;
     5743 +        spa_config_exit(spa, SCL_L2ARC, vd);
     5744 +        thread_exit();
     5745 +}
     5746 +
     5747 +/*
     5748 + * This function implements the actual L2ARC metadata rebuild. It:
     5749 + *
     5750 + * 1) reads the device's header
     5751 + * 2) if a good device header is found, starts reading the log block chain
     5752 + * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
     5753 + *
     5754 + * Operation stops under any of the following conditions:
     5755 + *
     5756 + * 1) We reach the end of the log blk chain (the back-reference in the blk is
     5757 + *    invalid or loops over our starting point).
     5758 + * 2) We encounter *any* error condition (cksum errors, io errors, looped
     5759 + *    blocks, etc.).
     5760 + * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
     5761 + *    from making severely fragmented L2ARC log blocks or slow L2ARC devices
     5762 + *    prevent a machine from finishing a pool import (and thus letting the
     5763 + *    administrator take corrective action, e.g. by kicking the misbehaving
     5764 + *    L2ARC device out of the pool, or by reimporting the pool with L2ARC
     5765 + *    rebuilding disabled).
     5766 + */
     5767 +static int
     5768 +l2arc_rebuild(l2arc_dev_t *dev)
     5769 +{
     5770 +        int                     err;
     5771 +        l2arc_log_blk_phys_t    *this_lb, *next_lb;
     5772 +        uint8_t                 *this_lb_buf, *next_lb_buf;
     5773 +        zio_t                   *this_io = NULL, *next_io = NULL;
     5774 +        int64_t                 deadline;
     5775 +        l2arc_log_blk_ptr_t     lb_ptrs[2];
     5776 +        boolean_t               first_pass;
     5777 +        uint64_t                load_guid;
     5778 +
     5779 +        load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
     5780 +        deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
     5781 +        /*
     5782 +         * Device header processing phase.
     5783 +         */
     5784 +        if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) {
     5785 +                /* device header corrupted, start a new one */
     5786 +                bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr));
     5787 +                return (err);
     5788 +        }
     5789 +        if (l2arc_check_rebuild_timeout_hit(deadline))
     5790 +                return (SET_ERROR(ETIMEDOUT));
     5791 +
     5792 +        /* Retrieve the persistent L2ARC device state */
     5793 +        dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail;
     5794 +        dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
     5795 +            dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr +
     5796 +            LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0]));
     5797 +        dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags &
     5798 +            L2ARC_DEV_HDR_EVICT_FIRST);
     5799 +
     5800 +        /* Prepare the rebuild processing state */
     5801 +        bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
     5802 +        this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
     5803 +        next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
     5804 +        this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
     5805 +        next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
     5806 +        first_pass = B_TRUE;
     5807 +
     5808 +        /* Start the rebuild process */
     5809 +        for (;;) {
     5810 +                if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0]))
     5811 +                        /* We hit an invalid block address, end the rebuild. */
     5812 +                        break;
     5813 +
     5814 +                if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
     5815 +                    this_lb, next_lb, this_lb_buf, next_lb_buf,
     5816 +                    this_io, &next_io)) != 0)
     5817 +                        break;
     5818 +
     5819 +                /* Protection against infinite loops of log blocks. */
     5820 +                if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr,
     5821 +                    lb_ptrs[0].l2lbp_daddr,
     5822 +                    dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
     5823 +                    !first_pass) {
     5824 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
     5825 +                        err = SET_ERROR(ELOOP);
     5826 +                        break;
     5827 +                }
     5828 +
     5829 +                /*
     5830 +                 * Our memory pressure valve. If the system is running low
     5831 +                 * on memory, rather than swamping memory with new ARC buf
     5832 +                 * hdrs, we opt not to rebuild the L2ARC. At this point,
     5833 +                 * however, we have already set up our L2ARC dev to chain in
     5834 +                 * new metadata log blk, so the user may choose to re-add the
     5835 +                 * L2ARC dev at a later time to reconstruct it (when there's
     5836 +                 * less memory pressure).
     5837 +                 */
     5838 +                if (arc_reclaim_needed()) {
     5839 +                        ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
     5840 +                        cmn_err(CE_NOTE, "System running low on memory, "
     5841 +                            "aborting L2ARC rebuild.");
     5842 +                        err = SET_ERROR(ENOMEM);
     5843 +                        break;
     5844 +                }
     5845 +
     5846 +                /*
     5847 +                 * Now that we know that the next_lb checks out alright, we
     5848 +                 * can start reconstruction from this lb - we can be sure
     5849 +                 * that the L2ARC write hand has not yet reached any of our
     5850 +                 * buffers.
     5851 +                 */
     5852 +                l2arc_log_blk_restore(dev, load_guid, this_lb,
     5853 +                    LBP_GET_PSIZE(&lb_ptrs[0]));
     5854 +
     5855 +                /*
     5856 +                 * End of list detection. We can look ahead two steps in the
     5857 +                 * blk chain and if the 2nd blk from this_lb dips below the
     5858 +                 * initial chain starting point, then we know two things:
     5859 +                 *      1) it can't be valid, and
     5860 +                 *      2) the next_lb's ARC entries might have already been
     5861 +                 *      partially overwritten and so we should stop before
     5862 +                 *      we restore it
     5863 +                 */
     5864 +                if (l2arc_range_check_overlap(
     5865 +                    this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr,
     5866 +                    dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
     5867 +                    !first_pass)
     5868 +                        break;
     5869 +
     5870 +                /* log blk restored, continue with next one in the list */
     5871 +                lb_ptrs[0] = lb_ptrs[1];
     5872 +                lb_ptrs[1] = this_lb->l2lb_back2_lbp;
     5873 +                PTR_SWAP(this_lb, next_lb);
     5874 +                PTR_SWAP(this_lb_buf, next_lb_buf);
     5875 +                this_io = next_io;
     5876 +                next_io = NULL;
     5877 +                first_pass = B_FALSE;
     5878 +
     5879 +                if (l2arc_check_rebuild_timeout_hit(deadline)) {
     5880 +                        err = SET_ERROR(ETIMEDOUT);
     5881 +                        break;
     5882 +                }
     5883 +        }
     5884 +        if (next_io != NULL)
     5885 +                l2arc_log_blk_prefetch_abort(next_io);
     5886 +        kmem_free(this_lb, sizeof (*this_lb));
     5887 +        kmem_free(next_lb, sizeof (*next_lb));
     5888 +        kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
     5889 +        kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
     5890 +        if (err == 0)
     5891 +                ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
     5892 +
     5893 +        return (err);
     5894 +}
     5895 +
     5896 +/*
     5897 + * Restores the payload of a log blk to ARC. This creates empty ARC hdr
     5898 + * entries which only contain an l2arc hdr, essentially restoring the
     5899 + * buffers to their L2ARC evicted state. This function also updates space
     5900 + * usage on the L2ARC vdev to make sure it tracks restored buffers.
     5901 + */
     5902 +static void
     5903 +l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
     5904 +    l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
     5905 +{
     5906 +        uint64_t        size = 0, psize = 0;
     5907 +
     5908 +        mutex_enter(&l2arc_buflist_mtx);
     5909 +
     5910 +        for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
     5911 +                /*
     5912 +                 * Restore goes in the reverse direction to preserve correct
     5913 +                 * temporal ordering of buffers in the l2ad_buflist.
     5914 +                 */
     5915 +                l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid);
     5916 +                size += LE_GET_LSIZE(&lb->l2lb_entries[i]);
     5917 +                psize += LE_GET_PSIZE(&lb->l2lb_entries[i]);
     5918 +        }
     5919 +        mutex_exit(&l2arc_buflist_mtx);
     5920 +
     5921 +        /*
     5922 +         * Record rebuild stats:
     5923 +         *      size            In-memory size of restored buffer data in ARC
     5924 +         *      psize           Physical size of restored buffers in the L2ARC
     5925 +         *      bufs            # of ARC buffer headers restored
     5926 +         *      log_blks        # of L2ARC log entries processed during restore
     5927 +         */
     5928 +        ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
     5929 +        ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
     5930 +        ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
     5931 +        ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
     5932 +        ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
     5933 +        ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
     5934 +        vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
     5935 +}
     5936 +
     5937 +/*
     5938 + * Restores a single ARC buf hdr from a log block. The ARC buffer is put
     5939 + * into a state indicating that it has been evicted to L2ARC.
     5940 + */
     5941 +static void
     5942 +l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
     5943 +    uint64_t load_guid)
     5944 +{
     5945 +        arc_buf_hdr_t   *hdr, *exists;
     5946 +        kmutex_t        *hash_lock;
     5947 +        arc_buf_contents_t      type = LE_GET_TYPE(le);
     5948 +        l2arc_buf_hdr_t         *l2hdr;
     5949 +
     5950 +        hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type);
     5951 +        hdr->b_dva = le->l2le_dva;
     5952 +        hdr->b_birth = le->l2le_birth;
     5953 +        hdr->b_cksum0 = le->l2le_cksum0;
     5954 +        hdr->b_size = LE_GET_LSIZE(le);
     5955 +        exists = buf_hash_insert(hdr, &hash_lock);
     5956 +        if (exists) {
     5957 +                /* Buffer was already cached, no need to restore it. */
     5958 +                mutex_exit(hash_lock);
     5959 +                arc_hdr_destroy(hdr);
     5960 +                ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
     5961 +                return;
     5962 +        }
     5963 +        hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE;
     5964 +        if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF)
     5965 +                hdr->b_flags |= ARC_L2COMPRESS;
     5966 +        mutex_enter(&hdr->b_freeze_lock);
     5967 +        ASSERT(hdr->b_freeze_cksum == NULL);
     5968 +        hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
     5969 +        *hdr->b_freeze_cksum = le->l2le_freeze_cksum;
     5970 +        mutex_exit(&hdr->b_freeze_lock);
     5971 +
     5972 +        /* now rebuild the l2arc entry */
     5973 +        ASSERT(hdr->b_l2hdr == NULL);
     5974 +        l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
     5975 +        l2hdr->b_dev = dev;
     5976 +        l2hdr->b_daddr = le->l2le_daddr;
     5977 +        l2hdr->b_asize = LE_GET_PSIZE(le);
     5978 +        l2hdr->b_compress = LE_GET_COMPRESS(le);
     5979 +        hdr->b_l2hdr = l2hdr;
     5980 +        list_insert_tail(dev->l2ad_buflist, hdr);
     5981 +        ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
     5982 +        ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
     5983 +
     5984 +        arc_change_state(arc_l2c_only, hdr, hash_lock);
     5985 +        mutex_exit(hash_lock);
     5986 +}
     5987 +
     5988 +/*
     5989 + * Attempts to read the device header on the provided L2ARC device and writes
     5990 + * it to `ub'. On success, this function returns 0, otherwise the appropriate
     5991 + * error code is returned.
     5992 + */
     5993 +static int
     5994 +l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr)
     5995 +{
     5996 +        int             err;
     5997 +        uint64_t        guid;
     5998 +        zio_cksum_t     cksum;
     5999 +
     6000 +        guid = spa_guid(dev->l2ad_vdev->vdev_spa);
     6001 +
     6002 +        if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
     6003 +            VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr,
     6004 +            ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
     6005 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
     6006 +            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
     6007 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
     6008 +                return (err);
     6009 +        }
     6010 +
     6011 +        if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
     6012 +                byteswap_uint64_array(hdr, sizeof (*hdr));
     6013 +
     6014 +        if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC ||
     6015 +            hdr->l2dh_spa_guid != guid) {
     6016 +                /*
     6017 +                 * Attempt to rebuild a device containing no actual dev hdr
     6018 +                 * or containing a header from some other pool.
     6019 +                 */
     6020 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
     6021 +                return (SET_ERROR(ENOTSUP));
     6022 +        }
     6023 +
     6024 +        l2arc_dev_hdr_checksum(hdr, &cksum);
     6025 +        if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) {
     6026 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
     6027 +                return (SET_ERROR(EINVAL));
     6028 +        }
     6029 +        if (hdr->l2dh_evict_tail < dev->l2ad_start ||
     6030 +            hdr->l2dh_evict_tail >= dev->l2ad_end) {
     6031 +                /* Data in dev hdr is invalid for this device. */
     6032 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
     6033 +                return (SET_ERROR(EINVAL));
     6034 +        }
     6035 +
     6036 +        return (0);
     6037 +}
     6038 +
     6039 +/*
     6040 + * Reads L2ARC log blocks from storage and validates their contents.
     6041 + *
     6042 + * This function implements a simple prefetcher to make sure that while
     6043 + * we're processing one buffer the L2ARC is already prefetching the next
     6044 + * one in the chain.
     6045 + *
     6046 + * The arguments this_lp and next_lp point to the current and next log blk
     6047 + * address in the block chain. Similarly, this_lb and next_lb hold the
     6048 + * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
     6049 + * and next_lb_buf must be buffers of appropriate to hold a raw
     6050 + * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
     6051 + * to buffer decompression).
     6052 + *
     6053 + * The `this_io' and `next_io' arguments are used for block prefetching.
     6054 + * When issuing the first blk IO during rebuild, you should pass NULL for
     6055 + * `this_io'. This function will then issue a sync IO to read the block and
     6056 + * also issue an async IO to fetch the next block in the block chain. The
     6057 + * prefetch IO is returned in `next_io'. On subsequent calls to this
     6058 + * function, pass the value returned in `next_io' from the previous call
     6059 + * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
     6060 + * Prior to the call, you should initialize your `next_io' pointer to be
     6061 + * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
     6062 + *
     6063 + * On success, this function returns 0, otherwise it returns an appropriate
     6064 + * error code. On error the prefetching IO is aborted and cleared before
     6065 + * returning from this function. Therefore, if we return `success', the
     6066 + * caller can assume that we have taken care of cleanup of prefetch IOs.
     6067 + */
     6068 +static int
     6069 +l2arc_log_blk_read(l2arc_dev_t *dev,
     6070 +    const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp,
     6071 +    l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     6072 +    uint8_t *this_lb_buf, uint8_t *next_lb_buf,
     6073 +    zio_t *this_io, zio_t **next_io)
     6074 +{
     6075 +        int err = 0;
     6076 +        zio_cksum_t cksum;
     6077 +
     6078 +        ASSERT(this_lbp != NULL && next_lbp != NULL);
     6079 +        ASSERT(this_lb != NULL && next_lb != NULL);
     6080 +        ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
     6081 +        ASSERT(next_io != NULL && *next_io == NULL);
     6082 +        ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp));
     6083 +
     6084 +        /*
     6085 +         * Check to see if we have issued the IO for this log blk in a
     6086 +         * previous run. If not, this is the first call, so issue it now.
     6087 +         */
     6088 +        if (this_io == NULL) {
     6089 +                this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
     6090 +                    this_lb_buf);
     6091 +        }
     6092 +
     6093 +        /*
     6094 +         * Peek to see if we can start issuing the next IO immediately.
     6095 +         */
     6096 +        if (l2arc_log_blk_ptr_valid(dev, next_lbp)) {
     6097 +                /*
     6098 +                 * Start issuing IO for the next log blk early - this
     6099 +                 * should help keep the L2ARC device busy while we
     6100 +                 * decompress and restore this log blk.
     6101 +                 */
     6102 +                *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
     6103 +                    next_lb_buf);
     6104 +        }
     6105 +
     6106 +        /* Wait for the IO to read this log block to complete */
     6107 +        if ((err = zio_wait(this_io)) != 0) {
     6108 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
     6109 +                goto cleanup;
     6110 +        }
     6111 +
     6112 +        /* Make sure the buffer checks out */
     6113 +        fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum);
     6114 +        if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) {
     6115 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
     6116 +                err = SET_ERROR(EINVAL);
     6117 +                goto cleanup;
     6118 +        }
     6119 +
     6120 +        /* Now we can take our time decoding this buffer */
     6121 +        switch (LBP_GET_COMPRESS(this_lbp)) {
     6122 +        case ZIO_COMPRESS_OFF:
     6123 +                bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
     6124 +                break;
     6125 +        case ZIO_COMPRESS_LZ4:
     6126 +                if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
     6127 +                    this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
     6128 +                    sizeof (*this_lb))) != 0) {
     6129 +                        err = SET_ERROR(EINVAL);
     6130 +                        goto cleanup;
     6131 +                }
     6132 +                break;
     6133 +        default:
     6134 +                err = SET_ERROR(EINVAL);
     6135 +                goto cleanup;
     6136 +        }
     6137 +        if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
     6138 +                byteswap_uint64_array(this_lb, sizeof (*this_lb));
     6139 +        if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) {
     6140 +                err = SET_ERROR(EINVAL);
     6141 +                goto cleanup;
     6142 +        }
     6143 +cleanup:
     6144 +        /* Abort an in-flight prefetch I/O in case of error */
     6145 +        if (err != 0 && *next_io != NULL) {
     6146 +                l2arc_log_blk_prefetch_abort(*next_io);
     6147 +                *next_io = NULL;
     6148 +        }
     6149 +        return (err);
     6150 +}
     6151 +
     6152 +/*
     6153 + * Validates an L2ARC log blk address to make sure that it can be read
     6154 + * from the provided L2ARC device. Returns B_TRUE if the address is
     6155 + * within the device's bounds, or B_FALSE if not.
     6156 + */
     6157 +static boolean_t
     6158 +l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp)
     6159 +{
     6160 +        uint64_t psize = LBP_GET_PSIZE(lbp);
     6161 +        uint64_t end = lbp->l2lbp_daddr + psize;
     6162 +
     6163 +        /*
     6164 +         * A log block is valid if all of the following conditions are true:
     6165 +         * - it fits entirely between l2ad_start and l2ad_end
     6166 +         * - it has a valid size
     6167 +         * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it
     6168 +         *      doesn't sit in the evicted region)
     6169 +         */
     6170 +        return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end &&
     6171 +            psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
     6172 +            lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand);
     6173 +}
     6174 +
     6175 +/*
     6176 + * Starts an asynchronous read IO to read a log block. This is used in log
     6177 + * block reconstruction to start reading the next block before we are done
     6178 + * decoding and reconstructing the current block, to keep the l2arc device
     6179 + * nice and hot with read IO to process.
     6180 + * The returned zio will contain a newly allocated memory buffers for the IO
     6181 + * data which should then be freed by the caller once the zio is no longer
     6182 + * needed (i.e. due to it having completed). If you wish to abort this
     6183 + * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
     6184 + * care of disposing of the allocated buffers correctly.
     6185 + */
     6186 +static zio_t *
     6187 +l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp,
     6188 +    uint8_t *lb_buf)
     6189 +{
     6190 +        uint32_t psize;
     6191 +        zio_t *pio;
     6192 +
     6193 +        psize = LBP_GET_PSIZE(lbp);
     6194 +        ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
     6195 +        pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
     6196 +            ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
     6197 +            ZIO_FLAG_DONT_RETRY);
     6198 +        (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize,
     6199 +            lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
     6200 +            ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
     6201 +            ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
     6202 +
     6203 +        return (pio);
     6204 +}
     6205 +
     6206 +/*
     6207 + * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
     6208 + * buffers allocated for it.
     6209 + */
     6210 +static void
     6211 +l2arc_log_blk_prefetch_abort(zio_t *zio)
     6212 +{
     6213 +        (void) zio_wait(zio);
     6214 +}
     6215 +
     6216 +/*
     6217 + * Creates a zio to update the device header on an l2arc device. The zio is
     6218 + * initiated as a child of `pio'.
     6219 + */
     6220 +static void
     6221 +l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
     6222 +{
     6223 +        zio_t *wzio;
     6224 +        vdev_stat_t st;
     6225 +        l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr;
     6226 +
     6227 +        vdev_get_stats(dev->l2ad_vdev, &st);
     6228 +
     6229 +        hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC;
     6230 +        hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
     6231 +        hdr->l2dh_evict_tail = dev->l2ad_evict;
     6232 +        hdr->l2dh_alloc_space = st.vs_alloc;
     6233 +        hdr->l2dh_flags = 0;
     6234 +        if (dev->l2ad_first)
     6235 +                hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
     6236 +
     6237 +        /* checksum operation goes last */
     6238 +        l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum);
     6239 +
     6240 +        CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE &&
     6241 +            sizeof (*hdr) <= SPA_MAXBLOCKSIZE);
     6242 +        wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
     6243 +            sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL,
     6244 +            NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
     6245 +        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
     6246 +            zio_t *, wzio);
     6247 +        (void) zio_nowait(wzio);
     6248 +}
     6249 +
     6250 +/*
     6251 + * Commits a log block to the L2ARC device. This routine is invoked from
     6252 + * l2arc_write_buffers when the log block fills up.
     6253 + * This function allocates some memory to temporarily hold the serialized
     6254 + * buffer to be written. This is then released in l2arc_write_done.
     6255 + */
     6256 +static void
     6257 +l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     6258 +    l2arc_write_callback_t *cb)
     6259 +{
     6260 +        l2arc_log_blk_phys_t    *lb = &dev->l2ad_log_blk;
     6261 +        uint64_t                psize, asize;
     6262 +        l2arc_log_blk_buf_t     *lb_buf;
     6263 +        zio_t                   *wzio;
     6264 +
     6265 +        VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
     6266 +
     6267 +        /* link the buffer into the block chain */
     6268 +        lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1];
     6269 +        lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC;
     6270 +
     6271 +        /* try to compress the buffer */
     6272 +        lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
     6273 +        list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf);
     6274 +        VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb,
     6275 +            lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0);
     6276 +
     6277 +        /*
     6278 +         * Update the start log blk pointer in the device header to point
     6279 +         * to the log block we're about to write.
     6280 +         */
     6281 +        dev->l2ad_dev_hdr.l2dh_start_lbps[1] =
     6282 +            dev->l2ad_dev_hdr.l2dh_start_lbps[0];
     6283 +        dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand;
     6284 +        LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb));
     6285 +        LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize);
     6286 +        LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
     6287 +            ZIO_CHECKSUM_FLETCHER_4);
     6288 +        LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0);
     6289 +        if (psize < sizeof (*lb)) {
     6290 +                /* compression succeeded */
     6291 +                LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
     6292 +                    ZIO_COMPRESS_LZ4);
     6293 +        } else {
     6294 +                /* compression failed */
     6295 +                bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb));
     6296 +                LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
     6297 +                    ZIO_COMPRESS_OFF);
     6298 +        }
     6299 +        /* checksum what we're about to write */
     6300 +        fletcher_4_native(lb_buf->l2lbb_log_blk, psize,
     6301 +            &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum);
     6302 +
     6303 +        /* perform the write itself */
     6304 +        CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
     6305 +            L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
     6306 +        wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
     6307 +            psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
     6308 +            ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
     6309 +        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
     6310 +        (void) zio_nowait(wzio);
     6311 +
     6312 +        /* realign the device hand */
     6313 +        asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
     6314 +        dev->l2ad_hand += asize;
     6315 +        VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first);
     6316 +        vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
     6317 +
     6318 +        /* bump the kstats */
     6319 +        ARCSTAT_INCR(arcstat_l2_write_bytes, psize);
     6320 +        ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
     6321 +        ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
     6322 +        ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
     6323 +            dev->l2ad_log_blk_payload_asize / asize);
     6324 +
     6325 +        dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0;
     6326 +}
     6327 +
     6328 +/*
     6329 + * Computes the checksum of `hdr' and stores it in `cksum'.
     6330 + */
     6331 +static void
     6332 +l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
     6333 +{
     6334 +        fletcher_4_native((uint8_t *)hdr +
     6335 +            offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
     6336 +            sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
     6337 +            cksum);
     6338 +}
     6339 +
     6340 +/*
     6341 + * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
     6342 + * The buffer being inserted must be present in L2ARC.
     6343 + * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
     6344 + * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
     6345 + */
     6346 +static boolean_t
     6347 +l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
     6348 +{
     6349 +        l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
     6350 +        l2arc_log_ent_phys_t *le;
     6351 +        const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
     6352 +        int index = dev->l2ad_log_ent_idx++;
     6353 +
     6354 +        ASSERT(l2hdr != NULL);
     6355 +        ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
     6356 +
     6357 +        le = &lb->l2lb_entries[index];
     6358 +        bzero(le, sizeof (*le));
     6359 +        le->l2le_dva = ab->b_dva;
     6360 +        le->l2le_birth = ab->b_birth;
     6361 +        le->l2le_cksum0 = ab->b_cksum0;
     6362 +        le->l2le_daddr = l2hdr->b_daddr;
     6363 +        LE_SET_LSIZE(le, ab->b_size);
     6364 +        LE_SET_PSIZE(le, l2hdr->b_asize);
     6365 +        LE_SET_COMPRESS(le, l2hdr->b_compress);
     6366 +        le->l2le_freeze_cksum = *ab->b_freeze_cksum;
     6367 +        LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
     6368 +        LE_SET_TYPE(le, ab->b_type);
     6369 +        dev->l2ad_log_blk_payload_asize += l2hdr->b_asize;
     6370 +
     6371 +        return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
     6372 +}
     6373 +
     6374 +/*
     6375 + * Checks whether a given L2ARC device address sits in a time-sequential
     6376 + * range. The trick here is that the L2ARC is a rotary buffer, so we can't
     6377 + * just do a range comparison, we need to handle the situation in which the
     6378 + * range wraps around the end of the L2ARC device. Arguments:
     6379 + *      bottom  Lower end of the range to check (written to earlier).
     6380 + *      top     Upper end of the range to check (written to later).
     6381 + *      check   The address for which we want to determine if it sits in
     6382 + *              between the top and bottom.
     6383 + *
     6384 + * The 3-way conditional below represents the following cases:
     6385 + *
     6386 + *      bottom < top : Sequentially ordered case:
     6387 + *        <check>--------+-------------------+
     6388 + *                       |  (overlap here?)  |
     6389 + *       L2ARC dev       V                   V
     6390 + *       |---------------<bottom>============<top>--------------|
     6391 + *
     6392 + *      bottom > top: Looped-around case:
     6393 + *                            <check>--------+------------------+
     6394 + *                                           |  (overlap here?) |
     6395 + *       L2ARC dev                           V                  V
     6396 + *       |===============<top>---------------<bottom>===========|
     6397 + *       ^               ^
     6398 + *       |  (or here?)   |
     6399 + *       +---------------+---------<check>
     6400 + *
     6401 + *      top == bottom : Just a single address comparison.
     6402 + */
     6403 +static inline boolean_t
     6404 +l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
     6405 +{
     6406 +        if (bottom < top)
     6407 +                return (bottom <= check && check <= top);
     6408 +        else if (bottom > top)
     6409 +                return (check <= top || bottom <= check);
     6410 +        else
     6411 +                return (check == top);
     6412 +}
     6413 +
     6414 +/*
     6415 + * Checks whether a rebuild timeout deadline has been hit and if it has,
     6416 + * increments the appropriate error counters.
     6417 + */
     6418 +static boolean_t
     6419 +l2arc_check_rebuild_timeout_hit(int64_t deadline)
     6420 +{
     6421 +        if (deadline != 0 && deadline < ddi_get_lbolt64()) {
     6422 +                ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout);
     6423 +                cmn_err(CE_WARN, "L2ARC rebuild is taking too long, "
     6424 +                    "dropping remaining L2ARC metadata.");
     6425 +                return (B_TRUE);
     6426 +        } else {
     6427 +                return (B_FALSE);
     6428 +        }
5201 6429  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX