Print this page
New ARC buf_hash architecture


  55  * Pages are evicted when the cache is full and there is a cache
  56  * miss.  Our model has a variable sized cache.  It grows with
  57  * high use, but also tries to react to memory pressure from the
  58  * operating system: decreasing its size when system memory is
  59  * tight.
  60  *
  61  * 3. The Megiddo and Modha model assumes a fixed page size. All
  62  * elements of the cache are therefore exactly the same size.  So
  63  * when adjusting the cache size following a cache miss, its simply
  64  * a matter of choosing a single page to evict.  In our model, we
  65  * have variable sized cache blocks (rangeing from 512 bytes to
  66  * 128K bytes).  We therefore choose a set of blocks to evict to make
  67  * space for a cache miss that approximates as closely as possible
  68  * the space used by the new block.
  69  *
  70  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71  * by N. Megiddo & D. Modha, FAST 2003
  72  */
  73 
  74 /*



















  75  * The locking model:
  76  *
  77  * A new reference to a cache buffer can be obtained in two
  78  * ways: 1) via a hash table lookup using the DVA as a key,
  79  * or 2) via one of the ARC lists.  The arc_read() interface
  80  * uses method 1, while the internal arc algorithms for
  81  * adjusting the cache use method 2.  We therefore provide two
  82  * types of locks: 1) the hash table lock array, and 2) the
  83  * arc list locks.
  84  *
  85  * Buffers do not have their own mutexes, rather they rely on the
  86  * hash table mutexes for the bulk of their protection (i.e. most
  87  * fields in the arc_buf_hdr_t are protected by these mutexes).






  88  *
  89  * buf_hash_find() returns the appropriate mutex (held) when it
  90  * locates the requested buffer in the hash table.  It returns
  91  * NULL for the mutex if the buffer was not in the table.
  92  *
  93  * buf_hash_remove() expects the appropriate hash mutex to be
  94  * already held before it is invoked.
  95  *
  96  * Each arc state also has a mutex which is used to protect the
  97  * buffer list associated with the state.  When attempting to
  98  * obtain a hash table lock while holding an arc list lock you
  99  * must use: mutex_tryenter() to avoid deadlock.  Also note that
 100  * the active state mutex must be held before the ghost state mutex.
 101  *
 102  * Arc buffers may have an associated eviction callback function.
 103  * This function will be invoked prior to removing the buffer (e.g.
 104  * in arc_do_user_evicts()).  Note however that the data associated
 105  * with the buffer may be evicted prior to the callback.  The callback
 106  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 107  * the users of callbacks must ensure that their private data is


 185 
 186 static int arc_dead;
 187 
 188 /*
 189  * The arc has filled available memory and has now warmed up.
 190  */
 191 static boolean_t arc_warm;
 192 
 193 /*
 194  * These tunables are for performance analysis.
 195  */
 196 uint64_t zfs_arc_max;
 197 uint64_t zfs_arc_min;
 198 uint64_t zfs_arc_meta_limit = 0;
 199 int zfs_arc_grow_retry = 0;
 200 int zfs_arc_shrink_shift = 0;
 201 int zfs_arc_p_min_shift = 0;
 202 int zfs_disable_dup_eviction = 0;
 203 
 204 /*












 205  * Note that buffers can be in one of 6 states:
 206  *      ARC_anon        - anonymous (discussed below)
 207  *      ARC_mru         - recently used, currently cached
 208  *      ARC_mru_ghost   - recentely used, no longer in cache
 209  *      ARC_mfu         - frequently used, currently cached
 210  *      ARC_mfu_ghost   - frequently used, no longer in cache
 211  *      ARC_l2c_only    - exists in L2ARC but not other states
 212  * When there are no active references to the buffer, they are
 213  * are linked onto a list in one of these arc states.  These are
 214  * the only buffers that can be evicted or deleted.  Within each
 215  * state there are multiple lists, one for meta-data and one for
 216  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 217  * etc.) is tracked separately so that it can be managed more
 218  * explicitly: favored over data, limited explicitly.
 219  *
 220  * Anonymous buffers are buffers that are not associated with
 221  * a DVA.  These are buffers that hold dirty block copies
 222  * before they are written to stable storage.  By definition,
 223  * they are "ref'd" and are considered part of arc_mru
 224  * that cannot be freed.  Generally, they will aquire a DVA


 571 /*
 572  * Other sizes
 573  */
 574 
 575 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 576 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 577 
 578 /*
 579  * Hash table routines
 580  */
 581 
 582 #define HT_LOCK_PAD     64
 583 
 584 struct ht_lock {
 585         kmutex_t        ht_lock;
 586 #ifdef _KERNEL
 587         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 588 #endif
 589 };
 590 
 591 #define BUF_LOCKS 256
 592 typedef struct buf_hash_table {
 593         uint64_t ht_mask;
 594         arc_buf_hdr_t **ht_table;
 595         struct ht_lock ht_locks[BUF_LOCKS];

 596 } buf_hash_table_t;
 597 
 598 static buf_hash_table_t buf_hash_table;
 599 
 600 #define BUF_HASH_INDEX(spa, dva, birth) \
 601         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 602 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])

 603 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 604 #define HDR_LOCK(hdr) \
 605         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 606 
 607 uint64_t zfs_crc64_table[256];
 608 
 609 /*
 610  * Level 2 ARC
 611  */
 612 
 613 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 614 #define L2ARC_HEADROOM          2                       /* num of writes */
 615 /*
 616  * If we discover during ARC scan any buffers to be compressed, we boost
 617  * our headroom for the next scanning cycle by this percentage multiple.
 618  */
 619 #define L2ARC_HEADROOM_BOOST    200
 620 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 621 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 622 


 690         /* protected by l2arc_free_on_write_mtx */
 691         void            *l2df_data;
 692         size_t          l2df_size;
 693         void            (*l2df_func)(void *, size_t);
 694         list_node_t     l2df_list_node;
 695 } l2arc_data_free_t;
 696 
 697 static kmutex_t l2arc_feed_thr_lock;
 698 static kcondvar_t l2arc_feed_thr_cv;
 699 static uint8_t l2arc_thread_exit;
 700 
 701 static void l2arc_read_done(zio_t *zio);
 702 static void l2arc_hdr_stat_add(void);
 703 static void l2arc_hdr_stat_remove(void);
 704 
 705 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 706 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 707     enum zio_compress c);
 708 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 709 
 710 static uint64_t
 711 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 712 {
 713         uint8_t *vdva = (uint8_t *)dva;
 714         uint64_t crc = -1ULL;
 715         int i;
 716 
 717         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 718 
 719         for (i = 0; i < sizeof (dva_t); i++)
 720                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 721 
 722         crc ^= (spa>>8) ^ birth;
 723 
 724         return (crc);
 725 }
 726 
 727 #define BUF_EMPTY(buf)                                          \
 728         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 729         (buf)->b_dva.dva_word[1] == 0 &&                     \
 730         (buf)->b_birth == 0)


 827         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 828 
 829         if (buf_hash_table.ht_table[idx] &&
 830             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 831                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 832 }
 833 
 834 /*
 835  * Global data structures and functions for the buf kmem cache.
 836  */
 837 static kmem_cache_t *hdr_cache;
 838 static kmem_cache_t *buf_cache;
 839 
 840 static void
 841 buf_fini(void)
 842 {
 843         int i;
 844 
 845         kmem_free(buf_hash_table.ht_table,
 846             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 847         for (i = 0; i < BUF_LOCKS; i++)

 848                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);


 849         kmem_cache_destroy(hdr_cache);
 850         kmem_cache_destroy(buf_cache);
 851 }
 852 
 853 /*
 854  * Constructor callback - called when the cache is empty
 855  * and a new buf is requested.
 856  */
 857 /* ARGSUSED */
 858 static int
 859 hdr_cons(void *vbuf, void *unused, int kmflag)
 860 {
 861         arc_buf_hdr_t *buf = vbuf;
 862 
 863         bzero(buf, sizeof (arc_buf_hdr_t));
 864         refcount_create(&buf->b_refcnt);
 865         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 866         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 867         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 868 


 912 /*
 913  * Reclaim callback -- invoked when memory is low.
 914  */
 915 /* ARGSUSED */
 916 static void
 917 hdr_recl(void *unused)
 918 {
 919         dprintf("hdr_recl called\n");
 920         /*
 921          * umem calls the reclaim func when we destroy the buf cache,
 922          * which is after we do arc_fini().
 923          */
 924         if (!arc_dead)
 925                 cv_signal(&arc_reclaim_thr_cv);
 926 }
 927 
 928 static void
 929 buf_init(void)
 930 {
 931         uint64_t *ct;
 932         uint64_t hsize = 1ULL << 12;
 933         int i, j;
 934 
 935         /*
 936          * The hash table is big enough to fill all of physical memory
 937          * with an average 64K block size.  The table will take up
 938          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 939          */
 940         while (hsize * 65536 < physmem * PAGESIZE)
 941                 hsize <<= 1;
 942 retry:
 943         buf_hash_table.ht_mask = hsize - 1;
 944         buf_hash_table.ht_table =
 945             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 946         if (buf_hash_table.ht_table == NULL) {
 947                 ASSERT(hsize > (1ULL << 8));
 948                 hsize >>= 1;
 949                 goto retry;





 950         }
 951 
 952         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 953             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 954         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 955             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 956 
 957         for (i = 0; i < 256; i++)
 958                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 959                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 960 
 961         for (i = 0; i < BUF_LOCKS; i++) {
 962                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 963                     NULL, MUTEX_DEFAULT, NULL);
 964         }
 965 }
 966 
 967 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 968 
 969 static void
 970 arc_cksum_verify(arc_buf_t *buf)
 971 {
 972         zio_cksum_t zc;
 973 
 974         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 975                 return;
 976 
 977         mutex_enter(&buf->b_hdr->b_freeze_lock);
 978         if (buf->b_hdr->b_freeze_cksum == NULL ||
 979             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 980                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 981                 return;
 982         }
 983         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 984         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))




  55  * Pages are evicted when the cache is full and there is a cache
  56  * miss.  Our model has a variable sized cache.  It grows with
  57  * high use, but also tries to react to memory pressure from the
  58  * operating system: decreasing its size when system memory is
  59  * tight.
  60  *
  61  * 3. The Megiddo and Modha model assumes a fixed page size. All
  62  * elements of the cache are therefore exactly the same size.  So
  63  * when adjusting the cache size following a cache miss, its simply
  64  * a matter of choosing a single page to evict.  In our model, we
  65  * have variable sized cache blocks (rangeing from 512 bytes to
  66  * 128K bytes).  We therefore choose a set of blocks to evict to make
  67  * space for a cache miss that approximates as closely as possible
  68  * the space used by the new block.
  69  *
  70  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71  * by N. Megiddo & D. Modha, FAST 2003
  72  */
  73 
  74 /*
  75  * External users typically access ARC buffers via a hash table
  76  * lookup, using the DVA, spa_t pointer value and the birth TXG
  77  * number as the key. The hash value is derived by buf_hash(),
  78  * which spits out a 64-bit hash index. This index is then masked
  79  * with ht_mask to obtain the final index into the hash table:
  80  *
  81  *                     ,---------------- & ht_mask ----------------,
  82  * 64-bit hash value   |             (hash table index)             |
  83  * |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX|
  84  *
  85  * Sizing of the hash table is done at boot from the amount of
  86  * physical memory. We start with a base value of 2^12 hash
  87  * buckets and then evaluate whether this number, multiplied by
  88  * 2^zfs_arc_ht_base_masklen (the minimum mask length), is
  89  * greater than or equal to the amount of physical memory. If not,
  90  * we double the number of hash buckets and repeat. Using the
  91  * default settings these values translate to ~1 MB of hash tables
  92  * for each 1 GB of physical memory.
  93  *
  94  * The locking model:
  95  *
  96  * A new reference to a cache buffer can be obtained in two
  97  * ways: 1) via a hash table lookup using the DVA as a key,
  98  * or 2) via one of the ARC lists.  The arc_read() interface
  99  * uses method 1, while the internal arc algorithms for
 100  * adjusting the cache use method 2.  We therefore provide two
 101  * types of locks: 1) the hash table lock array, and 2) the
 102  * arc list locks.
 103  *
 104  * Buffers do not have their own mutexes, rather they rely on the
 105  * hash table mutexes for the bulk of their protection (i.e. most
 106  * fields in the arc_buf_hdr_t are protected by these mutexes). The
 107  * specific mutex is selected by taking its hash value and masking
 108  * it by ht_lock_mask, which then produces an index into the mutex
 109  * table. The size of the lock table is derived from the amount of
 110  * physical memory, which is simply divided by
 111  * 2^zfs_arc_ht_lock_shift, giving the number of locks, with a
 112  * minimum of MIN_BUF_LOCKS.
 113  *
 114  * buf_hash_find() returns the appropriate mutex (held) when it
 115  * locates the requested buffer in the hash table.  It returns
 116  * NULL for the mutex if the buffer was not in the table.
 117  *
 118  * buf_hash_remove() expects the appropriate hash mutex to be
 119  * already held before it is invoked.
 120  *
 121  * Each arc state also has a mutex which is used to protect the
 122  * buffer list associated with the state.  When attempting to
 123  * obtain a hash table lock while holding an arc list lock you
 124  * must use: mutex_tryenter() to avoid deadlock.  Also note that
 125  * the active state mutex must be held before the ghost state mutex.
 126  *
 127  * Arc buffers may have an associated eviction callback function.
 128  * This function will be invoked prior to removing the buffer (e.g.
 129  * in arc_do_user_evicts()).  Note however that the data associated
 130  * with the buffer may be evicted prior to the callback.  The callback
 131  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 132  * the users of callbacks must ensure that their private data is


 210 
 211 static int arc_dead;
 212 
 213 /*
 214  * The arc has filled available memory and has now warmed up.
 215  */
 216 static boolean_t arc_warm;
 217 
 218 /*
 219  * These tunables are for performance analysis.
 220  */
 221 uint64_t zfs_arc_max;
 222 uint64_t zfs_arc_min;
 223 uint64_t zfs_arc_meta_limit = 0;
 224 int zfs_arc_grow_retry = 0;
 225 int zfs_arc_shrink_shift = 0;
 226 int zfs_arc_p_min_shift = 0;
 227 int zfs_disable_dup_eviction = 0;
 228 
 229 /*
 230  * Used to calculate the size of ARC hash tables and number of hash locks.
 231  * See big theory block comment at the start of this file.
 232  */
 233 uint64_t zfs_arc_ht_base_masklen = 13;
 234 /*
 235  * We want to allocate one hash lock for every 4GB of memory with a minimum
 236  * of MIN_BUF_LOCKS.
 237  */
 238 uint64_t zfs_arc_ht_lock_shift = 32;
 239 #define MIN_BUF_LOCKS   256
 240 
 241 /*
 242  * Note that buffers can be in one of 6 states:
 243  *      ARC_anon        - anonymous (discussed below)
 244  *      ARC_mru         - recently used, currently cached
 245  *      ARC_mru_ghost   - recentely used, no longer in cache
 246  *      ARC_mfu         - frequently used, currently cached
 247  *      ARC_mfu_ghost   - frequently used, no longer in cache
 248  *      ARC_l2c_only    - exists in L2ARC but not other states
 249  * When there are no active references to the buffer, they are
 250  * are linked onto a list in one of these arc states.  These are
 251  * the only buffers that can be evicted or deleted.  Within each
 252  * state there are multiple lists, one for meta-data and one for
 253  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 254  * etc.) is tracked separately so that it can be managed more
 255  * explicitly: favored over data, limited explicitly.
 256  *
 257  * Anonymous buffers are buffers that are not associated with
 258  * a DVA.  These are buffers that hold dirty block copies
 259  * before they are written to stable storage.  By definition,
 260  * they are "ref'd" and are considered part of arc_mru
 261  * that cannot be freed.  Generally, they will aquire a DVA


 608 /*
 609  * Other sizes
 610  */
 611 
 612 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 613 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 614 
 615 /*
 616  * Hash table routines
 617  */
 618 
 619 #define HT_LOCK_PAD     64
 620 
 621 struct ht_lock {
 622         kmutex_t        ht_lock;
 623 #ifdef _KERNEL
 624         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 625 #endif
 626 };
 627 

 628 typedef struct buf_hash_table {
 629         uint64_t        ht_mask;
 630         arc_buf_hdr_t   **ht_table;
 631         struct ht_lock  *ht_locks;
 632         uint64_t        ht_num_locks, ht_lock_mask;
 633 } buf_hash_table_t;
 634 
 635 static buf_hash_table_t buf_hash_table;
 636 
 637 #define BUF_HASH_INDEX(spa, dva, birth) \
 638         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 639 #define BUF_HASH_LOCK_NTRY(idx) \
 640         (buf_hash_table.ht_locks[idx & buf_hash_table.ht_lock_mask])
 641 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 642 #define HDR_LOCK(hdr) \
 643         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 644 
 645 uint64_t zfs_crc64_table[256];
 646 
 647 /*
 648  * Level 2 ARC
 649  */
 650 
 651 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 652 #define L2ARC_HEADROOM          2                       /* num of writes */
 653 /*
 654  * If we discover during ARC scan any buffers to be compressed, we boost
 655  * our headroom for the next scanning cycle by this percentage multiple.
 656  */
 657 #define L2ARC_HEADROOM_BOOST    200
 658 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 659 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 660 


 728         /* protected by l2arc_free_on_write_mtx */
 729         void            *l2df_data;
 730         size_t          l2df_size;
 731         void            (*l2df_func)(void *, size_t);
 732         list_node_t     l2df_list_node;
 733 } l2arc_data_free_t;
 734 
 735 static kmutex_t l2arc_feed_thr_lock;
 736 static kcondvar_t l2arc_feed_thr_cv;
 737 static uint8_t l2arc_thread_exit;
 738 
 739 static void l2arc_read_done(zio_t *zio);
 740 static void l2arc_hdr_stat_add(void);
 741 static void l2arc_hdr_stat_remove(void);
 742 
 743 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 744 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 745     enum zio_compress c);
 746 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 747 
 748 static inline uint64_t
 749 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 750 {
 751         uint8_t *vdva = (uint8_t *)dva;
 752         uint64_t crc = -1ULL;
 753         int i;
 754 
 755         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 756 
 757         for (i = 0; i < sizeof (dva_t); i++)
 758                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 759 
 760         crc ^= (spa>>8) ^ birth;
 761 
 762         return (crc);
 763 }
 764 
 765 #define BUF_EMPTY(buf)                                          \
 766         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 767         (buf)->b_dva.dva_word[1] == 0 &&                     \
 768         (buf)->b_birth == 0)


 865         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 866 
 867         if (buf_hash_table.ht_table[idx] &&
 868             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 869                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 870 }
 871 
 872 /*
 873  * Global data structures and functions for the buf kmem cache.
 874  */
 875 static kmem_cache_t *hdr_cache;
 876 static kmem_cache_t *buf_cache;
 877 
 878 static void
 879 buf_fini(void)
 880 {
 881         int i;
 882 
 883         kmem_free(buf_hash_table.ht_table,
 884             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 885 
 886         for (i = 0; i < buf_hash_table.ht_num_locks; i++)
 887                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 888         kmem_free(buf_hash_table.ht_locks, sizeof (struct ht_lock) *
 889             buf_hash_table.ht_num_locks);
 890         kmem_cache_destroy(hdr_cache);
 891         kmem_cache_destroy(buf_cache);
 892 }
 893 
 894 /*
 895  * Constructor callback - called when the cache is empty
 896  * and a new buf is requested.
 897  */
 898 /* ARGSUSED */
 899 static int
 900 hdr_cons(void *vbuf, void *unused, int kmflag)
 901 {
 902         arc_buf_hdr_t *buf = vbuf;
 903 
 904         bzero(buf, sizeof (arc_buf_hdr_t));
 905         refcount_create(&buf->b_refcnt);
 906         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 907         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 908         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 909 


 953 /*
 954  * Reclaim callback -- invoked when memory is low.
 955  */
 956 /* ARGSUSED */
 957 static void
 958 hdr_recl(void *unused)
 959 {
 960         dprintf("hdr_recl called\n");
 961         /*
 962          * umem calls the reclaim func when we destroy the buf cache,
 963          * which is after we do arc_fini().
 964          */
 965         if (!arc_dead)
 966                 cv_signal(&arc_reclaim_thr_cv);
 967 }
 968 
 969 static void
 970 buf_init(void)
 971 {
 972         uint64_t        *ct;
 973         uint64_t        ht_masklen = 12;
 974         int             i, j;
 975 
 976         while ((1ULL << (ht_masklen + zfs_arc_ht_base_masklen)) <
 977             physmem * PAGESIZE)
 978                 ht_masklen++;
 979         buf_hash_table.ht_mask = (1ULL << ht_masklen) - 1;





 980         buf_hash_table.ht_table =
 981             kmem_zalloc((1ULL << ht_masklen) * sizeof (void *), KM_SLEEP);
 982 
 983         buf_hash_table.ht_num_locks = MAX((physmem * PAGESIZE) >>
 984             zfs_arc_ht_lock_shift, MIN_BUF_LOCKS);
 985         buf_hash_table.ht_lock_mask = buf_hash_table.ht_num_locks - 1;
 986         buf_hash_table.ht_locks = kmem_zalloc(sizeof (struct ht_lock) *
 987             buf_hash_table.ht_num_locks, KM_SLEEP);
 988         for (i = 0; i < buf_hash_table.ht_num_locks; i++) {
 989                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 990                     NULL, MUTEX_DEFAULT, NULL);
 991         }
 992 
 993         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 994             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 995         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 996             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 997 
 998         for (i = 0; i < 256; i++)
 999                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1000                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);





1001 }
1002 
1003 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1004 
1005 static void
1006 arc_cksum_verify(arc_buf_t *buf)
1007 {
1008         zio_cksum_t zc;
1009 
1010         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1011                 return;
1012 
1013         mutex_enter(&buf->b_hdr->b_freeze_lock);
1014         if (buf->b_hdr->b_freeze_cksum == NULL ||
1015             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1016                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1017                 return;
1018         }
1019         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1020         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))