Print this page
New ARC buf_hash architecture
*** 70,79 ****
--- 70,98 ----
* See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
* by N. Megiddo & D. Modha, FAST 2003
*/
/*
+ * External users typically access ARC buffers via a hash table
+ * lookup, using the DVA, spa_t pointer value and the birth TXG
+ * number as the key. The hash value is derived by buf_hash(),
+ * which spits out a 64-bit hash index. This index is then masked
+ * with ht_mask to obtain the final index into the hash table:
+ *
+ * ,---------------- & ht_mask ----------------,
+ * 64-bit hash value | (hash table index) |
+ * |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX|
+ *
+ * Sizing of the hash table is done at boot from the amount of
+ * physical memory. We start with a base value of 2^12 hash
+ * buckets and then evaluate whether this number, multiplied by
+ * 2^zfs_arc_ht_base_masklen (the minimum mask length), is
+ * greater than or equal to the amount of physical memory. If not,
+ * we double the number of hash buckets and repeat. Using the
+ * default settings these values translate to ~1 MB of hash tables
+ * for each 1 GB of physical memory.
+ *
* The locking model:
*
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
* or 2) via one of the ARC lists. The arc_read() interface
*** 82,92 ****
* types of locks: 1) the hash table lock array, and 2) the
* arc list locks.
*
* Buffers do not have their own mutexes, rather they rely on the
* hash table mutexes for the bulk of their protection (i.e. most
! * fields in the arc_buf_hdr_t are protected by these mutexes).
*
* buf_hash_find() returns the appropriate mutex (held) when it
* locates the requested buffer in the hash table. It returns
* NULL for the mutex if the buffer was not in the table.
*
--- 101,117 ----
* types of locks: 1) the hash table lock array, and 2) the
* arc list locks.
*
* Buffers do not have their own mutexes, rather they rely on the
* hash table mutexes for the bulk of their protection (i.e. most
! * fields in the arc_buf_hdr_t are protected by these mutexes). The
! * specific mutex is selected by taking its hash value and masking
! * it by ht_lock_mask, which then produces an index into the mutex
! * table. The size of the lock table is derived from the amount of
! * physical memory, which is simply divided by
! * 2^zfs_arc_ht_lock_shift, giving the number of locks, with a
! * minimum of MIN_BUF_LOCKS.
*
* buf_hash_find() returns the appropriate mutex (held) when it
* locates the requested buffer in the hash table. It returns
* NULL for the mutex if the buffer was not in the table.
*
*** 200,209 ****
--- 225,246 ----
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
int zfs_disable_dup_eviction = 0;
/*
+ * Used to calculate the size of ARC hash tables and number of hash locks.
+ * See big theory block comment at the start of this file.
+ */
+ uint64_t zfs_arc_ht_base_masklen = 13;
+ /*
+ * We want to allocate one hash lock for every 4GB of memory with a minimum
+ * of MIN_BUF_LOCKS.
+ */
+ uint64_t zfs_arc_ht_lock_shift = 32;
+ #define MIN_BUF_LOCKS 256
+
+ /*
* Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
* ARC_mru - recently used, currently cached
* ARC_mru_ghost - recentely used, no longer in cache
* ARC_mfu - frequently used, currently cached
*** 586,607 ****
#ifdef _KERNEL
unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
#endif
};
- #define BUF_LOCKS 256
typedef struct buf_hash_table {
uint64_t ht_mask;
arc_buf_hdr_t **ht_table;
! struct ht_lock ht_locks[BUF_LOCKS];
} buf_hash_table_t;
static buf_hash_table_t buf_hash_table;
#define BUF_HASH_INDEX(spa, dva, birth) \
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
! #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
#define HDR_LOCK(hdr) \
(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
uint64_t zfs_crc64_table[256];
--- 623,645 ----
#ifdef _KERNEL
unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
#endif
};
typedef struct buf_hash_table {
uint64_t ht_mask;
arc_buf_hdr_t **ht_table;
! struct ht_lock *ht_locks;
! uint64_t ht_num_locks, ht_lock_mask;
} buf_hash_table_t;
static buf_hash_table_t buf_hash_table;
#define BUF_HASH_INDEX(spa, dva, birth) \
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
! #define BUF_HASH_LOCK_NTRY(idx) \
! (buf_hash_table.ht_locks[idx & buf_hash_table.ht_lock_mask])
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
#define HDR_LOCK(hdr) \
(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
uint64_t zfs_crc64_table[256];
*** 705,715 ****
static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
enum zio_compress c);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
! static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL;
int i;
--- 743,753 ----
static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
enum zio_compress c);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
! static inline uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL;
int i;
*** 842,853 ****
{
int i;
kmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
! for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
kmem_cache_destroy(hdr_cache);
kmem_cache_destroy(buf_cache);
}
/*
--- 880,894 ----
{
int i;
kmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
!
! for (i = 0; i < buf_hash_table.ht_num_locks; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ kmem_free(buf_hash_table.ht_locks, sizeof (struct ht_lock) *
+ buf_hash_table.ht_num_locks);
kmem_cache_destroy(hdr_cache);
kmem_cache_destroy(buf_cache);
}
/*
*** 927,954 ****
static void
buf_init(void)
{
uint64_t *ct;
! uint64_t hsize = 1ULL << 12;
int i, j;
! /*
! * The hash table is big enough to fill all of physical memory
! * with an average 64K block size. The table will take up
! * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
! */
! while (hsize * 65536 < physmem * PAGESIZE)
! hsize <<= 1;
! retry:
! buf_hash_table.ht_mask = hsize - 1;
buf_hash_table.ht_table =
! kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
! if (buf_hash_table.ht_table == NULL) {
! ASSERT(hsize > (1ULL << 8));
! hsize >>= 1;
! goto retry;
}
hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
--- 968,995 ----
static void
buf_init(void)
{
uint64_t *ct;
! uint64_t ht_masklen = 12;
int i, j;
! while ((1ULL << (ht_masklen + zfs_arc_ht_base_masklen)) <
! physmem * PAGESIZE)
! ht_masklen++;
! buf_hash_table.ht_mask = (1ULL << ht_masklen) - 1;
buf_hash_table.ht_table =
! kmem_zalloc((1ULL << ht_masklen) * sizeof (void *), KM_SLEEP);
!
! buf_hash_table.ht_num_locks = MAX((physmem * PAGESIZE) >>
! zfs_arc_ht_lock_shift, MIN_BUF_LOCKS);
! buf_hash_table.ht_lock_mask = buf_hash_table.ht_num_locks - 1;
! buf_hash_table.ht_locks = kmem_zalloc(sizeof (struct ht_lock) *
! buf_hash_table.ht_num_locks, KM_SLEEP);
! for (i = 0; i < buf_hash_table.ht_num_locks; i++) {
! mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
! NULL, MUTEX_DEFAULT, NULL);
}
hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
*** 955,969 ****
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < 256; i++)
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
-
- for (i = 0; i < BUF_LOCKS; i++) {
- mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
- NULL, MUTEX_DEFAULT, NULL);
- }
}
#define ARC_MINTIME (hz>>4) /* 62 ms */
static void
--- 996,1005 ----