Print this page
New ARC buf_hash architecture
@@ -70,10 +70,29 @@
* See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
* by N. Megiddo & D. Modha, FAST 2003
*/
/*
+ * External users typically access ARC buffers via a hash table
+ * lookup, using the DVA, spa_t pointer value and the birth TXG
+ * number as the key. The hash value is derived by buf_hash(),
+ * which spits out a 64-bit hash index. This index is then masked
+ * with ht_mask to obtain the final index into the hash table:
+ *
+ * ,---------------- & ht_mask ----------------,
+ * 64-bit hash value | (hash table index) |
+ * |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX|
+ *
+ * Sizing of the hash table is done at boot from the amount of
+ * physical memory. We start with a base value of 2^12 hash
+ * buckets and then evaluate whether this number, multiplied by
+ * 2^zfs_arc_ht_base_masklen (the minimum mask length), is
+ * greater than or equal to the amount of physical memory. If not,
+ * we double the number of hash buckets and repeat. Using the
+ * default settings these values translate to ~1 MB of hash tables
+ * for each 1 GB of physical memory.
+ *
* The locking model:
*
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
* or 2) via one of the ARC lists. The arc_read() interface
@@ -82,11 +101,17 @@
* types of locks: 1) the hash table lock array, and 2) the
* arc list locks.
*
* Buffers do not have their own mutexes, rather they rely on the
* hash table mutexes for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexes).
+ * fields in the arc_buf_hdr_t are protected by these mutexes). The
+ * specific mutex is selected by taking its hash value and masking
+ * it by ht_lock_mask, which then produces an index into the mutex
+ * table. The size of the lock table is derived from the amount of
+ * physical memory, which is simply divided by
+ * 2^zfs_arc_ht_lock_shift, giving the number of locks, with a
+ * minimum of MIN_BUF_LOCKS.
*
* buf_hash_find() returns the appropriate mutex (held) when it
* locates the requested buffer in the hash table. It returns
* NULL for the mutex if the buffer was not in the table.
*
@@ -200,10 +225,22 @@
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
int zfs_disable_dup_eviction = 0;
/*
+ * Used to calculate the size of ARC hash tables and number of hash locks.
+ * See big theory block comment at the start of this file.
+ */
+uint64_t zfs_arc_ht_base_masklen = 13;
+/*
+ * We want to allocate one hash lock for every 4GB of memory with a minimum
+ * of MIN_BUF_LOCKS.
+ */
+uint64_t zfs_arc_ht_lock_shift = 32;
+#define MIN_BUF_LOCKS 256
+
+/*
* Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
* ARC_mru - recently used, currently cached
* ARC_mru_ghost - recentely used, no longer in cache
* ARC_mfu - frequently used, currently cached
@@ -586,22 +623,23 @@
#ifdef _KERNEL
unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
#endif
};
-#define BUF_LOCKS 256
typedef struct buf_hash_table {
uint64_t ht_mask;
arc_buf_hdr_t **ht_table;
- struct ht_lock ht_locks[BUF_LOCKS];
+ struct ht_lock *ht_locks;
+ uint64_t ht_num_locks, ht_lock_mask;
} buf_hash_table_t;
static buf_hash_table_t buf_hash_table;
#define BUF_HASH_INDEX(spa, dva, birth) \
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define BUF_HASH_LOCK_NTRY(idx) \
+ (buf_hash_table.ht_locks[idx & buf_hash_table.ht_lock_mask])
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
#define HDR_LOCK(hdr) \
(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
uint64_t zfs_crc64_table[256];
@@ -705,11 +743,11 @@
static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
enum zio_compress c);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
-static uint64_t
+static inline uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL;
int i;
@@ -842,12 +880,15 @@
{
int i;
kmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
- for (i = 0; i < BUF_LOCKS; i++)
+
+ for (i = 0; i < buf_hash_table.ht_num_locks; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ kmem_free(buf_hash_table.ht_locks, sizeof (struct ht_lock) *
+ buf_hash_table.ht_num_locks);
kmem_cache_destroy(hdr_cache);
kmem_cache_destroy(buf_cache);
}
/*
@@ -927,28 +968,28 @@
static void
buf_init(void)
{
uint64_t *ct;
- uint64_t hsize = 1ULL << 12;
+ uint64_t ht_masklen = 12;
int i, j;
- /*
- * The hash table is big enough to fill all of physical memory
- * with an average 64K block size. The table will take up
- * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
- */
- while (hsize * 65536 < physmem * PAGESIZE)
- hsize <<= 1;
-retry:
- buf_hash_table.ht_mask = hsize - 1;
+ while ((1ULL << (ht_masklen + zfs_arc_ht_base_masklen)) <
+ physmem * PAGESIZE)
+ ht_masklen++;
+ buf_hash_table.ht_mask = (1ULL << ht_masklen) - 1;
buf_hash_table.ht_table =
- kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
- if (buf_hash_table.ht_table == NULL) {
- ASSERT(hsize > (1ULL << 8));
- hsize >>= 1;
- goto retry;
+ kmem_zalloc((1ULL << ht_masklen) * sizeof (void *), KM_SLEEP);
+
+ buf_hash_table.ht_num_locks = MAX((physmem * PAGESIZE) >>
+ zfs_arc_ht_lock_shift, MIN_BUF_LOCKS);
+ buf_hash_table.ht_lock_mask = buf_hash_table.ht_num_locks - 1;
+ buf_hash_table.ht_locks = kmem_zalloc(sizeof (struct ht_lock) *
+ buf_hash_table.ht_num_locks, KM_SLEEP);
+ for (i = 0; i < buf_hash_table.ht_num_locks; i++) {
+ mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+ NULL, MUTEX_DEFAULT, NULL);
}
hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
@@ -955,15 +996,10 @@
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < 256; i++)
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
-
- for (i = 0; i < BUF_LOCKS; i++) {
- mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
- NULL, MUTEX_DEFAULT, NULL);
- }
}
#define ARC_MINTIME (hz>>4) /* 62 ms */
static void