Print this page
New ARC buf_hash architecture

@@ -70,10 +70,29 @@
  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  * by N. Megiddo & D. Modha, FAST 2003
  */
 
 /*
+ * External users typically access ARC buffers via a hash table
+ * lookup, using the DVA, spa_t pointer value and the birth TXG
+ * number as the key. The hash value is derived by buf_hash(),
+ * which spits out a 64-bit hash index. This index is then masked
+ * with ht_mask to obtain the final index into the hash table:
+ *
+ *                     ,---------------- & ht_mask ----------------,
+ * 64-bit hash value   |             (hash table index)             |
+ * |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX|
+ *
+ * Sizing of the hash table is done at boot from the amount of
+ * physical memory. We start with a base value of 2^12 hash
+ * buckets and then evaluate whether this number, multiplied by
+ * 2^zfs_arc_ht_base_masklen (the minimum mask length), is
+ * greater than or equal to the amount of physical memory. If not,
+ * we double the number of hash buckets and repeat. Using the
+ * default settings these values translate to ~1 MB of hash tables
+ * for each 1 GB of physical memory.
+ *
  * The locking model:
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface

@@ -82,11 +101,17 @@
  * types of locks: 1) the hash table lock array, and 2) the
  * arc list locks.
  *
  * Buffers do not have their own mutexes, rather they rely on the
  * hash table mutexes for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexes).
+ * fields in the arc_buf_hdr_t are protected by these mutexes). The
+ * specific mutex is selected by taking its hash value and masking
+ * it by ht_lock_mask, which then produces an index into the mutex
+ * table. The size of the lock table is derived from the amount of
+ * physical memory, which is simply divided by
+ * 2^zfs_arc_ht_lock_shift, giving the number of locks, with a
+ * minimum of MIN_BUF_LOCKS.
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
  * NULL for the mutex if the buffer was not in the table.
  *

@@ -200,10 +225,22 @@
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
 int zfs_disable_dup_eviction = 0;
 
 /*
+ * Used to calculate the size of ARC hash tables and number of hash locks.
+ * See big theory block comment at the start of this file.
+ */
+uint64_t zfs_arc_ht_base_masklen = 13;
+/*
+ * We want to allocate one hash lock for every 4GB of memory with a minimum
+ * of MIN_BUF_LOCKS.
+ */
+uint64_t zfs_arc_ht_lock_shift = 32;
+#define MIN_BUF_LOCKS   256
+
+/*
  * Note that buffers can be in one of 6 states:
  *      ARC_anon        - anonymous (discussed below)
  *      ARC_mru         - recently used, currently cached
  *      ARC_mru_ghost   - recentely used, no longer in cache
  *      ARC_mfu         - frequently used, currently cached

@@ -586,22 +623,23 @@
 #ifdef _KERNEL
         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 #endif
 };
 
-#define BUF_LOCKS 256
 typedef struct buf_hash_table {
         uint64_t ht_mask;
         arc_buf_hdr_t **ht_table;
-        struct ht_lock ht_locks[BUF_LOCKS];
+        struct ht_lock  *ht_locks;
+        uint64_t        ht_num_locks, ht_lock_mask;
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define BUF_HASH_INDEX(spa, dva, birth) \
         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define BUF_HASH_LOCK_NTRY(idx) \
+        (buf_hash_table.ht_locks[idx & buf_hash_table.ht_lock_mask])
 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 #define HDR_LOCK(hdr) \
         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];

@@ -705,11 +743,11 @@
 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
     enum zio_compress c);
 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 
-static uint64_t
+static inline uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
         uint8_t *vdva = (uint8_t *)dva;
         uint64_t crc = -1ULL;
         int i;

@@ -842,12 +880,15 @@
 {
         int i;
 
         kmem_free(buf_hash_table.ht_table,
             (buf_hash_table.ht_mask + 1) * sizeof (void *));
-        for (i = 0; i < BUF_LOCKS; i++)
+
+        for (i = 0; i < buf_hash_table.ht_num_locks; i++)
                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+        kmem_free(buf_hash_table.ht_locks, sizeof (struct ht_lock) *
+            buf_hash_table.ht_num_locks);
         kmem_cache_destroy(hdr_cache);
         kmem_cache_destroy(buf_cache);
 }
 
 /*

@@ -927,28 +968,28 @@
 
 static void
 buf_init(void)
 {
         uint64_t *ct;
-        uint64_t hsize = 1ULL << 12;
+        uint64_t        ht_masklen = 12;
         int i, j;
 
-        /*
-         * The hash table is big enough to fill all of physical memory
-         * with an average 64K block size.  The table will take up
-         * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
-         */
-        while (hsize * 65536 < physmem * PAGESIZE)
-                hsize <<= 1;
-retry:
-        buf_hash_table.ht_mask = hsize - 1;
+        while ((1ULL << (ht_masklen + zfs_arc_ht_base_masklen)) <
+            physmem * PAGESIZE)
+                ht_masklen++;
+        buf_hash_table.ht_mask = (1ULL << ht_masklen) - 1;
         buf_hash_table.ht_table =
-            kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
-        if (buf_hash_table.ht_table == NULL) {
-                ASSERT(hsize > (1ULL << 8));
-                hsize >>= 1;
-                goto retry;
+            kmem_zalloc((1ULL << ht_masklen) * sizeof (void *), KM_SLEEP);
+
+        buf_hash_table.ht_num_locks = MAX((physmem * PAGESIZE) >>
+            zfs_arc_ht_lock_shift, MIN_BUF_LOCKS);
+        buf_hash_table.ht_lock_mask = buf_hash_table.ht_num_locks - 1;
+        buf_hash_table.ht_locks = kmem_zalloc(sizeof (struct ht_lock) *
+            buf_hash_table.ht_num_locks, KM_SLEEP);
+        for (i = 0; i < buf_hash_table.ht_num_locks; i++) {
+                mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+                    NULL, MUTEX_DEFAULT, NULL);
         }
 
         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),

@@ -955,15 +996,10 @@
             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
         for (i = 0; i < 256; i++)
                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
-
-        for (i = 0; i < BUF_LOCKS; i++) {
-                mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
-                    NULL, MUTEX_DEFAULT, NULL);
-        }
 }
 
 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 
 static void