Print this page
New ARC buf_hash architecture

*** 70,79 **** --- 70,98 ---- * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* + * External users typically access ARC buffers via a hash table + * lookup, using the DVA, spa_t pointer value and the birth TXG + * number as the key. The hash value is derived by buf_hash(), + * which spits out a 64-bit hash index. This index is then masked + * with ht_mask to obtain the final index into the hash table: + * + * ,---------------- & ht_mask ----------------, + * 64-bit hash value | (hash table index) | + * |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| + * + * Sizing of the hash table is done at boot from the amount of + * physical memory. We start with a base value of 2^12 hash + * buckets and then evaluate whether this number, multiplied by + * 2^zfs_arc_ht_base_masklen (the minimum mask length), is + * greater than or equal to the amount of physical memory. If not, + * we double the number of hash buckets and repeat. Using the + * default settings these values translate to ~1 MB of hash tables + * for each 1 GB of physical memory. + * * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface
*** 82,92 **** * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most ! * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * --- 101,117 ---- * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most ! * fields in the arc_buf_hdr_t are protected by these mutexes). The ! * specific mutex is selected by taking its hash value and masking ! * it by ht_lock_mask, which then produces an index into the mutex ! * table. The size of the lock table is derived from the amount of ! * physical memory, which is simply divided by ! * 2^zfs_arc_ht_lock_shift, giving the number of locks, with a ! * minimum of MIN_BUF_LOCKS. * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. *
*** 200,209 **** --- 225,246 ---- int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; /* + * Used to calculate the size of ARC hash tables and number of hash locks. + * See big theory block comment at the start of this file. + */ + uint64_t zfs_arc_ht_base_masklen = 13; + /* + * We want to allocate one hash lock for every 4GB of memory with a minimum + * of MIN_BUF_LOCKS. + */ + uint64_t zfs_arc_ht_lock_shift = 32; + #define MIN_BUF_LOCKS 256 + + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached
*** 586,607 **** #ifdef _KERNEL unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; #endif }; - #define BUF_LOCKS 256 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; ! struct ht_lock ht_locks[BUF_LOCKS]; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) ! #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; --- 623,645 ---- #ifdef _KERNEL unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; #endif }; typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; ! struct ht_lock *ht_locks; ! uint64_t ht_num_locks, ht_lock_mask; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) ! #define BUF_HASH_LOCK_NTRY(idx) \ ! (buf_hash_table.ht_locks[idx & buf_hash_table.ht_lock_mask]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256];
*** 705,715 **** static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c); static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); ! static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; --- 743,753 ---- static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c); static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); ! static inline uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i;
*** 842,853 **** { int i; kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); ! for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_cache); kmem_cache_destroy(buf_cache); } /* --- 880,894 ---- { int i; kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); ! ! for (i = 0; i < buf_hash_table.ht_num_locks; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + kmem_free(buf_hash_table.ht_locks, sizeof (struct ht_lock) * + buf_hash_table.ht_num_locks); kmem_cache_destroy(hdr_cache); kmem_cache_destroy(buf_cache); } /*
*** 927,954 **** static void buf_init(void) { uint64_t *ct; ! uint64_t hsize = 1ULL << 12; int i, j; ! /* ! * The hash table is big enough to fill all of physical memory ! * with an average 64K block size. The table will take up ! * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). ! */ ! while (hsize * 65536 < physmem * PAGESIZE) ! hsize <<= 1; ! retry: ! buf_hash_table.ht_mask = hsize - 1; buf_hash_table.ht_table = ! kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); ! if (buf_hash_table.ht_table == NULL) { ! ASSERT(hsize > (1ULL << 8)); ! hsize >>= 1; ! goto retry; } hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), --- 968,995 ---- static void buf_init(void) { uint64_t *ct; ! uint64_t ht_masklen = 12; int i, j; ! while ((1ULL << (ht_masklen + zfs_arc_ht_base_masklen)) < ! physmem * PAGESIZE) ! ht_masklen++; ! buf_hash_table.ht_mask = (1ULL << ht_masklen) - 1; buf_hash_table.ht_table = ! kmem_zalloc((1ULL << ht_masklen) * sizeof (void *), KM_SLEEP); ! ! buf_hash_table.ht_num_locks = MAX((physmem * PAGESIZE) >> ! zfs_arc_ht_lock_shift, MIN_BUF_LOCKS); ! buf_hash_table.ht_lock_mask = buf_hash_table.ht_num_locks - 1; ! buf_hash_table.ht_locks = kmem_zalloc(sizeof (struct ht_lock) * ! buf_hash_table.ht_num_locks, KM_SLEEP); ! for (i = 0; i < buf_hash_table.ht_num_locks; i++) { ! mutex_init(&buf_hash_table.ht_locks[i].ht_lock, ! NULL, MUTEX_DEFAULT, NULL); } hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
*** 955,969 **** 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); - - for (i = 0; i < BUF_LOCKS; i++) { - mutex_init(&buf_hash_table.ht_locks[i].ht_lock, - NULL, MUTEX_DEFAULT, NULL); - } } #define ARC_MINTIME (hz>>4) /* 62 ms */ static void --- 996,1005 ----