Print this page
New ARC buf_hash architecture

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c
↓ open down ↓ 64 lines elided ↑ open up ↑
  65   65   * have variable sized cache blocks (rangeing from 512 bytes to
  66   66   * 128K bytes).  We therefore choose a set of blocks to evict to make
  67   67   * space for a cache miss that approximates as closely as possible
  68   68   * the space used by the new block.
  69   69   *
  70   70   * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71   71   * by N. Megiddo & D. Modha, FAST 2003
  72   72   */
  73   73  
  74   74  /*
       75 + * External users typically access ARC buffers via a hash table
       76 + * lookup, using the DVA, spa_t pointer value and the birth TXG
       77 + * number as the key. The hash value is derived by buf_hash(),
       78 + * which spits out a 64-bit hash index. This index is then masked
       79 + * with ht_mask to obtain the final index into the hash table:
       80 + *
       81 + *                     ,---------------- & ht_mask ----------------,
       82 + * 64-bit hash value   |             (hash table index)             |
       83 + * |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX|
       84 + *
       85 + * Sizing of the hash table is done at boot from the amount of
       86 + * physical memory. We start with a base value of 2^12 hash
       87 + * buckets and then evaluate whether this number, multiplied by
       88 + * 2^zfs_arc_ht_base_masklen (the minimum mask length), is
       89 + * greater than or equal to the amount of physical memory. If not,
       90 + * we double the number of hash buckets and repeat. Using the
       91 + * default settings these values translate to ~1 MB of hash tables
       92 + * for each 1 GB of physical memory.
       93 + *
  75   94   * The locking model:
  76   95   *
  77   96   * A new reference to a cache buffer can be obtained in two
  78   97   * ways: 1) via a hash table lookup using the DVA as a key,
  79   98   * or 2) via one of the ARC lists.  The arc_read() interface
  80   99   * uses method 1, while the internal arc algorithms for
  81  100   * adjusting the cache use method 2.  We therefore provide two
  82  101   * types of locks: 1) the hash table lock array, and 2) the
  83  102   * arc list locks.
  84  103   *
  85  104   * Buffers do not have their own mutexes, rather they rely on the
  86  105   * hash table mutexes for the bulk of their protection (i.e. most
  87      - * fields in the arc_buf_hdr_t are protected by these mutexes).
      106 + * fields in the arc_buf_hdr_t are protected by these mutexes). The
      107 + * specific mutex is selected by taking its hash value and masking
      108 + * it by ht_lock_mask, which then produces an index into the mutex
      109 + * table. The size of the lock table is derived from the amount of
      110 + * physical memory, which is simply divided by
      111 + * 2^zfs_arc_ht_lock_shift, giving the number of locks, with a
      112 + * minimum of MIN_BUF_LOCKS.
  88  113   *
  89  114   * buf_hash_find() returns the appropriate mutex (held) when it
  90  115   * locates the requested buffer in the hash table.  It returns
  91  116   * NULL for the mutex if the buffer was not in the table.
  92  117   *
  93  118   * buf_hash_remove() expects the appropriate hash mutex to be
  94  119   * already held before it is invoked.
  95  120   *
  96  121   * Each arc state also has a mutex which is used to protect the
  97  122   * buffer list associated with the state.  When attempting to
↓ open down ↓ 97 lines elided ↑ open up ↑
 195  220   */
 196  221  uint64_t zfs_arc_max;
 197  222  uint64_t zfs_arc_min;
 198  223  uint64_t zfs_arc_meta_limit = 0;
 199  224  int zfs_arc_grow_retry = 0;
 200  225  int zfs_arc_shrink_shift = 0;
 201  226  int zfs_arc_p_min_shift = 0;
 202  227  int zfs_disable_dup_eviction = 0;
 203  228  
 204  229  /*
      230 + * Used to calculate the size of ARC hash tables and number of hash locks.
      231 + * See big theory block comment at the start of this file.
      232 + */
      233 +uint64_t zfs_arc_ht_base_masklen = 13;
      234 +/*
      235 + * We want to allocate one hash lock for every 4GB of memory with a minimum
      236 + * of MIN_BUF_LOCKS.
      237 + */
      238 +uint64_t zfs_arc_ht_lock_shift = 32;
      239 +#define MIN_BUF_LOCKS   256
      240 +
      241 +/*
 205  242   * Note that buffers can be in one of 6 states:
 206  243   *      ARC_anon        - anonymous (discussed below)
 207  244   *      ARC_mru         - recently used, currently cached
 208  245   *      ARC_mru_ghost   - recentely used, no longer in cache
 209  246   *      ARC_mfu         - frequently used, currently cached
 210  247   *      ARC_mfu_ghost   - frequently used, no longer in cache
 211  248   *      ARC_l2c_only    - exists in L2ARC but not other states
 212  249   * When there are no active references to the buffer, they are
 213  250   * are linked onto a list in one of these arc states.  These are
 214  251   * the only buffers that can be evicted or deleted.  Within each
↓ open down ↓ 366 lines elided ↑ open up ↑
 581  618  
 582  619  #define HT_LOCK_PAD     64
 583  620  
 584  621  struct ht_lock {
 585  622          kmutex_t        ht_lock;
 586  623  #ifdef _KERNEL
 587  624          unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 588  625  #endif
 589  626  };
 590  627  
 591      -#define BUF_LOCKS 256
 592  628  typedef struct buf_hash_table {
 593      -        uint64_t ht_mask;
 594      -        arc_buf_hdr_t **ht_table;
 595      -        struct ht_lock ht_locks[BUF_LOCKS];
      629 +        uint64_t        ht_mask;
      630 +        arc_buf_hdr_t   **ht_table;
      631 +        struct ht_lock  *ht_locks;
      632 +        uint64_t        ht_num_locks, ht_lock_mask;
 596  633  } buf_hash_table_t;
 597  634  
 598  635  static buf_hash_table_t buf_hash_table;
 599  636  
 600  637  #define BUF_HASH_INDEX(spa, dva, birth) \
 601  638          (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 602      -#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
      639 +#define BUF_HASH_LOCK_NTRY(idx) \
      640 +        (buf_hash_table.ht_locks[idx & buf_hash_table.ht_lock_mask])
 603  641  #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 604  642  #define HDR_LOCK(hdr) \
 605  643          (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 606  644  
 607  645  uint64_t zfs_crc64_table[256];
 608  646  
 609  647  /*
 610  648   * Level 2 ARC
 611  649   */
 612  650  
↓ open down ↓ 87 lines elided ↑ open up ↑
 700  738  
 701  739  static void l2arc_read_done(zio_t *zio);
 702  740  static void l2arc_hdr_stat_add(void);
 703  741  static void l2arc_hdr_stat_remove(void);
 704  742  
 705  743  static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 706  744  static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 707  745      enum zio_compress c);
 708  746  static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 709  747  
 710      -static uint64_t
      748 +static inline uint64_t
 711  749  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 712  750  {
 713  751          uint8_t *vdva = (uint8_t *)dva;
 714  752          uint64_t crc = -1ULL;
 715  753          int i;
 716  754  
 717  755          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 718  756  
 719  757          for (i = 0; i < sizeof (dva_t); i++)
 720  758                  crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
↓ open down ↓ 116 lines elided ↑ open up ↑
 837  875  static kmem_cache_t *hdr_cache;
 838  876  static kmem_cache_t *buf_cache;
 839  877  
 840  878  static void
 841  879  buf_fini(void)
 842  880  {
 843  881          int i;
 844  882  
 845  883          kmem_free(buf_hash_table.ht_table,
 846  884              (buf_hash_table.ht_mask + 1) * sizeof (void *));
 847      -        for (i = 0; i < BUF_LOCKS; i++)
      885 +
      886 +        for (i = 0; i < buf_hash_table.ht_num_locks; i++)
 848  887                  mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
      888 +        kmem_free(buf_hash_table.ht_locks, sizeof (struct ht_lock) *
      889 +            buf_hash_table.ht_num_locks);
 849  890          kmem_cache_destroy(hdr_cache);
 850  891          kmem_cache_destroy(buf_cache);
 851  892  }
 852  893  
 853  894  /*
 854  895   * Constructor callback - called when the cache is empty
 855  896   * and a new buf is requested.
 856  897   */
 857  898  /* ARGSUSED */
 858  899  static int
↓ open down ↓ 62 lines elided ↑ open up ↑
 921  962           * umem calls the reclaim func when we destroy the buf cache,
 922  963           * which is after we do arc_fini().
 923  964           */
 924  965          if (!arc_dead)
 925  966                  cv_signal(&arc_reclaim_thr_cv);
 926  967  }
 927  968  
 928  969  static void
 929  970  buf_init(void)
 930  971  {
 931      -        uint64_t *ct;
 932      -        uint64_t hsize = 1ULL << 12;
 933      -        int i, j;
      972 +        uint64_t        *ct;
      973 +        uint64_t        ht_masklen = 12;
      974 +        int             i, j;
 934  975  
 935      -        /*
 936      -         * The hash table is big enough to fill all of physical memory
 937      -         * with an average 64K block size.  The table will take up
 938      -         * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 939      -         */
 940      -        while (hsize * 65536 < physmem * PAGESIZE)
 941      -                hsize <<= 1;
 942      -retry:
 943      -        buf_hash_table.ht_mask = hsize - 1;
      976 +        while ((1ULL << (ht_masklen + zfs_arc_ht_base_masklen)) <
      977 +            physmem * PAGESIZE)
      978 +                ht_masklen++;
      979 +        buf_hash_table.ht_mask = (1ULL << ht_masklen) - 1;
 944  980          buf_hash_table.ht_table =
 945      -            kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 946      -        if (buf_hash_table.ht_table == NULL) {
 947      -                ASSERT(hsize > (1ULL << 8));
 948      -                hsize >>= 1;
 949      -                goto retry;
      981 +            kmem_zalloc((1ULL << ht_masklen) * sizeof (void *), KM_SLEEP);
      982 +
      983 +        buf_hash_table.ht_num_locks = MAX((physmem * PAGESIZE) >>
      984 +            zfs_arc_ht_lock_shift, MIN_BUF_LOCKS);
      985 +        buf_hash_table.ht_lock_mask = buf_hash_table.ht_num_locks - 1;
      986 +        buf_hash_table.ht_locks = kmem_zalloc(sizeof (struct ht_lock) *
      987 +            buf_hash_table.ht_num_locks, KM_SLEEP);
      988 +        for (i = 0; i < buf_hash_table.ht_num_locks; i++) {
      989 +                mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
      990 +                    NULL, MUTEX_DEFAULT, NULL);
 950  991          }
 951  992  
 952  993          hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 953  994              0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 954  995          buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 955  996              0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 956  997  
 957  998          for (i = 0; i < 256; i++)
 958  999                  for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 959 1000                          *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 960      -
 961      -        for (i = 0; i < BUF_LOCKS; i++) {
 962      -                mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 963      -                    NULL, MUTEX_DEFAULT, NULL);
 964      -        }
 965 1001  }
 966 1002  
 967 1003  #define ARC_MINTIME     (hz>>4) /* 62 ms */
 968 1004  
 969 1005  static void
 970 1006  arc_cksum_verify(arc_buf_t *buf)
 971 1007  {
 972 1008          zio_cksum_t zc;
 973 1009  
 974 1010          if (!(zfs_flags & ZFS_DEBUG_MODIFY))
↓ open down ↓ 4225 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX