Print this page
3742 zfs comments need cleaner, more consistent style
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>
Reviewed by:    George Wilson <george.wilson@delphix.com>
Reviewed by:    Eric Schrock <eric.schrock@delphix.com>


  40  * subset of the blocks in the cache are un-evictable because we
  41  * have handed out a reference to them.  Blocks are only evictable
  42  * when there are no external references active.  This makes
  43  * eviction far more problematic:  we choose to evict the evictable
  44  * blocks that are the "lowest" in the list.
  45  *
  46  * There are times when it is not possible to evict the requested
  47  * space.  In these circumstances we are unable to adjust the cache
  48  * size.  To prevent the cache growing unbounded at these times we
  49  * implement a "cache throttle" that slows the flow of new data
  50  * into the cache until we can make space available.
  51  *
  52  * 2. The Megiddo and Modha model assumes a fixed cache size.
  53  * Pages are evicted when the cache is full and there is a cache
  54  * miss.  Our model has a variable sized cache.  It grows with
  55  * high use, but also tries to react to memory pressure from the
  56  * operating system: decreasing its size when system memory is
  57  * tight.
  58  *
  59  * 3. The Megiddo and Modha model assumes a fixed page size. All
  60  * elements of the cache are therefor exactly the same size.  So
  61  * when adjusting the cache size following a cache miss, its simply
  62  * a matter of choosing a single page to evict.  In our model, we
  63  * have variable sized cache blocks (rangeing from 512 bytes to
  64  * 128K bytes).  We therefor choose a set of blocks to evict to make
  65  * space for a cache miss that approximates as closely as possible
  66  * the space used by the new block.
  67  *
  68  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  69  * by N. Megiddo & D. Modha, FAST 2003
  70  */
  71 
  72 /*
  73  * The locking model:
  74  *
  75  * A new reference to a cache buffer can be obtained in two
  76  * ways: 1) via a hash table lookup using the DVA as a key,
  77  * or 2) via one of the ARC lists.  The arc_read() interface
  78  * uses method 1, while the internal arc algorithms for
  79  * adjusting the cache use method 2.  We therefor provide two
  80  * types of locks: 1) the hash table lock array, and 2) the
  81  * arc list locks.
  82  *
  83  * Buffers do not have their own mutexes, rather they rely on the
  84  * hash table mutexes for the bulk of their protection (i.e. most
  85  * fields in the arc_buf_hdr_t are protected by these mutexes).
  86  *
  87  * buf_hash_find() returns the appropriate mutex (held) when it
  88  * locates the requested buffer in the hash table.  It returns
  89  * NULL for the mutex if the buffer was not in the table.
  90  *
  91  * buf_hash_remove() expects the appropriate hash mutex to be
  92  * already held before it is invoked.
  93  *
  94  * Each arc state also has a mutex which is used to protect the
  95  * buffer list associated with the state.  When attempting to
  96  * obtain a hash table lock while holding an arc list lock you
  97  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  98  * the active state mutex must be held before the ghost state mutex.
  99  *


 358         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 359         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 360         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 361         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 362         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 363         { "l2_io_error",                KSTAT_DATA_UINT64 },
 364         { "l2_size",                    KSTAT_DATA_UINT64 },
 365         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 366         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 367         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 368         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 369         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 370         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 371         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 372         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 373 };
 374 
 375 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 376 
 377 #define ARCSTAT_INCR(stat, val) \
 378         atomic_add_64(&arc_stats.stat.value.ui64, (val));
 379 
 380 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 381 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 382 
 383 #define ARCSTAT_MAX(stat, val) {                                        \
 384         uint64_t m;                                                     \
 385         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 386             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 387                 continue;                                               \
 388 }
 389 
 390 #define ARCSTAT_MAXSTAT(stat) \
 391         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 392 
 393 /*
 394  * We define a macro to allow ARC hits/misses to be easily broken down by
 395  * two separate conditions, giving a total of four different subtypes for
 396  * each of hits and misses (so eight statistics total).
 397  */
 398 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \


 578         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 579 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 580 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 581 #define HDR_LOCK(hdr) \
 582         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 583 
 584 uint64_t zfs_crc64_table[256];
 585 
 586 /*
 587  * Level 2 ARC
 588  */
 589 
 590 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 591 #define L2ARC_HEADROOM          2               /* num of writes */
 592 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 593 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 594 
 595 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 596 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 597 
 598 /*
 599  * L2ARC Performance Tunables
 600  */
 601 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 602 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 603 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 604 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 605 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 606 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 607 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 608 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 609 
 610 /*
 611  * L2ARC Internals
 612  */
 613 typedef struct l2arc_dev {
 614         vdev_t                  *l2ad_vdev;     /* vdev */
 615         spa_t                   *l2ad_spa;      /* spa */
 616         uint64_t                l2ad_hand;      /* next write location */
 617         uint64_t                l2ad_write;     /* desired write size, bytes */
 618         uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 619         uint64_t                l2ad_start;     /* first addr on device */
 620         uint64_t                l2ad_end;       /* last addr on device */


3528          */
3529         if (spa_get_random(10000) == 0) {
3530                 dprintf("forcing random failure\n");
3531                 return (SET_ERROR(ERESTART));
3532         }
3533 #endif
3534         if (reserve > arc_c/4 && !arc_no_grow)
3535                 arc_c = MIN(arc_c_max, reserve * 4);
3536         if (reserve > arc_c)
3537                 return (SET_ERROR(ENOMEM));
3538 
3539         /*
3540          * Don't count loaned bufs as in flight dirty data to prevent long
3541          * network delays from blocking transactions that are ready to be
3542          * assigned to a txg.
3543          */
3544         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3545 
3546         /*
3547          * Writes will, almost always, require additional memory allocations
3548          * in order to compress/encrypt/etc the data.  We therefor need to
3549          * make sure that there is sufficient available memory for this.
3550          */
3551         if (error = arc_memory_throttle(reserve, anon_size, txg))
3552                 return (error);
3553 
3554         /*
3555          * Throttle writes when the amount of dirty data in the cache
3556          * gets too large.  We try to keep the cache less than half full
3557          * of dirty blocks so that our sync times don't grow too large.
3558          * Note: if two requests come in concurrently, we might let them
3559          * both succeed, when one of them should fail.  Not a huge deal.
3560          */
3561 
3562         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3563             anon_size > arc_c / 4) {
3564                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3565                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3566                     arc_tempreserve>>10,
3567                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3568                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,




  40  * subset of the blocks in the cache are un-evictable because we
  41  * have handed out a reference to them.  Blocks are only evictable
  42  * when there are no external references active.  This makes
  43  * eviction far more problematic:  we choose to evict the evictable
  44  * blocks that are the "lowest" in the list.
  45  *
  46  * There are times when it is not possible to evict the requested
  47  * space.  In these circumstances we are unable to adjust the cache
  48  * size.  To prevent the cache growing unbounded at these times we
  49  * implement a "cache throttle" that slows the flow of new data
  50  * into the cache until we can make space available.
  51  *
  52  * 2. The Megiddo and Modha model assumes a fixed cache size.
  53  * Pages are evicted when the cache is full and there is a cache
  54  * miss.  Our model has a variable sized cache.  It grows with
  55  * high use, but also tries to react to memory pressure from the
  56  * operating system: decreasing its size when system memory is
  57  * tight.
  58  *
  59  * 3. The Megiddo and Modha model assumes a fixed page size. All
  60  * elements of the cache are therefore exactly the same size.  So
  61  * when adjusting the cache size following a cache miss, its simply
  62  * a matter of choosing a single page to evict.  In our model, we
  63  * have variable sized cache blocks (rangeing from 512 bytes to
  64  * 128K bytes).  We therefore choose a set of blocks to evict to make
  65  * space for a cache miss that approximates as closely as possible
  66  * the space used by the new block.
  67  *
  68  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  69  * by N. Megiddo & D. Modha, FAST 2003
  70  */
  71 
  72 /*
  73  * The locking model:
  74  *
  75  * A new reference to a cache buffer can be obtained in two
  76  * ways: 1) via a hash table lookup using the DVA as a key,
  77  * or 2) via one of the ARC lists.  The arc_read() interface
  78  * uses method 1, while the internal arc algorithms for
  79  * adjusting the cache use method 2.  We therefore provide two
  80  * types of locks: 1) the hash table lock array, and 2) the
  81  * arc list locks.
  82  *
  83  * Buffers do not have their own mutexes, rather they rely on the
  84  * hash table mutexes for the bulk of their protection (i.e. most
  85  * fields in the arc_buf_hdr_t are protected by these mutexes).
  86  *
  87  * buf_hash_find() returns the appropriate mutex (held) when it
  88  * locates the requested buffer in the hash table.  It returns
  89  * NULL for the mutex if the buffer was not in the table.
  90  *
  91  * buf_hash_remove() expects the appropriate hash mutex to be
  92  * already held before it is invoked.
  93  *
  94  * Each arc state also has a mutex which is used to protect the
  95  * buffer list associated with the state.  When attempting to
  96  * obtain a hash table lock while holding an arc list lock you
  97  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  98  * the active state mutex must be held before the ghost state mutex.
  99  *


 358         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 359         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 360         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 361         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 362         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 363         { "l2_io_error",                KSTAT_DATA_UINT64 },
 364         { "l2_size",                    KSTAT_DATA_UINT64 },
 365         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 366         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 367         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 368         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 369         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 370         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 371         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 372         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 373 };
 374 
 375 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 376 
 377 #define ARCSTAT_INCR(stat, val) \
 378         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 379 
 380 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 381 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 382 
 383 #define ARCSTAT_MAX(stat, val) {                                        \
 384         uint64_t m;                                                     \
 385         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 386             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 387                 continue;                                               \
 388 }
 389 
 390 #define ARCSTAT_MAXSTAT(stat) \
 391         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 392 
 393 /*
 394  * We define a macro to allow ARC hits/misses to be easily broken down by
 395  * two separate conditions, giving a total of four different subtypes for
 396  * each of hits and misses (so eight statistics total).
 397  */
 398 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \


 578         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 579 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 580 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 581 #define HDR_LOCK(hdr) \
 582         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 583 
 584 uint64_t zfs_crc64_table[256];
 585 
 586 /*
 587  * Level 2 ARC
 588  */
 589 
 590 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 591 #define L2ARC_HEADROOM          2               /* num of writes */
 592 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 593 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 594 
 595 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 596 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 597 
 598 /* L2ARC Performance Tunables */


 599 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 600 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 601 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 602 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 603 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 604 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 605 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 606 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 607 
 608 /*
 609  * L2ARC Internals
 610  */
 611 typedef struct l2arc_dev {
 612         vdev_t                  *l2ad_vdev;     /* vdev */
 613         spa_t                   *l2ad_spa;      /* spa */
 614         uint64_t                l2ad_hand;      /* next write location */
 615         uint64_t                l2ad_write;     /* desired write size, bytes */
 616         uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 617         uint64_t                l2ad_start;     /* first addr on device */
 618         uint64_t                l2ad_end;       /* last addr on device */


3526          */
3527         if (spa_get_random(10000) == 0) {
3528                 dprintf("forcing random failure\n");
3529                 return (SET_ERROR(ERESTART));
3530         }
3531 #endif
3532         if (reserve > arc_c/4 && !arc_no_grow)
3533                 arc_c = MIN(arc_c_max, reserve * 4);
3534         if (reserve > arc_c)
3535                 return (SET_ERROR(ENOMEM));
3536 
3537         /*
3538          * Don't count loaned bufs as in flight dirty data to prevent long
3539          * network delays from blocking transactions that are ready to be
3540          * assigned to a txg.
3541          */
3542         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3543 
3544         /*
3545          * Writes will, almost always, require additional memory allocations
3546          * in order to compress/encrypt/etc the data.  We therefore need to
3547          * make sure that there is sufficient available memory for this.
3548          */
3549         if (error = arc_memory_throttle(reserve, anon_size, txg))
3550                 return (error);
3551 
3552         /*
3553          * Throttle writes when the amount of dirty data in the cache
3554          * gets too large.  We try to keep the cache less than half full
3555          * of dirty blocks so that our sync times don't grow too large.
3556          * Note: if two requests come in concurrently, we might let them
3557          * both succeed, when one of them should fail.  Not a huge deal.
3558          */
3559 
3560         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3561             anon_size > arc_c / 4) {
3562                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3563                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3564                     arc_tempreserve>>10,
3565                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3566                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,