Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c
↓ open down ↓ 119 lines elided ↑ open up ↑
 120  120   */
 121  121  
 122  122  #include <sys/spa.h>
 123  123  #include <sys/zio.h>
 124  124  #include <sys/zio_compress.h>
 125  125  #include <sys/zfs_context.h>
 126  126  #include <sys/arc.h>
 127  127  #include <sys/refcount.h>
 128  128  #include <sys/vdev.h>
 129  129  #include <sys/vdev_impl.h>
      130 +#include <sys/dsl_pool.h>
 130  131  #ifdef _KERNEL
 131  132  #include <sys/vmsystm.h>
 132  133  #include <vm/anon.h>
 133  134  #include <sys/fs/swapnode.h>
 134  135  #include <sys/dnlc.h>
 135  136  #endif
 136  137  #include <sys/callb.h>
 137  138  #include <sys/kstat.h>
 138  139  #include <zfs_fletcher.h>
 139  140  
 140  141  #ifndef _KERNEL
 141  142  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 142  143  boolean_t arc_watch = B_FALSE;
 143  144  int arc_procfd;
 144  145  #endif
 145  146  
 146  147  static kmutex_t         arc_reclaim_thr_lock;
 147  148  static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 148  149  static uint8_t          arc_thread_exit;
 149  150  
 150      -extern int zfs_write_limit_shift;
 151      -extern uint64_t zfs_write_limit_max;
 152      -extern kmutex_t zfs_write_limit_lock;
 153      -
 154  151  #define ARC_REDUCE_DNLC_PERCENT 3
 155  152  uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 156  153  
 157  154  typedef enum arc_reclaim_strategy {
 158  155          ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 159  156          ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 160  157  } arc_reclaim_strategy_t;
 161  158  
      159 +/*
      160 + * The number of iterations through arc_evict_*() before we
      161 + * drop & reacquire the lock.
      162 + */
      163 +int arc_evict_iterations = 100;
      164 +
 162  165  /* number of seconds before growing cache again */
 163  166  static int              arc_grow_retry = 60;
 164  167  
 165  168  /* shift of arc_c for calculating both min and max arc_p */
 166  169  static int              arc_p_min_shift = 4;
 167  170  
 168  171  /* log2(fraction of arc to reclaim) */
 169  172  static int              arc_shrink_shift = 5;
 170  173  
 171  174  /*
 172  175   * minimum lifespan of a prefetch block in clock ticks
 173  176   * (initialized in arc_init())
 174  177   */
 175  178  static int              arc_min_prefetch_lifespan;
 176  179  
      180 +/*
      181 + * If this percent of memory is free, don't throttle.
      182 + */
      183 +int arc_lotsfree_percent = 10;
      184 +
 177  185  static int arc_dead;
 178  186  
 179  187  /*
 180  188   * The arc has filled available memory and has now warmed up.
 181  189   */
 182  190  static boolean_t arc_warm;
 183  191  
 184  192  /*
 185  193   * These tunables are for performance analysis.
 186  194   */
↓ open down ↓ 275 lines elided ↑ open up ↑
 462  470          arc_buf_t               *acb_buf;
 463  471          zio_t                   *acb_zio_dummy;
 464  472          arc_callback_t          *acb_next;
 465  473  };
 466  474  
 467  475  typedef struct arc_write_callback arc_write_callback_t;
 468  476  
 469  477  struct arc_write_callback {
 470  478          void            *awcb_private;
 471  479          arc_done_func_t *awcb_ready;
      480 +        arc_done_func_t *awcb_physdone;
 472  481          arc_done_func_t *awcb_done;
 473  482          arc_buf_t       *awcb_buf;
 474  483  };
 475  484  
 476  485  struct arc_buf_hdr {
 477  486          /* protected by hash lock */
 478  487          dva_t                   b_dva;
 479  488          uint64_t                b_birth;
 480  489          uint64_t                b_cksum0;
 481  490  
↓ open down ↓ 674 lines elided ↑ open up ↑
1156 1165   * for the buffer must be held by the caller.
1157 1166   */
1158 1167  static void
1159 1168  arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1160 1169  {
1161 1170          arc_state_t *old_state = ab->b_state;
1162 1171          int64_t refcnt = refcount_count(&ab->b_refcnt);
1163 1172          uint64_t from_delta, to_delta;
1164 1173  
1165 1174          ASSERT(MUTEX_HELD(hash_lock));
1166      -        ASSERT(new_state != old_state);
     1175 +        ASSERT3P(new_state, !=, old_state);
1167 1176          ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1168 1177          ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1169 1178          ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1170 1179  
1171 1180          from_delta = to_delta = ab->b_datacnt * ab->b_size;
1172 1181  
1173 1182          /*
1174 1183           * If this buffer is evictable, transfer it from the
1175 1184           * old state list to the new state list.
1176 1185           */
↓ open down ↓ 594 lines elided ↑ open up ↑
1771 1780  arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1772 1781      arc_buf_contents_t type)
1773 1782  {
1774 1783          arc_state_t *evicted_state;
1775 1784          uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1776 1785          arc_buf_hdr_t *ab, *ab_prev = NULL;
1777 1786          list_t *list = &state->arcs_list[type];
1778 1787          kmutex_t *hash_lock;
1779 1788          boolean_t have_lock;
1780 1789          void *stolen = NULL;
     1790 +        arc_buf_hdr_t marker = { 0 };
     1791 +        int count = 0;
1781 1792  
1782 1793          ASSERT(state == arc_mru || state == arc_mfu);
1783 1794  
1784 1795          evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1785 1796  
1786 1797          mutex_enter(&state->arcs_mtx);
1787 1798          mutex_enter(&evicted_state->arcs_mtx);
1788 1799  
1789 1800          for (ab = list_tail(list); ab; ab = ab_prev) {
1790 1801                  ab_prev = list_prev(list, ab);
↓ open down ↓ 3 lines elided ↑ open up ↑
1794 1805                      (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1795 1806                      ddi_get_lbolt() - ab->b_arc_access <
1796 1807                      arc_min_prefetch_lifespan)) {
1797 1808                          skipped++;
1798 1809                          continue;
1799 1810                  }
1800 1811                  /* "lookahead" for better eviction candidate */
1801 1812                  if (recycle && ab->b_size != bytes &&
1802 1813                      ab_prev && ab_prev->b_size == bytes)
1803 1814                          continue;
     1815 +
     1816 +                /* ignore markers */
     1817 +                if (ab->b_spa == 0)
     1818 +                        continue;
     1819 +
     1820 +                /*
     1821 +                 * It may take a long time to evict all the bufs requested.
     1822 +                 * To avoid blocking all arc activity, periodically drop
     1823 +                 * the arcs_mtx and give other threads a chance to run
     1824 +                 * before reacquiring the lock.
     1825 +                 *
     1826 +                 * If we are looking for a buffer to recycle, we are in
     1827 +                 * the hot code path, so don't sleep.
     1828 +                 */
     1829 +                if (!recycle && count++ > arc_evict_iterations) {
     1830 +                        list_insert_after(list, ab, &marker);
     1831 +                        mutex_exit(&evicted_state->arcs_mtx);
     1832 +                        mutex_exit(&state->arcs_mtx);
     1833 +                        kpreempt(KPREEMPT_SYNC);
     1834 +                        mutex_enter(&state->arcs_mtx);
     1835 +                        mutex_enter(&evicted_state->arcs_mtx);
     1836 +                        ab_prev = list_prev(list, &marker);
     1837 +                        list_remove(list, &marker);
     1838 +                        count = 0;
     1839 +                        continue;
     1840 +                }
     1841 +
1804 1842                  hash_lock = HDR_LOCK(ab);
1805 1843                  have_lock = MUTEX_HELD(hash_lock);
1806 1844                  if (have_lock || mutex_tryenter(hash_lock)) {
1807 1845                          ASSERT0(refcount_count(&ab->b_refcnt));
1808 1846                          ASSERT(ab->b_datacnt > 0);
1809 1847                          while (ab->b_buf) {
1810 1848                                  arc_buf_t *buf = ab->b_buf;
1811 1849                                  if (!mutex_tryenter(&buf->b_evict_lock)) {
1812 1850                                          missed += 1;
1813 1851                                          break;
↓ open down ↓ 61 lines elided ↑ open up ↑
1875 1913                  dprintf("only evicted %lld bytes from %x",
1876 1914                      (longlong_t)bytes_evicted, state);
1877 1915  
1878 1916          if (skipped)
1879 1917                  ARCSTAT_INCR(arcstat_evict_skip, skipped);
1880 1918  
1881 1919          if (missed)
1882 1920                  ARCSTAT_INCR(arcstat_mutex_miss, missed);
1883 1921  
1884 1922          /*
1885      -         * We have just evicted some data into the ghost state, make
1886      -         * sure we also adjust the ghost state size if necessary.
     1923 +         * Note: we have just evicted some data into the ghost state,
     1924 +         * potentially putting the ghost size over the desired size.  Rather
     1925 +         * that evicting from the ghost list in this hot code path, leave
     1926 +         * this chore to the arc_reclaim_thread().
1887 1927           */
1888      -        if (arc_no_grow &&
1889      -            arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1890      -                int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1891      -                    arc_mru_ghost->arcs_size - arc_c;
1892 1928  
1893      -                if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1894      -                        int64_t todelete =
1895      -                            MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1896      -                        arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1897      -                } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1898      -                        int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1899      -                            arc_mru_ghost->arcs_size +
1900      -                            arc_mfu_ghost->arcs_size - arc_c);
1901      -                        arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1902      -                }
1903      -        }
1904      -
1905 1929          return (stolen);
1906 1930  }
1907 1931  
1908 1932  /*
1909 1933   * Remove buffers from list until we've removed the specified number of
1910 1934   * bytes.  Destroy the buffers that are removed.
1911 1935   */
1912 1936  static void
1913 1937  arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1914 1938  {
1915 1939          arc_buf_hdr_t *ab, *ab_prev;
1916 1940          arc_buf_hdr_t marker = { 0 };
1917 1941          list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1918 1942          kmutex_t *hash_lock;
1919 1943          uint64_t bytes_deleted = 0;
1920 1944          uint64_t bufs_skipped = 0;
     1945 +        int count = 0;
1921 1946  
1922 1947          ASSERT(GHOST_STATE(state));
1923 1948  top:
1924 1949          mutex_enter(&state->arcs_mtx);
1925 1950          for (ab = list_tail(list); ab; ab = ab_prev) {
1926 1951                  ab_prev = list_prev(list, ab);
     1952 +                if (ab->b_type > ARC_BUFC_NUMTYPES)
     1953 +                        panic("invalid ab=%p", (void *)ab);
1927 1954                  if (spa && ab->b_spa != spa)
1928 1955                          continue;
1929 1956  
1930 1957                  /* ignore markers */
1931 1958                  if (ab->b_spa == 0)
1932 1959                          continue;
1933 1960  
1934 1961                  hash_lock = HDR_LOCK(ab);
1935 1962                  /* caller may be trying to modify this buffer, skip it */
1936 1963                  if (MUTEX_HELD(hash_lock))
1937 1964                          continue;
     1965 +
     1966 +                /*
     1967 +                 * It may take a long time to evict all the bufs requested.
     1968 +                 * To avoid blocking all arc activity, periodically drop
     1969 +                 * the arcs_mtx and give other threads a chance to run
     1970 +                 * before reacquiring the lock.
     1971 +                 */
     1972 +                if (count++ > arc_evict_iterations) {
     1973 +                        list_insert_after(list, ab, &marker);
     1974 +                        mutex_exit(&state->arcs_mtx);
     1975 +                        kpreempt(KPREEMPT_SYNC);
     1976 +                        mutex_enter(&state->arcs_mtx);
     1977 +                        ab_prev = list_prev(list, &marker);
     1978 +                        list_remove(list, &marker);
     1979 +                        count = 0;
     1980 +                        continue;
     1981 +                }
1938 1982                  if (mutex_tryenter(hash_lock)) {
1939 1983                          ASSERT(!HDR_IO_IN_PROGRESS(ab));
1940 1984                          ASSERT(ab->b_buf == NULL);
1941 1985                          ARCSTAT_BUMP(arcstat_deleted);
1942 1986                          bytes_deleted += ab->b_size;
1943 1987  
1944 1988                          if (ab->b_l2hdr != NULL) {
1945 1989                                  /*
1946 1990                                   * This buffer is cached on the 2nd Level ARC;
1947 1991                                   * don't destroy the header.
↓ open down ↓ 15 lines elided ↑ open up ↑
1963 2007                           * hash lock to become available. Once its
1964 2008                           * available, restart from where we left off.
1965 2009                           */
1966 2010                          list_insert_after(list, ab, &marker);
1967 2011                          mutex_exit(&state->arcs_mtx);
1968 2012                          mutex_enter(hash_lock);
1969 2013                          mutex_exit(hash_lock);
1970 2014                          mutex_enter(&state->arcs_mtx);
1971 2015                          ab_prev = list_prev(list, &marker);
1972 2016                          list_remove(list, &marker);
1973      -                } else
     2017 +                } else {
1974 2018                          bufs_skipped += 1;
     2019 +                }
     2020 +
1975 2021          }
1976 2022          mutex_exit(&state->arcs_mtx);
1977 2023  
1978 2024          if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1979 2025              (bytes < 0 || bytes_deleted < bytes)) {
1980 2026                  list = &state->arcs_list[ARC_BUFC_METADATA];
1981 2027                  goto top;
1982 2028          }
1983 2029  
1984 2030          if (bufs_skipped) {
↓ open down ↓ 833 lines elided ↑ open up ↑
2818 2864   * either wait for the in-progress read to complete (and return the
2819 2865   * results); or, if this is a read with a "done" func, add a record
2820 2866   * to the read to invoke the "done" func when the read completes,
2821 2867   * and return; or just return.
2822 2868   *
2823 2869   * arc_read_done() will invoke all the requested "done" functions
2824 2870   * for readers of this block.
2825 2871   */
2826 2872  int
2827 2873  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2828      -    void *private, int priority, int zio_flags, uint32_t *arc_flags,
     2874 +    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
2829 2875      const zbookmark_t *zb)
2830 2876  {
2831 2877          arc_buf_hdr_t *hdr;
2832 2878          arc_buf_t *buf = NULL;
2833 2879          kmutex_t *hash_lock;
2834 2880          zio_t *rzio;
2835 2881          uint64_t guid = spa_load_guid(spa);
2836 2882  
2837 2883  top:
2838 2884          hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
↓ open down ↓ 582 lines elided ↑ open up ↑
3421 3467                  if (hdr->b_freeze_cksum != NULL) {
3422 3468                          kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3423 3469                          hdr->b_freeze_cksum = NULL;
3424 3470                  }
3425 3471                  mutex_exit(&hdr->b_freeze_lock);
3426 3472          }
3427 3473          arc_cksum_compute(buf, B_FALSE);
3428 3474          hdr->b_flags |= ARC_IO_IN_PROGRESS;
3429 3475  }
3430 3476  
     3477 +/*
     3478 + * The SPA calls this callback for each physical write that happens on behalf
     3479 + * of a logical write.  See the comment in dbuf_write_physdone() for details.
     3480 + */
3431 3481  static void
     3482 +arc_write_physdone(zio_t *zio)
     3483 +{
     3484 +        arc_write_callback_t *cb = zio->io_private;
     3485 +        if (cb->awcb_physdone != NULL)
     3486 +                cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
     3487 +}
     3488 +
     3489 +static void
3432 3490  arc_write_done(zio_t *zio)
3433 3491  {
3434 3492          arc_write_callback_t *callback = zio->io_private;
3435 3493          arc_buf_t *buf = callback->awcb_buf;
3436 3494          arc_buf_hdr_t *hdr = buf->b_hdr;
3437 3495  
3438 3496          ASSERT(hdr->b_acb == NULL);
3439 3497  
3440 3498          if (zio->io_error == 0) {
3441 3499                  hdr->b_dva = *BP_IDENTITY(zio->io_bp);
↓ open down ↓ 59 lines elided ↑ open up ↑
3501 3559  
3502 3560          ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3503 3561          callback->awcb_done(zio, buf, callback->awcb_private);
3504 3562  
3505 3563          kmem_free(callback, sizeof (arc_write_callback_t));
3506 3564  }
3507 3565  
3508 3566  zio_t *
3509 3567  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3510 3568      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3511      -    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3512      -    void *private, int priority, int zio_flags, const zbookmark_t *zb)
     3569 +    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
     3570 +    arc_done_func_t *done, void *private, zio_priority_t priority,
     3571 +    int zio_flags, const zbookmark_t *zb)
3513 3572  {
3514 3573          arc_buf_hdr_t *hdr = buf->b_hdr;
3515 3574          arc_write_callback_t *callback;
3516 3575          zio_t *zio;
3517 3576  
3518 3577          ASSERT(ready != NULL);
3519 3578          ASSERT(done != NULL);
3520 3579          ASSERT(!HDR_IO_ERROR(hdr));
3521 3580          ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3522 3581          ASSERT(hdr->b_acb == NULL);
3523 3582          if (l2arc)
3524 3583                  hdr->b_flags |= ARC_L2CACHE;
3525 3584          if (l2arc_compress)
3526 3585                  hdr->b_flags |= ARC_L2COMPRESS;
3527 3586          callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3528 3587          callback->awcb_ready = ready;
     3588 +        callback->awcb_physdone = physdone;
3529 3589          callback->awcb_done = done;
3530 3590          callback->awcb_private = private;
3531 3591          callback->awcb_buf = buf;
3532 3592  
3533 3593          zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3534      -            arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
     3594 +            arc_write_ready, arc_write_physdone, arc_write_done, callback,
     3595 +            priority, zio_flags, zb);
3535 3596  
3536 3597          return (zio);
3537 3598  }
3538 3599  
3539 3600  static int
3540      -arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
     3601 +arc_memory_throttle(uint64_t reserve, uint64_t txg)
3541 3602  {
3542 3603  #ifdef _KERNEL
3543 3604          uint64_t available_memory = ptob(freemem);
3544 3605          static uint64_t page_load = 0;
3545 3606          static uint64_t last_txg = 0;
3546 3607  
3547 3608  #if defined(__i386)
3548 3609          available_memory =
3549 3610              MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3550 3611  #endif
3551      -        if (available_memory >= zfs_write_limit_max)
     3612 +
     3613 +        if (freemem > physmem * arc_lotsfree_percent / 100)
3552 3614                  return (0);
3553 3615  
3554 3616          if (txg > last_txg) {
3555 3617                  last_txg = txg;
3556 3618                  page_load = 0;
3557 3619          }
3558 3620          /*
3559 3621           * If we are in pageout, we know that memory is already tight,
3560 3622           * the arc is already going to be evicting, so we just want to
3561 3623           * continue to let page writes occur as quickly as possible.
↓ open down ↓ 3 lines elided ↑ open up ↑
3565 3627                          return (SET_ERROR(ERESTART));
3566 3628                  /* Note: reserve is inflated, so we deflate */
3567 3629                  page_load += reserve / 8;
3568 3630                  return (0);
3569 3631          } else if (page_load > 0 && arc_reclaim_needed()) {
3570 3632                  /* memory is low, delay before restarting */
3571 3633                  ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3572 3634                  return (SET_ERROR(EAGAIN));
3573 3635          }
3574 3636          page_load = 0;
3575      -
3576      -        if (arc_size > arc_c_min) {
3577      -                uint64_t evictable_memory =
3578      -                    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3579      -                    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3580      -                    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3581      -                    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3582      -                available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3583      -        }
3584      -
3585      -        if (inflight_data > available_memory / 4) {
3586      -                ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3587      -                return (SET_ERROR(ERESTART));
3588      -        }
3589 3637  #endif
3590 3638          return (0);
3591 3639  }
3592 3640  
3593 3641  void
3594 3642  arc_tempreserve_clear(uint64_t reserve)
3595 3643  {
3596 3644          atomic_add_64(&arc_tempreserve, -reserve);
3597 3645          ASSERT((int64_t)arc_tempreserve >= 0);
3598 3646  }
3599 3647  
3600 3648  int
3601 3649  arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3602 3650  {
3603 3651          int error;
3604 3652          uint64_t anon_size;
3605 3653  
3606      -#ifdef ZFS_DEBUG
3607      -        /*
3608      -         * Once in a while, fail for no reason.  Everything should cope.
3609      -         */
3610      -        if (spa_get_random(10000) == 0) {
3611      -                dprintf("forcing random failure\n");
3612      -                return (SET_ERROR(ERESTART));
3613      -        }
3614      -#endif
3615 3654          if (reserve > arc_c/4 && !arc_no_grow)
3616 3655                  arc_c = MIN(arc_c_max, reserve * 4);
3617 3656          if (reserve > arc_c)
3618 3657                  return (SET_ERROR(ENOMEM));
3619 3658  
3620 3659          /*
3621 3660           * Don't count loaned bufs as in flight dirty data to prevent long
3622 3661           * network delays from blocking transactions that are ready to be
3623 3662           * assigned to a txg.
3624 3663           */
3625 3664          anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3626 3665  
3627 3666          /*
3628 3667           * Writes will, almost always, require additional memory allocations
3629 3668           * in order to compress/encrypt/etc the data.  We therefore need to
3630 3669           * make sure that there is sufficient available memory for this.
3631 3670           */
3632      -        if (error = arc_memory_throttle(reserve, anon_size, txg))
     3671 +        error = arc_memory_throttle(reserve, txg);
     3672 +        if (error != 0)
3633 3673                  return (error);
3634 3674  
3635 3675          /*
3636 3676           * Throttle writes when the amount of dirty data in the cache
3637 3677           * gets too large.  We try to keep the cache less than half full
3638 3678           * of dirty blocks so that our sync times don't grow too large.
3639 3679           * Note: if two requests come in concurrently, we might let them
3640 3680           * both succeed, when one of them should fail.  Not a huge deal.
3641 3681           */
3642 3682  
↓ open down ↓ 128 lines elided ↑ open up ↑
3771 3811                  arc_ksp->ks_data = &arc_stats;
3772 3812                  kstat_install(arc_ksp);
3773 3813          }
3774 3814  
3775 3815          (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3776 3816              TS_RUN, minclsyspri);
3777 3817  
3778 3818          arc_dead = FALSE;
3779 3819          arc_warm = B_FALSE;
3780 3820  
3781      -        if (zfs_write_limit_max == 0)
3782      -                zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3783      -        else
3784      -                zfs_write_limit_shift = 0;
3785      -        mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
     3821 +        /*
     3822 +         * Calculate maximum amount of dirty data per pool.
     3823 +         *
     3824 +         * If it has been set by /etc/system, take that.
     3825 +         * Otherwise, use a percentage of physical memory defined by
     3826 +         * zfs_dirty_data_max_percent (default 10%) with a cap at
     3827 +         * zfs_dirty_data_max_max (default 4GB).
     3828 +         */
     3829 +        if (zfs_dirty_data_max == 0) {
     3830 +                zfs_dirty_data_max = physmem * PAGESIZE *
     3831 +                    zfs_dirty_data_max_percent / 100;
     3832 +                zfs_dirty_data_max = MIN(zfs_dirty_data_max,
     3833 +                    zfs_dirty_data_max_max);
     3834 +        }
3786 3835  }
3787 3836  
3788 3837  void
3789 3838  arc_fini(void)
3790 3839  {
3791 3840          mutex_enter(&arc_reclaim_thr_lock);
3792 3841          arc_thread_exit = 1;
3793 3842          while (arc_thread_exit != 0)
3794 3843                  cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3795 3844          mutex_exit(&arc_reclaim_thr_lock);
↓ open down ↓ 20 lines elided ↑ open up ↑
3816 3865          list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3817 3866          list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3818 3867  
3819 3868          mutex_destroy(&arc_anon->arcs_mtx);
3820 3869          mutex_destroy(&arc_mru->arcs_mtx);
3821 3870          mutex_destroy(&arc_mru_ghost->arcs_mtx);
3822 3871          mutex_destroy(&arc_mfu->arcs_mtx);
3823 3872          mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3824 3873          mutex_destroy(&arc_l2c_only->arcs_mtx);
3825 3874  
3826      -        mutex_destroy(&zfs_write_limit_lock);
3827      -
3828 3875          buf_fini();
3829 3876  
3830 3877          ASSERT(arc_loaned_bytes == 0);
3831 3878  }
3832 3879  
3833 3880  /*
3834 3881   * Level 2 ARC
3835 3882   *
3836 3883   * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3837 3884   * It uses dedicated storage devices to hold cached data, which are populated
↓ open down ↓ 1304 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX