Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dsl_dir.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_dir.c
↓ open down ↓ 576 lines elided ↑ open up ↑
 577  577                  myspace = MIN(parentspace, quota - used);
 578  578          }
 579  579  
 580  580          mutex_exit(&dd->dd_lock);
 581  581  
 582  582          return (myspace);
 583  583  }
 584  584  
 585  585  struct tempreserve {
 586  586          list_node_t tr_node;
 587      -        dsl_pool_t *tr_dp;
 588  587          dsl_dir_t *tr_ds;
 589  588          uint64_t tr_size;
 590  589  };
 591  590  
 592  591  static int
 593  592  dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 594  593      boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
 595  594      dmu_tx_t *tx, boolean_t first)
 596  595  {
 597  596          uint64_t txg = tx->tx_txg;
↓ open down ↓ 130 lines elided ↑ open up ↑
 728  727          ASSERT3S(asize, >, 0);
 729  728          ASSERT3S(fsize, >=, 0);
 730  729  
 731  730          err = arc_tempreserve_space(lsize, tx->tx_txg);
 732  731          if (err == 0) {
 733  732                  struct tempreserve *tr;
 734  733  
 735  734                  tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 736  735                  tr->tr_size = lsize;
 737  736                  list_insert_tail(tr_list, tr);
 738      -
 739      -                err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 740  737          } else {
 741  738                  if (err == EAGAIN) {
      739 +                        /*
      740 +                         * If arc_memory_throttle() detected that pageout
      741 +                         * is running and we are low on memory, we delay new
      742 +                         * non-pageout transactions to give pageout an
      743 +                         * advantage.
      744 +                         *
      745 +                         * It is unfortunate to be delaying while the caller's
      746 +                         * locks are held.
      747 +                         */
 742  748                          txg_delay(dd->dd_pool, tx->tx_txg,
 743  749                              MSEC2NSEC(10), MSEC2NSEC(10));
 744  750                          err = SET_ERROR(ERESTART);
 745  751                  }
 746      -                dsl_pool_memory_pressure(dd->dd_pool);
 747  752          }
 748  753  
 749  754          if (err == 0) {
 750      -                struct tempreserve *tr;
 751      -
 752      -                tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 753      -                tr->tr_dp = dd->dd_pool;
 754      -                tr->tr_size = asize;
 755      -                list_insert_tail(tr_list, tr);
 756      -
 757  755                  err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 758  756                      FALSE, asize > usize, tr_list, tx, TRUE);
 759  757          }
 760  758  
 761  759          if (err != 0)
 762  760                  dsl_dir_tempreserve_clear(tr_list, tx);
 763  761          else
 764  762                  *tr_cookiep = tr_list;
 765  763  
 766  764          return (err);
↓ open down ↓ 8 lines elided ↑ open up ↑
 775  773  {
 776  774          int txgidx = tx->tx_txg & TXG_MASK;
 777  775          list_t *tr_list = tr_cookie;
 778  776          struct tempreserve *tr;
 779  777  
 780  778          ASSERT3U(tx->tx_txg, !=, 0);
 781  779  
 782  780          if (tr_cookie == NULL)
 783  781                  return;
 784  782  
 785      -        while (tr = list_head(tr_list)) {
 786      -                if (tr->tr_dp) {
 787      -                        dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
 788      -                } else if (tr->tr_ds) {
      783 +        while ((tr = list_head(tr_list)) != NULL) {
      784 +                if (tr->tr_ds) {
 789  785                          mutex_enter(&tr->tr_ds->dd_lock);
 790  786                          ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 791  787                              tr->tr_size);
 792  788                          tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 793  789                          mutex_exit(&tr->tr_ds->dd_lock);
 794  790                  } else {
 795  791                          arc_tempreserve_clear(tr->tr_size);
 796  792                  }
 797  793                  list_remove(tr_list, tr);
 798  794                  kmem_free(tr, sizeof (struct tempreserve));
 799  795          }
 800  796  
 801  797          kmem_free(tr_list, sizeof (list_t));
 802  798  }
 803  799  
 804      -static void
 805      -dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
      800 +/*
      801 + * This should be called from open context when we think we're going to write
      802 + * or free space, for example when dirtying data. Be conservative; it's okay
      803 + * to write less space or free more, but we don't want to write more or free
      804 + * less than the amount specified.
      805 + */
      806 +void
      807 +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 806  808  {
 807  809          int64_t parent_space;
 808  810          uint64_t est_used;
 809  811  
 810  812          mutex_enter(&dd->dd_lock);
 811  813          if (space > 0)
 812  814                  dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 813  815  
 814  816          est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
 815  817          parent_space = parent_delta(dd, est_used, space);
 816  818          mutex_exit(&dd->dd_lock);
 817  819  
 818  820          /* Make sure that we clean up dd_space_to* */
 819  821          dsl_dir_dirty(dd, tx);
 820  822  
 821  823          /* XXX this is potentially expensive and unnecessary... */
 822  824          if (parent_space && dd->dd_parent)
 823      -                dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
      825 +                dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
 824  826  }
 825  827  
 826      -/*
 827      - * Call in open context when we think we're going to write/free space,
 828      - * eg. when dirtying data.  Be conservative (ie. OK to write less than
 829      - * this or free more than this, but don't write more or free less).
 830      - */
 831      -void
 832      -dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 833      -{
 834      -        dsl_pool_willuse_space(dd->dd_pool, space, tx);
 835      -        dsl_dir_willuse_space_impl(dd, space, tx);
 836      -}
 837      -
 838  828  /* call from syncing context when we actually write/free space for this dd */
 839  829  void
 840  830  dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 841  831      int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 842  832  {
 843  833          int64_t accounted_delta;
 844  834  
 845  835          /*
 846  836           * dsl_dataset_set_refreservation_sync_impl() calls this with
 847  837           * dd_lock held, so that it can atomically update
↓ open down ↓ 517 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX