Print this page
    
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dsl_pool.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_pool.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 Steven Hartland. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dsl_pool.h>
  28   28  #include <sys/dsl_dataset.h>
  29   29  #include <sys/dsl_prop.h>
  30   30  #include <sys/dsl_dir.h>
  31   31  #include <sys/dsl_synctask.h>
  32   32  #include <sys/dsl_scan.h>
  33   33  #include <sys/dnode.h>
  34   34  #include <sys/dmu_tx.h>
  35   35  #include <sys/dmu_objset.h>
  36   36  #include <sys/arc.h>
  37   37  #include <sys/zap.h>
  38   38  #include <sys/zio.h>
  
    | 
      ↓ open down ↓ | 
    38 lines elided | 
    
      ↑ open up ↑ | 
  
  39   39  #include <sys/zfs_context.h>
  40   40  #include <sys/fs/zfs.h>
  41   41  #include <sys/zfs_znode.h>
  42   42  #include <sys/spa_impl.h>
  43   43  #include <sys/dsl_deadlist.h>
  44   44  #include <sys/bptree.h>
  45   45  #include <sys/zfeature.h>
  46   46  #include <sys/zil_impl.h>
  47   47  #include <sys/dsl_userhold.h>
  48   48  
  49      -int zfs_no_write_throttle = 0;
  50      -int zfs_write_limit_shift = 3;                  /* 1/8th of physical memory */
  51      -int zfs_txg_synctime_ms = 1000;         /* target millisecs to sync a txg */
       49 +/*
       50 + * ZFS Write Throttle
       51 + * ------------------
       52 + *
       53 + * ZFS must limit the rate of incoming writes to the rate at which it is able
       54 + * to sync data modifications to the backend storage. Throttling by too much
       55 + * creates an artificial limit; throttling by too little can only be sustained
       56 + * for short periods and would lead to highly lumpy performance. On a per-pool
       57 + * basis, ZFS tracks the amount of modified (dirty) data. As operations change
       58 + * data, the amount of dirty data increases; as ZFS syncs out data, the amount
       59 + * of dirty data decreases. When the amount of dirty data exceeds a
       60 + * predetermined threshold further modifications are blocked until the amount
       61 + * of dirty data decreases (as data is synced out).
       62 + *
       63 + * The limit on dirty data is tunable, and should be adjusted according to
       64 + * both the IO capacity and available memory of the system. The larger the
       65 + * window, the more ZFS is able to aggregate and amortize metadata (and data)
       66 + * changes. However, memory is a limited resource, and allowing for more dirty
       67 + * data comes at the cost of keeping other useful data in memory (for example
       68 + * ZFS data cached by the ARC).
       69 + *
       70 + * Implementation
       71 + *
       72 + * As buffers are modified dsl_pool_willuse_space() increments both the per-
       73 + * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
       74 + * dirty space used; dsl_pool_dirty_space() decrements those values as data
       75 + * is synced out from dsl_pool_sync(). While only the poolwide value is
       76 + * relevant, the per-txg value is useful for debugging. The tunable
       77 + * zfs_dirty_data_max determines the dirty space limit. Once that value is
       78 + * exceeded, new writes are halted until space frees up.
       79 + *
       80 + * The zfs_dirty_data_sync tunable dictates the threshold at which we
       81 + * ensure that there is a txg syncing (see the comment in txg.c for a full
       82 + * description of transaction group stages).
       83 + *
       84 + * The IO scheduler uses both the dirty space limit and current amount of
       85 + * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
       86 + * issues. See the comment in vdev_queue.c for details of the IO scheduler.
       87 + *
       88 + * The delay is also calculated based on the amount of dirty data.  See the
       89 + * comment above dmu_tx_delay() for details.
       90 + */
  52   91  
  53      -uint64_t zfs_write_limit_min = 32 << 20;        /* min write limit is 32MB */
  54      -uint64_t zfs_write_limit_max = 0;               /* max data payload per txg */
  55      -uint64_t zfs_write_limit_inflated = 0;
  56      -uint64_t zfs_write_limit_override = 0;
       92 +/*
       93 + * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
       94 + * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
       95 + */
       96 +uint64_t zfs_dirty_data_max;
       97 +uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
       98 +int zfs_dirty_data_max_percent = 10;
  57   99  
  58      -kmutex_t zfs_write_limit_lock;
      100 +/*
      101 + * If there is at least this much dirty data, push out a txg.
      102 + */
      103 +uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
  59  104  
  60      -static pgcnt_t old_physmem = 0;
      105 +/*
      106 + * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
      107 + * and delay each transaction.
      108 + * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
      109 + */
      110 +int zfs_delay_min_dirty_percent = 60;
  61  111  
      112 +/*
      113 + * This controls how quickly the delay approaches infinity.
      114 + * Larger values cause it to delay less for a given amount of dirty data.
      115 + * Therefore larger values will cause there to be more dirty data for a
      116 + * given throughput.
      117 + *
      118 + * For the smoothest delay, this value should be about 1 billion divided
      119 + * by the maximum number of operations per second.  This will smoothly
      120 + * handle between 10x and 1/10th this number.
      121 + *
      122 + * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
      123 + * multiply in dmu_tx_delay().
      124 + */
      125 +uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
      126 +
      127 +
      128 +/*
      129 + * XXX someday maybe turn these into #defines, and you have to tune it on a
      130 + * per-pool basis using zfs.conf.
      131 + */
      132 +
      133 +
  62  134  hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
  63  135  hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
  64  136  
  65  137  int
  66  138  dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  67  139  {
  68  140          uint64_t obj;
  69  141          int err;
  70  142  
  71  143          err = zap_lookup(dp->dp_meta_objset,
  72  144              dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
  73  145              name, sizeof (obj), 1, &obj);
  74  146          if (err)
  75  147                  return (err);
  76  148  
  77  149          return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
  78  150  }
  79  151  
  
    | 
      ↓ open down ↓ | 
    8 lines elided | 
    
      ↑ open up ↑ | 
  
  80  152  static dsl_pool_t *
  81  153  dsl_pool_open_impl(spa_t *spa, uint64_t txg)
  82  154  {
  83  155          dsl_pool_t *dp;
  84  156          blkptr_t *bp = spa_get_rootblkptr(spa);
  85  157  
  86  158          dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
  87  159          dp->dp_spa = spa;
  88  160          dp->dp_meta_rootbp = *bp;
  89  161          rrw_init(&dp->dp_config_rwlock, B_TRUE);
  90      -        dp->dp_write_limit = zfs_write_limit_min;
  91  162          txg_init(dp, txg);
  92  163  
  93  164          txg_list_create(&dp->dp_dirty_datasets,
  94  165              offsetof(dsl_dataset_t, ds_dirty_link));
  95  166          txg_list_create(&dp->dp_dirty_zilogs,
  96  167              offsetof(zilog_t, zl_dirty_link));
  97  168          txg_list_create(&dp->dp_dirty_dirs,
  98  169              offsetof(dsl_dir_t, dd_dirty_link));
  99  170          txg_list_create(&dp->dp_sync_tasks,
 100  171              offsetof(dsl_sync_task_t, dst_node));
 101  172  
 102  173          mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
      174 +        cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 103  175  
 104  176          dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 105  177              1, 4, 0);
 106  178  
 107  179          return (dp);
 108  180  }
 109  181  
 110  182  int
 111  183  dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 112  184  {
 113  185          int err;
 114  186          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 115  187  
 116  188          err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 117  189              &dp->dp_meta_objset);
 118  190          if (err != 0)
 119  191                  dsl_pool_close(dp);
 120  192          else
 121  193                  *dpp = dp;
 122  194  
 123  195          return (err);
 124  196  }
 125  197  
 126  198  int
 127  199  dsl_pool_open(dsl_pool_t *dp)
 128  200  {
 129  201          int err;
 130  202          dsl_dir_t *dd;
 131  203          dsl_dataset_t *ds;
 132  204          uint64_t obj;
 133  205  
 134  206          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 135  207          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 136  208              DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 137  209              &dp->dp_root_dir_obj);
 138  210          if (err)
 139  211                  goto out;
 140  212  
 141  213          err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 142  214              NULL, dp, &dp->dp_root_dir);
 143  215          if (err)
 144  216                  goto out;
 145  217  
 146  218          err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 147  219          if (err)
 148  220                  goto out;
 149  221  
 150  222          if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 151  223                  err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 152  224                  if (err)
 153  225                          goto out;
 154  226                  err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 155  227                      FTAG, &ds);
 156  228                  if (err == 0) {
 157  229                          err = dsl_dataset_hold_obj(dp,
 158  230                              ds->ds_phys->ds_prev_snap_obj, dp,
 159  231                              &dp->dp_origin_snap);
 160  232                          dsl_dataset_rele(ds, FTAG);
 161  233                  }
 162  234                  dsl_dir_rele(dd, dp);
 163  235                  if (err)
 164  236                          goto out;
 165  237          }
 166  238  
 167  239          if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 168  240                  err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 169  241                      &dp->dp_free_dir);
 170  242                  if (err)
 171  243                          goto out;
 172  244  
 173  245                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 174  246                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 175  247                  if (err)
 176  248                          goto out;
 177  249                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 178  250                      dp->dp_meta_objset, obj));
 179  251          }
 180  252  
 181  253          if (spa_feature_is_active(dp->dp_spa,
 182  254              &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 183  255                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 184  256                      DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 185  257                      &dp->dp_bptree_obj);
 186  258                  if (err != 0)
 187  259                          goto out;
 188  260          }
 189  261  
 190  262          if (spa_feature_is_active(dp->dp_spa,
 191  263              &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
 192  264                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 193  265                      DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 194  266                      &dp->dp_empty_bpobj);
 195  267                  if (err != 0)
 196  268                          goto out;
 197  269          }
 198  270  
 199  271          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 200  272              DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 201  273              &dp->dp_tmp_userrefs_obj);
 202  274          if (err == ENOENT)
 203  275                  err = 0;
 204  276          if (err)
 205  277                  goto out;
 206  278  
  
    | 
      ↓ open down ↓ | 
    94 lines elided | 
    
      ↑ open up ↑ | 
  
 207  279          err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 208  280  
 209  281  out:
 210  282          rrw_exit(&dp->dp_config_rwlock, FTAG);
 211  283          return (err);
 212  284  }
 213  285  
 214  286  void
 215  287  dsl_pool_close(dsl_pool_t *dp)
 216  288  {
 217      -        /* drop our references from dsl_pool_open() */
 218      -
 219  289          /*
      290 +         * Drop our references from dsl_pool_open().
      291 +         *
 220  292           * Since we held the origin_snap from "syncing" context (which
 221  293           * includes pool-opening context), it actually only got a "ref"
 222  294           * and not a hold, so just drop that here.
 223  295           */
 224  296          if (dp->dp_origin_snap)
 225  297                  dsl_dataset_rele(dp->dp_origin_snap, dp);
 226  298          if (dp->dp_mos_dir)
 227  299                  dsl_dir_rele(dp->dp_mos_dir, dp);
 228  300          if (dp->dp_free_dir)
 229  301                  dsl_dir_rele(dp->dp_free_dir, dp);
 230  302          if (dp->dp_root_dir)
 231  303                  dsl_dir_rele(dp->dp_root_dir, dp);
 232  304  
 233  305          bpobj_close(&dp->dp_free_bpobj);
 234  306  
 235  307          /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 236  308          if (dp->dp_meta_objset)
 237  309                  dmu_objset_evict(dp->dp_meta_objset);
 238  310  
 239  311          txg_list_destroy(&dp->dp_dirty_datasets);
 240  312          txg_list_destroy(&dp->dp_dirty_zilogs);
 241  313          txg_list_destroy(&dp->dp_sync_tasks);
 242  314          txg_list_destroy(&dp->dp_dirty_dirs);
 243  315  
 244  316          arc_flush(dp->dp_spa);
 245  317          txg_fini(dp);
 246  318          dsl_scan_fini(dp);
 247  319          rrw_destroy(&dp->dp_config_rwlock);
 248  320          mutex_destroy(&dp->dp_lock);
 249  321          taskq_destroy(dp->dp_vnrele_taskq);
 250  322          if (dp->dp_blkstats)
 251  323                  kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 252  324          kmem_free(dp, sizeof (dsl_pool_t));
 253  325  }
 254  326  
 255  327  dsl_pool_t *
 256  328  dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 257  329  {
 258  330          int err;
 259  331          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 260  332          dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 261  333          objset_t *os;
 262  334          dsl_dataset_t *ds;
 263  335          uint64_t obj;
 264  336  
 265  337          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 266  338  
 267  339          /* create and open the MOS (meta-objset) */
 268  340          dp->dp_meta_objset = dmu_objset_create_impl(spa,
 269  341              NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 270  342  
 271  343          /* create the pool directory */
 272  344          err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 273  345              DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 274  346          ASSERT0(err);
 275  347  
 276  348          /* Initialize scan structures */
 277  349          VERIFY0(dsl_scan_init(dp, txg));
 278  350  
 279  351          /* create and open the root dir */
 280  352          dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 281  353          VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 282  354              NULL, dp, &dp->dp_root_dir));
 283  355  
 284  356          /* create and open the meta-objset dir */
 285  357          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 286  358          VERIFY0(dsl_pool_open_special_dir(dp,
 287  359              MOS_DIR_NAME, &dp->dp_mos_dir));
 288  360  
 289  361          if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 290  362                  /* create and open the free dir */
 291  363                  (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 292  364                      FREE_DIR_NAME, tx);
 293  365                  VERIFY0(dsl_pool_open_special_dir(dp,
 294  366                      FREE_DIR_NAME, &dp->dp_free_dir));
 295  367  
 296  368                  /* create and open the free_bplist */
 297  369                  obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
 298  370                  VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 299  371                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 300  372                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 301  373                      dp->dp_meta_objset, obj));
 302  374          }
 303  375  
 304  376          if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 305  377                  dsl_pool_create_origin(dp, tx);
 306  378  
 307  379          /* create the root dataset */
 308  380          obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 309  381  
 310  382          /* create the root objset */
 311  383          VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 312  384          os = dmu_objset_create_impl(dp->dp_spa, ds,
 313  385              dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 314  386  #ifdef _KERNEL
 315  387          zfs_create_fs(os, kcred, zplprops, tx);
 316  388  #endif
 317  389          dsl_dataset_rele(ds, FTAG);
 318  390  
 319  391          dmu_tx_commit(tx);
 320  392  
 321  393          rrw_exit(&dp->dp_config_rwlock, FTAG);
 322  394  
 323  395          return (dp);
 324  396  }
 325  397  
 326  398  /*
 327  399   * Account for the meta-objset space in its placeholder dsl_dir.
 328  400   */
 329  401  void
 330  402  dsl_pool_mos_diduse_space(dsl_pool_t *dp,
 331  403      int64_t used, int64_t comp, int64_t uncomp)
 332  404  {
 333  405          ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 334  406          mutex_enter(&dp->dp_lock);
 335  407          dp->dp_mos_used_delta += used;
 336  408          dp->dp_mos_compressed_delta += comp;
 337  409          dp->dp_mos_uncompressed_delta += uncomp;
 338  410          mutex_exit(&dp->dp_lock);
  
    | 
      ↓ open down ↓ | 
    109 lines elided | 
    
      ↑ open up ↑ | 
  
 339  411  }
 340  412  
 341  413  static int
 342  414  deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 343  415  {
 344  416          dsl_deadlist_t *dl = arg;
 345  417          dsl_deadlist_insert(dl, bp, tx);
 346  418          return (0);
 347  419  }
 348  420  
      421 +static void
      422 +dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
      423 +{
      424 +        zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
      425 +        dmu_objset_sync(dp->dp_meta_objset, zio, tx);
      426 +        VERIFY0(zio_wait(zio));
      427 +        dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
      428 +        spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
      429 +}
      430 +
      431 +static void
      432 +dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
      433 +{
      434 +        ASSERT(MUTEX_HELD(&dp->dp_lock));
      435 +
      436 +        if (delta < 0)
      437 +                ASSERT3U(-delta, <=, dp->dp_dirty_total);
      438 +
      439 +        dp->dp_dirty_total += delta;
      440 +
      441 +        /*
      442 +         * Note: we signal even when increasing dp_dirty_total.
      443 +         * This ensures forward progress -- each thread wakes the next waiter.
      444 +         */
      445 +        if (dp->dp_dirty_total <= zfs_dirty_data_max)
      446 +                cv_signal(&dp->dp_spaceavail_cv);
      447 +}
      448 +
 349  449  void
 350  450  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 351  451  {
 352  452          zio_t *zio;
 353  453          dmu_tx_t *tx;
 354  454          dsl_dir_t *dd;
 355  455          dsl_dataset_t *ds;
 356  456          objset_t *mos = dp->dp_meta_objset;
 357      -        hrtime_t start, write_time;
 358      -        uint64_t data_written;
 359      -        int err;
 360  457          list_t synced_datasets;
 361  458  
 362  459          list_create(&synced_datasets, sizeof (dsl_dataset_t),
 363  460              offsetof(dsl_dataset_t, ds_synced_link));
 364  461  
 365      -        /*
 366      -         * We need to copy dp_space_towrite() before doing
 367      -         * dsl_sync_task_sync(), because
 368      -         * dsl_dataset_snapshot_reserve_space() will increase
 369      -         * dp_space_towrite but not actually write anything.
 370      -         */
 371      -        data_written = dp->dp_space_towrite[txg & TXG_MASK];
 372      -
 373  462          tx = dmu_tx_create_assigned(dp, txg);
 374  463  
 375      -        dp->dp_read_overhead = 0;
 376      -        start = gethrtime();
 377      -
      464 +        /*
      465 +         * Write out all dirty blocks of dirty datasets.
      466 +         */
 378  467          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 379      -        while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
      468 +        while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 380  469                  /*
 381  470                   * We must not sync any non-MOS datasets twice, because
 382  471                   * we may have taken a snapshot of them.  However, we
 383  472                   * may sync newly-created datasets on pass 2.
 384  473                   */
 385  474                  ASSERT(!list_link_active(&ds->ds_synced_link));
 386  475                  list_insert_tail(&synced_datasets, ds);
 387  476                  dsl_dataset_sync(ds, zio, tx);
 388  477          }
 389      -        DTRACE_PROBE(pool_sync__1setup);
 390      -        err = zio_wait(zio);
      478 +        VERIFY0(zio_wait(zio));
 391  479  
 392      -        write_time = gethrtime() - start;
 393      -        ASSERT(err == 0);
 394      -        DTRACE_PROBE(pool_sync__2rootzio);
      480 +        /*
      481 +         * We have written all of the accounted dirty data, so our
      482 +         * dp_space_towrite should now be zero.  However, some seldom-used
      483 +         * code paths do not adhere to this (e.g. dbuf_undirty(), also
      484 +         * rounding error in dbuf_write_physdone).
      485 +         * Shore up the accounting of any dirtied space now.
      486 +         */
      487 +        dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
 395  488  
 396  489          /*
 397  490           * After the data blocks have been written (ensured by the zio_wait()
 398  491           * above), update the user/group space accounting.
 399  492           */
 400      -        for (ds = list_head(&synced_datasets); ds;
 401      -            ds = list_next(&synced_datasets, ds))
      493 +        for (ds = list_head(&synced_datasets); ds != NULL;
      494 +            ds = list_next(&synced_datasets, ds)) {
 402  495                  dmu_objset_do_userquota_updates(ds->ds_objset, tx);
      496 +        }
 403  497  
 404  498          /*
 405  499           * Sync the datasets again to push out the changes due to
 406  500           * userspace updates.  This must be done before we process the
 407  501           * sync tasks, so that any snapshots will have the correct
 408  502           * user accounting information (and we won't get confused
 409  503           * about which blocks are part of the snapshot).
 410  504           */
 411  505          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 412      -        while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
      506 +        while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 413  507                  ASSERT(list_link_active(&ds->ds_synced_link));
 414  508                  dmu_buf_rele(ds->ds_dbuf, ds);
 415  509                  dsl_dataset_sync(ds, zio, tx);
 416  510          }
 417      -        err = zio_wait(zio);
      511 +        VERIFY0(zio_wait(zio));
 418  512  
 419  513          /*
 420  514           * Now that the datasets have been completely synced, we can
 421  515           * clean up our in-memory structures accumulated while syncing:
 422  516           *
 423  517           *  - move dead blocks from the pending deadlist to the on-disk deadlist
 424  518           *  - release hold from dsl_dataset_dirty()
 425  519           */
 426      -        while (ds = list_remove_head(&synced_datasets)) {
      520 +        while ((ds = list_remove_head(&synced_datasets)) != NULL) {
 427  521                  objset_t *os = ds->ds_objset;
 428  522                  bplist_iterate(&ds->ds_pending_deadlist,
 429  523                      deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 430  524                  ASSERT(!dmu_objset_is_dirty(os, txg));
 431  525                  dmu_buf_rele(ds->ds_dbuf, ds);
 432  526          }
 433      -
 434      -        start = gethrtime();
 435      -        while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
      527 +        while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
 436  528                  dsl_dir_sync(dd, tx);
 437      -        write_time += gethrtime() - start;
      529 +        }
 438  530  
 439  531          /*
 440  532           * The MOS's space is accounted for in the pool/$MOS
 441  533           * (dp_mos_dir).  We can't modify the mos while we're syncing
 442  534           * it, so we remember the deltas and apply them here.
 443  535           */
 444  536          if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 445  537              dp->dp_mos_uncompressed_delta != 0) {
 446  538                  dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 447  539                      dp->dp_mos_used_delta,
 448  540                      dp->dp_mos_compressed_delta,
 449  541                      dp->dp_mos_uncompressed_delta, tx);
 450  542                  dp->dp_mos_used_delta = 0;
 451  543                  dp->dp_mos_compressed_delta = 0;
 452  544                  dp->dp_mos_uncompressed_delta = 0;
 453  545          }
 454  546  
 455      -        start = gethrtime();
 456  547          if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 457  548              list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 458      -                zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 459      -                dmu_objset_sync(mos, zio, tx);
 460      -                err = zio_wait(zio);
 461      -                ASSERT(err == 0);
 462      -                dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 463      -                spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
      549 +                dsl_pool_sync_mos(dp, tx);
 464  550          }
 465      -        write_time += gethrtime() - start;
 466      -        DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
 467      -            hrtime_t, dp->dp_read_overhead);
 468      -        write_time -= dp->dp_read_overhead;
 469  551  
 470  552          /*
 471  553           * If we modify a dataset in the same txg that we want to destroy it,
 472  554           * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 473  555           * dsl_dir_destroy_check() will fail if there are unexpected holds.
 474  556           * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 475  557           * and clearing the hold on it) before we process the sync_tasks.
 476  558           * The MOS data dirtied by the sync_tasks will be synced on the next
 477  559           * pass.
 478  560           */
 479      -        DTRACE_PROBE(pool_sync__3task);
 480  561          if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 481  562                  dsl_sync_task_t *dst;
 482  563                  /*
 483  564                   * No more sync tasks should have been added while we
 484  565                   * were syncing.
 485  566                   */
 486      -                ASSERT(spa_sync_pass(dp->dp_spa) == 1);
 487      -                while (dst = txg_list_remove(&dp->dp_sync_tasks, txg))
      567 +                ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
      568 +                while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
 488  569                          dsl_sync_task_sync(dst, tx);
 489  570          }
 490  571  
 491  572          dmu_tx_commit(tx);
 492  573  
 493      -        dp->dp_space_towrite[txg & TXG_MASK] = 0;
 494      -        ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
 495      -
 496      -        /*
 497      -         * If the write limit max has not been explicitly set, set it
 498      -         * to a fraction of available physical memory (default 1/8th).
 499      -         * Note that we must inflate the limit because the spa
 500      -         * inflates write sizes to account for data replication.
 501      -         * Check this each sync phase to catch changing memory size.
 502      -         */
 503      -        if (physmem != old_physmem && zfs_write_limit_shift) {
 504      -                mutex_enter(&zfs_write_limit_lock);
 505      -                old_physmem = physmem;
 506      -                zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
 507      -                zfs_write_limit_inflated = MAX(zfs_write_limit_min,
 508      -                    spa_get_asize(dp->dp_spa, zfs_write_limit_max));
 509      -                mutex_exit(&zfs_write_limit_lock);
 510      -        }
 511      -
 512      -        /*
 513      -         * Attempt to keep the sync time consistent by adjusting the
 514      -         * amount of write traffic allowed into each transaction group.
 515      -         * Weight the throughput calculation towards the current value:
 516      -         *      thru = 3/4 old_thru + 1/4 new_thru
 517      -         *
 518      -         * Note: write_time is in nanosecs while dp_throughput is expressed in
 519      -         * bytes per millisecond.
 520      -         */
 521      -        ASSERT(zfs_write_limit_min > 0);
 522      -        if (data_written > zfs_write_limit_min / 8 &&
 523      -            write_time > MSEC2NSEC(1)) {
 524      -                uint64_t throughput = data_written / NSEC2MSEC(write_time);
 525      -
 526      -                if (dp->dp_throughput)
 527      -                        dp->dp_throughput = throughput / 4 +
 528      -                            3 * dp->dp_throughput / 4;
 529      -                else
 530      -                        dp->dp_throughput = throughput;
 531      -                dp->dp_write_limit = MIN(zfs_write_limit_inflated,
 532      -                    MAX(zfs_write_limit_min,
 533      -                    dp->dp_throughput * zfs_txg_synctime_ms));
 534      -        }
      574 +        DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
 535  575  }
 536  576  
 537  577  void
 538  578  dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 539  579  {
 540  580          zilog_t *zilog;
 541      -        dsl_dataset_t *ds;
 542  581  
 543  582          while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
 544      -                ds = dmu_objset_ds(zilog->zl_os);
      583 +                dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 545  584                  zil_clean(zilog, txg);
 546  585                  ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 547  586                  dmu_buf_rele(ds->ds_dbuf, zilog);
 548  587          }
 549  588          ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 550  589  }
 551  590  
 552  591  /*
 553  592   * TRUE if the current thread is the tx_sync_thread or if we
 554  593   * are being called from SPA context during pool initialization.
 555  594   */
 556  595  int
 557  596  dsl_pool_sync_context(dsl_pool_t *dp)
 558  597  {
 559  598          return (curthread == dp->dp_tx.tx_sync_thread ||
 560  599              spa_is_initializing(dp->dp_spa));
 561  600  }
 562  601  
 563  602  uint64_t
 564  603  dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 565  604  {
 566  605          uint64_t space, resv;
 567  606  
 568  607          /*
 569  608           * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 570  609           * efficiency.
 571  610           * XXX The intent log is not accounted for, so it must fit
 572  611           * within this slop.
 573  612           *
 574  613           * If we're trying to assess whether it's OK to do a free,
 575  614           * cut the reservation in half to allow forward progress
  
    | 
      ↓ open down ↓ | 
    21 lines elided | 
    
      ↑ open up ↑ | 
  
 576  615           * (e.g. make it possible to rm(1) files from a full pool).
 577  616           */
 578  617          space = spa_get_dspace(dp->dp_spa);
 579  618          resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 580  619          if (netfree)
 581  620                  resv >>= 1;
 582  621  
 583  622          return (space - resv);
 584  623  }
 585  624  
 586      -int
 587      -dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
      625 +boolean_t
      626 +dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 588  627  {
 589      -        uint64_t reserved = 0;
 590      -        uint64_t write_limit = (zfs_write_limit_override ?
 591      -            zfs_write_limit_override : dp->dp_write_limit);
      628 +        uint64_t delay_min_bytes =
      629 +            zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
      630 +        boolean_t rv;
 592  631  
 593      -        if (zfs_no_write_throttle) {
 594      -                atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
 595      -                    space);
 596      -                return (0);
 597      -        }
 598      -
 599      -        /*
 600      -         * Check to see if we have exceeded the maximum allowed IO for
 601      -         * this transaction group.  We can do this without locks since
 602      -         * a little slop here is ok.  Note that we do the reserved check
 603      -         * with only half the requested reserve: this is because the
 604      -         * reserve requests are worst-case, and we really don't want to
 605      -         * throttle based off of worst-case estimates.
 606      -         */
 607      -        if (write_limit > 0) {
 608      -                reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
 609      -                    + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
 610      -
 611      -                if (reserved && reserved > write_limit)
 612      -                        return (SET_ERROR(ERESTART));
 613      -        }
 614      -
 615      -        atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
 616      -
 617      -        /*
 618      -         * If this transaction group is over 7/8ths capacity, delay
 619      -         * the caller 1 clock tick.  This will slow down the "fill"
 620      -         * rate until the sync process can catch up with us.
 621      -         */
 622      -        if (reserved && reserved > (write_limit - (write_limit >> 3))) {
 623      -                txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
 624      -                    zfs_throttle_resolution);
 625      -        }
 626      -
 627      -        return (0);
      632 +        mutex_enter(&dp->dp_lock);
      633 +        if (dp->dp_dirty_total > zfs_dirty_data_sync)
      634 +                txg_kick(dp);
      635 +        rv = (dp->dp_dirty_total > delay_min_bytes);
      636 +        mutex_exit(&dp->dp_lock);
      637 +        return (rv);
 628  638  }
 629  639  
 630  640  void
 631      -dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
      641 +dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 632  642  {
 633      -        ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
 634      -        atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
      643 +        if (space > 0) {
      644 +                mutex_enter(&dp->dp_lock);
      645 +                dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
      646 +                dsl_pool_dirty_delta(dp, space);
      647 +                mutex_exit(&dp->dp_lock);
      648 +        }
 635  649  }
 636  650  
 637  651  void
 638      -dsl_pool_memory_pressure(dsl_pool_t *dp)
 639      -{
 640      -        uint64_t space_inuse = 0;
 641      -        int i;
 642      -
 643      -        if (dp->dp_write_limit == zfs_write_limit_min)
      652 +dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) {
      653 +        ASSERT3S(space, >=, 0);
      654 +        if (space == 0)
 644  655                  return;
 645      -
 646      -        for (i = 0; i < TXG_SIZE; i++) {
 647      -                space_inuse += dp->dp_space_towrite[i];
 648      -                space_inuse += dp->dp_tempreserved[i];
      656 +        mutex_enter(&dp->dp_lock);
      657 +        if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
      658 +                /* XXX writing something we didn't dirty? */
      659 +                space = dp->dp_dirty_pertxg[txg & TXG_MASK];
 649  660          }
 650      -        dp->dp_write_limit = MAX(zfs_write_limit_min,
 651      -            MIN(dp->dp_write_limit, space_inuse / 4));
      661 +        ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
      662 +        dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
      663 +        ASSERT3U(dp->dp_dirty_total, >=, space);
      664 +        dsl_pool_dirty_delta(dp, -space);
      665 +        mutex_exit(&dp->dp_lock);
 652  666  }
 653  667  
 654      -void
 655      -dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 656      -{
 657      -        if (space > 0) {
 658      -                mutex_enter(&dp->dp_lock);
 659      -                dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
 660      -                mutex_exit(&dp->dp_lock);
 661      -        }
 662      -}
 663      -
 664  668  /* ARGSUSED */
 665  669  static int
 666  670  upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 667  671  {
 668  672          dmu_tx_t *tx = arg;
 669  673          dsl_dataset_t *ds, *prev = NULL;
 670  674          int err;
 671  675  
 672  676          err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 673  677          if (err)
 674  678                  return (err);
 675  679  
 676  680          while (ds->ds_phys->ds_prev_snap_obj != 0) {
 677  681                  err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 678  682                      FTAG, &prev);
 679  683                  if (err) {
 680  684                          dsl_dataset_rele(ds, FTAG);
 681  685                          return (err);
 682  686                  }
 683  687  
 684  688                  if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
 685  689                          break;
 686  690                  dsl_dataset_rele(ds, FTAG);
 687  691                  ds = prev;
 688  692                  prev = NULL;
 689  693          }
 690  694  
 691  695          if (prev == NULL) {
 692  696                  prev = dp->dp_origin_snap;
 693  697  
 694  698                  /*
 695  699                   * The $ORIGIN can't have any data, or the accounting
 696  700                   * will be wrong.
 697  701                   */
 698  702                  ASSERT0(prev->ds_phys->ds_bp.blk_birth);
 699  703  
 700  704                  /* The origin doesn't get attached to itself */
 701  705                  if (ds->ds_object == prev->ds_object) {
 702  706                          dsl_dataset_rele(ds, FTAG);
 703  707                          return (0);
 704  708                  }
 705  709  
 706  710                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
 707  711                  ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
 708  712                  ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
 709  713  
 710  714                  dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 711  715                  ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
 712  716  
 713  717                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 714  718                  prev->ds_phys->ds_num_children++;
 715  719  
 716  720                  if (ds->ds_phys->ds_next_snap_obj == 0) {
 717  721                          ASSERT(ds->ds_prev == NULL);
 718  722                          VERIFY0(dsl_dataset_hold_obj(dp,
 719  723                              ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 720  724                  }
 721  725          }
 722  726  
 723  727          ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
 724  728          ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
 725  729  
 726  730          if (prev->ds_phys->ds_next_clones_obj == 0) {
 727  731                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 728  732                  prev->ds_phys->ds_next_clones_obj =
 729  733                      zap_create(dp->dp_meta_objset,
 730  734                      DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 731  735          }
 732  736          VERIFY0(zap_add_int(dp->dp_meta_objset,
 733  737              prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
 734  738  
 735  739          dsl_dataset_rele(ds, FTAG);
 736  740          if (prev != dp->dp_origin_snap)
 737  741                  dsl_dataset_rele(prev, FTAG);
 738  742          return (0);
 739  743  }
 740  744  
 741  745  void
 742  746  dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 743  747  {
 744  748          ASSERT(dmu_tx_is_syncing(tx));
 745  749          ASSERT(dp->dp_origin_snap != NULL);
 746  750  
 747  751          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 748  752              tx, DS_FIND_CHILDREN));
 749  753  }
 750  754  
 751  755  /* ARGSUSED */
 752  756  static int
 753  757  upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 754  758  {
 755  759          dmu_tx_t *tx = arg;
 756  760          objset_t *mos = dp->dp_meta_objset;
 757  761  
 758  762          if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
 759  763                  dsl_dataset_t *origin;
 760  764  
 761  765                  VERIFY0(dsl_dataset_hold_obj(dp,
 762  766                      ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
 763  767  
 764  768                  if (origin->ds_dir->dd_phys->dd_clones == 0) {
 765  769                          dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 766  770                          origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
 767  771                              DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 768  772                  }
 769  773  
 770  774                  VERIFY0(zap_add_int(dp->dp_meta_objset,
 771  775                      origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
 772  776  
 773  777                  dsl_dataset_rele(origin, FTAG);
 774  778          }
 775  779          return (0);
 776  780  }
 777  781  
 778  782  void
 779  783  dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 780  784  {
 781  785          ASSERT(dmu_tx_is_syncing(tx));
 782  786          uint64_t obj;
 783  787  
 784  788          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 785  789          VERIFY0(dsl_pool_open_special_dir(dp,
 786  790              FREE_DIR_NAME, &dp->dp_free_dir));
 787  791  
 788  792          /*
 789  793           * We can't use bpobj_alloc(), because spa_version() still
 790  794           * returns the old version, and we need a new-version bpobj with
 791  795           * subobj support.  So call dmu_object_alloc() directly.
 792  796           */
 793  797          obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 794  798              SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 795  799          VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 796  800              DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 797  801          VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 798  802  
 799  803          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 800  804              upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 801  805  }
 802  806  
 803  807  void
 804  808  dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 805  809  {
 806  810          uint64_t dsobj;
 807  811          dsl_dataset_t *ds;
 808  812  
 809  813          ASSERT(dmu_tx_is_syncing(tx));
 810  814          ASSERT(dp->dp_origin_snap == NULL);
 811  815          ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 812  816  
 813  817          /* create the origin dir, ds, & snap-ds */
 814  818          dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 815  819              NULL, 0, kcred, tx);
 816  820          VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 817  821          dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
 818  822          VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 819  823              dp, &dp->dp_origin_snap));
 820  824          dsl_dataset_rele(ds, FTAG);
 821  825  }
 822  826  
 823  827  taskq_t *
 824  828  dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 825  829  {
 826  830          return (dp->dp_vnrele_taskq);
 827  831  }
 828  832  
 829  833  /*
 830  834   * Walk through the pool-wide zap object of temporary snapshot user holds
 831  835   * and release them.
 832  836   */
 833  837  void
 834  838  dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 835  839  {
 836  840          zap_attribute_t za;
 837  841          zap_cursor_t zc;
 838  842          objset_t *mos = dp->dp_meta_objset;
 839  843          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 840  844          nvlist_t *holds;
 841  845  
 842  846          if (zapobj == 0)
 843  847                  return;
 844  848          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 845  849  
 846  850          holds = fnvlist_alloc();
 847  851  
 848  852          for (zap_cursor_init(&zc, mos, zapobj);
 849  853              zap_cursor_retrieve(&zc, &za) == 0;
 850  854              zap_cursor_advance(&zc)) {
 851  855                  char *htag;
 852  856                  nvlist_t *tags;
 853  857  
 854  858                  htag = strchr(za.za_name, '-');
 855  859                  *htag = '\0';
 856  860                  ++htag;
 857  861                  if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
 858  862                          tags = fnvlist_alloc();
 859  863                          fnvlist_add_boolean(tags, htag);
 860  864                          fnvlist_add_nvlist(holds, za.za_name, tags);
 861  865                          fnvlist_free(tags);
 862  866                  } else {
 863  867                          fnvlist_add_boolean(tags, htag);
 864  868                  }
 865  869          }
 866  870          dsl_dataset_user_release_tmp(dp, holds);
 867  871          fnvlist_free(holds);
 868  872          zap_cursor_fini(&zc);
 869  873  }
 870  874  
 871  875  /*
 872  876   * Create the pool-wide zap object for storing temporary snapshot holds.
 873  877   */
 874  878  void
 875  879  dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 876  880  {
 877  881          objset_t *mos = dp->dp_meta_objset;
 878  882  
 879  883          ASSERT(dp->dp_tmp_userrefs_obj == 0);
 880  884          ASSERT(dmu_tx_is_syncing(tx));
 881  885  
 882  886          dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 883  887              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 884  888  }
 885  889  
 886  890  static int
 887  891  dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 888  892      const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 889  893  {
 890  894          objset_t *mos = dp->dp_meta_objset;
 891  895          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 892  896          char *name;
 893  897          int error;
 894  898  
 895  899          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 896  900          ASSERT(dmu_tx_is_syncing(tx));
 897  901  
 898  902          /*
 899  903           * If the pool was created prior to SPA_VERSION_USERREFS, the
 900  904           * zap object for temporary holds might not exist yet.
 901  905           */
 902  906          if (zapobj == 0) {
 903  907                  if (holding) {
 904  908                          dsl_pool_user_hold_create_obj(dp, tx);
 905  909                          zapobj = dp->dp_tmp_userrefs_obj;
 906  910                  } else {
 907  911                          return (SET_ERROR(ENOENT));
 908  912                  }
 909  913          }
 910  914  
 911  915          name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 912  916          if (holding)
 913  917                  error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 914  918          else
 915  919                  error = zap_remove(mos, zapobj, name, tx);
 916  920          strfree(name);
 917  921  
 918  922          return (error);
 919  923  }
 920  924  
 921  925  /*
 922  926   * Add a temporary hold for the given dataset object and tag.
 923  927   */
 924  928  int
 925  929  dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 926  930      uint64_t now, dmu_tx_t *tx)
 927  931  {
 928  932          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 929  933  }
 930  934  
 931  935  /*
 932  936   * Release a temporary hold for the given dataset object and tag.
 933  937   */
 934  938  int
 935  939  dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 936  940      dmu_tx_t *tx)
 937  941  {
 938  942          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
 939  943              tx, B_FALSE));
 940  944  }
 941  945  
 942  946  /*
 943  947   * DSL Pool Configuration Lock
 944  948   *
 945  949   * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
 946  950   * creation / destruction / rename / property setting).  It must be held for
 947  951   * read to hold a dataset or dsl_dir.  I.e. you must call
 948  952   * dsl_pool_config_enter() or dsl_pool_hold() before calling
 949  953   * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
 950  954   * must be held continuously until all datasets and dsl_dirs are released.
 951  955   *
 952  956   * The only exception to this rule is that if a "long hold" is placed on
 953  957   * a dataset, then the dp_config_rwlock may be dropped while the dataset
 954  958   * is still held.  The long hold will prevent the dataset from being
 955  959   * destroyed -- the destroy will fail with EBUSY.  A long hold can be
 956  960   * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
 957  961   * (by calling dsl_{dataset,objset}_{try}own{_obj}).
 958  962   *
 959  963   * Legitimate long-holders (including owners) should be long-running, cancelable
 960  964   * tasks that should cause "zfs destroy" to fail.  This includes DMU
 961  965   * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
 962  966   * "zfs send", and "zfs diff".  There are several other long-holders whose
 963  967   * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
 964  968   *
 965  969   * The usual formula for long-holding would be:
 966  970   * dsl_pool_hold()
 967  971   * dsl_dataset_hold()
 968  972   * ... perform checks ...
 969  973   * dsl_dataset_long_hold()
 970  974   * dsl_pool_rele()
 971  975   * ... perform long-running task ...
 972  976   * dsl_dataset_long_rele()
 973  977   * dsl_dataset_rele()
 974  978   *
 975  979   * Note that when the long hold is released, the dataset is still held but
 976  980   * the pool is not held.  The dataset may change arbitrarily during this time
 977  981   * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
 978  982   * dataset except release it.
 979  983   *
 980  984   * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
 981  985   * or modifying operations.
 982  986   *
 983  987   * Modifying operations should generally use dsl_sync_task().  The synctask
 984  988   * infrastructure enforces proper locking strategy with respect to the
 985  989   * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
 986  990   *
 987  991   * Read-only operations will manually hold the pool, then the dataset, obtain
 988  992   * information from the dataset, then release the pool and dataset.
 989  993   * dmu_objset_{hold,rele}() are convenience routines that also do the pool
 990  994   * hold/rele.
 991  995   */
 992  996  
 993  997  int
 994  998  dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
 995  999  {
 996 1000          spa_t *spa;
 997 1001          int error;
 998 1002  
 999 1003          error = spa_open(name, &spa, tag);
1000 1004          if (error == 0) {
1001 1005                  *dp = spa_get_dsl(spa);
1002 1006                  dsl_pool_config_enter(*dp, tag);
1003 1007          }
1004 1008          return (error);
1005 1009  }
1006 1010  
1007 1011  void
1008 1012  dsl_pool_rele(dsl_pool_t *dp, void *tag)
1009 1013  {
1010 1014          dsl_pool_config_exit(dp, tag);
1011 1015          spa_close(dp->dp_spa, tag);
1012 1016  }
1013 1017  
1014 1018  void
1015 1019  dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
1016 1020  {
1017 1021          /*
1018 1022           * We use a "reentrant" reader-writer lock, but not reentrantly.
1019 1023           *
1020 1024           * The rrwlock can (with the track_all flag) track all reading threads,
1021 1025           * which is very useful for debugging which code path failed to release
1022 1026           * the lock, and for verifying that the *current* thread does hold
1023 1027           * the lock.
1024 1028           *
1025 1029           * (Unlike a rwlock, which knows that N threads hold it for
1026 1030           * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
1027 1031           * if any thread holds it for read, even if this thread doesn't).
1028 1032           */
1029 1033          ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
1030 1034          rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
1031 1035  }
1032 1036  
1033 1037  void
1034 1038  dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
1035 1039  {
1036 1040          rrw_exit(&dp->dp_config_rwlock, tag);
1037 1041  }
1038 1042  
1039 1043  boolean_t
1040 1044  dsl_pool_config_held(dsl_pool_t *dp)
1041 1045  {
1042 1046          return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
1043 1047  }
  
    | 
      ↓ open down ↓ | 
    370 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX