illumos-3740-r6 Wdiff usr/src/uts/common/fs/zfs/dsl_pool.c

Print this page

3740 Poor ZFS send / receive performance due to snapshot hold / release processing
Submitted by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dsl_pool.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_pool.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
       24 + * Copyright (c) 2013 Steven Hartland. All rights reserved.
  24   25   */
  25   26  
  26   27  #include <sys/dsl_pool.h>
  27   28  #include <sys/dsl_dataset.h>
  28   29  #include <sys/dsl_prop.h>
  29   30  #include <sys/dsl_dir.h>
  30   31  #include <sys/dsl_synctask.h>
  31   32  #include <sys/dsl_scan.h>
  32   33  #include <sys/dnode.h>
  33   34  #include <sys/dmu_tx.h>

  34   35  #include <sys/dmu_objset.h>
  35   36  #include <sys/arc.h>
  36   37  #include <sys/zap.h>
  37   38  #include <sys/zio.h>
  38   39  #include <sys/zfs_context.h>
  39   40  #include <sys/fs/zfs.h>
  40   41  #include <sys/zfs_znode.h>
  41   42  #include <sys/spa_impl.h>
  42   43  #include <sys/dsl_deadlist.h>
  43   44  #include <sys/bptree.h>
  44   45  #include <sys/zfeature.h>
  45   46  #include <sys/zil_impl.h>
  46   47  #include <sys/dsl_userhold.h>
  47   48  
  48   49  int zfs_no_write_throttle = 0;
  49   50  int zfs_write_limit_shift = 3;                  /* 1/8th of physical memory */
  50   51  int zfs_txg_synctime_ms = 1000;         /* target millisecs to sync a txg */
  51   52  
  52   53  uint64_t zfs_write_limit_min = 32 << 20;        /* min write limit is 32MB */
  53   54  uint64_t zfs_write_limit_max = 0;               /* max data payload per txg */
  54   55  uint64_t zfs_write_limit_inflated = 0;
  55   56  uint64_t zfs_write_limit_override = 0;
  56   57  
  57   58  kmutex_t zfs_write_limit_lock;
  58   59  
  59   60  static pgcnt_t old_physmem = 0;
  60   61  
  61   62  hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
  62   63  hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
  63   64  
  64   65  int
  65   66  dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  66   67  {
  67   68          uint64_t obj;
  68   69          int err;
  69   70  
  70   71          err = zap_lookup(dp->dp_meta_objset,
  71   72              dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
  72   73              name, sizeof (obj), 1, &obj);
  73   74          if (err)
  74   75                  return (err);
  75   76  
  76   77          return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
  77   78  }
  78   79  
  79   80  static dsl_pool_t *
  80   81  dsl_pool_open_impl(spa_t *spa, uint64_t txg)
  81   82  {
  82   83          dsl_pool_t *dp;
  83   84          blkptr_t *bp = spa_get_rootblkptr(spa);
  84   85  
  85   86          dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
  86   87          dp->dp_spa = spa;
  87   88          dp->dp_meta_rootbp = *bp;
  88   89          rrw_init(&dp->dp_config_rwlock, B_TRUE);
  89   90          dp->dp_write_limit = zfs_write_limit_min;
  90   91          txg_init(dp, txg);
  91   92  
  92   93          txg_list_create(&dp->dp_dirty_datasets,
  93   94              offsetof(dsl_dataset_t, ds_dirty_link));
  94   95          txg_list_create(&dp->dp_dirty_zilogs,
  95   96              offsetof(zilog_t, zl_dirty_link));
  96   97          txg_list_create(&dp->dp_dirty_dirs,
  97   98              offsetof(dsl_dir_t, dd_dirty_link));
  98   99          txg_list_create(&dp->dp_sync_tasks,
  99  100              offsetof(dsl_sync_task_t, dst_node));
 100  101  
 101  102          mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 102  103  
 103  104          dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 104  105              1, 4, 0);
 105  106  
 106  107          return (dp);
 107  108  }
 108  109  
 109  110  int
 110  111  dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 111  112  {
 112  113          int err;
 113  114          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 114  115  
 115  116          err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 116  117              &dp->dp_meta_objset);
 117  118          if (err != 0)
 118  119                  dsl_pool_close(dp);
 119  120          else
 120  121                  *dpp = dp;
 121  122  
 122  123          return (err);
 123  124  }
 124  125  
 125  126  int
 126  127  dsl_pool_open(dsl_pool_t *dp)
 127  128  {
 128  129          int err;
 129  130          dsl_dir_t *dd;
 130  131          dsl_dataset_t *ds;
 131  132          uint64_t obj;
 132  133  
 133  134          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 134  135          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 135  136              DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 136  137              &dp->dp_root_dir_obj);
 137  138          if (err)
 138  139                  goto out;
 139  140  
 140  141          err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 141  142              NULL, dp, &dp->dp_root_dir);
 142  143          if (err)
 143  144                  goto out;
 144  145  
 145  146          err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 146  147          if (err)
 147  148                  goto out;
 148  149  
 149  150          if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 150  151                  err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 151  152                  if (err)
 152  153                          goto out;
 153  154                  err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 154  155                      FTAG, &ds);
 155  156                  if (err == 0) {
 156  157                          err = dsl_dataset_hold_obj(dp,
 157  158                              ds->ds_phys->ds_prev_snap_obj, dp,
 158  159                              &dp->dp_origin_snap);
 159  160                          dsl_dataset_rele(ds, FTAG);
 160  161                  }
 161  162                  dsl_dir_rele(dd, dp);
 162  163                  if (err)
 163  164                          goto out;
 164  165          }
 165  166  
 166  167          if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 167  168                  err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 168  169                      &dp->dp_free_dir);
 169  170                  if (err)
 170  171                          goto out;
 171  172  
 172  173                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 173  174                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 174  175                  if (err)
 175  176                          goto out;
 176  177                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 177  178                      dp->dp_meta_objset, obj));
 178  179          }
 179  180  
 180  181          if (spa_feature_is_active(dp->dp_spa,
 181  182              &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 182  183                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 183  184                      DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 184  185                      &dp->dp_bptree_obj);
 185  186                  if (err != 0)
 186  187                          goto out;
 187  188          }
 188  189  
 189  190          if (spa_feature_is_active(dp->dp_spa,
 190  191              &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
 191  192                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 192  193                      DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 193  194                      &dp->dp_empty_bpobj);
 194  195                  if (err != 0)
 195  196                          goto out;
 196  197          }
 197  198  
 198  199          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 199  200              DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 200  201              &dp->dp_tmp_userrefs_obj);
 201  202          if (err == ENOENT)
 202  203                  err = 0;
 203  204          if (err)
 204  205                  goto out;
 205  206  
 206  207          err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 207  208  
 208  209  out:
 209  210          rrw_exit(&dp->dp_config_rwlock, FTAG);
 210  211          return (err);
 211  212  }
 212  213  
 213  214  void
 214  215  dsl_pool_close(dsl_pool_t *dp)
 215  216  {
 216  217          /* drop our references from dsl_pool_open() */
 217  218  
 218  219          /*
 219  220           * Since we held the origin_snap from "syncing" context (which
 220  221           * includes pool-opening context), it actually only got a "ref"
 221  222           * and not a hold, so just drop that here.
 222  223           */
 223  224          if (dp->dp_origin_snap)
 224  225                  dsl_dataset_rele(dp->dp_origin_snap, dp);
 225  226          if (dp->dp_mos_dir)
 226  227                  dsl_dir_rele(dp->dp_mos_dir, dp);
 227  228          if (dp->dp_free_dir)
 228  229                  dsl_dir_rele(dp->dp_free_dir, dp);
 229  230          if (dp->dp_root_dir)
 230  231                  dsl_dir_rele(dp->dp_root_dir, dp);
 231  232  
 232  233          bpobj_close(&dp->dp_free_bpobj);
 233  234  
 234  235          /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 235  236          if (dp->dp_meta_objset)
 236  237                  dmu_objset_evict(dp->dp_meta_objset);
 237  238  
 238  239          txg_list_destroy(&dp->dp_dirty_datasets);
 239  240          txg_list_destroy(&dp->dp_dirty_zilogs);
 240  241          txg_list_destroy(&dp->dp_sync_tasks);
 241  242          txg_list_destroy(&dp->dp_dirty_dirs);
 242  243  
 243  244          arc_flush(dp->dp_spa);
 244  245          txg_fini(dp);
 245  246          dsl_scan_fini(dp);
 246  247          rrw_destroy(&dp->dp_config_rwlock);
 247  248          mutex_destroy(&dp->dp_lock);
 248  249          taskq_destroy(dp->dp_vnrele_taskq);
 249  250          if (dp->dp_blkstats)
 250  251                  kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 251  252          kmem_free(dp, sizeof (dsl_pool_t));
 252  253  }
 253  254  
 254  255  dsl_pool_t *
 255  256  dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 256  257  {
 257  258          int err;
 258  259          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 259  260          dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 260  261          objset_t *os;
 261  262          dsl_dataset_t *ds;
 262  263          uint64_t obj;
 263  264  
 264  265          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 265  266  
 266  267          /* create and open the MOS (meta-objset) */
 267  268          dp->dp_meta_objset = dmu_objset_create_impl(spa,
 268  269              NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 269  270  
 270  271          /* create the pool directory */
 271  272          err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 272  273              DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 273  274          ASSERT0(err);
 274  275  
 275  276          /* Initialize scan structures */
 276  277          VERIFY0(dsl_scan_init(dp, txg));
 277  278  
 278  279          /* create and open the root dir */
 279  280          dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 280  281          VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 281  282              NULL, dp, &dp->dp_root_dir));
 282  283  
 283  284          /* create and open the meta-objset dir */
 284  285          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 285  286          VERIFY0(dsl_pool_open_special_dir(dp,
 286  287              MOS_DIR_NAME, &dp->dp_mos_dir));
 287  288  
 288  289          if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 289  290                  /* create and open the free dir */
 290  291                  (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 291  292                      FREE_DIR_NAME, tx);
 292  293                  VERIFY0(dsl_pool_open_special_dir(dp,
 293  294                      FREE_DIR_NAME, &dp->dp_free_dir));
 294  295  
 295  296                  /* create and open the free_bplist */
 296  297                  obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
 297  298                  VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 298  299                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 299  300                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 300  301                      dp->dp_meta_objset, obj));
 301  302          }
 302  303  
 303  304          if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 304  305                  dsl_pool_create_origin(dp, tx);
 305  306  
 306  307          /* create the root dataset */
 307  308          obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 308  309  
 309  310          /* create the root objset */
 310  311          VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 311  312          os = dmu_objset_create_impl(dp->dp_spa, ds,
 312  313              dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 313  314  #ifdef _KERNEL
 314  315          zfs_create_fs(os, kcred, zplprops, tx);
 315  316  #endif
 316  317          dsl_dataset_rele(ds, FTAG);
 317  318  
 318  319          dmu_tx_commit(tx);
 319  320  
 320  321          rrw_exit(&dp->dp_config_rwlock, FTAG);
 321  322  
 322  323          return (dp);
 323  324  }
 324  325  
 325  326  /*
 326  327   * Account for the meta-objset space in its placeholder dsl_dir.
 327  328   */
 328  329  void
 329  330  dsl_pool_mos_diduse_space(dsl_pool_t *dp,
 330  331      int64_t used, int64_t comp, int64_t uncomp)
 331  332  {
 332  333          ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 333  334          mutex_enter(&dp->dp_lock);
 334  335          dp->dp_mos_used_delta += used;
 335  336          dp->dp_mos_compressed_delta += comp;
 336  337          dp->dp_mos_uncompressed_delta += uncomp;
 337  338          mutex_exit(&dp->dp_lock);
 338  339  }
 339  340  
 340  341  static int
 341  342  deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 342  343  {
 343  344          dsl_deadlist_t *dl = arg;
 344  345          dsl_deadlist_insert(dl, bp, tx);
 345  346          return (0);
 346  347  }
 347  348  
 348  349  void
 349  350  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 350  351  {
 351  352          zio_t *zio;
 352  353          dmu_tx_t *tx;
 353  354          dsl_dir_t *dd;
 354  355          dsl_dataset_t *ds;
 355  356          objset_t *mos = dp->dp_meta_objset;
 356  357          hrtime_t start, write_time;
 357  358          uint64_t data_written;
 358  359          int err;
 359  360          list_t synced_datasets;
 360  361  
 361  362          list_create(&synced_datasets, sizeof (dsl_dataset_t),
 362  363              offsetof(dsl_dataset_t, ds_synced_link));
 363  364  
 364  365          /*
 365  366           * We need to copy dp_space_towrite() before doing
 366  367           * dsl_sync_task_sync(), because
 367  368           * dsl_dataset_snapshot_reserve_space() will increase
 368  369           * dp_space_towrite but not actually write anything.
 369  370           */
 370  371          data_written = dp->dp_space_towrite[txg & TXG_MASK];
 371  372  
 372  373          tx = dmu_tx_create_assigned(dp, txg);
 373  374  
 374  375          dp->dp_read_overhead = 0;
 375  376          start = gethrtime();
 376  377  
 377  378          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 378  379          while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
 379  380                  /*
 380  381                   * We must not sync any non-MOS datasets twice, because
 381  382                   * we may have taken a snapshot of them.  However, we
 382  383                   * may sync newly-created datasets on pass 2.
 383  384                   */
 384  385                  ASSERT(!list_link_active(&ds->ds_synced_link));
 385  386                  list_insert_tail(&synced_datasets, ds);
 386  387                  dsl_dataset_sync(ds, zio, tx);
 387  388          }
 388  389          DTRACE_PROBE(pool_sync__1setup);
 389  390          err = zio_wait(zio);
 390  391  
 391  392          write_time = gethrtime() - start;
 392  393          ASSERT(err == 0);
 393  394          DTRACE_PROBE(pool_sync__2rootzio);
 394  395  
 395  396          /*
 396  397           * After the data blocks have been written (ensured by the zio_wait()
 397  398           * above), update the user/group space accounting.
 398  399           */
 399  400          for (ds = list_head(&synced_datasets); ds;
 400  401              ds = list_next(&synced_datasets, ds))
 401  402                  dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 402  403  
 403  404          /*
 404  405           * Sync the datasets again to push out the changes due to
 405  406           * userspace updates.  This must be done before we process the
 406  407           * sync tasks, so that any snapshots will have the correct
 407  408           * user accounting information (and we won't get confused
 408  409           * about which blocks are part of the snapshot).
 409  410           */
 410  411          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 411  412          while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
 412  413                  ASSERT(list_link_active(&ds->ds_synced_link));
 413  414                  dmu_buf_rele(ds->ds_dbuf, ds);
 414  415                  dsl_dataset_sync(ds, zio, tx);
 415  416          }
 416  417          err = zio_wait(zio);
 417  418  
 418  419          /*
 419  420           * Now that the datasets have been completely synced, we can
 420  421           * clean up our in-memory structures accumulated while syncing:
 421  422           *
 422  423           *  - move dead blocks from the pending deadlist to the on-disk deadlist
 423  424           *  - release hold from dsl_dataset_dirty()
 424  425           */
 425  426          while (ds = list_remove_head(&synced_datasets)) {
 426  427                  objset_t *os = ds->ds_objset;
 427  428                  bplist_iterate(&ds->ds_pending_deadlist,
 428  429                      deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 429  430                  ASSERT(!dmu_objset_is_dirty(os, txg));
 430  431                  dmu_buf_rele(ds->ds_dbuf, ds);
 431  432          }
 432  433  
 433  434          start = gethrtime();
 434  435          while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
 435  436                  dsl_dir_sync(dd, tx);
 436  437          write_time += gethrtime() - start;
 437  438  
 438  439          /*
 439  440           * The MOS's space is accounted for in the pool/$MOS
 440  441           * (dp_mos_dir).  We can't modify the mos while we're syncing
 441  442           * it, so we remember the deltas and apply them here.
 442  443           */
 443  444          if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 444  445              dp->dp_mos_uncompressed_delta != 0) {
 445  446                  dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 446  447                      dp->dp_mos_used_delta,
 447  448                      dp->dp_mos_compressed_delta,
 448  449                      dp->dp_mos_uncompressed_delta, tx);
 449  450                  dp->dp_mos_used_delta = 0;
 450  451                  dp->dp_mos_compressed_delta = 0;
 451  452                  dp->dp_mos_uncompressed_delta = 0;
 452  453          }
 453  454  
 454  455          start = gethrtime();
 455  456          if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 456  457              list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 457  458                  zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 458  459                  dmu_objset_sync(mos, zio, tx);
 459  460                  err = zio_wait(zio);
 460  461                  ASSERT(err == 0);
 461  462                  dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 462  463                  spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 463  464          }
 464  465          write_time += gethrtime() - start;
 465  466          DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
 466  467              hrtime_t, dp->dp_read_overhead);
 467  468          write_time -= dp->dp_read_overhead;
 468  469  
 469  470          /*
 470  471           * If we modify a dataset in the same txg that we want to destroy it,
 471  472           * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 472  473           * dsl_dir_destroy_check() will fail if there are unexpected holds.
 473  474           * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 474  475           * and clearing the hold on it) before we process the sync_tasks.
 475  476           * The MOS data dirtied by the sync_tasks will be synced on the next
 476  477           * pass.
 477  478           */
 478  479          DTRACE_PROBE(pool_sync__3task);
 479  480          if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 480  481                  dsl_sync_task_t *dst;
 481  482                  /*
 482  483                   * No more sync tasks should have been added while we
 483  484                   * were syncing.
 484  485                   */
 485  486                  ASSERT(spa_sync_pass(dp->dp_spa) == 1);
 486  487                  while (dst = txg_list_remove(&dp->dp_sync_tasks, txg))
 487  488                          dsl_sync_task_sync(dst, tx);
 488  489          }
 489  490  
 490  491          dmu_tx_commit(tx);
 491  492  
 492  493          dp->dp_space_towrite[txg & TXG_MASK] = 0;
 493  494          ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
 494  495  
 495  496          /*
 496  497           * If the write limit max has not been explicitly set, set it
 497  498           * to a fraction of available physical memory (default 1/8th).
 498  499           * Note that we must inflate the limit because the spa
 499  500           * inflates write sizes to account for data replication.
 500  501           * Check this each sync phase to catch changing memory size.
 501  502           */
 502  503          if (physmem != old_physmem && zfs_write_limit_shift) {
 503  504                  mutex_enter(&zfs_write_limit_lock);
 504  505                  old_physmem = physmem;
 505  506                  zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
 506  507                  zfs_write_limit_inflated = MAX(zfs_write_limit_min,
 507  508                      spa_get_asize(dp->dp_spa, zfs_write_limit_max));
 508  509                  mutex_exit(&zfs_write_limit_lock);
 509  510          }
 510  511  
 511  512          /*
 512  513           * Attempt to keep the sync time consistent by adjusting the
 513  514           * amount of write traffic allowed into each transaction group.
 514  515           * Weight the throughput calculation towards the current value:
 515  516           *      thru = 3/4 old_thru + 1/4 new_thru
 516  517           *
 517  518           * Note: write_time is in nanosecs while dp_throughput is expressed in
 518  519           * bytes per millisecond.
 519  520           */
 520  521          ASSERT(zfs_write_limit_min > 0);
 521  522          if (data_written > zfs_write_limit_min / 8 &&
 522  523              write_time > MSEC2NSEC(1)) {
 523  524                  uint64_t throughput = data_written / NSEC2MSEC(write_time);
 524  525  
 525  526                  if (dp->dp_throughput)
 526  527                          dp->dp_throughput = throughput / 4 +
 527  528                              3 * dp->dp_throughput / 4;
 528  529                  else
 529  530                          dp->dp_throughput = throughput;
 530  531                  dp->dp_write_limit = MIN(zfs_write_limit_inflated,
 531  532                      MAX(zfs_write_limit_min,
 532  533                      dp->dp_throughput * zfs_txg_synctime_ms));
 533  534          }
 534  535  }
 535  536  
 536  537  void
 537  538  dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 538  539  {
 539  540          zilog_t *zilog;
 540  541          dsl_dataset_t *ds;
 541  542  
 542  543          while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
 543  544                  ds = dmu_objset_ds(zilog->zl_os);
 544  545                  zil_clean(zilog, txg);
 545  546                  ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 546  547                  dmu_buf_rele(ds->ds_dbuf, zilog);
 547  548          }
 548  549          ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 549  550  }
 550  551  
 551  552  /*
 552  553   * TRUE if the current thread is the tx_sync_thread or if we
 553  554   * are being called from SPA context during pool initialization.
 554  555   */
 555  556  int
 556  557  dsl_pool_sync_context(dsl_pool_t *dp)
 557  558  {
 558  559          return (curthread == dp->dp_tx.tx_sync_thread ||
 559  560              spa_is_initializing(dp->dp_spa));
 560  561  }
 561  562  
 562  563  uint64_t
 563  564  dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 564  565  {
 565  566          uint64_t space, resv;
 566  567  
 567  568          /*
 568  569           * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 569  570           * efficiency.
 570  571           * XXX The intent log is not accounted for, so it must fit
 571  572           * within this slop.
 572  573           *
 573  574           * If we're trying to assess whether it's OK to do a free,
 574  575           * cut the reservation in half to allow forward progress
 575  576           * (e.g. make it possible to rm(1) files from a full pool).
 576  577           */
 577  578          space = spa_get_dspace(dp->dp_spa);
 578  579          resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 579  580          if (netfree)
 580  581                  resv >>= 1;
 581  582  
 582  583          return (space - resv);
 583  584  }
 584  585  
 585  586  int
 586  587  dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
 587  588  {
 588  589          uint64_t reserved = 0;
 589  590          uint64_t write_limit = (zfs_write_limit_override ?
 590  591              zfs_write_limit_override : dp->dp_write_limit);
 591  592  
 592  593          if (zfs_no_write_throttle) {
 593  594                  atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
 594  595                      space);
 595  596                  return (0);
 596  597          }
 597  598  
 598  599          /*
 599  600           * Check to see if we have exceeded the maximum allowed IO for
 600  601           * this transaction group.  We can do this without locks since
 601  602           * a little slop here is ok.  Note that we do the reserved check
 602  603           * with only half the requested reserve: this is because the
 603  604           * reserve requests are worst-case, and we really don't want to
 604  605           * throttle based off of worst-case estimates.
 605  606           */
 606  607          if (write_limit > 0) {
 607  608                  reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
 608  609                      + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
 609  610  
 610  611                  if (reserved && reserved > write_limit)
 611  612                          return (SET_ERROR(ERESTART));
 612  613          }
 613  614  
 614  615          atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
 615  616  
 616  617          /*
 617  618           * If this transaction group is over 7/8ths capacity, delay
 618  619           * the caller 1 clock tick.  This will slow down the "fill"
 619  620           * rate until the sync process can catch up with us.
 620  621           */
 621  622          if (reserved && reserved > (write_limit - (write_limit >> 3))) {
 622  623                  txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
 623  624                      zfs_throttle_resolution);
 624  625          }
 625  626  
 626  627          return (0);
 627  628  }
 628  629  
 629  630  void
 630  631  dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 631  632  {
 632  633          ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
 633  634          atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
 634  635  }
 635  636  
 636  637  void
 637  638  dsl_pool_memory_pressure(dsl_pool_t *dp)
 638  639  {
 639  640          uint64_t space_inuse = 0;
 640  641          int i;
 641  642  
 642  643          if (dp->dp_write_limit == zfs_write_limit_min)
 643  644                  return;
 644  645  
 645  646          for (i = 0; i < TXG_SIZE; i++) {
 646  647                  space_inuse += dp->dp_space_towrite[i];
 647  648                  space_inuse += dp->dp_tempreserved[i];
 648  649          }
 649  650          dp->dp_write_limit = MAX(zfs_write_limit_min,
 650  651              MIN(dp->dp_write_limit, space_inuse / 4));
 651  652  }
 652  653  
 653  654  void
 654  655  dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 655  656  {
 656  657          if (space > 0) {
 657  658                  mutex_enter(&dp->dp_lock);
 658  659                  dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
 659  660                  mutex_exit(&dp->dp_lock);
 660  661          }
 661  662  }
 662  663  
 663  664  /* ARGSUSED */
 664  665  static int
 665  666  upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 666  667  {
 667  668          dmu_tx_t *tx = arg;
 668  669          dsl_dataset_t *ds, *prev = NULL;
 669  670          int err;
 670  671  
 671  672          err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 672  673          if (err)
 673  674                  return (err);
 674  675  
 675  676          while (ds->ds_phys->ds_prev_snap_obj != 0) {
 676  677                  err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 677  678                      FTAG, &prev);
 678  679                  if (err) {
 679  680                          dsl_dataset_rele(ds, FTAG);
 680  681                          return (err);
 681  682                  }
 682  683  
 683  684                  if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
 684  685                          break;
 685  686                  dsl_dataset_rele(ds, FTAG);
 686  687                  ds = prev;
 687  688                  prev = NULL;
 688  689          }
 689  690  
 690  691          if (prev == NULL) {
 691  692                  prev = dp->dp_origin_snap;
 692  693  
 693  694                  /*
 694  695                   * The $ORIGIN can't have any data, or the accounting
 695  696                   * will be wrong.
 696  697                   */
 697  698                  ASSERT0(prev->ds_phys->ds_bp.blk_birth);
 698  699  
 699  700                  /* The origin doesn't get attached to itself */
 700  701                  if (ds->ds_object == prev->ds_object) {
 701  702                          dsl_dataset_rele(ds, FTAG);
 702  703                          return (0);
 703  704                  }
 704  705  
 705  706                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
 706  707                  ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
 707  708                  ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
 708  709  
 709  710                  dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 710  711                  ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
 711  712  
 712  713                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 713  714                  prev->ds_phys->ds_num_children++;
 714  715  
 715  716                  if (ds->ds_phys->ds_next_snap_obj == 0) {
 716  717                          ASSERT(ds->ds_prev == NULL);
 717  718                          VERIFY0(dsl_dataset_hold_obj(dp,
 718  719                              ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 719  720                  }
 720  721          }
 721  722  
 722  723          ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
 723  724          ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
 724  725  
 725  726          if (prev->ds_phys->ds_next_clones_obj == 0) {
 726  727                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 727  728                  prev->ds_phys->ds_next_clones_obj =
 728  729                      zap_create(dp->dp_meta_objset,
 729  730                      DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 730  731          }
 731  732          VERIFY0(zap_add_int(dp->dp_meta_objset,
 732  733              prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
 733  734  
 734  735          dsl_dataset_rele(ds, FTAG);
 735  736          if (prev != dp->dp_origin_snap)
 736  737                  dsl_dataset_rele(prev, FTAG);
 737  738          return (0);
 738  739  }
 739  740  
 740  741  void
 741  742  dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 742  743  {
 743  744          ASSERT(dmu_tx_is_syncing(tx));
 744  745          ASSERT(dp->dp_origin_snap != NULL);
 745  746  
 746  747          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 747  748              tx, DS_FIND_CHILDREN));
 748  749  }
 749  750  
 750  751  /* ARGSUSED */
 751  752  static int
 752  753  upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 753  754  {
 754  755          dmu_tx_t *tx = arg;
 755  756          objset_t *mos = dp->dp_meta_objset;
 756  757  
 757  758          if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
 758  759                  dsl_dataset_t *origin;
 759  760  
 760  761                  VERIFY0(dsl_dataset_hold_obj(dp,
 761  762                      ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
 762  763  
 763  764                  if (origin->ds_dir->dd_phys->dd_clones == 0) {
 764  765                          dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 765  766                          origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
 766  767                              DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 767  768                  }
 768  769  
 769  770                  VERIFY0(zap_add_int(dp->dp_meta_objset,
 770  771                      origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
 771  772  
 772  773                  dsl_dataset_rele(origin, FTAG);
 773  774          }
 774  775          return (0);
 775  776  }
 776  777  
 777  778  void
 778  779  dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 779  780  {
 780  781          ASSERT(dmu_tx_is_syncing(tx));
 781  782          uint64_t obj;
 782  783  
 783  784          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 784  785          VERIFY0(dsl_pool_open_special_dir(dp,
 785  786              FREE_DIR_NAME, &dp->dp_free_dir));
 786  787  
 787  788          /*
 788  789           * We can't use bpobj_alloc(), because spa_version() still
 789  790           * returns the old version, and we need a new-version bpobj with
 790  791           * subobj support.  So call dmu_object_alloc() directly.
 791  792           */
 792  793          obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 793  794              SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 794  795          VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 795  796              DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 796  797          VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 797  798  
 798  799          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 799  800              upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 800  801  }
 801  802  
 802  803  void
 803  804  dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 804  805  {
 805  806          uint64_t dsobj;
 806  807          dsl_dataset_t *ds;
 807  808  
 808  809          ASSERT(dmu_tx_is_syncing(tx));
 809  810          ASSERT(dp->dp_origin_snap == NULL);
 810  811          ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 811  812  
 812  813          /* create the origin dir, ds, & snap-ds */
 813  814          dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 814  815              NULL, 0, kcred, tx);
 815  816          VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 816  817          dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
 817  818          VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 818  819              dp, &dp->dp_origin_snap));
 819  820          dsl_dataset_rele(ds, FTAG);
 820  821  }
 821  822  
 822  823  taskq_t *
 823  824  dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 824  825  {
 825  826          return (dp->dp_vnrele_taskq);
 826  827  }
 827  828  
 828  829  /*

↓ open down ↓

795 lines elided

↑ open up ↑

 829  830   * Walk through the pool-wide zap object of temporary snapshot user holds
 830  831   * and release them.
 831  832   */
 832  833  void
 833  834  dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 834  835  {
 835  836          zap_attribute_t za;
 836  837          zap_cursor_t zc;
 837  838          objset_t *mos = dp->dp_meta_objset;
 838  839          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
      840 +        nvlist_t *holds;
 839  841  
 840  842          if (zapobj == 0)
 841  843                  return;
 842  844          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 843  845  
      846 +        holds = fnvlist_alloc();
      847 +
 844  848          for (zap_cursor_init(&zc, mos, zapobj);
 845  849              zap_cursor_retrieve(&zc, &za) == 0;
 846  850              zap_cursor_advance(&zc)) {
 847  851                  char *htag;
 848      -                uint64_t dsobj;
      852 +                nvlist_t *tags;
 849  853  
 850  854                  htag = strchr(za.za_name, '-');
 851  855                  *htag = '\0';
 852  856                  ++htag;
 853      -                dsobj = strtonum(za.za_name, NULL);
 854      -                dsl_dataset_user_release_tmp(dp, dsobj, htag);
      857 +                if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
      858 +                        tags = fnvlist_alloc();
      859 +                        fnvlist_add_boolean(tags, htag);
      860 +                        fnvlist_add_nvlist(holds, za.za_name, tags);
      861 +                        fnvlist_free(tags);
      862 +                } else {
      863 +                        fnvlist_add_boolean(tags, htag);
      864 +                }
 855  865          }
      866 +        dsl_dataset_user_release_tmp(dp, holds);
      867 +        fnvlist_free(holds);
 856  868          zap_cursor_fini(&zc);
 857  869  }
 858  870  
 859  871  /*
 860  872   * Create the pool-wide zap object for storing temporary snapshot holds.
 861  873   */
 862  874  void
 863  875  dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 864  876  {
 865  877          objset_t *mos = dp->dp_meta_objset;

 866  878  
 867  879          ASSERT(dp->dp_tmp_userrefs_obj == 0);
 868  880          ASSERT(dmu_tx_is_syncing(tx));
 869  881  
 870  882          dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 871  883              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 872  884  }
 873  885  
 874  886  static int
 875  887  dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 876  888      const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 877  889  {
 878  890          objset_t *mos = dp->dp_meta_objset;
 879  891          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 880  892          char *name;
 881  893          int error;
 882  894  
 883  895          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 884  896          ASSERT(dmu_tx_is_syncing(tx));
 885  897  
 886  898          /*
 887  899           * If the pool was created prior to SPA_VERSION_USERREFS, the
 888  900           * zap object for temporary holds might not exist yet.
 889  901           */
 890  902          if (zapobj == 0) {
 891  903                  if (holding) {
 892  904                          dsl_pool_user_hold_create_obj(dp, tx);
 893  905                          zapobj = dp->dp_tmp_userrefs_obj;
 894  906                  } else {
 895  907                          return (SET_ERROR(ENOENT));
 896  908                  }
 897  909          }
 898  910  
 899  911          name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 900  912          if (holding)
 901  913                  error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 902  914          else
 903  915                  error = zap_remove(mos, zapobj, name, tx);
 904  916          strfree(name);
 905  917  
 906  918          return (error);
 907  919  }
 908  920  
 909  921  /*
 910  922   * Add a temporary hold for the given dataset object and tag.
 911  923   */
 912  924  int
 913  925  dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 914  926      uint64_t now, dmu_tx_t *tx)
 915  927  {
 916  928          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 917  929  }
 918  930  
 919  931  /*
 920  932   * Release a temporary hold for the given dataset object and tag.
 921  933   */
 922  934  int
 923  935  dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 924  936      dmu_tx_t *tx)
 925  937  {
 926  938          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
 927  939              tx, B_FALSE));
 928  940  }
 929  941  
 930  942  /*
 931  943   * DSL Pool Configuration Lock
 932  944   *
 933  945   * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
 934  946   * creation / destruction / rename / property setting).  It must be held for
 935  947   * read to hold a dataset or dsl_dir.  I.e. you must call
 936  948   * dsl_pool_config_enter() or dsl_pool_hold() before calling
 937  949   * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
 938  950   * must be held continuously until all datasets and dsl_dirs are released.
 939  951   *
 940  952   * The only exception to this rule is that if a "long hold" is placed on
 941  953   * a dataset, then the dp_config_rwlock may be dropped while the dataset
 942  954   * is still held.  The long hold will prevent the dataset from being
 943  955   * destroyed -- the destroy will fail with EBUSY.  A long hold can be
 944  956   * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
 945  957   * (by calling dsl_{dataset,objset}_{try}own{_obj}).
 946  958   *
 947  959   * Legitimate long-holders (including owners) should be long-running, cancelable
 948  960   * tasks that should cause "zfs destroy" to fail.  This includes DMU
 949  961   * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
 950  962   * "zfs send", and "zfs diff".  There are several other long-holders whose
 951  963   * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
 952  964   *
 953  965   * The usual formula for long-holding would be:
 954  966   * dsl_pool_hold()
 955  967   * dsl_dataset_hold()
 956  968   * ... perform checks ...
 957  969   * dsl_dataset_long_hold()
 958  970   * dsl_pool_rele()
 959  971   * ... perform long-running task ...
 960  972   * dsl_dataset_long_rele()
 961  973   * dsl_dataset_rele()
 962  974   *
 963  975   * Note that when the long hold is released, the dataset is still held but
 964  976   * the pool is not held.  The dataset may change arbitrarily during this time
 965  977   * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
 966  978   * dataset except release it.
 967  979   *
 968  980   * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
 969  981   * or modifying operations.
 970  982   *
 971  983   * Modifying operations should generally use dsl_sync_task().  The synctask
 972  984   * infrastructure enforces proper locking strategy with respect to the
 973  985   * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
 974  986   *
 975  987   * Read-only operations will manually hold the pool, then the dataset, obtain
 976  988   * information from the dataset, then release the pool and dataset.
 977  989   * dmu_objset_{hold,rele}() are convenience routines that also do the pool
 978  990   * hold/rele.
 979  991   */
 980  992  
 981  993  int
 982  994  dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
 983  995  {
 984  996          spa_t *spa;
 985  997          int error;
 986  998  
 987  999          error = spa_open(name, &spa, tag);
 988 1000          if (error == 0) {
 989 1001                  *dp = spa_get_dsl(spa);
 990 1002                  dsl_pool_config_enter(*dp, tag);
 991 1003          }
 992 1004          return (error);
 993 1005  }
 994 1006  
 995 1007  void
 996 1008  dsl_pool_rele(dsl_pool_t *dp, void *tag)
 997 1009  {
 998 1010          dsl_pool_config_exit(dp, tag);
 999 1011          spa_close(dp->dp_spa, tag);
1000 1012  }
1001 1013  
1002 1014  void
1003 1015  dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
1004 1016  {
1005 1017          /*
1006 1018           * We use a "reentrant" reader-writer lock, but not reentrantly.
1007 1019           *
1008 1020           * The rrwlock can (with the track_all flag) track all reading threads,
1009 1021           * which is very useful for debugging which code path failed to release
1010 1022           * the lock, and for verifying that the *current* thread does hold
1011 1023           * the lock.
1012 1024           *
1013 1025           * (Unlike a rwlock, which knows that N threads hold it for
1014 1026           * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
1015 1027           * if any thread holds it for read, even if this thread doesn't).
1016 1028           */
1017 1029          ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
1018 1030          rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
1019 1031  }
1020 1032  
1021 1033  void
1022 1034  dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
1023 1035  {
1024 1036          rrw_exit(&dp->dp_config_rwlock, tag);
1025 1037  }
1026 1038  
1027 1039  boolean_t
1028 1040  dsl_pool_config_held(dsl_pool_t *dp)
1029 1041  {
1030 1042          return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
1031 1043  }

↓ open down ↓

166 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX