illumos-illumos-3740-r3 Wdiff usr/src/uts/common/fs/zfs/dsl_pool.c

Print this page

3740 Poor ZFS send / receive performance due to snapshot hold / release processing
Submitted by: Steven Hartland <steven.hartland@multiplay.co.uk>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dsl_pool.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_pool.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  #include <sys/dsl_pool.h>
  27   27  #include <sys/dsl_dataset.h>
  28   28  #include <sys/dsl_prop.h>
  29   29  #include <sys/dsl_dir.h>
  30   30  #include <sys/dsl_synctask.h>
  31   31  #include <sys/dsl_scan.h>
  32   32  #include <sys/dnode.h>
  33   33  #include <sys/dmu_tx.h>
  34   34  #include <sys/dmu_objset.h>
  35   35  #include <sys/arc.h>
  36   36  #include <sys/zap.h>
  37   37  #include <sys/zio.h>
  38   38  #include <sys/zfs_context.h>
  39   39  #include <sys/fs/zfs.h>
  40   40  #include <sys/zfs_znode.h>
  41   41  #include <sys/spa_impl.h>
  42   42  #include <sys/dsl_deadlist.h>
  43   43  #include <sys/bptree.h>
  44   44  #include <sys/zfeature.h>
  45   45  #include <sys/zil_impl.h>
  46   46  #include <sys/dsl_userhold.h>
  47   47  
  48   48  int zfs_no_write_throttle = 0;
  49   49  int zfs_write_limit_shift = 3;                  /* 1/8th of physical memory */
  50   50  int zfs_txg_synctime_ms = 1000;         /* target millisecs to sync a txg */
  51   51  
  52   52  uint64_t zfs_write_limit_min = 32 << 20;        /* min write limit is 32MB */
  53   53  uint64_t zfs_write_limit_max = 0;               /* max data payload per txg */
  54   54  uint64_t zfs_write_limit_inflated = 0;
  55   55  uint64_t zfs_write_limit_override = 0;
  56   56  
  57   57  kmutex_t zfs_write_limit_lock;
  58   58  
  59   59  static pgcnt_t old_physmem = 0;
  60   60  
  61   61  hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
  62   62  hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
  63   63  
  64   64  int
  65   65  dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  66   66  {
  67   67          uint64_t obj;
  68   68          int err;
  69   69  
  70   70          err = zap_lookup(dp->dp_meta_objset,
  71   71              dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
  72   72              name, sizeof (obj), 1, &obj);
  73   73          if (err)
  74   74                  return (err);
  75   75  
  76   76          return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
  77   77  }
  78   78  
  79   79  static dsl_pool_t *
  80   80  dsl_pool_open_impl(spa_t *spa, uint64_t txg)
  81   81  {
  82   82          dsl_pool_t *dp;
  83   83          blkptr_t *bp = spa_get_rootblkptr(spa);
  84   84  
  85   85          dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
  86   86          dp->dp_spa = spa;
  87   87          dp->dp_meta_rootbp = *bp;
  88   88          rrw_init(&dp->dp_config_rwlock, B_TRUE);
  89   89          dp->dp_write_limit = zfs_write_limit_min;
  90   90          txg_init(dp, txg);
  91   91  
  92   92          txg_list_create(&dp->dp_dirty_datasets,
  93   93              offsetof(dsl_dataset_t, ds_dirty_link));
  94   94          txg_list_create(&dp->dp_dirty_zilogs,
  95   95              offsetof(zilog_t, zl_dirty_link));
  96   96          txg_list_create(&dp->dp_dirty_dirs,
  97   97              offsetof(dsl_dir_t, dd_dirty_link));
  98   98          txg_list_create(&dp->dp_sync_tasks,
  99   99              offsetof(dsl_sync_task_t, dst_node));
 100  100  
 101  101          mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 102  102  
 103  103          dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 104  104              1, 4, 0);
 105  105  
 106  106          return (dp);
 107  107  }
 108  108  
 109  109  int
 110  110  dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 111  111  {
 112  112          int err;
 113  113          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 114  114  
 115  115          err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 116  116              &dp->dp_meta_objset);
 117  117          if (err != 0)
 118  118                  dsl_pool_close(dp);
 119  119          else
 120  120                  *dpp = dp;
 121  121  
 122  122          return (err);
 123  123  }
 124  124  
 125  125  int
 126  126  dsl_pool_open(dsl_pool_t *dp)
 127  127  {
 128  128          int err;
 129  129          dsl_dir_t *dd;
 130  130          dsl_dataset_t *ds;
 131  131          uint64_t obj;
 132  132  
 133  133          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 134  134          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 135  135              DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 136  136              &dp->dp_root_dir_obj);
 137  137          if (err)
 138  138                  goto out;
 139  139  
 140  140          err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 141  141              NULL, dp, &dp->dp_root_dir);
 142  142          if (err)
 143  143                  goto out;
 144  144  
 145  145          err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 146  146          if (err)
 147  147                  goto out;
 148  148  
 149  149          if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 150  150                  err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 151  151                  if (err)
 152  152                          goto out;
 153  153                  err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 154  154                      FTAG, &ds);
 155  155                  if (err == 0) {
 156  156                          err = dsl_dataset_hold_obj(dp,
 157  157                              ds->ds_phys->ds_prev_snap_obj, dp,
 158  158                              &dp->dp_origin_snap);
 159  159                          dsl_dataset_rele(ds, FTAG);
 160  160                  }
 161  161                  dsl_dir_rele(dd, dp);
 162  162                  if (err)
 163  163                          goto out;
 164  164          }
 165  165  
 166  166          if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 167  167                  err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 168  168                      &dp->dp_free_dir);
 169  169                  if (err)
 170  170                          goto out;
 171  171  
 172  172                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 173  173                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 174  174                  if (err)
 175  175                          goto out;
 176  176                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 177  177                      dp->dp_meta_objset, obj));
 178  178          }
 179  179  
 180  180          if (spa_feature_is_active(dp->dp_spa,
 181  181              &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 182  182                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 183  183                      DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 184  184                      &dp->dp_bptree_obj);
 185  185                  if (err != 0)
 186  186                          goto out;
 187  187          }
 188  188  
 189  189          if (spa_feature_is_active(dp->dp_spa,
 190  190              &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
 191  191                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 192  192                      DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 193  193                      &dp->dp_empty_bpobj);
 194  194                  if (err != 0)
 195  195                          goto out;
 196  196          }
 197  197  
 198  198          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 199  199              DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 200  200              &dp->dp_tmp_userrefs_obj);
 201  201          if (err == ENOENT)
 202  202                  err = 0;
 203  203          if (err)
 204  204                  goto out;
 205  205  
 206  206          err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 207  207  
 208  208  out:
 209  209          rrw_exit(&dp->dp_config_rwlock, FTAG);
 210  210          return (err);
 211  211  }
 212  212  
 213  213  void
 214  214  dsl_pool_close(dsl_pool_t *dp)
 215  215  {
 216  216          /* drop our references from dsl_pool_open() */
 217  217  
 218  218          /*
 219  219           * Since we held the origin_snap from "syncing" context (which
 220  220           * includes pool-opening context), it actually only got a "ref"
 221  221           * and not a hold, so just drop that here.
 222  222           */
 223  223          if (dp->dp_origin_snap)
 224  224                  dsl_dataset_rele(dp->dp_origin_snap, dp);
 225  225          if (dp->dp_mos_dir)
 226  226                  dsl_dir_rele(dp->dp_mos_dir, dp);
 227  227          if (dp->dp_free_dir)
 228  228                  dsl_dir_rele(dp->dp_free_dir, dp);
 229  229          if (dp->dp_root_dir)
 230  230                  dsl_dir_rele(dp->dp_root_dir, dp);
 231  231  
 232  232          bpobj_close(&dp->dp_free_bpobj);
 233  233  
 234  234          /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 235  235          if (dp->dp_meta_objset)
 236  236                  dmu_objset_evict(dp->dp_meta_objset);
 237  237  
 238  238          txg_list_destroy(&dp->dp_dirty_datasets);
 239  239          txg_list_destroy(&dp->dp_dirty_zilogs);
 240  240          txg_list_destroy(&dp->dp_sync_tasks);
 241  241          txg_list_destroy(&dp->dp_dirty_dirs);
 242  242  
 243  243          arc_flush(dp->dp_spa);
 244  244          txg_fini(dp);
 245  245          dsl_scan_fini(dp);
 246  246          rrw_destroy(&dp->dp_config_rwlock);
 247  247          mutex_destroy(&dp->dp_lock);
 248  248          taskq_destroy(dp->dp_vnrele_taskq);
 249  249          if (dp->dp_blkstats)
 250  250                  kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 251  251          kmem_free(dp, sizeof (dsl_pool_t));
 252  252  }
 253  253  
 254  254  dsl_pool_t *
 255  255  dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 256  256  {
 257  257          int err;
 258  258          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 259  259          dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 260  260          objset_t *os;
 261  261          dsl_dataset_t *ds;
 262  262          uint64_t obj;
 263  263  
 264  264          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 265  265  
 266  266          /* create and open the MOS (meta-objset) */
 267  267          dp->dp_meta_objset = dmu_objset_create_impl(spa,
 268  268              NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 269  269  
 270  270          /* create the pool directory */
 271  271          err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 272  272              DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 273  273          ASSERT0(err);
 274  274  
 275  275          /* Initialize scan structures */
 276  276          VERIFY0(dsl_scan_init(dp, txg));
 277  277  
 278  278          /* create and open the root dir */
 279  279          dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 280  280          VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 281  281              NULL, dp, &dp->dp_root_dir));
 282  282  
 283  283          /* create and open the meta-objset dir */
 284  284          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 285  285          VERIFY0(dsl_pool_open_special_dir(dp,
 286  286              MOS_DIR_NAME, &dp->dp_mos_dir));
 287  287  
 288  288          if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 289  289                  /* create and open the free dir */
 290  290                  (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 291  291                      FREE_DIR_NAME, tx);
 292  292                  VERIFY0(dsl_pool_open_special_dir(dp,
 293  293                      FREE_DIR_NAME, &dp->dp_free_dir));
 294  294  
 295  295                  /* create and open the free_bplist */
 296  296                  obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
 297  297                  VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 298  298                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 299  299                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 300  300                      dp->dp_meta_objset, obj));
 301  301          }
 302  302  
 303  303          if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 304  304                  dsl_pool_create_origin(dp, tx);
 305  305  
 306  306          /* create the root dataset */
 307  307          obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 308  308  
 309  309          /* create the root objset */
 310  310          VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 311  311          os = dmu_objset_create_impl(dp->dp_spa, ds,
 312  312              dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 313  313  #ifdef _KERNEL
 314  314          zfs_create_fs(os, kcred, zplprops, tx);
 315  315  #endif
 316  316          dsl_dataset_rele(ds, FTAG);
 317  317  
 318  318          dmu_tx_commit(tx);
 319  319  
 320  320          rrw_exit(&dp->dp_config_rwlock, FTAG);
 321  321  
 322  322          return (dp);
 323  323  }
 324  324  
 325  325  /*
 326  326   * Account for the meta-objset space in its placeholder dsl_dir.
 327  327   */
 328  328  void
 329  329  dsl_pool_mos_diduse_space(dsl_pool_t *dp,
 330  330      int64_t used, int64_t comp, int64_t uncomp)
 331  331  {
 332  332          ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 333  333          mutex_enter(&dp->dp_lock);
 334  334          dp->dp_mos_used_delta += used;
 335  335          dp->dp_mos_compressed_delta += comp;
 336  336          dp->dp_mos_uncompressed_delta += uncomp;
 337  337          mutex_exit(&dp->dp_lock);
 338  338  }
 339  339  
 340  340  static int
 341  341  deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 342  342  {
 343  343          dsl_deadlist_t *dl = arg;
 344  344          dsl_deadlist_insert(dl, bp, tx);
 345  345          return (0);
 346  346  }
 347  347  
 348  348  void
 349  349  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 350  350  {
 351  351          zio_t *zio;
 352  352          dmu_tx_t *tx;
 353  353          dsl_dir_t *dd;
 354  354          dsl_dataset_t *ds;
 355  355          objset_t *mos = dp->dp_meta_objset;
 356  356          hrtime_t start, write_time;
 357  357          uint64_t data_written;
 358  358          int err;
 359  359          list_t synced_datasets;
 360  360  
 361  361          list_create(&synced_datasets, sizeof (dsl_dataset_t),
 362  362              offsetof(dsl_dataset_t, ds_synced_link));
 363  363  
 364  364          /*
 365  365           * We need to copy dp_space_towrite() before doing
 366  366           * dsl_sync_task_sync(), because
 367  367           * dsl_dataset_snapshot_reserve_space() will increase
 368  368           * dp_space_towrite but not actually write anything.
 369  369           */
 370  370          data_written = dp->dp_space_towrite[txg & TXG_MASK];
 371  371  
 372  372          tx = dmu_tx_create_assigned(dp, txg);
 373  373  
 374  374          dp->dp_read_overhead = 0;
 375  375          start = gethrtime();
 376  376  
 377  377          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 378  378          while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
 379  379                  /*
 380  380                   * We must not sync any non-MOS datasets twice, because
 381  381                   * we may have taken a snapshot of them.  However, we
 382  382                   * may sync newly-created datasets on pass 2.
 383  383                   */
 384  384                  ASSERT(!list_link_active(&ds->ds_synced_link));
 385  385                  list_insert_tail(&synced_datasets, ds);
 386  386                  dsl_dataset_sync(ds, zio, tx);
 387  387          }
 388  388          DTRACE_PROBE(pool_sync__1setup);
 389  389          err = zio_wait(zio);
 390  390  
 391  391          write_time = gethrtime() - start;
 392  392          ASSERT(err == 0);
 393  393          DTRACE_PROBE(pool_sync__2rootzio);
 394  394  
 395  395          /*
 396  396           * After the data blocks have been written (ensured by the zio_wait()
 397  397           * above), update the user/group space accounting.
 398  398           */
 399  399          for (ds = list_head(&synced_datasets); ds;
 400  400              ds = list_next(&synced_datasets, ds))
 401  401                  dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 402  402  
 403  403          /*
 404  404           * Sync the datasets again to push out the changes due to
 405  405           * userspace updates.  This must be done before we process the
 406  406           * sync tasks, so that any snapshots will have the correct
 407  407           * user accounting information (and we won't get confused
 408  408           * about which blocks are part of the snapshot).
 409  409           */
 410  410          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 411  411          while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
 412  412                  ASSERT(list_link_active(&ds->ds_synced_link));
 413  413                  dmu_buf_rele(ds->ds_dbuf, ds);
 414  414                  dsl_dataset_sync(ds, zio, tx);
 415  415          }
 416  416          err = zio_wait(zio);
 417  417  
 418  418          /*
 419  419           * Now that the datasets have been completely synced, we can
 420  420           * clean up our in-memory structures accumulated while syncing:
 421  421           *
 422  422           *  - move dead blocks from the pending deadlist to the on-disk deadlist
 423  423           *  - release hold from dsl_dataset_dirty()
 424  424           */
 425  425          while (ds = list_remove_head(&synced_datasets)) {
 426  426                  objset_t *os = ds->ds_objset;
 427  427                  bplist_iterate(&ds->ds_pending_deadlist,
 428  428                      deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 429  429                  ASSERT(!dmu_objset_is_dirty(os, txg));
 430  430                  dmu_buf_rele(ds->ds_dbuf, ds);
 431  431          }
 432  432  
 433  433          start = gethrtime();
 434  434          while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
 435  435                  dsl_dir_sync(dd, tx);
 436  436          write_time += gethrtime() - start;
 437  437  
 438  438          /*
 439  439           * The MOS's space is accounted for in the pool/$MOS
 440  440           * (dp_mos_dir).  We can't modify the mos while we're syncing
 441  441           * it, so we remember the deltas and apply them here.
 442  442           */
 443  443          if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 444  444              dp->dp_mos_uncompressed_delta != 0) {
 445  445                  dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 446  446                      dp->dp_mos_used_delta,
 447  447                      dp->dp_mos_compressed_delta,
 448  448                      dp->dp_mos_uncompressed_delta, tx);
 449  449                  dp->dp_mos_used_delta = 0;
 450  450                  dp->dp_mos_compressed_delta = 0;
 451  451                  dp->dp_mos_uncompressed_delta = 0;
 452  452          }
 453  453  
 454  454          start = gethrtime();
 455  455          if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 456  456              list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 457  457                  zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 458  458                  dmu_objset_sync(mos, zio, tx);
 459  459                  err = zio_wait(zio);
 460  460                  ASSERT(err == 0);
 461  461                  dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 462  462                  spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 463  463          }
 464  464          write_time += gethrtime() - start;
 465  465          DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
 466  466              hrtime_t, dp->dp_read_overhead);
 467  467          write_time -= dp->dp_read_overhead;
 468  468  
 469  469          /*
 470  470           * If we modify a dataset in the same txg that we want to destroy it,
 471  471           * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 472  472           * dsl_dir_destroy_check() will fail if there are unexpected holds.
 473  473           * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 474  474           * and clearing the hold on it) before we process the sync_tasks.
 475  475           * The MOS data dirtied by the sync_tasks will be synced on the next
 476  476           * pass.
 477  477           */
 478  478          DTRACE_PROBE(pool_sync__3task);
 479  479          if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 480  480                  dsl_sync_task_t *dst;
 481  481                  /*
 482  482                   * No more sync tasks should have been added while we
 483  483                   * were syncing.
 484  484                   */
 485  485                  ASSERT(spa_sync_pass(dp->dp_spa) == 1);
 486  486                  while (dst = txg_list_remove(&dp->dp_sync_tasks, txg))
 487  487                          dsl_sync_task_sync(dst, tx);
 488  488          }
 489  489  
 490  490          dmu_tx_commit(tx);
 491  491  
 492  492          dp->dp_space_towrite[txg & TXG_MASK] = 0;
 493  493          ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
 494  494  
 495  495          /*
 496  496           * If the write limit max has not been explicitly set, set it
 497  497           * to a fraction of available physical memory (default 1/8th).
 498  498           * Note that we must inflate the limit because the spa
 499  499           * inflates write sizes to account for data replication.
 500  500           * Check this each sync phase to catch changing memory size.
 501  501           */
 502  502          if (physmem != old_physmem && zfs_write_limit_shift) {
 503  503                  mutex_enter(&zfs_write_limit_lock);
 504  504                  old_physmem = physmem;
 505  505                  zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
 506  506                  zfs_write_limit_inflated = MAX(zfs_write_limit_min,
 507  507                      spa_get_asize(dp->dp_spa, zfs_write_limit_max));
 508  508                  mutex_exit(&zfs_write_limit_lock);
 509  509          }
 510  510  
 511  511          /*
 512  512           * Attempt to keep the sync time consistent by adjusting the
 513  513           * amount of write traffic allowed into each transaction group.
 514  514           * Weight the throughput calculation towards the current value:
 515  515           *      thru = 3/4 old_thru + 1/4 new_thru
 516  516           *
 517  517           * Note: write_time is in nanosecs while dp_throughput is expressed in
 518  518           * bytes per millisecond.
 519  519           */
 520  520          ASSERT(zfs_write_limit_min > 0);
 521  521          if (data_written > zfs_write_limit_min / 8 &&
 522  522              write_time > MSEC2NSEC(1)) {
 523  523                  uint64_t throughput = data_written / NSEC2MSEC(write_time);
 524  524  
 525  525                  if (dp->dp_throughput)
 526  526                          dp->dp_throughput = throughput / 4 +
 527  527                              3 * dp->dp_throughput / 4;
 528  528                  else
 529  529                          dp->dp_throughput = throughput;
 530  530                  dp->dp_write_limit = MIN(zfs_write_limit_inflated,
 531  531                      MAX(zfs_write_limit_min,
 532  532                      dp->dp_throughput * zfs_txg_synctime_ms));
 533  533          }
 534  534  }
 535  535  
 536  536  void
 537  537  dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 538  538  {
 539  539          zilog_t *zilog;
 540  540          dsl_dataset_t *ds;
 541  541  
 542  542          while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
 543  543                  ds = dmu_objset_ds(zilog->zl_os);
 544  544                  zil_clean(zilog, txg);
 545  545                  ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 546  546                  dmu_buf_rele(ds->ds_dbuf, zilog);
 547  547          }
 548  548          ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 549  549  }
 550  550  
 551  551  /*
 552  552   * TRUE if the current thread is the tx_sync_thread or if we
 553  553   * are being called from SPA context during pool initialization.
 554  554   */
 555  555  int
 556  556  dsl_pool_sync_context(dsl_pool_t *dp)
 557  557  {
 558  558          return (curthread == dp->dp_tx.tx_sync_thread ||
 559  559              spa_is_initializing(dp->dp_spa));
 560  560  }
 561  561  
 562  562  uint64_t
 563  563  dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 564  564  {
 565  565          uint64_t space, resv;
 566  566  
 567  567          /*
 568  568           * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 569  569           * efficiency.
 570  570           * XXX The intent log is not accounted for, so it must fit
 571  571           * within this slop.
 572  572           *
 573  573           * If we're trying to assess whether it's OK to do a free,
 574  574           * cut the reservation in half to allow forward progress
 575  575           * (e.g. make it possible to rm(1) files from a full pool).
 576  576           */
 577  577          space = spa_get_dspace(dp->dp_spa);
 578  578          resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 579  579          if (netfree)
 580  580                  resv >>= 1;
 581  581  
 582  582          return (space - resv);
 583  583  }
 584  584  
 585  585  int
 586  586  dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
 587  587  {
 588  588          uint64_t reserved = 0;
 589  589          uint64_t write_limit = (zfs_write_limit_override ?
 590  590              zfs_write_limit_override : dp->dp_write_limit);
 591  591  
 592  592          if (zfs_no_write_throttle) {
 593  593                  atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
 594  594                      space);
 595  595                  return (0);
 596  596          }
 597  597  
 598  598          /*
 599  599           * Check to see if we have exceeded the maximum allowed IO for
 600  600           * this transaction group.  We can do this without locks since
 601  601           * a little slop here is ok.  Note that we do the reserved check
 602  602           * with only half the requested reserve: this is because the
 603  603           * reserve requests are worst-case, and we really don't want to
 604  604           * throttle based off of worst-case estimates.
 605  605           */
 606  606          if (write_limit > 0) {
 607  607                  reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
 608  608                      + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
 609  609  
 610  610                  if (reserved && reserved > write_limit)
 611  611                          return (SET_ERROR(ERESTART));
 612  612          }
 613  613  
 614  614          atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
 615  615  
 616  616          /*
 617  617           * If this transaction group is over 7/8ths capacity, delay
 618  618           * the caller 1 clock tick.  This will slow down the "fill"
 619  619           * rate until the sync process can catch up with us.
 620  620           */
 621  621          if (reserved && reserved > (write_limit - (write_limit >> 3))) {
 622  622                  txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
 623  623                      zfs_throttle_resolution);
 624  624          }
 625  625  
 626  626          return (0);
 627  627  }
 628  628  
 629  629  void
 630  630  dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 631  631  {
 632  632          ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
 633  633          atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
 634  634  }
 635  635  
 636  636  void
 637  637  dsl_pool_memory_pressure(dsl_pool_t *dp)
 638  638  {
 639  639          uint64_t space_inuse = 0;
 640  640          int i;
 641  641  
 642  642          if (dp->dp_write_limit == zfs_write_limit_min)
 643  643                  return;
 644  644  
 645  645          for (i = 0; i < TXG_SIZE; i++) {
 646  646                  space_inuse += dp->dp_space_towrite[i];
 647  647                  space_inuse += dp->dp_tempreserved[i];
 648  648          }
 649  649          dp->dp_write_limit = MAX(zfs_write_limit_min,
 650  650              MIN(dp->dp_write_limit, space_inuse / 4));
 651  651  }
 652  652  
 653  653  void
 654  654  dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 655  655  {
 656  656          if (space > 0) {
 657  657                  mutex_enter(&dp->dp_lock);
 658  658                  dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
 659  659                  mutex_exit(&dp->dp_lock);
 660  660          }
 661  661  }
 662  662  
 663  663  /* ARGSUSED */
 664  664  static int
 665  665  upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 666  666  {
 667  667          dmu_tx_t *tx = arg;
 668  668          dsl_dataset_t *ds, *prev = NULL;
 669  669          int err;
 670  670  
 671  671          err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 672  672          if (err)
 673  673                  return (err);
 674  674  
 675  675          while (ds->ds_phys->ds_prev_snap_obj != 0) {
 676  676                  err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 677  677                      FTAG, &prev);
 678  678                  if (err) {
 679  679                          dsl_dataset_rele(ds, FTAG);
 680  680                          return (err);
 681  681                  }
 682  682  
 683  683                  if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
 684  684                          break;
 685  685                  dsl_dataset_rele(ds, FTAG);
 686  686                  ds = prev;
 687  687                  prev = NULL;
 688  688          }
 689  689  
 690  690          if (prev == NULL) {
 691  691                  prev = dp->dp_origin_snap;
 692  692  
 693  693                  /*
 694  694                   * The $ORIGIN can't have any data, or the accounting
 695  695                   * will be wrong.
 696  696                   */
 697  697                  ASSERT0(prev->ds_phys->ds_bp.blk_birth);
 698  698  
 699  699                  /* The origin doesn't get attached to itself */
 700  700                  if (ds->ds_object == prev->ds_object) {
 701  701                          dsl_dataset_rele(ds, FTAG);
 702  702                          return (0);
 703  703                  }
 704  704  
 705  705                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
 706  706                  ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
 707  707                  ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
 708  708  
 709  709                  dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 710  710                  ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
 711  711  
 712  712                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 713  713                  prev->ds_phys->ds_num_children++;
 714  714  
 715  715                  if (ds->ds_phys->ds_next_snap_obj == 0) {
 716  716                          ASSERT(ds->ds_prev == NULL);
 717  717                          VERIFY0(dsl_dataset_hold_obj(dp,
 718  718                              ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 719  719                  }
 720  720          }
 721  721  
 722  722          ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
 723  723          ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
 724  724  
 725  725          if (prev->ds_phys->ds_next_clones_obj == 0) {
 726  726                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 727  727                  prev->ds_phys->ds_next_clones_obj =
 728  728                      zap_create(dp->dp_meta_objset,
 729  729                      DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 730  730          }
 731  731          VERIFY0(zap_add_int(dp->dp_meta_objset,
 732  732              prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
 733  733  
 734  734          dsl_dataset_rele(ds, FTAG);
 735  735          if (prev != dp->dp_origin_snap)
 736  736                  dsl_dataset_rele(prev, FTAG);
 737  737          return (0);
 738  738  }
 739  739  
 740  740  void
 741  741  dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 742  742  {
 743  743          ASSERT(dmu_tx_is_syncing(tx));
 744  744          ASSERT(dp->dp_origin_snap != NULL);
 745  745  
 746  746          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 747  747              tx, DS_FIND_CHILDREN));
 748  748  }
 749  749  
 750  750  /* ARGSUSED */
 751  751  static int
 752  752  upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 753  753  {
 754  754          dmu_tx_t *tx = arg;
 755  755          objset_t *mos = dp->dp_meta_objset;
 756  756  
 757  757          if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
 758  758                  dsl_dataset_t *origin;
 759  759  
 760  760                  VERIFY0(dsl_dataset_hold_obj(dp,
 761  761                      ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
 762  762  
 763  763                  if (origin->ds_dir->dd_phys->dd_clones == 0) {
 764  764                          dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 765  765                          origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
 766  766                              DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 767  767                  }
 768  768  
 769  769                  VERIFY0(zap_add_int(dp->dp_meta_objset,
 770  770                      origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
 771  771  
 772  772                  dsl_dataset_rele(origin, FTAG);
 773  773          }
 774  774          return (0);
 775  775  }
 776  776  
 777  777  void
 778  778  dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 779  779  {
 780  780          ASSERT(dmu_tx_is_syncing(tx));
 781  781          uint64_t obj;
 782  782  
 783  783          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 784  784          VERIFY0(dsl_pool_open_special_dir(dp,
 785  785              FREE_DIR_NAME, &dp->dp_free_dir));
 786  786  
 787  787          /*
 788  788           * We can't use bpobj_alloc(), because spa_version() still
 789  789           * returns the old version, and we need a new-version bpobj with
 790  790           * subobj support.  So call dmu_object_alloc() directly.
 791  791           */
 792  792          obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 793  793              SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 794  794          VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 795  795              DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 796  796          VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 797  797  
 798  798          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 799  799              upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 800  800  }
 801  801  
 802  802  void
 803  803  dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 804  804  {
 805  805          uint64_t dsobj;
 806  806          dsl_dataset_t *ds;
 807  807  
 808  808          ASSERT(dmu_tx_is_syncing(tx));
 809  809          ASSERT(dp->dp_origin_snap == NULL);
 810  810          ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 811  811  
 812  812          /* create the origin dir, ds, & snap-ds */
 813  813          dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 814  814              NULL, 0, kcred, tx);
 815  815          VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 816  816          dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
 817  817          VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 818  818              dp, &dp->dp_origin_snap));
 819  819          dsl_dataset_rele(ds, FTAG);
 820  820  }
 821  821  
 822  822  taskq_t *
 823  823  dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 824  824  {
 825  825          return (dp->dp_vnrele_taskq);
 826  826  }
 827  827  
 828  828  /*

↓ open down ↓

828 lines elided

↑ open up ↑

 829  829   * Walk through the pool-wide zap object of temporary snapshot user holds
 830  830   * and release them.
 831  831   */
 832  832  void
 833  833  dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 834  834  {
 835  835          zap_attribute_t za;
 836  836          zap_cursor_t zc;
 837  837          objset_t *mos = dp->dp_meta_objset;
 838  838          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
      839 +        nvlist_t *holds;
 839  840  
 840  841          if (zapobj == 0)
 841  842                  return;
 842  843          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 843  844  
      845 +        holds = fnvlist_alloc();
      846 +
 844  847          for (zap_cursor_init(&zc, mos, zapobj);
 845  848              zap_cursor_retrieve(&zc, &za) == 0;
 846  849              zap_cursor_advance(&zc)) {
 847  850                  char *htag;
 848  851                  uint64_t dsobj;
      852 +                nvlist_t *tags;
 849  853  
 850  854                  htag = strchr(za.za_name, '-');
 851  855                  *htag = '\0';
 852  856                  ++htag;
 853      -                dsobj = strtonum(za.za_name, NULL);
 854      -                dsl_dataset_user_release_tmp(dp, dsobj, htag);
      857 +                if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
      858 +                        tags = fnvlist_alloc();
      859 +                        fnvlist_add_boolean(tags, htag);
      860 +                        fnvlist_add_nvlist(holds, za.za_name, tags);
      861 +                        fnvlist_free(tags);
      862 +                } else {
      863 +                        fnvlist_add_boolean(tags, htag);
      864 +                }
 855  865          }
      866 +        dsl_dataset_user_release_tmp(dp, holds);
      867 +        fnvlist_free(holds);
 856  868          zap_cursor_fini(&zc);
 857  869  }
 858  870  
 859  871  /*
 860  872   * Create the pool-wide zap object for storing temporary snapshot holds.
 861  873   */
 862  874  void
 863  875  dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 864  876  {
 865  877          objset_t *mos = dp->dp_meta_objset;

 866  878  
 867  879          ASSERT(dp->dp_tmp_userrefs_obj == 0);
 868  880          ASSERT(dmu_tx_is_syncing(tx));
 869  881  
 870  882          dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 871  883              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 872  884  }
 873  885  
 874  886  static int
 875  887  dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 876  888      const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 877  889  {
 878  890          objset_t *mos = dp->dp_meta_objset;
 879  891          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 880  892          char *name;
 881  893          int error;
 882  894  
 883  895          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 884  896          ASSERT(dmu_tx_is_syncing(tx));
 885  897  
 886  898          /*
 887  899           * If the pool was created prior to SPA_VERSION_USERREFS, the
 888  900           * zap object for temporary holds might not exist yet.
 889  901           */
 890  902          if (zapobj == 0) {
 891  903                  if (holding) {
 892  904                          dsl_pool_user_hold_create_obj(dp, tx);
 893  905                          zapobj = dp->dp_tmp_userrefs_obj;
 894  906                  } else {
 895  907                          return (SET_ERROR(ENOENT));
 896  908                  }
 897  909          }
 898  910  
 899  911          name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 900  912          if (holding)
 901  913                  error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 902  914          else
 903  915                  error = zap_remove(mos, zapobj, name, tx);
 904  916          strfree(name);
 905  917  
 906  918          return (error);
 907  919  }
 908  920  
 909  921  /*
 910  922   * Add a temporary hold for the given dataset object and tag.
 911  923   */
 912  924  int
 913  925  dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 914  926      uint64_t now, dmu_tx_t *tx)
 915  927  {
 916  928          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 917  929  }
 918  930  
 919  931  /*
 920  932   * Release a temporary hold for the given dataset object and tag.
 921  933   */
 922  934  int
 923  935  dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 924  936      dmu_tx_t *tx)
 925  937  {
 926  938          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
 927  939              tx, B_FALSE));
 928  940  }
 929  941  
 930  942  /*
 931  943   * DSL Pool Configuration Lock
 932  944   *
 933  945   * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
 934  946   * creation / destruction / rename / property setting).  It must be held for
 935  947   * read to hold a dataset or dsl_dir.  I.e. you must call
 936  948   * dsl_pool_config_enter() or dsl_pool_hold() before calling
 937  949   * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
 938  950   * must be held continuously until all datasets and dsl_dirs are released.
 939  951   *
 940  952   * The only exception to this rule is that if a "long hold" is placed on
 941  953   * a dataset, then the dp_config_rwlock may be dropped while the dataset
 942  954   * is still held.  The long hold will prevent the dataset from being
 943  955   * destroyed -- the destroy will fail with EBUSY.  A long hold can be
 944  956   * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
 945  957   * (by calling dsl_{dataset,objset}_{try}own{_obj}).
 946  958   *
 947  959   * Legitimate long-holders (including owners) should be long-running, cancelable
 948  960   * tasks that should cause "zfs destroy" to fail.  This includes DMU
 949  961   * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
 950  962   * "zfs send", and "zfs diff".  There are several other long-holders whose
 951  963   * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
 952  964   *
 953  965   * The usual formula for long-holding would be:
 954  966   * dsl_pool_hold()
 955  967   * dsl_dataset_hold()
 956  968   * ... perform checks ...
 957  969   * dsl_dataset_long_hold()
 958  970   * dsl_pool_rele()
 959  971   * ... perform long-running task ...
 960  972   * dsl_dataset_long_rele()
 961  973   * dsl_dataset_rele()
 962  974   *
 963  975   * Note that when the long hold is released, the dataset is still held but
 964  976   * the pool is not held.  The dataset may change arbitrarily during this time
 965  977   * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
 966  978   * dataset except release it.
 967  979   *
 968  980   * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
 969  981   * or modifying operations.
 970  982   *
 971  983   * Modifying operations should generally use dsl_sync_task().  The synctask
 972  984   * infrastructure enforces proper locking strategy with respect to the
 973  985   * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
 974  986   *
 975  987   * Read-only operations will manually hold the pool, then the dataset, obtain
 976  988   * information from the dataset, then release the pool and dataset.
 977  989   * dmu_objset_{hold,rele}() are convenience routines that also do the pool
 978  990   * hold/rele.
 979  991   */
 980  992  
 981  993  int
 982  994  dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
 983  995  {
 984  996          spa_t *spa;
 985  997          int error;
 986  998  
 987  999          error = spa_open(name, &spa, tag);
 988 1000          if (error == 0) {
 989 1001                  *dp = spa_get_dsl(spa);
 990 1002                  dsl_pool_config_enter(*dp, tag);
 991 1003          }
 992 1004          return (error);
 993 1005  }
 994 1006  
 995 1007  void
 996 1008  dsl_pool_rele(dsl_pool_t *dp, void *tag)
 997 1009  {
 998 1010          dsl_pool_config_exit(dp, tag);
 999 1011          spa_close(dp->dp_spa, tag);
1000 1012  }
1001 1013  
1002 1014  void
1003 1015  dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
1004 1016  {
1005 1017          /*
1006 1018           * We use a "reentrant" reader-writer lock, but not reentrantly.
1007 1019           *
1008 1020           * The rrwlock can (with the track_all flag) track all reading threads,
1009 1021           * which is very useful for debugging which code path failed to release
1010 1022           * the lock, and for verifying that the *current* thread does hold
1011 1023           * the lock.
1012 1024           *
1013 1025           * (Unlike a rwlock, which knows that N threads hold it for
1014 1026           * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
1015 1027           * if any thread holds it for read, even if this thread doesn't).
1016 1028           */
1017 1029          ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
1018 1030          rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
1019 1031  }
1020 1032  
1021 1033  void
1022 1034  dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
1023 1035  {
1024 1036          rrw_exit(&dp->dp_config_rwlock, tag);
1025 1037  }
1026 1038  
1027 1039  boolean_t
1028 1040  dsl_pool_config_held(dsl_pool_t *dp)
1029 1041  {
1030 1042          return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
1031 1043  }

↓ open down ↓

166 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX