illumos-gate Wdiff usr/src/uts/common/fs/zfs/dsl_pool.c

Print this page

5981 Deadlock in dmu_objset_find_dp

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dsl_pool.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_pool.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 Steven Hartland. All rights reserved.
  25   25   * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  26   26   */
  27   27  
  28   28  #include <sys/dsl_pool.h>
  29   29  #include <sys/dsl_dataset.h>
  30   30  #include <sys/dsl_prop.h>
  31   31  #include <sys/dsl_dir.h>
  32   32  #include <sys/dsl_synctask.h>
  33   33  #include <sys/dsl_scan.h>
  34   34  #include <sys/dnode.h>
  35   35  #include <sys/dmu_tx.h>
  36   36  #include <sys/dmu_objset.h>
  37   37  #include <sys/arc.h>
  38   38  #include <sys/zap.h>
  39   39  #include <sys/zio.h>
  40   40  #include <sys/zfs_context.h>
  41   41  #include <sys/fs/zfs.h>
  42   42  #include <sys/zfs_znode.h>
  43   43  #include <sys/spa_impl.h>
  44   44  #include <sys/dsl_deadlist.h>
  45   45  #include <sys/bptree.h>
  46   46  #include <sys/zfeature.h>
  47   47  #include <sys/zil_impl.h>
  48   48  #include <sys/dsl_userhold.h>
  49   49  
  50   50  /*
  51   51   * ZFS Write Throttle
  52   52   * ------------------
  53   53   *
  54   54   * ZFS must limit the rate of incoming writes to the rate at which it is able
  55   55   * to sync data modifications to the backend storage. Throttling by too much
  56   56   * creates an artificial limit; throttling by too little can only be sustained
  57   57   * for short periods and would lead to highly lumpy performance. On a per-pool
  58   58   * basis, ZFS tracks the amount of modified (dirty) data. As operations change
  59   59   * data, the amount of dirty data increases; as ZFS syncs out data, the amount
  60   60   * of dirty data decreases. When the amount of dirty data exceeds a
  61   61   * predetermined threshold further modifications are blocked until the amount
  62   62   * of dirty data decreases (as data is synced out).
  63   63   *
  64   64   * The limit on dirty data is tunable, and should be adjusted according to
  65   65   * both the IO capacity and available memory of the system. The larger the
  66   66   * window, the more ZFS is able to aggregate and amortize metadata (and data)
  67   67   * changes. However, memory is a limited resource, and allowing for more dirty
  68   68   * data comes at the cost of keeping other useful data in memory (for example
  69   69   * ZFS data cached by the ARC).
  70   70   *
  71   71   * Implementation
  72   72   *
  73   73   * As buffers are modified dsl_pool_willuse_space() increments both the per-
  74   74   * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
  75   75   * dirty space used; dsl_pool_dirty_space() decrements those values as data
  76   76   * is synced out from dsl_pool_sync(). While only the poolwide value is
  77   77   * relevant, the per-txg value is useful for debugging. The tunable
  78   78   * zfs_dirty_data_max determines the dirty space limit. Once that value is
  79   79   * exceeded, new writes are halted until space frees up.
  80   80   *
  81   81   * The zfs_dirty_data_sync tunable dictates the threshold at which we
  82   82   * ensure that there is a txg syncing (see the comment in txg.c for a full
  83   83   * description of transaction group stages).
  84   84   *
  85   85   * The IO scheduler uses both the dirty space limit and current amount of
  86   86   * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
  87   87   * issues. See the comment in vdev_queue.c for details of the IO scheduler.
  88   88   *
  89   89   * The delay is also calculated based on the amount of dirty data.  See the
  90   90   * comment above dmu_tx_delay() for details.
  91   91   */
  92   92  
  93   93  /*
  94   94   * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
  95   95   * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
  96   96   */
  97   97  uint64_t zfs_dirty_data_max;
  98   98  uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
  99   99  int zfs_dirty_data_max_percent = 10;
 100  100  
 101  101  /*
 102  102   * If there is at least this much dirty data, push out a txg.
 103  103   */
 104  104  uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
 105  105  
 106  106  /*
 107  107   * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
 108  108   * and delay each transaction.
 109  109   * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
 110  110   */
 111  111  int zfs_delay_min_dirty_percent = 60;
 112  112  
 113  113  /*
 114  114   * This controls how quickly the delay approaches infinity.
 115  115   * Larger values cause it to delay more for a given amount of dirty data.
 116  116   * Therefore larger values will cause there to be less dirty data for a
 117  117   * given throughput.
 118  118   *
 119  119   * For the smoothest delay, this value should be about 1 billion divided
 120  120   * by the maximum number of operations per second.  This will smoothly
 121  121   * handle between 10x and 1/10th this number.
 122  122   *
 123  123   * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
 124  124   * multiply in dmu_tx_delay().
 125  125   */
 126  126  uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 127  127  
 128  128  
 129  129  hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
 130  130  hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
 131  131  
 132  132  int
 133  133  dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 134  134  {
 135  135          uint64_t obj;
 136  136          int err;
 137  137  
 138  138          err = zap_lookup(dp->dp_meta_objset,
 139  139              dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
 140  140              name, sizeof (obj), 1, &obj);
 141  141          if (err)
 142  142                  return (err);
 143  143  
 144  144          return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
 145  145  }
 146  146  
 147  147  static dsl_pool_t *
 148  148  dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 149  149  {
 150  150          dsl_pool_t *dp;
 151  151          blkptr_t *bp = spa_get_rootblkptr(spa);
 152  152  
 153  153          dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
 154  154          dp->dp_spa = spa;
 155  155          dp->dp_meta_rootbp = *bp;
 156  156          rrw_init(&dp->dp_config_rwlock, B_TRUE);
 157  157          txg_init(dp, txg);
 158  158  
 159  159          txg_list_create(&dp->dp_dirty_datasets,
 160  160              offsetof(dsl_dataset_t, ds_dirty_link));
 161  161          txg_list_create(&dp->dp_dirty_zilogs,
 162  162              offsetof(zilog_t, zl_dirty_link));
 163  163          txg_list_create(&dp->dp_dirty_dirs,
 164  164              offsetof(dsl_dir_t, dd_dirty_link));
 165  165          txg_list_create(&dp->dp_sync_tasks,
 166  166              offsetof(dsl_sync_task_t, dst_node));
 167  167  
 168  168          mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 169  169          cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 170  170  
 171  171          dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 172  172              1, 4, 0);
 173  173  
 174  174          return (dp);
 175  175  }
 176  176  
 177  177  int
 178  178  dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 179  179  {
 180  180          int err;
 181  181          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 182  182  
 183  183          err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 184  184              &dp->dp_meta_objset);
 185  185          if (err != 0)
 186  186                  dsl_pool_close(dp);
 187  187          else
 188  188                  *dpp = dp;
 189  189  
 190  190          return (err);
 191  191  }
 192  192  
 193  193  int
 194  194  dsl_pool_open(dsl_pool_t *dp)
 195  195  {
 196  196          int err;
 197  197          dsl_dir_t *dd;
 198  198          dsl_dataset_t *ds;
 199  199          uint64_t obj;
 200  200  
 201  201          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 202  202          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 203  203              DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 204  204              &dp->dp_root_dir_obj);
 205  205          if (err)
 206  206                  goto out;
 207  207  
 208  208          err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 209  209              NULL, dp, &dp->dp_root_dir);
 210  210          if (err)
 211  211                  goto out;
 212  212  
 213  213          err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 214  214          if (err)
 215  215                  goto out;
 216  216  
 217  217          if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 218  218                  err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 219  219                  if (err)
 220  220                          goto out;
 221  221                  err = dsl_dataset_hold_obj(dp,
 222  222                      dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
 223  223                  if (err == 0) {
 224  224                          err = dsl_dataset_hold_obj(dp,
 225  225                              dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
 226  226                              &dp->dp_origin_snap);
 227  227                          dsl_dataset_rele(ds, FTAG);
 228  228                  }
 229  229                  dsl_dir_rele(dd, dp);
 230  230                  if (err)
 231  231                          goto out;
 232  232          }
 233  233  
 234  234          if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 235  235                  err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 236  236                      &dp->dp_free_dir);
 237  237                  if (err)
 238  238                          goto out;
 239  239  
 240  240                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 241  241                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 242  242                  if (err)
 243  243                          goto out;
 244  244                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 245  245                      dp->dp_meta_objset, obj));
 246  246          }
 247  247  
 248  248          /*
 249  249           * Note: errors ignored, because the leak dir will not exist if we
 250  250           * have not encountered a leak yet.
 251  251           */
 252  252          (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
 253  253              &dp->dp_leak_dir);
 254  254  
 255  255          if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 256  256                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 257  257                      DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 258  258                      &dp->dp_bptree_obj);
 259  259                  if (err != 0)
 260  260                          goto out;
 261  261          }
 262  262  
 263  263          if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 264  264                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 265  265                      DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 266  266                      &dp->dp_empty_bpobj);
 267  267                  if (err != 0)
 268  268                          goto out;
 269  269          }
 270  270  
 271  271          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 272  272              DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 273  273              &dp->dp_tmp_userrefs_obj);
 274  274          if (err == ENOENT)
 275  275                  err = 0;
 276  276          if (err)
 277  277                  goto out;
 278  278  
 279  279          err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 280  280  
 281  281  out:
 282  282          rrw_exit(&dp->dp_config_rwlock, FTAG);
 283  283          return (err);
 284  284  }
 285  285  
 286  286  void
 287  287  dsl_pool_close(dsl_pool_t *dp)
 288  288  {
 289  289          /*
 290  290           * Drop our references from dsl_pool_open().
 291  291           *
 292  292           * Since we held the origin_snap from "syncing" context (which
 293  293           * includes pool-opening context), it actually only got a "ref"
 294  294           * and not a hold, so just drop that here.
 295  295           */
 296  296          if (dp->dp_origin_snap)
 297  297                  dsl_dataset_rele(dp->dp_origin_snap, dp);
 298  298          if (dp->dp_mos_dir)
 299  299                  dsl_dir_rele(dp->dp_mos_dir, dp);
 300  300          if (dp->dp_free_dir)
 301  301                  dsl_dir_rele(dp->dp_free_dir, dp);
 302  302          if (dp->dp_leak_dir)
 303  303                  dsl_dir_rele(dp->dp_leak_dir, dp);
 304  304          if (dp->dp_root_dir)
 305  305                  dsl_dir_rele(dp->dp_root_dir, dp);
 306  306  
 307  307          bpobj_close(&dp->dp_free_bpobj);
 308  308  
 309  309          /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 310  310          if (dp->dp_meta_objset)
 311  311                  dmu_objset_evict(dp->dp_meta_objset);
 312  312  
 313  313          txg_list_destroy(&dp->dp_dirty_datasets);
 314  314          txg_list_destroy(&dp->dp_dirty_zilogs);
 315  315          txg_list_destroy(&dp->dp_sync_tasks);
 316  316          txg_list_destroy(&dp->dp_dirty_dirs);
 317  317  
 318  318          /*
 319  319           * We can't set retry to TRUE since we're explicitly specifying
 320  320           * a spa to flush. This is good enough; any missed buffers for
 321  321           * this spa won't cause trouble, and they'll eventually fall
 322  322           * out of the ARC just like any other unused buffer.
 323  323           */
 324  324          arc_flush(dp->dp_spa, FALSE);
 325  325  
 326  326          txg_fini(dp);
 327  327          dsl_scan_fini(dp);
 328  328          dmu_buf_user_evict_wait();
 329  329  
 330  330          rrw_destroy(&dp->dp_config_rwlock);
 331  331          mutex_destroy(&dp->dp_lock);
 332  332          taskq_destroy(dp->dp_vnrele_taskq);
 333  333          if (dp->dp_blkstats)
 334  334                  kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 335  335          kmem_free(dp, sizeof (dsl_pool_t));
 336  336  }
 337  337  
 338  338  dsl_pool_t *
 339  339  dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 340  340  {
 341  341          int err;
 342  342          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 343  343          dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 344  344          objset_t *os;
 345  345          dsl_dataset_t *ds;
 346  346          uint64_t obj;
 347  347  
 348  348          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 349  349  
 350  350          /* create and open the MOS (meta-objset) */
 351  351          dp->dp_meta_objset = dmu_objset_create_impl(spa,
 352  352              NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 353  353  
 354  354          /* create the pool directory */
 355  355          err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 356  356              DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 357  357          ASSERT0(err);
 358  358  
 359  359          /* Initialize scan structures */
 360  360          VERIFY0(dsl_scan_init(dp, txg));
 361  361  
 362  362          /* create and open the root dir */
 363  363          dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 364  364          VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 365  365              NULL, dp, &dp->dp_root_dir));
 366  366  
 367  367          /* create and open the meta-objset dir */
 368  368          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 369  369          VERIFY0(dsl_pool_open_special_dir(dp,
 370  370              MOS_DIR_NAME, &dp->dp_mos_dir));
 371  371  
 372  372          if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 373  373                  /* create and open the free dir */
 374  374                  (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 375  375                      FREE_DIR_NAME, tx);
 376  376                  VERIFY0(dsl_pool_open_special_dir(dp,
 377  377                      FREE_DIR_NAME, &dp->dp_free_dir));
 378  378  
 379  379                  /* create and open the free_bplist */
 380  380                  obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
 381  381                  VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 382  382                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 383  383                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 384  384                      dp->dp_meta_objset, obj));
 385  385          }
 386  386  
 387  387          if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 388  388                  dsl_pool_create_origin(dp, tx);
 389  389  
 390  390          /* create the root dataset */
 391  391          obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 392  392  
 393  393          /* create the root objset */
 394  394          VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 395  395          os = dmu_objset_create_impl(dp->dp_spa, ds,
 396  396              dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 397  397  #ifdef _KERNEL
 398  398          zfs_create_fs(os, kcred, zplprops, tx);
 399  399  #endif
 400  400          dsl_dataset_rele(ds, FTAG);
 401  401  
 402  402          dmu_tx_commit(tx);
 403  403  
 404  404          rrw_exit(&dp->dp_config_rwlock, FTAG);
 405  405  
 406  406          return (dp);
 407  407  }
 408  408  
 409  409  /*
 410  410   * Account for the meta-objset space in its placeholder dsl_dir.
 411  411   */
 412  412  void
 413  413  dsl_pool_mos_diduse_space(dsl_pool_t *dp,
 414  414      int64_t used, int64_t comp, int64_t uncomp)
 415  415  {
 416  416          ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 417  417          mutex_enter(&dp->dp_lock);
 418  418          dp->dp_mos_used_delta += used;
 419  419          dp->dp_mos_compressed_delta += comp;
 420  420          dp->dp_mos_uncompressed_delta += uncomp;
 421  421          mutex_exit(&dp->dp_lock);
 422  422  }
 423  423  
 424  424  static int
 425  425  deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 426  426  {
 427  427          dsl_deadlist_t *dl = arg;
 428  428          dsl_deadlist_insert(dl, bp, tx);
 429  429          return (0);
 430  430  }
 431  431  
 432  432  static void
 433  433  dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
 434  434  {
 435  435          zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 436  436          dmu_objset_sync(dp->dp_meta_objset, zio, tx);
 437  437          VERIFY0(zio_wait(zio));
 438  438          dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 439  439          spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 440  440  }
 441  441  
 442  442  static void
 443  443  dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 444  444  {
 445  445          ASSERT(MUTEX_HELD(&dp->dp_lock));
 446  446  
 447  447          if (delta < 0)
 448  448                  ASSERT3U(-delta, <=, dp->dp_dirty_total);
 449  449  
 450  450          dp->dp_dirty_total += delta;
 451  451  
 452  452          /*
 453  453           * Note: we signal even when increasing dp_dirty_total.
 454  454           * This ensures forward progress -- each thread wakes the next waiter.
 455  455           */
 456  456          if (dp->dp_dirty_total <= zfs_dirty_data_max)
 457  457                  cv_signal(&dp->dp_spaceavail_cv);
 458  458  }
 459  459  
 460  460  void
 461  461  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 462  462  {
 463  463          zio_t *zio;
 464  464          dmu_tx_t *tx;
 465  465          dsl_dir_t *dd;
 466  466          dsl_dataset_t *ds;
 467  467          objset_t *mos = dp->dp_meta_objset;
 468  468          list_t synced_datasets;
 469  469  
 470  470          list_create(&synced_datasets, sizeof (dsl_dataset_t),
 471  471              offsetof(dsl_dataset_t, ds_synced_link));
 472  472  
 473  473          tx = dmu_tx_create_assigned(dp, txg);
 474  474  
 475  475          /*
 476  476           * Write out all dirty blocks of dirty datasets.
 477  477           */
 478  478          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 479  479          while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 480  480                  /*
 481  481                   * We must not sync any non-MOS datasets twice, because
 482  482                   * we may have taken a snapshot of them.  However, we
 483  483                   * may sync newly-created datasets on pass 2.
 484  484                   */
 485  485                  ASSERT(!list_link_active(&ds->ds_synced_link));
 486  486                  list_insert_tail(&synced_datasets, ds);
 487  487                  dsl_dataset_sync(ds, zio, tx);
 488  488          }
 489  489          VERIFY0(zio_wait(zio));
 490  490  
 491  491          /*
 492  492           * We have written all of the accounted dirty data, so our
 493  493           * dp_space_towrite should now be zero.  However, some seldom-used
 494  494           * code paths do not adhere to this (e.g. dbuf_undirty(), also
 495  495           * rounding error in dbuf_write_physdone).
 496  496           * Shore up the accounting of any dirtied space now.
 497  497           */
 498  498          dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
 499  499  
 500  500          /*
 501  501           * After the data blocks have been written (ensured by the zio_wait()
 502  502           * above), update the user/group space accounting.
 503  503           */
 504  504          for (ds = list_head(&synced_datasets); ds != NULL;
 505  505              ds = list_next(&synced_datasets, ds)) {
 506  506                  dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 507  507          }
 508  508  
 509  509          /*
 510  510           * Sync the datasets again to push out the changes due to
 511  511           * userspace updates.  This must be done before we process the
 512  512           * sync tasks, so that any snapshots will have the correct
 513  513           * user accounting information (and we won't get confused
 514  514           * about which blocks are part of the snapshot).
 515  515           */
 516  516          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 517  517          while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 518  518                  ASSERT(list_link_active(&ds->ds_synced_link));
 519  519                  dmu_buf_rele(ds->ds_dbuf, ds);
 520  520                  dsl_dataset_sync(ds, zio, tx);
 521  521          }
 522  522          VERIFY0(zio_wait(zio));
 523  523  
 524  524          /*
 525  525           * Now that the datasets have been completely synced, we can
 526  526           * clean up our in-memory structures accumulated while syncing:
 527  527           *
 528  528           *  - move dead blocks from the pending deadlist to the on-disk deadlist
 529  529           *  - release hold from dsl_dataset_dirty()
 530  530           */
 531  531          while ((ds = list_remove_head(&synced_datasets)) != NULL) {
 532  532                  objset_t *os = ds->ds_objset;
 533  533                  bplist_iterate(&ds->ds_pending_deadlist,
 534  534                      deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 535  535                  ASSERT(!dmu_objset_is_dirty(os, txg));
 536  536                  dmu_buf_rele(ds->ds_dbuf, ds);
 537  537          }
 538  538          while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
 539  539                  dsl_dir_sync(dd, tx);
 540  540          }
 541  541  
 542  542          /*
 543  543           * The MOS's space is accounted for in the pool/$MOS
 544  544           * (dp_mos_dir).  We can't modify the mos while we're syncing
 545  545           * it, so we remember the deltas and apply them here.
 546  546           */
 547  547          if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 548  548              dp->dp_mos_uncompressed_delta != 0) {
 549  549                  dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 550  550                      dp->dp_mos_used_delta,
 551  551                      dp->dp_mos_compressed_delta,
 552  552                      dp->dp_mos_uncompressed_delta, tx);
 553  553                  dp->dp_mos_used_delta = 0;
 554  554                  dp->dp_mos_compressed_delta = 0;
 555  555                  dp->dp_mos_uncompressed_delta = 0;
 556  556          }
 557  557  
 558  558          if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 559  559              list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 560  560                  dsl_pool_sync_mos(dp, tx);
 561  561          }
 562  562  
 563  563          /*
 564  564           * If we modify a dataset in the same txg that we want to destroy it,
 565  565           * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 566  566           * dsl_dir_destroy_check() will fail if there are unexpected holds.
 567  567           * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 568  568           * and clearing the hold on it) before we process the sync_tasks.
 569  569           * The MOS data dirtied by the sync_tasks will be synced on the next
 570  570           * pass.
 571  571           */
 572  572          if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 573  573                  dsl_sync_task_t *dst;
 574  574                  /*
 575  575                   * No more sync tasks should have been added while we
 576  576                   * were syncing.
 577  577                   */
 578  578                  ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
 579  579                  while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
 580  580                          dsl_sync_task_sync(dst, tx);
 581  581          }
 582  582  
 583  583          dmu_tx_commit(tx);
 584  584  
 585  585          DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
 586  586  }
 587  587  
 588  588  void
 589  589  dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 590  590  {
 591  591          zilog_t *zilog;
 592  592  
 593  593          while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
 594  594                  dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 595  595                  zil_clean(zilog, txg);
 596  596                  ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 597  597                  dmu_buf_rele(ds->ds_dbuf, zilog);
 598  598          }
 599  599          ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 600  600  }
 601  601  
 602  602  /*
 603  603   * TRUE if the current thread is the tx_sync_thread or if we
 604  604   * are being called from SPA context during pool initialization.
 605  605   */
 606  606  int
 607  607  dsl_pool_sync_context(dsl_pool_t *dp)
 608  608  {
 609  609          return (curthread == dp->dp_tx.tx_sync_thread ||
 610  610              spa_is_initializing(dp->dp_spa));
 611  611  }
 612  612  
 613  613  uint64_t
 614  614  dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 615  615  {
 616  616          uint64_t space, resv;
 617  617  
 618  618          /*
 619  619           * If we're trying to assess whether it's OK to do a free,
 620  620           * cut the reservation in half to allow forward progress
 621  621           * (e.g. make it possible to rm(1) files from a full pool).
 622  622           */
 623  623          space = spa_get_dspace(dp->dp_spa);
 624  624          resv = spa_get_slop_space(dp->dp_spa);
 625  625          if (netfree)
 626  626                  resv >>= 1;
 627  627  
 628  628          return (space - resv);
 629  629  }
 630  630  
 631  631  boolean_t
 632  632  dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 633  633  {
 634  634          uint64_t delay_min_bytes =
 635  635              zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 636  636          boolean_t rv;
 637  637  
 638  638          mutex_enter(&dp->dp_lock);
 639  639          if (dp->dp_dirty_total > zfs_dirty_data_sync)
 640  640                  txg_kick(dp);
 641  641          rv = (dp->dp_dirty_total > delay_min_bytes);
 642  642          mutex_exit(&dp->dp_lock);
 643  643          return (rv);
 644  644  }
 645  645  
 646  646  void
 647  647  dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 648  648  {
 649  649          if (space > 0) {
 650  650                  mutex_enter(&dp->dp_lock);
 651  651                  dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
 652  652                  dsl_pool_dirty_delta(dp, space);
 653  653                  mutex_exit(&dp->dp_lock);
 654  654          }
 655  655  }
 656  656  
 657  657  void
 658  658  dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 659  659  {
 660  660          ASSERT3S(space, >=, 0);
 661  661          if (space == 0)
 662  662                  return;
 663  663          mutex_enter(&dp->dp_lock);
 664  664          if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
 665  665                  /* XXX writing something we didn't dirty? */
 666  666                  space = dp->dp_dirty_pertxg[txg & TXG_MASK];
 667  667          }
 668  668          ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
 669  669          dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
 670  670          ASSERT3U(dp->dp_dirty_total, >=, space);
 671  671          dsl_pool_dirty_delta(dp, -space);
 672  672          mutex_exit(&dp->dp_lock);
 673  673  }
 674  674  
 675  675  /* ARGSUSED */
 676  676  static int
 677  677  upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 678  678  {
 679  679          dmu_tx_t *tx = arg;
 680  680          dsl_dataset_t *ds, *prev = NULL;
 681  681          int err;
 682  682  
 683  683          err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 684  684          if (err)
 685  685                  return (err);
 686  686  
 687  687          while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 688  688                  err = dsl_dataset_hold_obj(dp,
 689  689                      dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 690  690                  if (err) {
 691  691                          dsl_dataset_rele(ds, FTAG);
 692  692                          return (err);
 693  693                  }
 694  694  
 695  695                  if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
 696  696                          break;
 697  697                  dsl_dataset_rele(ds, FTAG);
 698  698                  ds = prev;
 699  699                  prev = NULL;
 700  700          }
 701  701  
 702  702          if (prev == NULL) {
 703  703                  prev = dp->dp_origin_snap;
 704  704  
 705  705                  /*
 706  706                   * The $ORIGIN can't have any data, or the accounting
 707  707                   * will be wrong.
 708  708                   */
 709  709                  ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
 710  710  
 711  711                  /* The origin doesn't get attached to itself */
 712  712                  if (ds->ds_object == prev->ds_object) {
 713  713                          dsl_dataset_rele(ds, FTAG);
 714  714                          return (0);
 715  715                  }
 716  716  
 717  717                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
 718  718                  dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
 719  719                  dsl_dataset_phys(ds)->ds_prev_snap_txg =
 720  720                      dsl_dataset_phys(prev)->ds_creation_txg;
 721  721  
 722  722                  dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 723  723                  dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
 724  724  
 725  725                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 726  726                  dsl_dataset_phys(prev)->ds_num_children++;
 727  727  
 728  728                  if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
 729  729                          ASSERT(ds->ds_prev == NULL);
 730  730                          VERIFY0(dsl_dataset_hold_obj(dp,
 731  731                              dsl_dataset_phys(ds)->ds_prev_snap_obj,
 732  732                              ds, &ds->ds_prev));
 733  733                  }
 734  734          }
 735  735  
 736  736          ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
 737  737          ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
 738  738  
 739  739          if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
 740  740                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 741  741                  dsl_dataset_phys(prev)->ds_next_clones_obj =
 742  742                      zap_create(dp->dp_meta_objset,
 743  743                      DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 744  744          }
 745  745          VERIFY0(zap_add_int(dp->dp_meta_objset,
 746  746              dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
 747  747  
 748  748          dsl_dataset_rele(ds, FTAG);
 749  749          if (prev != dp->dp_origin_snap)
 750  750                  dsl_dataset_rele(prev, FTAG);
 751  751          return (0);
 752  752  }
 753  753  
 754  754  void
 755  755  dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 756  756  {
 757  757          ASSERT(dmu_tx_is_syncing(tx));
 758  758          ASSERT(dp->dp_origin_snap != NULL);
 759  759  
 760  760          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 761  761              tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 762  762  }
 763  763  
 764  764  /* ARGSUSED */
 765  765  static int
 766  766  upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 767  767  {
 768  768          dmu_tx_t *tx = arg;
 769  769          objset_t *mos = dp->dp_meta_objset;
 770  770  
 771  771          if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
 772  772                  dsl_dataset_t *origin;
 773  773  
 774  774                  VERIFY0(dsl_dataset_hold_obj(dp,
 775  775                      dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
 776  776  
 777  777                  if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 778  778                          dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 779  779                          dsl_dir_phys(origin->ds_dir)->dd_clones =
 780  780                              zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
 781  781                              0, tx);
 782  782                  }
 783  783  
 784  784                  VERIFY0(zap_add_int(dp->dp_meta_objset,
 785  785                      dsl_dir_phys(origin->ds_dir)->dd_clones,
 786  786                      ds->ds_object, tx));
 787  787  
 788  788                  dsl_dataset_rele(origin, FTAG);
 789  789          }
 790  790          return (0);
 791  791  }
 792  792  
 793  793  void
 794  794  dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 795  795  {
 796  796          ASSERT(dmu_tx_is_syncing(tx));
 797  797          uint64_t obj;
 798  798  
 799  799          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 800  800          VERIFY0(dsl_pool_open_special_dir(dp,
 801  801              FREE_DIR_NAME, &dp->dp_free_dir));
 802  802  
 803  803          /*
 804  804           * We can't use bpobj_alloc(), because spa_version() still
 805  805           * returns the old version, and we need a new-version bpobj with
 806  806           * subobj support.  So call dmu_object_alloc() directly.
 807  807           */
 808  808          obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 809  809              SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 810  810          VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 811  811              DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 812  812          VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 813  813  
 814  814          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 815  815              upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 816  816  }
 817  817  
 818  818  void
 819  819  dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 820  820  {
 821  821          uint64_t dsobj;
 822  822          dsl_dataset_t *ds;
 823  823  
 824  824          ASSERT(dmu_tx_is_syncing(tx));
 825  825          ASSERT(dp->dp_origin_snap == NULL);
 826  826          ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 827  827  
 828  828          /* create the origin dir, ds, & snap-ds */
 829  829          dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 830  830              NULL, 0, kcred, tx);
 831  831          VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 832  832          dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
 833  833          VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
 834  834              dp, &dp->dp_origin_snap));
 835  835          dsl_dataset_rele(ds, FTAG);
 836  836  }
 837  837  
 838  838  taskq_t *
 839  839  dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 840  840  {
 841  841          return (dp->dp_vnrele_taskq);
 842  842  }
 843  843  
 844  844  /*
 845  845   * Walk through the pool-wide zap object of temporary snapshot user holds
 846  846   * and release them.
 847  847   */
 848  848  void
 849  849  dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 850  850  {
 851  851          zap_attribute_t za;
 852  852          zap_cursor_t zc;
 853  853          objset_t *mos = dp->dp_meta_objset;
 854  854          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 855  855          nvlist_t *holds;
 856  856  
 857  857          if (zapobj == 0)
 858  858                  return;
 859  859          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 860  860  
 861  861          holds = fnvlist_alloc();
 862  862  
 863  863          for (zap_cursor_init(&zc, mos, zapobj);
 864  864              zap_cursor_retrieve(&zc, &za) == 0;
 865  865              zap_cursor_advance(&zc)) {
 866  866                  char *htag;
 867  867                  nvlist_t *tags;
 868  868  
 869  869                  htag = strchr(za.za_name, '-');
 870  870                  *htag = '\0';
 871  871                  ++htag;
 872  872                  if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
 873  873                          tags = fnvlist_alloc();
 874  874                          fnvlist_add_boolean(tags, htag);
 875  875                          fnvlist_add_nvlist(holds, za.za_name, tags);
 876  876                          fnvlist_free(tags);
 877  877                  } else {
 878  878                          fnvlist_add_boolean(tags, htag);
 879  879                  }
 880  880          }
 881  881          dsl_dataset_user_release_tmp(dp, holds);
 882  882          fnvlist_free(holds);
 883  883          zap_cursor_fini(&zc);
 884  884  }
 885  885  
 886  886  /*
 887  887   * Create the pool-wide zap object for storing temporary snapshot holds.
 888  888   */
 889  889  void
 890  890  dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 891  891  {
 892  892          objset_t *mos = dp->dp_meta_objset;
 893  893  
 894  894          ASSERT(dp->dp_tmp_userrefs_obj == 0);
 895  895          ASSERT(dmu_tx_is_syncing(tx));
 896  896  
 897  897          dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 898  898              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 899  899  }
 900  900  
 901  901  static int
 902  902  dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 903  903      const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 904  904  {
 905  905          objset_t *mos = dp->dp_meta_objset;
 906  906          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 907  907          char *name;
 908  908          int error;
 909  909  
 910  910          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 911  911          ASSERT(dmu_tx_is_syncing(tx));
 912  912  
 913  913          /*
 914  914           * If the pool was created prior to SPA_VERSION_USERREFS, the
 915  915           * zap object for temporary holds might not exist yet.
 916  916           */
 917  917          if (zapobj == 0) {
 918  918                  if (holding) {
 919  919                          dsl_pool_user_hold_create_obj(dp, tx);
 920  920                          zapobj = dp->dp_tmp_userrefs_obj;
 921  921                  } else {
 922  922                          return (SET_ERROR(ENOENT));
 923  923                  }
 924  924          }
 925  925  
 926  926          name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 927  927          if (holding)
 928  928                  error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 929  929          else
 930  930                  error = zap_remove(mos, zapobj, name, tx);
 931  931          strfree(name);
 932  932  
 933  933          return (error);
 934  934  }
 935  935  
 936  936  /*
 937  937   * Add a temporary hold for the given dataset object and tag.
 938  938   */
 939  939  int
 940  940  dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 941  941      uint64_t now, dmu_tx_t *tx)
 942  942  {
 943  943          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 944  944  }
 945  945  
 946  946  /*
 947  947   * Release a temporary hold for the given dataset object and tag.
 948  948   */
 949  949  int
 950  950  dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 951  951      dmu_tx_t *tx)
 952  952  {
 953  953          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
 954  954              tx, B_FALSE));
 955  955  }
 956  956  
 957  957  /*
 958  958   * DSL Pool Configuration Lock
 959  959   *
 960  960   * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
 961  961   * creation / destruction / rename / property setting).  It must be held for
 962  962   * read to hold a dataset or dsl_dir.  I.e. you must call
 963  963   * dsl_pool_config_enter() or dsl_pool_hold() before calling
 964  964   * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
 965  965   * must be held continuously until all datasets and dsl_dirs are released.
 966  966   *
 967  967   * The only exception to this rule is that if a "long hold" is placed on
 968  968   * a dataset, then the dp_config_rwlock may be dropped while the dataset
 969  969   * is still held.  The long hold will prevent the dataset from being
 970  970   * destroyed -- the destroy will fail with EBUSY.  A long hold can be
 971  971   * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
 972  972   * (by calling dsl_{dataset,objset}_{try}own{_obj}).
 973  973   *
 974  974   * Legitimate long-holders (including owners) should be long-running, cancelable
 975  975   * tasks that should cause "zfs destroy" to fail.  This includes DMU
 976  976   * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
 977  977   * "zfs send", and "zfs diff".  There are several other long-holders whose
 978  978   * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
 979  979   *
 980  980   * The usual formula for long-holding would be:
 981  981   * dsl_pool_hold()
 982  982   * dsl_dataset_hold()
 983  983   * ... perform checks ...
 984  984   * dsl_dataset_long_hold()
 985  985   * dsl_pool_rele()
 986  986   * ... perform long-running task ...
 987  987   * dsl_dataset_long_rele()
 988  988   * dsl_dataset_rele()
 989  989   *
 990  990   * Note that when the long hold is released, the dataset is still held but
 991  991   * the pool is not held.  The dataset may change arbitrarily during this time
 992  992   * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
 993  993   * dataset except release it.
 994  994   *
 995  995   * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
 996  996   * or modifying operations.
 997  997   *
 998  998   * Modifying operations should generally use dsl_sync_task().  The synctask
 999  999   * infrastructure enforces proper locking strategy with respect to the
1000 1000   * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
1001 1001   *
1002 1002   * Read-only operations will manually hold the pool, then the dataset, obtain
1003 1003   * information from the dataset, then release the pool and dataset.
1004 1004   * dmu_objset_{hold,rele}() are convenience routines that also do the pool
1005 1005   * hold/rele.
1006 1006   */
1007 1007  
1008 1008  int
1009 1009  dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
1010 1010  {
1011 1011          spa_t *spa;
1012 1012          int error;
1013 1013  
1014 1014          error = spa_open(name, &spa, tag);
1015 1015          if (error == 0) {
1016 1016                  *dp = spa_get_dsl(spa);
1017 1017                  dsl_pool_config_enter(*dp, tag);
1018 1018          }
1019 1019          return (error);
1020 1020  }
1021 1021  
1022 1022  void
1023 1023  dsl_pool_rele(dsl_pool_t *dp, void *tag)
1024 1024  {
1025 1025          dsl_pool_config_exit(dp, tag);
1026 1026          spa_close(dp->dp_spa, tag);
1027 1027  }
1028 1028  
1029 1029  void
1030 1030  dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
1031 1031  {
1032 1032          /*
1033 1033           * We use a "reentrant" reader-writer lock, but not reentrantly.
1034 1034           *
1035 1035           * The rrwlock can (with the track_all flag) track all reading threads,
1036 1036           * which is very useful for debugging which code path failed to release
1037 1037           * the lock, and for verifying that the *current* thread does hold
1038 1038           * the lock.

↓ open down ↓

1038 lines elided

↑ open up ↑

1039 1039           *
1040 1040           * (Unlike a rwlock, which knows that N threads hold it for
1041 1041           * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
1042 1042           * if any thread holds it for read, even if this thread doesn't).
1043 1043           */
1044 1044          ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
1045 1045          rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
1046 1046  }
1047 1047  
1048 1048  void
     1049 +dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
     1050 +{
     1051 +        ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
     1052 +        rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
     1053 +}
     1054 +
     1055 +void
1049 1056  dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
1050 1057  {
1051 1058          rrw_exit(&dp->dp_config_rwlock, tag);
1052 1059  }
1053 1060  
1054 1061  boolean_t
1055 1062  dsl_pool_config_held(dsl_pool_t *dp)
1056 1063  {
1057 1064          return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
1058 1065  }
1059 1066  
1060 1067  boolean_t
1061 1068  dsl_pool_config_held_writer(dsl_pool_t *dp)
1062 1069  {
1063 1070          return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
1064 1071  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX