Print this page
    
3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dsl_pool.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_pool.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  #include <sys/dsl_pool.h>
  27   27  #include <sys/dsl_dataset.h>
  28   28  #include <sys/dsl_prop.h>
  29   29  #include <sys/dsl_dir.h>
  30   30  #include <sys/dsl_synctask.h>
  31   31  #include <sys/dsl_scan.h>
  32   32  #include <sys/dnode.h>
  33   33  #include <sys/dmu_tx.h>
  34   34  #include <sys/dmu_objset.h>
  35   35  #include <sys/arc.h>
  36   36  #include <sys/zap.h>
  37   37  #include <sys/zio.h>
  38   38  #include <sys/zfs_context.h>
  39   39  #include <sys/fs/zfs.h>
  40   40  #include <sys/zfs_znode.h>
  41   41  #include <sys/spa_impl.h>
  42   42  #include <sys/dsl_deadlist.h>
  43   43  #include <sys/bptree.h>
  44   44  #include <sys/zfeature.h>
  45   45  
  46   46  int zfs_no_write_throttle = 0;
  47   47  int zfs_write_limit_shift = 3;                  /* 1/8th of physical memory */
  48   48  int zfs_txg_synctime_ms = 1000;         /* target millisecs to sync a txg */
  49   49  
  50   50  uint64_t zfs_write_limit_min = 32 << 20;        /* min write limit is 32MB */
  51   51  uint64_t zfs_write_limit_max = 0;               /* max data payload per txg */
  52   52  uint64_t zfs_write_limit_inflated = 0;
  53   53  uint64_t zfs_write_limit_override = 0;
  54   54  
  55   55  kmutex_t zfs_write_limit_lock;
  56   56  
  57   57  static pgcnt_t old_physmem = 0;
  58   58  
  59   59  int
  60   60  dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  61   61  {
  62   62          uint64_t obj;
  63   63          int err;
  64   64  
  65   65          err = zap_lookup(dp->dp_meta_objset,
  66   66              dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
  67   67              name, sizeof (obj), 1, &obj);
  68   68          if (err)
  69   69                  return (err);
  70   70  
  71   71          return (dsl_dir_open_obj(dp, obj, name, dp, ddp));
  72   72  }
  73   73  
  74   74  static dsl_pool_t *
  75   75  dsl_pool_open_impl(spa_t *spa, uint64_t txg)
  76   76  {
  77   77          dsl_pool_t *dp;
  78   78          blkptr_t *bp = spa_get_rootblkptr(spa);
  79   79  
  80   80          dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
  81   81          dp->dp_spa = spa;
  82   82          dp->dp_meta_rootbp = *bp;
  83   83          rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
  84   84          dp->dp_write_limit = zfs_write_limit_min;
  85   85          txg_init(dp, txg);
  86   86  
  87   87          txg_list_create(&dp->dp_dirty_datasets,
  88   88              offsetof(dsl_dataset_t, ds_dirty_link));
  89   89          txg_list_create(&dp->dp_dirty_dirs,
  90   90              offsetof(dsl_dir_t, dd_dirty_link));
  91   91          txg_list_create(&dp->dp_sync_tasks,
  92   92              offsetof(dsl_sync_task_group_t, dstg_node));
  93   93          list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
  94   94              offsetof(dsl_dataset_t, ds_synced_link));
  95   95  
  96   96          mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
  97   97  
  98   98          dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
  99   99              1, 4, 0);
 100  100  
 101  101          return (dp);
 102  102  }
 103  103  
 104  104  int
 105  105  dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 106  106  {
 107  107          int err;
 108  108          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 109  109  
 110  110          err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 111  111              &dp->dp_meta_objset);
 112  112          if (err != 0)
 113  113                  dsl_pool_close(dp);
 114  114          else
 115  115                  *dpp = dp;
 116  116  
 117  117          return (err);
 118  118  }
 119  119  
 120  120  int
 121  121  dsl_pool_open(dsl_pool_t *dp)
 122  122  {
 123  123          int err;
 124  124          dsl_dir_t *dd;
 125  125          dsl_dataset_t *ds;
 126  126          uint64_t obj;
 127  127  
 128  128          ASSERT(!dmu_objset_is_dirty_anywhere(dp->dp_meta_objset));
 129  129  
 130  130          rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 131  131          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 132  132              DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 133  133              &dp->dp_root_dir_obj);
 134  134          if (err)
 135  135                  goto out;
 136  136  
 137  137          err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
 138  138              NULL, dp, &dp->dp_root_dir);
 139  139          if (err)
 140  140                  goto out;
 141  141  
 142  142          err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 143  143          if (err)
 144  144                  goto out;
 145  145  
 146  146          if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 147  147                  err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 148  148                  if (err)
 149  149                          goto out;
 150  150                  err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 151  151                      FTAG, &ds);
 152  152                  if (err == 0) {
 153  153                          err = dsl_dataset_hold_obj(dp,
 154  154                              ds->ds_phys->ds_prev_snap_obj, dp,
 155  155                              &dp->dp_origin_snap);
 156  156                          dsl_dataset_rele(ds, FTAG);
 157  157                  }
 158  158                  dsl_dir_close(dd, dp);
 159  159                  if (err)
 160  160                          goto out;
 161  161          }
 162  162  
  
    | 
      ↓ open down ↓ | 
    162 lines elided | 
    
      ↑ open up ↑ | 
  
 163  163          if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 164  164                  err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 165  165                      &dp->dp_free_dir);
 166  166                  if (err)
 167  167                          goto out;
 168  168  
 169  169                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 170  170                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 171  171                  if (err)
 172  172                          goto out;
 173      -                VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
      173 +                VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 174  174                      dp->dp_meta_objset, obj));
 175  175          }
 176  176  
 177  177          if (spa_feature_is_active(dp->dp_spa,
 178  178              &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 179  179                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 180  180                      DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 181  181                      &dp->dp_bptree_obj);
 182  182                  if (err != 0)
 183  183                          goto out;
 184  184          }
 185  185  
 186  186          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 187  187              DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 188  188              &dp->dp_tmp_userrefs_obj);
 189  189          if (err == ENOENT)
 190  190                  err = 0;
 191  191          if (err)
 192  192                  goto out;
 193  193  
 194  194          err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 195  195  
 196  196  out:
 197  197          rw_exit(&dp->dp_config_rwlock);
 198  198          return (err);
 199  199  }
 200  200  
 201  201  void
 202  202  dsl_pool_close(dsl_pool_t *dp)
 203  203  {
 204  204          /* drop our references from dsl_pool_open() */
 205  205  
 206  206          /*
 207  207           * Since we held the origin_snap from "syncing" context (which
 208  208           * includes pool-opening context), it actually only got a "ref"
 209  209           * and not a hold, so just drop that here.
 210  210           */
 211  211          if (dp->dp_origin_snap)
 212  212                  dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
 213  213          if (dp->dp_mos_dir)
 214  214                  dsl_dir_close(dp->dp_mos_dir, dp);
 215  215          if (dp->dp_free_dir)
 216  216                  dsl_dir_close(dp->dp_free_dir, dp);
 217  217          if (dp->dp_root_dir)
 218  218                  dsl_dir_close(dp->dp_root_dir, dp);
 219  219  
 220  220          bpobj_close(&dp->dp_free_bpobj);
 221  221  
 222  222          /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 223  223          if (dp->dp_meta_objset)
 224  224                  dmu_objset_evict(dp->dp_meta_objset);
 225  225  
 226  226          txg_list_destroy(&dp->dp_dirty_datasets);
 227  227          txg_list_destroy(&dp->dp_sync_tasks);
 228  228          txg_list_destroy(&dp->dp_dirty_dirs);
 229  229          list_destroy(&dp->dp_synced_datasets);
 230  230  
 231  231          arc_flush(dp->dp_spa);
 232  232          txg_fini(dp);
 233  233          dsl_scan_fini(dp);
 234  234          rw_destroy(&dp->dp_config_rwlock);
 235  235          mutex_destroy(&dp->dp_lock);
 236  236          taskq_destroy(dp->dp_vnrele_taskq);
 237  237          if (dp->dp_blkstats)
 238  238                  kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 239  239          kmem_free(dp, sizeof (dsl_pool_t));
 240  240  }
 241  241  
 242  242  dsl_pool_t *
 243  243  dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 244  244  {
 245  245          int err;
 246  246          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 247  247          dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 248  248          objset_t *os;
  
    | 
      ↓ open down ↓ | 
    65 lines elided | 
    
      ↑ open up ↑ | 
  
 249  249          dsl_dataset_t *ds;
 250  250          uint64_t obj;
 251  251  
 252  252          /* create and open the MOS (meta-objset) */
 253  253          dp->dp_meta_objset = dmu_objset_create_impl(spa,
 254  254              NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 255  255  
 256  256          /* create the pool directory */
 257  257          err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 258  258              DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 259      -        ASSERT3U(err, ==, 0);
      259 +        ASSERT0(err);
 260  260  
 261  261          /* Initialize scan structures */
 262      -        VERIFY3U(0, ==, dsl_scan_init(dp, txg));
      262 +        VERIFY0(dsl_scan_init(dp, txg));
 263  263  
 264  264          /* create and open the root dir */
 265  265          dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 266  266          VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
 267  267              NULL, dp, &dp->dp_root_dir));
 268  268  
 269  269          /* create and open the meta-objset dir */
 270  270          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 271  271          VERIFY(0 == dsl_pool_open_special_dir(dp,
 272  272              MOS_DIR_NAME, &dp->dp_mos_dir));
 273  273  
 274  274          if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
  
    | 
      ↓ open down ↓ | 
    2 lines elided | 
    
      ↑ open up ↑ | 
  
 275  275                  /* create and open the free dir */
 276  276                  (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 277  277                      FREE_DIR_NAME, tx);
 278  278                  VERIFY(0 == dsl_pool_open_special_dir(dp,
 279  279                      FREE_DIR_NAME, &dp->dp_free_dir));
 280  280  
 281  281                  /* create and open the free_bplist */
 282  282                  obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
 283  283                  VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 284  284                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 285      -                VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
      285 +                VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 286  286                      dp->dp_meta_objset, obj));
 287  287          }
 288  288  
 289  289          if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 290  290                  dsl_pool_create_origin(dp, tx);
 291  291  
 292  292          /* create the root dataset */
 293  293          obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 294  294  
 295  295          /* create the root objset */
 296  296          VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 297  297          os = dmu_objset_create_impl(dp->dp_spa, ds,
 298  298              dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 299  299  #ifdef _KERNEL
 300  300          zfs_create_fs(os, kcred, zplprops, tx);
 301  301  #endif
 302  302          dsl_dataset_rele(ds, FTAG);
 303  303  
 304  304          dmu_tx_commit(tx);
 305  305  
 306  306          return (dp);
 307  307  }
 308  308  
 309  309  static int
 310  310  deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 311  311  {
 312  312          dsl_deadlist_t *dl = arg;
 313  313          dsl_pool_t *dp = dmu_objset_pool(dl->dl_os);
 314  314          rw_enter(&dp->dp_config_rwlock, RW_READER);
 315  315          dsl_deadlist_insert(dl, bp, tx);
 316  316          rw_exit(&dp->dp_config_rwlock);
 317  317          return (0);
 318  318  }
 319  319  
 320  320  void
 321  321  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 322  322  {
 323  323          zio_t *zio;
 324  324          dmu_tx_t *tx;
 325  325          dsl_dir_t *dd;
 326  326          dsl_dataset_t *ds;
 327  327          dsl_sync_task_group_t *dstg;
 328  328          objset_t *mos = dp->dp_meta_objset;
 329  329          hrtime_t start, write_time;
 330  330          uint64_t data_written;
 331  331          int err;
 332  332  
 333  333          /*
 334  334           * We need to copy dp_space_towrite() before doing
 335  335           * dsl_sync_task_group_sync(), because
 336  336           * dsl_dataset_snapshot_reserve_space() will increase
 337  337           * dp_space_towrite but not actually write anything.
 338  338           */
 339  339          data_written = dp->dp_space_towrite[txg & TXG_MASK];
 340  340  
 341  341          tx = dmu_tx_create_assigned(dp, txg);
 342  342  
 343  343          dp->dp_read_overhead = 0;
 344  344          start = gethrtime();
 345  345  
 346  346          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 347  347          while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
 348  348                  /*
 349  349                   * We must not sync any non-MOS datasets twice, because
 350  350                   * we may have taken a snapshot of them.  However, we
 351  351                   * may sync newly-created datasets on pass 2.
 352  352                   */
 353  353                  ASSERT(!list_link_active(&ds->ds_synced_link));
 354  354                  list_insert_tail(&dp->dp_synced_datasets, ds);
 355  355                  dsl_dataset_sync(ds, zio, tx);
 356  356          }
 357  357          DTRACE_PROBE(pool_sync__1setup);
 358  358          err = zio_wait(zio);
 359  359  
 360  360          write_time = gethrtime() - start;
 361  361          ASSERT(err == 0);
 362  362          DTRACE_PROBE(pool_sync__2rootzio);
 363  363  
 364  364          for (ds = list_head(&dp->dp_synced_datasets); ds;
 365  365              ds = list_next(&dp->dp_synced_datasets, ds))
 366  366                  dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 367  367  
 368  368          /*
 369  369           * Sync the datasets again to push out the changes due to
 370  370           * userspace updates.  This must be done before we process the
 371  371           * sync tasks, because that could cause a snapshot of a dataset
 372  372           * whose ds_bp will be rewritten when we do this 2nd sync.
 373  373           */
 374  374          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 375  375          while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
 376  376                  ASSERT(list_link_active(&ds->ds_synced_link));
 377  377                  dmu_buf_rele(ds->ds_dbuf, ds);
 378  378                  dsl_dataset_sync(ds, zio, tx);
 379  379          }
 380  380          err = zio_wait(zio);
 381  381  
 382  382          /*
 383  383           * Move dead blocks from the pending deadlist to the on-disk
 384  384           * deadlist.
 385  385           */
 386  386          for (ds = list_head(&dp->dp_synced_datasets); ds;
 387  387              ds = list_next(&dp->dp_synced_datasets, ds)) {
 388  388                  bplist_iterate(&ds->ds_pending_deadlist,
 389  389                      deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 390  390          }
 391  391  
 392  392          while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
 393  393                  /*
 394  394                   * No more sync tasks should have been added while we
 395  395                   * were syncing.
 396  396                   */
 397  397                  ASSERT(spa_sync_pass(dp->dp_spa) == 1);
 398  398                  dsl_sync_task_group_sync(dstg, tx);
 399  399          }
 400  400          DTRACE_PROBE(pool_sync__3task);
 401  401  
 402  402          start = gethrtime();
 403  403          while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
 404  404                  dsl_dir_sync(dd, tx);
 405  405          write_time += gethrtime() - start;
 406  406  
 407  407          start = gethrtime();
 408  408          if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 409  409              list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 410  410                  zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 411  411                  dmu_objset_sync(mos, zio, tx);
 412  412                  err = zio_wait(zio);
 413  413                  ASSERT(err == 0);
 414  414                  dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 415  415                  spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 416  416          }
 417  417          write_time += gethrtime() - start;
 418  418          DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
 419  419              hrtime_t, dp->dp_read_overhead);
 420  420          write_time -= dp->dp_read_overhead;
 421  421  
 422  422          dmu_tx_commit(tx);
 423  423  
 424  424          dp->dp_space_towrite[txg & TXG_MASK] = 0;
 425  425          ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
 426  426  
 427  427          /*
 428  428           * If the write limit max has not been explicitly set, set it
 429  429           * to a fraction of available physical memory (default 1/8th).
 430  430           * Note that we must inflate the limit because the spa
 431  431           * inflates write sizes to account for data replication.
 432  432           * Check this each sync phase to catch changing memory size.
 433  433           */
 434  434          if (physmem != old_physmem && zfs_write_limit_shift) {
 435  435                  mutex_enter(&zfs_write_limit_lock);
 436  436                  old_physmem = physmem;
 437  437                  zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
 438  438                  zfs_write_limit_inflated = MAX(zfs_write_limit_min,
 439  439                      spa_get_asize(dp->dp_spa, zfs_write_limit_max));
 440  440                  mutex_exit(&zfs_write_limit_lock);
 441  441          }
 442  442  
 443  443          /*
 444  444           * Attempt to keep the sync time consistent by adjusting the
 445  445           * amount of write traffic allowed into each transaction group.
 446  446           * Weight the throughput calculation towards the current value:
 447  447           *      thru = 3/4 old_thru + 1/4 new_thru
 448  448           *
 449  449           * Note: write_time is in nanosecs, so write_time/MICROSEC
 450  450           * yields millisecs
 451  451           */
 452  452          ASSERT(zfs_write_limit_min > 0);
 453  453          if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
 454  454                  uint64_t throughput = data_written / (write_time / MICROSEC);
 455  455  
 456  456                  if (dp->dp_throughput)
 457  457                          dp->dp_throughput = throughput / 4 +
 458  458                              3 * dp->dp_throughput / 4;
 459  459                  else
 460  460                          dp->dp_throughput = throughput;
 461  461                  dp->dp_write_limit = MIN(zfs_write_limit_inflated,
 462  462                      MAX(zfs_write_limit_min,
 463  463                      dp->dp_throughput * zfs_txg_synctime_ms));
 464  464          }
 465  465  }
 466  466  
 467  467  void
 468  468  dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 469  469  {
 470  470          dsl_dataset_t *ds;
 471  471          objset_t *os;
 472  472  
 473  473          while (ds = list_head(&dp->dp_synced_datasets)) {
 474  474                  list_remove(&dp->dp_synced_datasets, ds);
 475  475                  os = ds->ds_objset;
 476  476                  zil_clean(os->os_zil, txg);
 477  477                  ASSERT(!dmu_objset_is_dirty(os, txg));
 478  478                  dmu_buf_rele(ds->ds_dbuf, ds);
 479  479          }
 480  480          ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 481  481  }
 482  482  
 483  483  /*
 484  484   * TRUE if the current thread is the tx_sync_thread or if we
 485  485   * are being called from SPA context during pool initialization.
 486  486   */
 487  487  int
 488  488  dsl_pool_sync_context(dsl_pool_t *dp)
 489  489  {
 490  490          return (curthread == dp->dp_tx.tx_sync_thread ||
 491  491              spa_is_initializing(dp->dp_spa));
 492  492  }
 493  493  
 494  494  uint64_t
 495  495  dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 496  496  {
 497  497          uint64_t space, resv;
 498  498  
 499  499          /*
 500  500           * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 501  501           * efficiency.
 502  502           * XXX The intent log is not accounted for, so it must fit
 503  503           * within this slop.
 504  504           *
 505  505           * If we're trying to assess whether it's OK to do a free,
 506  506           * cut the reservation in half to allow forward progress
 507  507           * (e.g. make it possible to rm(1) files from a full pool).
 508  508           */
 509  509          space = spa_get_dspace(dp->dp_spa);
 510  510          resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 511  511          if (netfree)
 512  512                  resv >>= 1;
 513  513  
 514  514          return (space - resv);
 515  515  }
 516  516  
 517  517  int
 518  518  dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
 519  519  {
 520  520          uint64_t reserved = 0;
 521  521          uint64_t write_limit = (zfs_write_limit_override ?
 522  522              zfs_write_limit_override : dp->dp_write_limit);
 523  523  
 524  524          if (zfs_no_write_throttle) {
 525  525                  atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
 526  526                      space);
 527  527                  return (0);
 528  528          }
 529  529  
 530  530          /*
 531  531           * Check to see if we have exceeded the maximum allowed IO for
 532  532           * this transaction group.  We can do this without locks since
 533  533           * a little slop here is ok.  Note that we do the reserved check
 534  534           * with only half the requested reserve: this is because the
 535  535           * reserve requests are worst-case, and we really don't want to
 536  536           * throttle based off of worst-case estimates.
 537  537           */
 538  538          if (write_limit > 0) {
 539  539                  reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
 540  540                      + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
 541  541  
 542  542                  if (reserved && reserved > write_limit)
 543  543                          return (ERESTART);
 544  544          }
 545  545  
 546  546          atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
 547  547  
 548  548          /*
 549  549           * If this transaction group is over 7/8ths capacity, delay
 550  550           * the caller 1 clock tick.  This will slow down the "fill"
 551  551           * rate until the sync process can catch up with us.
 552  552           */
 553  553          if (reserved && reserved > (write_limit - (write_limit >> 3)))
 554  554                  txg_delay(dp, tx->tx_txg, 1);
 555  555  
 556  556          return (0);
 557  557  }
 558  558  
 559  559  void
 560  560  dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 561  561  {
 562  562          ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
 563  563          atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
 564  564  }
 565  565  
 566  566  void
 567  567  dsl_pool_memory_pressure(dsl_pool_t *dp)
 568  568  {
 569  569          uint64_t space_inuse = 0;
 570  570          int i;
 571  571  
 572  572          if (dp->dp_write_limit == zfs_write_limit_min)
 573  573                  return;
 574  574  
 575  575          for (i = 0; i < TXG_SIZE; i++) {
 576  576                  space_inuse += dp->dp_space_towrite[i];
 577  577                  space_inuse += dp->dp_tempreserved[i];
 578  578          }
 579  579          dp->dp_write_limit = MAX(zfs_write_limit_min,
 580  580              MIN(dp->dp_write_limit, space_inuse / 4));
 581  581  }
 582  582  
 583  583  void
 584  584  dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 585  585  {
 586  586          if (space > 0) {
 587  587                  mutex_enter(&dp->dp_lock);
 588  588                  dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
 589  589                  mutex_exit(&dp->dp_lock);
 590  590          }
 591  591  }
 592  592  
 593  593  /* ARGSUSED */
 594  594  static int
 595  595  upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 596  596  {
 597  597          dmu_tx_t *tx = arg;
 598  598          dsl_dataset_t *ds, *prev = NULL;
 599  599          int err;
 600  600          dsl_pool_t *dp = spa_get_dsl(spa);
 601  601  
 602  602          err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
 603  603          if (err)
 604  604                  return (err);
 605  605  
 606  606          while (ds->ds_phys->ds_prev_snap_obj != 0) {
 607  607                  err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 608  608                      FTAG, &prev);
 609  609                  if (err) {
 610  610                          dsl_dataset_rele(ds, FTAG);
 611  611                          return (err);
 612  612                  }
 613  613  
 614  614                  if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
 615  615                          break;
 616  616                  dsl_dataset_rele(ds, FTAG);
 617  617                  ds = prev;
 618  618                  prev = NULL;
 619  619          }
 620  620  
 621  621          if (prev == NULL) {
 622  622                  prev = dp->dp_origin_snap;
 623  623  
 624  624                  /*
 625  625                   * The $ORIGIN can't have any data, or the accounting
 626  626                   * will be wrong.
 627  627                   */
 628  628                  ASSERT(prev->ds_phys->ds_bp.blk_birth == 0);
 629  629  
 630  630                  /* The origin doesn't get attached to itself */
 631  631                  if (ds->ds_object == prev->ds_object) {
 632  632                          dsl_dataset_rele(ds, FTAG);
 633  633                          return (0);
 634  634                  }
 635  635  
 636  636                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
 637  637                  ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
 638  638                  ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
 639  639  
 640  640                  dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 641  641                  ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
 642  642  
 643  643                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 644  644                  prev->ds_phys->ds_num_children++;
 645  645  
 646  646                  if (ds->ds_phys->ds_next_snap_obj == 0) {
 647  647                          ASSERT(ds->ds_prev == NULL);
 648  648                          VERIFY(0 == dsl_dataset_hold_obj(dp,
 649  649                              ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 650  650                  }
 651  651          }
 652  652  
 653  653          ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object);
 654  654          ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
 655  655  
 656  656          if (prev->ds_phys->ds_next_clones_obj == 0) {
 657  657                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 658  658                  prev->ds_phys->ds_next_clones_obj =
 659  659                      zap_create(dp->dp_meta_objset,
 660  660                      DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 661  661          }
 662  662          VERIFY(0 == zap_add_int(dp->dp_meta_objset,
 663  663              prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
 664  664  
 665  665          dsl_dataset_rele(ds, FTAG);
 666  666          if (prev != dp->dp_origin_snap)
  
    | 
      ↓ open down ↓ | 
    371 lines elided | 
    
      ↑ open up ↑ | 
  
 667  667                  dsl_dataset_rele(prev, FTAG);
 668  668          return (0);
 669  669  }
 670  670  
 671  671  void
 672  672  dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 673  673  {
 674  674          ASSERT(dmu_tx_is_syncing(tx));
 675  675          ASSERT(dp->dp_origin_snap != NULL);
 676  676  
 677      -        VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
      677 +        VERIFY0(dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
 678  678              tx, DS_FIND_CHILDREN));
 679  679  }
 680  680  
 681  681  /* ARGSUSED */
 682  682  static int
 683  683  upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 684  684  {
 685  685          dmu_tx_t *tx = arg;
 686  686          dsl_dataset_t *ds;
 687  687          dsl_pool_t *dp = spa_get_dsl(spa);
 688  688          objset_t *mos = dp->dp_meta_objset;
 689  689  
 690      -        VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
      690 +        VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 691  691  
 692  692          if (ds->ds_dir->dd_phys->dd_origin_obj) {
 693  693                  dsl_dataset_t *origin;
 694  694  
 695      -                VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
      695 +                VERIFY0(dsl_dataset_hold_obj(dp,
 696  696                      ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
 697  697  
 698  698                  if (origin->ds_dir->dd_phys->dd_clones == 0) {
 699  699                          dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 700  700                          origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
 701  701                              DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 702  702                  }
 703  703  
 704      -                VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
      704 +                VERIFY0(zap_add_int(dp->dp_meta_objset,
 705  705                      origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 706  706  
 707  707                  dsl_dataset_rele(origin, FTAG);
 708  708          }
 709  709  
 710  710          dsl_dataset_rele(ds, FTAG);
 711  711          return (0);
 712  712  }
 713  713  
 714  714  void
 715  715  dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 716  716  {
 717  717          ASSERT(dmu_tx_is_syncing(tx));
 718  718          uint64_t obj;
 719  719  
 720  720          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
  
    | 
      ↓ open down ↓ | 
    6 lines elided | 
    
      ↑ open up ↑ | 
  
 721  721          VERIFY(0 == dsl_pool_open_special_dir(dp,
 722  722              FREE_DIR_NAME, &dp->dp_free_dir));
 723  723  
 724  724          /*
 725  725           * We can't use bpobj_alloc(), because spa_version() still
 726  726           * returns the old version, and we need a new-version bpobj with
 727  727           * subobj support.  So call dmu_object_alloc() directly.
 728  728           */
 729  729          obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 730  730              SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 731      -        VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
      731 +        VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 732  732              DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 733      -        VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
      733 +        VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 734  734              dp->dp_meta_objset, obj));
 735  735  
 736      -        VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL,
      736 +        VERIFY0(dmu_objset_find_spa(dp->dp_spa, NULL,
 737  737              upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 738  738  }
 739  739  
 740  740  void
 741  741  dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 742  742  {
 743  743          uint64_t dsobj;
 744  744          dsl_dataset_t *ds;
 745  745  
 746  746          ASSERT(dmu_tx_is_syncing(tx));
 747  747          ASSERT(dp->dp_origin_snap == NULL);
 748  748  
 749  749          /* create the origin dir, ds, & snap-ds */
 750  750          rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 751  751          dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 752  752              NULL, 0, kcred, tx);
 753  753          VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 754  754          dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx);
 755  755          VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 756  756              dp, &dp->dp_origin_snap));
 757  757          dsl_dataset_rele(ds, FTAG);
 758  758          rw_exit(&dp->dp_config_rwlock);
 759  759  }
 760  760  
 761  761  taskq_t *
 762  762  dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 763  763  {
 764  764          return (dp->dp_vnrele_taskq);
 765  765  }
 766  766  
 767  767  /*
 768  768   * Walk through the pool-wide zap object of temporary snapshot user holds
 769  769   * and release them.
 770  770   */
 771  771  void
 772  772  dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 773  773  {
 774  774          zap_attribute_t za;
 775  775          zap_cursor_t zc;
 776  776          objset_t *mos = dp->dp_meta_objset;
 777  777          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 778  778  
 779  779          if (zapobj == 0)
 780  780                  return;
 781  781          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 782  782  
 783  783          for (zap_cursor_init(&zc, mos, zapobj);
 784  784              zap_cursor_retrieve(&zc, &za) == 0;
 785  785              zap_cursor_advance(&zc)) {
 786  786                  char *htag;
 787  787                  uint64_t dsobj;
 788  788  
 789  789                  htag = strchr(za.za_name, '-');
 790  790                  *htag = '\0';
 791  791                  ++htag;
 792  792                  dsobj = strtonum(za.za_name, NULL);
 793  793                  (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
 794  794          }
 795  795          zap_cursor_fini(&zc);
 796  796  }
 797  797  
 798  798  /*
 799  799   * Create the pool-wide zap object for storing temporary snapshot holds.
 800  800   */
 801  801  void
 802  802  dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 803  803  {
 804  804          objset_t *mos = dp->dp_meta_objset;
 805  805  
 806  806          ASSERT(dp->dp_tmp_userrefs_obj == 0);
 807  807          ASSERT(dmu_tx_is_syncing(tx));
 808  808  
 809  809          dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 810  810              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 811  811  }
 812  812  
 813  813  static int
 814  814  dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 815  815      const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
 816  816  {
 817  817          objset_t *mos = dp->dp_meta_objset;
 818  818          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 819  819          char *name;
 820  820          int error;
 821  821  
 822  822          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 823  823          ASSERT(dmu_tx_is_syncing(tx));
 824  824  
 825  825          /*
 826  826           * If the pool was created prior to SPA_VERSION_USERREFS, the
 827  827           * zap object for temporary holds might not exist yet.
 828  828           */
 829  829          if (zapobj == 0) {
 830  830                  if (holding) {
 831  831                          dsl_pool_user_hold_create_obj(dp, tx);
 832  832                          zapobj = dp->dp_tmp_userrefs_obj;
 833  833                  } else {
 834  834                          return (ENOENT);
 835  835                  }
 836  836          }
 837  837  
 838  838          name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 839  839          if (holding)
 840  840                  error = zap_add(mos, zapobj, name, 8, 1, now, tx);
 841  841          else
 842  842                  error = zap_remove(mos, zapobj, name, tx);
 843  843          strfree(name);
 844  844  
 845  845          return (error);
 846  846  }
 847  847  
 848  848  /*
 849  849   * Add a temporary hold for the given dataset object and tag.
 850  850   */
 851  851  int
 852  852  dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 853  853      uint64_t *now, dmu_tx_t *tx)
 854  854  {
 855  855          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 856  856  }
 857  857  
 858  858  /*
 859  859   * Release a temporary hold for the given dataset object and tag.
 860  860   */
 861  861  int
 862  862  dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 863  863      dmu_tx_t *tx)
 864  864  {
 865  865          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
 866  866              tx, B_FALSE));
 867  867  }
  
    | 
      ↓ open down ↓ | 
    121 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX