illumos-gate Wdiff usr/src/uts/common/fs/zfs/dsl_dataset.c

Print this page

FITS: generating send-streams in portable format
This commit adds the command 'zfs fits-send', analogous to zfs send. The
generated send stream is compatible with the stream generated with that
from 'btrfs send' and can in principle easily be received to any filesystem.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dsl_dataset.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_dataset.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu_objset.h>
  28   28  #include <sys/dsl_dataset.h>
  29   29  #include <sys/dsl_dir.h>
  30   30  #include <sys/dsl_prop.h>
  31   31  #include <sys/dsl_synctask.h>
  32   32  #include <sys/dmu_traverse.h>
  33   33  #include <sys/dmu_impl.h>
  34   34  #include <sys/dmu_tx.h>
  35   35  #include <sys/arc.h>
  36   36  #include <sys/zio.h>
  37   37  #include <sys/zap.h>
  38   38  #include <sys/zfeature.h>
  39   39  #include <sys/unique.h>
  40   40  #include <sys/zfs_context.h>
  41   41  #include <sys/zfs_ioctl.h>
  42   42  #include <sys/spa.h>
  43   43  #include <sys/zfs_znode.h>
  44   44  #include <sys/zfs_onexit.h>
  45   45  #include <sys/zvol.h>
  46   46  #include <sys/dsl_scan.h>
  47   47  #include <sys/dsl_deadlist.h>
  48   48  
  49   49  static char *dsl_reaper = "the grim reaper";
  50   50  
  51   51  static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
  52   52  static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
  53   53  static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
  54   54  
  55   55  #define SWITCH64(x, y) \
  56   56          { \
  57   57                  uint64_t __tmp = (x); \
  58   58                  (x) = (y); \
  59   59                  (y) = __tmp; \
  60   60          }
  61   61  
  62   62  #define DS_REF_MAX      (1ULL << 62)
  63   63  
  64   64  #define DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
  65   65  
  66   66  #define DSL_DATASET_IS_DESTROYED(ds)    ((ds)->ds_owner == dsl_reaper)
  67   67  
  68   68  
  69   69  /*
  70   70   * Figure out how much of this delta should be propogated to the dsl_dir
  71   71   * layer.  If there's a refreservation, that space has already been
  72   72   * partially accounted for in our ancestors.
  73   73   */
  74   74  static int64_t
  75   75  parent_delta(dsl_dataset_t *ds, int64_t delta)
  76   76  {
  77   77          uint64_t old_bytes, new_bytes;
  78   78  
  79   79          if (ds->ds_reserved == 0)
  80   80                  return (delta);
  81   81  
  82   82          old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
  83   83          new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
  84   84  
  85   85          ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
  86   86          return (new_bytes - old_bytes);
  87   87  }
  88   88  
  89   89  void
  90   90  dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  91   91  {
  92   92          int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
  93   93          int compressed = BP_GET_PSIZE(bp);
  94   94          int uncompressed = BP_GET_UCSIZE(bp);
  95   95          int64_t delta;
  96   96  
  97   97          dprintf_bp(bp, "ds=%p", ds);
  98   98  
  99   99          ASSERT(dmu_tx_is_syncing(tx));
 100  100          /* It could have been compressed away to nothing */
 101  101          if (BP_IS_HOLE(bp))
 102  102                  return;
 103  103          ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 104  104          ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 105  105          if (ds == NULL) {
 106  106                  dsl_pool_mos_diduse_space(tx->tx_pool,
 107  107                      used, compressed, uncompressed);
 108  108                  return;
 109  109          }
 110  110          dmu_buf_will_dirty(ds->ds_dbuf, tx);
 111  111  
 112  112          mutex_enter(&ds->ds_dir->dd_lock);
 113  113          mutex_enter(&ds->ds_lock);
 114  114          delta = parent_delta(ds, used);
 115  115          ds->ds_phys->ds_referenced_bytes += used;
 116  116          ds->ds_phys->ds_compressed_bytes += compressed;
 117  117          ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 118  118          ds->ds_phys->ds_unique_bytes += used;
 119  119          mutex_exit(&ds->ds_lock);
 120  120          dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 121  121              compressed, uncompressed, tx);
 122  122          dsl_dir_transfer_space(ds->ds_dir, used - delta,
 123  123              DD_USED_REFRSRV, DD_USED_HEAD, tx);
 124  124          mutex_exit(&ds->ds_dir->dd_lock);
 125  125  }
 126  126  
 127  127  int
 128  128  dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 129  129      boolean_t async)
 130  130  {
 131  131          if (BP_IS_HOLE(bp))
 132  132                  return (0);
 133  133  
 134  134          ASSERT(dmu_tx_is_syncing(tx));
 135  135          ASSERT(bp->blk_birth <= tx->tx_txg);
 136  136  
 137  137          int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 138  138          int compressed = BP_GET_PSIZE(bp);
 139  139          int uncompressed = BP_GET_UCSIZE(bp);
 140  140  
 141  141          ASSERT(used > 0);
 142  142          if (ds == NULL) {
 143  143                  dsl_free(tx->tx_pool, tx->tx_txg, bp);
 144  144                  dsl_pool_mos_diduse_space(tx->tx_pool,
 145  145                      -used, -compressed, -uncompressed);
 146  146                  return (used);
 147  147          }
 148  148          ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 149  149  
 150  150          ASSERT(!dsl_dataset_is_snapshot(ds));
 151  151          dmu_buf_will_dirty(ds->ds_dbuf, tx);
 152  152  
 153  153          if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 154  154                  int64_t delta;
 155  155  
 156  156                  dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 157  157                  dsl_free(tx->tx_pool, tx->tx_txg, bp);
 158  158  
 159  159                  mutex_enter(&ds->ds_dir->dd_lock);
 160  160                  mutex_enter(&ds->ds_lock);
 161  161                  ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
 162  162                      !DS_UNIQUE_IS_ACCURATE(ds));
 163  163                  delta = parent_delta(ds, -used);
 164  164                  ds->ds_phys->ds_unique_bytes -= used;
 165  165                  mutex_exit(&ds->ds_lock);
 166  166                  dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 167  167                      delta, -compressed, -uncompressed, tx);
 168  168                  dsl_dir_transfer_space(ds->ds_dir, -used - delta,
 169  169                      DD_USED_REFRSRV, DD_USED_HEAD, tx);
 170  170                  mutex_exit(&ds->ds_dir->dd_lock);
 171  171          } else {
 172  172                  dprintf_bp(bp, "putting on dead list: %s", "");
 173  173                  if (async) {
 174  174                          /*
 175  175                           * We are here as part of zio's write done callback,
 176  176                           * which means we're a zio interrupt thread.  We can't
 177  177                           * call dsl_deadlist_insert() now because it may block
 178  178                           * waiting for I/O.  Instead, put bp on the deferred
 179  179                           * queue and let dsl_pool_sync() finish the job.
 180  180                           */
 181  181                          bplist_append(&ds->ds_pending_deadlist, bp);
 182  182                  } else {
 183  183                          dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 184  184                  }
 185  185                  ASSERT3U(ds->ds_prev->ds_object, ==,
 186  186                      ds->ds_phys->ds_prev_snap_obj);
 187  187                  ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 188  188                  /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 189  189                  if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 190  190                      ds->ds_object && bp->blk_birth >
 191  191                      ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 192  192                          dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 193  193                          mutex_enter(&ds->ds_prev->ds_lock);
 194  194                          ds->ds_prev->ds_phys->ds_unique_bytes += used;
 195  195                          mutex_exit(&ds->ds_prev->ds_lock);
 196  196                  }
 197  197                  if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 198  198                          dsl_dir_transfer_space(ds->ds_dir, used,
 199  199                              DD_USED_HEAD, DD_USED_SNAP, tx);
 200  200                  }
 201  201          }
 202  202          mutex_enter(&ds->ds_lock);
 203  203          ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
 204  204          ds->ds_phys->ds_referenced_bytes -= used;
 205  205          ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 206  206          ds->ds_phys->ds_compressed_bytes -= compressed;
 207  207          ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 208  208          ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 209  209          mutex_exit(&ds->ds_lock);
 210  210  
 211  211          return (used);
 212  212  }
 213  213  
 214  214  uint64_t
 215  215  dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 216  216  {
 217  217          uint64_t trysnap = 0;
 218  218  
 219  219          if (ds == NULL)
 220  220                  return (0);
 221  221          /*
 222  222           * The snapshot creation could fail, but that would cause an
 223  223           * incorrect FALSE return, which would only result in an
 224  224           * overestimation of the amount of space that an operation would
 225  225           * consume, which is OK.
 226  226           *
 227  227           * There's also a small window where we could miss a pending
 228  228           * snapshot, because we could set the sync task in the quiescing
 229  229           * phase.  So this should only be used as a guess.
 230  230           */
 231  231          if (ds->ds_trysnap_txg >
 232  232              spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 233  233                  trysnap = ds->ds_trysnap_txg;
 234  234          return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 235  235  }
 236  236  
 237  237  boolean_t
 238  238  dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
 239  239      uint64_t blk_birth)
 240  240  {
 241  241          if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
 242  242                  return (B_FALSE);
 243  243  
 244  244          ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 245  245  
 246  246          return (B_TRUE);
 247  247  }
 248  248  
 249  249  /* ARGSUSED */
 250  250  static void
 251  251  dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 252  252  {
 253  253          dsl_dataset_t *ds = dsv;
 254  254  
 255  255          ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 256  256  
 257  257          unique_remove(ds->ds_fsid_guid);
 258  258  
 259  259          if (ds->ds_objset != NULL)
 260  260                  dmu_objset_evict(ds->ds_objset);
 261  261  
 262  262          if (ds->ds_prev) {
 263  263                  dsl_dataset_drop_ref(ds->ds_prev, ds);
 264  264                  ds->ds_prev = NULL;
 265  265          }
 266  266  
 267  267          bplist_destroy(&ds->ds_pending_deadlist);
 268  268          if (db != NULL) {
 269  269                  dsl_deadlist_close(&ds->ds_deadlist);
 270  270          } else {
 271  271                  ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
 272  272                  ASSERT(!ds->ds_deadlist.dl_oldfmt);
 273  273          }
 274  274          if (ds->ds_dir)
 275  275                  dsl_dir_close(ds->ds_dir, ds);
 276  276  
 277  277          ASSERT(!list_link_active(&ds->ds_synced_link));
 278  278  
 279  279          mutex_destroy(&ds->ds_lock);
 280  280          mutex_destroy(&ds->ds_recvlock);
 281  281          mutex_destroy(&ds->ds_opening_lock);
 282  282          rw_destroy(&ds->ds_rwlock);
 283  283          cv_destroy(&ds->ds_exclusive_cv);
 284  284  
 285  285          kmem_free(ds, sizeof (dsl_dataset_t));
 286  286  }
 287  287  
 288  288  static int
 289  289  dsl_dataset_get_snapname(dsl_dataset_t *ds)
 290  290  {
 291  291          dsl_dataset_phys_t *headphys;
 292  292          int err;
 293  293          dmu_buf_t *headdbuf;
 294  294          dsl_pool_t *dp = ds->ds_dir->dd_pool;
 295  295          objset_t *mos = dp->dp_meta_objset;
 296  296  
 297  297          if (ds->ds_snapname[0])
 298  298                  return (0);
 299  299          if (ds->ds_phys->ds_next_snap_obj == 0)
 300  300                  return (0);
 301  301  
 302  302          err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 303  303              FTAG, &headdbuf);
 304  304          if (err)
 305  305                  return (err);
 306  306          headphys = headdbuf->db_data;
 307  307          err = zap_value_search(dp->dp_meta_objset,
 308  308              headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 309  309          dmu_buf_rele(headdbuf, FTAG);
 310  310          return (err);
 311  311  }
 312  312  
 313  313  static int
 314  314  dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 315  315  {
 316  316          objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 317  317          uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 318  318          matchtype_t mt;
 319  319          int err;
 320  320  
 321  321          if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 322  322                  mt = MT_FIRST;
 323  323          else
 324  324                  mt = MT_EXACT;
 325  325  
 326  326          err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 327  327              value, mt, NULL, 0, NULL);
 328  328          if (err == ENOTSUP && mt == MT_FIRST)
 329  329                  err = zap_lookup(mos, snapobj, name, 8, 1, value);
 330  330          return (err);
 331  331  }
 332  332  
 333  333  static int
 334  334  dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
 335  335  {
 336  336          objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 337  337          uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 338  338          matchtype_t mt;
 339  339          int err;
 340  340  
 341  341          dsl_dir_snap_cmtime_update(ds->ds_dir);
 342  342  
 343  343          if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 344  344                  mt = MT_FIRST;
 345  345          else
 346  346                  mt = MT_EXACT;
 347  347  
 348  348          err = zap_remove_norm(mos, snapobj, name, mt, tx);
 349  349          if (err == ENOTSUP && mt == MT_FIRST)
 350  350                  err = zap_remove(mos, snapobj, name, tx);
 351  351          return (err);
 352  352  }
 353  353  
 354  354  static int
 355  355  dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 356  356      dsl_dataset_t **dsp)
 357  357  {
 358  358          objset_t *mos = dp->dp_meta_objset;
 359  359          dmu_buf_t *dbuf;
 360  360          dsl_dataset_t *ds;
 361  361          int err;
 362  362          dmu_object_info_t doi;
 363  363  
 364  364          ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 365  365              dsl_pool_sync_context(dp));
 366  366  
 367  367          err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 368  368          if (err)
 369  369                  return (err);
 370  370  
 371  371          /* Make sure dsobj has the correct object type. */
 372  372          dmu_object_info_from_db(dbuf, &doi);
 373  373          if (doi.doi_type != DMU_OT_DSL_DATASET)
 374  374                  return (EINVAL);
 375  375  
 376  376          ds = dmu_buf_get_user(dbuf);
 377  377          if (ds == NULL) {
 378  378                  dsl_dataset_t *winner;
 379  379  
 380  380                  ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 381  381                  ds->ds_dbuf = dbuf;
 382  382                  ds->ds_object = dsobj;
 383  383                  ds->ds_phys = dbuf->db_data;
 384  384  
 385  385                  mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 386  386                  mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 387  387                  mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 388  388                  mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
 389  389  
 390  390                  rw_init(&ds->ds_rwlock, 0, 0, 0);
 391  391                  cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
 392  392  
 393  393                  bplist_create(&ds->ds_pending_deadlist);
 394  394                  dsl_deadlist_open(&ds->ds_deadlist,
 395  395                      mos, ds->ds_phys->ds_deadlist_obj);
 396  396  
 397  397                  list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 398  398                      offsetof(dmu_sendarg_t, dsa_link));
 399  399  
 400  400                  if (err == 0) {
 401  401                          err = dsl_dir_open_obj(dp,
 402  402                              ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 403  403                  }
 404  404                  if (err) {
 405  405                          mutex_destroy(&ds->ds_lock);
 406  406                          mutex_destroy(&ds->ds_recvlock);
 407  407                          mutex_destroy(&ds->ds_opening_lock);
 408  408                          rw_destroy(&ds->ds_rwlock);
 409  409                          cv_destroy(&ds->ds_exclusive_cv);
 410  410                          bplist_destroy(&ds->ds_pending_deadlist);
 411  411                          dsl_deadlist_close(&ds->ds_deadlist);
 412  412                          kmem_free(ds, sizeof (dsl_dataset_t));
 413  413                          dmu_buf_rele(dbuf, tag);
 414  414                          return (err);
 415  415                  }
 416  416  
 417  417                  if (!dsl_dataset_is_snapshot(ds)) {
 418  418                          ds->ds_snapname[0] = '\0';
 419  419                          if (ds->ds_phys->ds_prev_snap_obj) {
 420  420                                  err = dsl_dataset_get_ref(dp,
 421  421                                      ds->ds_phys->ds_prev_snap_obj,
 422  422                                      ds, &ds->ds_prev);
 423  423                          }
 424  424                  } else {
 425  425                          if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 426  426                                  err = dsl_dataset_get_snapname(ds);
 427  427                          if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
 428  428                                  err = zap_count(
 429  429                                      ds->ds_dir->dd_pool->dp_meta_objset,
 430  430                                      ds->ds_phys->ds_userrefs_obj,
 431  431                                      &ds->ds_userrefs);
 432  432                          }
 433  433                  }
 434  434  
 435  435                  if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
 436  436                          /*
 437  437                           * In sync context, we're called with either no lock
 438  438                           * or with the write lock.  If we're not syncing,
 439  439                           * we're always called with the read lock held.
 440  440                           */
 441  441                          boolean_t need_lock =
 442  442                              !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
 443  443                              dsl_pool_sync_context(dp);
 444  444  
 445  445                          if (need_lock)
 446  446                                  rw_enter(&dp->dp_config_rwlock, RW_READER);
 447  447  
 448  448                          err = dsl_prop_get_ds(ds,
 449  449                              "refreservation", sizeof (uint64_t), 1,
 450  450                              &ds->ds_reserved, NULL);
 451  451                          if (err == 0) {
 452  452                                  err = dsl_prop_get_ds(ds,
 453  453                                      "refquota", sizeof (uint64_t), 1,
 454  454                                      &ds->ds_quota, NULL);
 455  455                          }
 456  456  
 457  457                          if (need_lock)
 458  458                                  rw_exit(&dp->dp_config_rwlock);
 459  459                  } else {
 460  460                          ds->ds_reserved = ds->ds_quota = 0;
 461  461                  }
 462  462  
 463  463                  if (err == 0) {
 464  464                          winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
 465  465                              dsl_dataset_evict);
 466  466                  }
 467  467                  if (err || winner) {
 468  468                          bplist_destroy(&ds->ds_pending_deadlist);
 469  469                          dsl_deadlist_close(&ds->ds_deadlist);
 470  470                          if (ds->ds_prev)
 471  471                                  dsl_dataset_drop_ref(ds->ds_prev, ds);
 472  472                          dsl_dir_close(ds->ds_dir, ds);
 473  473                          mutex_destroy(&ds->ds_lock);
 474  474                          mutex_destroy(&ds->ds_recvlock);
 475  475                          mutex_destroy(&ds->ds_opening_lock);
 476  476                          rw_destroy(&ds->ds_rwlock);
 477  477                          cv_destroy(&ds->ds_exclusive_cv);
 478  478                          kmem_free(ds, sizeof (dsl_dataset_t));
 479  479                          if (err) {
 480  480                                  dmu_buf_rele(dbuf, tag);
 481  481                                  return (err);
 482  482                          }
 483  483                          ds = winner;
 484  484                  } else {
 485  485                          ds->ds_fsid_guid =
 486  486                              unique_insert(ds->ds_phys->ds_fsid_guid);
 487  487                  }
 488  488          }
 489  489          ASSERT3P(ds->ds_dbuf, ==, dbuf);
 490  490          ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
 491  491          ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
 492  492              spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 493  493              dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 494  494          mutex_enter(&ds->ds_lock);
 495  495          if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
 496  496                  mutex_exit(&ds->ds_lock);
 497  497                  dmu_buf_rele(ds->ds_dbuf, tag);
 498  498                  return (ENOENT);
 499  499          }
 500  500          mutex_exit(&ds->ds_lock);
 501  501          *dsp = ds;
 502  502          return (0);
 503  503  }
 504  504  
 505  505  static int
 506  506  dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
 507  507  {
 508  508          dsl_pool_t *dp = ds->ds_dir->dd_pool;
 509  509  
 510  510          /*
 511  511           * In syncing context we don't want the rwlock lock: there
 512  512           * may be an existing writer waiting for sync phase to
 513  513           * finish.  We don't need to worry about such writers, since
 514  514           * sync phase is single-threaded, so the writer can't be
 515  515           * doing anything while we are active.
 516  516           */
 517  517          if (dsl_pool_sync_context(dp)) {
 518  518                  ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 519  519                  return (0);
 520  520          }
 521  521  
 522  522          /*
 523  523           * Normal users will hold the ds_rwlock as a READER until they
 524  524           * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
 525  525           * drop their READER lock after they set the ds_owner field.
 526  526           *
 527  527           * If the dataset is being destroyed, the destroy thread will
 528  528           * obtain a WRITER lock for exclusive access after it's done its
 529  529           * open-context work and then change the ds_owner to
 530  530           * dsl_reaper once destruction is assured.  So threads
 531  531           * may block here temporarily, until the "destructability" of
 532  532           * the dataset is determined.
 533  533           */
 534  534          ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
 535  535          mutex_enter(&ds->ds_lock);
 536  536          while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
 537  537                  rw_exit(&dp->dp_config_rwlock);
 538  538                  cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
 539  539                  if (DSL_DATASET_IS_DESTROYED(ds)) {
 540  540                          mutex_exit(&ds->ds_lock);
 541  541                          dsl_dataset_drop_ref(ds, tag);
 542  542                          rw_enter(&dp->dp_config_rwlock, RW_READER);
 543  543                          return (ENOENT);
 544  544                  }
 545  545                  /*
 546  546                   * The dp_config_rwlock lives above the ds_lock. And
 547  547                   * we need to check DSL_DATASET_IS_DESTROYED() while
 548  548                   * holding the ds_lock, so we have to drop and reacquire
 549  549                   * the ds_lock here.
 550  550                   */
 551  551                  mutex_exit(&ds->ds_lock);
 552  552                  rw_enter(&dp->dp_config_rwlock, RW_READER);
 553  553                  mutex_enter(&ds->ds_lock);
 554  554          }
 555  555          mutex_exit(&ds->ds_lock);
 556  556          return (0);
 557  557  }
 558  558  
 559  559  int
 560  560  dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 561  561      dsl_dataset_t **dsp)
 562  562  {
 563  563          int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
 564  564  
 565  565          if (err)
 566  566                  return (err);
 567  567          return (dsl_dataset_hold_ref(*dsp, tag));
 568  568  }
 569  569  
 570  570  int
 571  571  dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
 572  572      void *tag, dsl_dataset_t **dsp)
 573  573  {
 574  574          int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 575  575          if (err)
 576  576                  return (err);
 577  577          if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 578  578                  dsl_dataset_rele(*dsp, tag);
 579  579                  *dsp = NULL;
 580  580                  return (EBUSY);
 581  581          }
 582  582          return (0);
 583  583  }
 584  584  
 585  585  int
 586  586  dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 587  587  {
 588  588          dsl_dir_t *dd;
 589  589          dsl_pool_t *dp;
 590  590          const char *snapname;
 591  591          uint64_t obj;
 592  592          int err = 0;
 593  593  
 594  594          err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
 595  595          if (err)
 596  596                  return (err);
 597  597  
 598  598          dp = dd->dd_pool;
 599  599          obj = dd->dd_phys->dd_head_dataset_obj;
 600  600          rw_enter(&dp->dp_config_rwlock, RW_READER);
 601  601          if (obj)
 602  602                  err = dsl_dataset_get_ref(dp, obj, tag, dsp);
 603  603          else
 604  604                  err = ENOENT;
 605  605          if (err)
 606  606                  goto out;
 607  607  
 608  608          err = dsl_dataset_hold_ref(*dsp, tag);
 609  609  
 610  610          /* we may be looking for a snapshot */
 611  611          if (err == 0 && snapname != NULL) {
 612  612                  dsl_dataset_t *ds = NULL;
 613  613  
 614  614                  if (*snapname++ != '@') {
 615  615                          dsl_dataset_rele(*dsp, tag);
 616  616                          err = ENOENT;
 617  617                          goto out;
 618  618                  }
 619  619  
 620  620                  dprintf("looking for snapshot '%s'\n", snapname);
 621  621                  err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
 622  622                  if (err == 0)
 623  623                          err = dsl_dataset_get_ref(dp, obj, tag, &ds);
 624  624                  dsl_dataset_rele(*dsp, tag);
 625  625  
 626  626                  ASSERT3U((err == 0), ==, (ds != NULL));
 627  627  
 628  628                  if (ds) {
 629  629                          mutex_enter(&ds->ds_lock);
 630  630                          if (ds->ds_snapname[0] == 0)
 631  631                                  (void) strlcpy(ds->ds_snapname, snapname,
 632  632                                      sizeof (ds->ds_snapname));
 633  633                          mutex_exit(&ds->ds_lock);
 634  634                          err = dsl_dataset_hold_ref(ds, tag);
 635  635                          *dsp = err ? NULL : ds;
 636  636                  }
 637  637          }
 638  638  out:
 639  639          rw_exit(&dp->dp_config_rwlock);
 640  640          dsl_dir_close(dd, FTAG);
 641  641          return (err);
 642  642  }
 643  643  
 644  644  int
 645  645  dsl_dataset_own(const char *name, boolean_t inconsistentok,
 646  646      void *tag, dsl_dataset_t **dsp)
 647  647  {
 648  648          int err = dsl_dataset_hold(name, tag, dsp);
 649  649          if (err)
 650  650                  return (err);
 651  651          if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 652  652                  dsl_dataset_rele(*dsp, tag);
 653  653                  return (EBUSY);
 654  654          }
 655  655          return (0);
 656  656  }
 657  657  
 658  658  void
 659  659  dsl_dataset_name(dsl_dataset_t *ds, char *name)
 660  660  {
 661  661          if (ds == NULL) {
 662  662                  (void) strcpy(name, "mos");
 663  663          } else {
 664  664                  dsl_dir_name(ds->ds_dir, name);
 665  665                  VERIFY(0 == dsl_dataset_get_snapname(ds));
 666  666                  if (ds->ds_snapname[0]) {
 667  667                          (void) strcat(name, "@");
 668  668                          /*
 669  669                           * We use a "recursive" mutex so that we
 670  670                           * can call dprintf_ds() with ds_lock held.
 671  671                           */
 672  672                          if (!MUTEX_HELD(&ds->ds_lock)) {

↓ open down ↓

672 lines elided

↑ open up ↑

 673  673                                  mutex_enter(&ds->ds_lock);
 674  674                                  (void) strcat(name, ds->ds_snapname);
 675  675                                  mutex_exit(&ds->ds_lock);
 676  676                          } else {
 677  677                                  (void) strcat(name, ds->ds_snapname);
 678  678                          }
 679  679                  }
 680  680          }
 681  681  }
 682  682  
 683      -static int
      683 +int
 684  684  dsl_dataset_namelen(dsl_dataset_t *ds)
 685  685  {
 686  686          int result;
 687  687  
 688  688          if (ds == NULL) {
 689  689                  result = 3;     /* "mos" */
 690  690          } else {
 691  691                  result = dsl_dir_namelen(ds->ds_dir);
 692  692                  VERIFY(0 == dsl_dataset_get_snapname(ds));
 693  693                  if (ds->ds_snapname[0]) {

 694  694                          ++result;       /* adding one for the @-sign */
 695  695                          if (!MUTEX_HELD(&ds->ds_lock)) {
 696  696                                  mutex_enter(&ds->ds_lock);
 697  697                                  result += strlen(ds->ds_snapname);
 698  698                                  mutex_exit(&ds->ds_lock);
 699  699                          } else {
 700  700                                  result += strlen(ds->ds_snapname);
 701  701                          }
 702  702                  }
 703  703          }
 704  704  
 705  705          return (result);
 706  706  }
 707  707  
 708  708  void
 709  709  dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
 710  710  {
 711  711          dmu_buf_rele(ds->ds_dbuf, tag);
 712  712  }
 713  713  
 714  714  void
 715  715  dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 716  716  {
 717  717          if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
 718  718                  rw_exit(&ds->ds_rwlock);
 719  719          }
 720  720          dsl_dataset_drop_ref(ds, tag);
 721  721  }
 722  722  
 723  723  void
 724  724  dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 725  725  {
 726  726          ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
 727  727              (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
 728  728  
 729  729          mutex_enter(&ds->ds_lock);
 730  730          ds->ds_owner = NULL;
 731  731          if (RW_WRITE_HELD(&ds->ds_rwlock)) {
 732  732                  rw_exit(&ds->ds_rwlock);
 733  733                  cv_broadcast(&ds->ds_exclusive_cv);
 734  734          }
 735  735          mutex_exit(&ds->ds_lock);
 736  736          if (ds->ds_dbuf)
 737  737                  dsl_dataset_drop_ref(ds, tag);
 738  738          else
 739  739                  dsl_dataset_evict(NULL, ds);
 740  740  }
 741  741  
 742  742  boolean_t
 743  743  dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
 744  744  {
 745  745          boolean_t gotit = FALSE;
 746  746  
 747  747          mutex_enter(&ds->ds_lock);
 748  748          if (ds->ds_owner == NULL &&
 749  749              (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
 750  750                  ds->ds_owner = tag;
 751  751                  if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
 752  752                          rw_exit(&ds->ds_rwlock);
 753  753                  gotit = TRUE;
 754  754          }
 755  755          mutex_exit(&ds->ds_lock);
 756  756          return (gotit);
 757  757  }
 758  758  
 759  759  void
 760  760  dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
 761  761  {
 762  762          ASSERT3P(owner, ==, ds->ds_owner);
 763  763          if (!RW_WRITE_HELD(&ds->ds_rwlock))
 764  764                  rw_enter(&ds->ds_rwlock, RW_WRITER);
 765  765  }
 766  766  
 767  767  uint64_t
 768  768  dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 769  769      uint64_t flags, dmu_tx_t *tx)
 770  770  {
 771  771          dsl_pool_t *dp = dd->dd_pool;
 772  772          dmu_buf_t *dbuf;
 773  773          dsl_dataset_phys_t *dsphys;
 774  774          uint64_t dsobj;
 775  775          objset_t *mos = dp->dp_meta_objset;
 776  776  
 777  777          if (origin == NULL)
 778  778                  origin = dp->dp_origin_snap;
 779  779  
 780  780          ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 781  781          ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
 782  782          ASSERT(dmu_tx_is_syncing(tx));
 783  783          ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 784  784  
 785  785          dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 786  786              DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 787  787          VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 788  788          dmu_buf_will_dirty(dbuf, tx);
 789  789          dsphys = dbuf->db_data;
 790  790          bzero(dsphys, sizeof (dsl_dataset_phys_t));
 791  791          dsphys->ds_dir_obj = dd->dd_object;
 792  792          dsphys->ds_flags = flags;
 793  793          dsphys->ds_fsid_guid = unique_create();
 794  794          (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 795  795              sizeof (dsphys->ds_guid));
 796  796          dsphys->ds_snapnames_zapobj =
 797  797              zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 798  798              DMU_OT_NONE, 0, tx);
 799  799          dsphys->ds_creation_time = gethrestime_sec();
 800  800          dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 801  801  
 802  802          if (origin == NULL) {
 803  803                  dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 804  804          } else {
 805  805                  dsl_dataset_t *ohds;
 806  806  
 807  807                  dsphys->ds_prev_snap_obj = origin->ds_object;
 808  808                  dsphys->ds_prev_snap_txg =
 809  809                      origin->ds_phys->ds_creation_txg;
 810  810                  dsphys->ds_referenced_bytes =
 811  811                      origin->ds_phys->ds_referenced_bytes;
 812  812                  dsphys->ds_compressed_bytes =
 813  813                      origin->ds_phys->ds_compressed_bytes;
 814  814                  dsphys->ds_uncompressed_bytes =
 815  815                      origin->ds_phys->ds_uncompressed_bytes;
 816  816                  dsphys->ds_bp = origin->ds_phys->ds_bp;
 817  817                  dsphys->ds_flags |= origin->ds_phys->ds_flags;
 818  818  
 819  819                  dmu_buf_will_dirty(origin->ds_dbuf, tx);
 820  820                  origin->ds_phys->ds_num_children++;
 821  821  
 822  822                  VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 823  823                      origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 824  824                  dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 825  825                      dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 826  826                  dsl_dataset_rele(ohds, FTAG);
 827  827  
 828  828                  if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 829  829                          if (origin->ds_phys->ds_next_clones_obj == 0) {
 830  830                                  origin->ds_phys->ds_next_clones_obj =
 831  831                                      zap_create(mos,
 832  832                                      DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 833  833                          }
 834  834                          VERIFY(0 == zap_add_int(mos,
 835  835                              origin->ds_phys->ds_next_clones_obj,
 836  836                              dsobj, tx));
 837  837                  }
 838  838  
 839  839                  dmu_buf_will_dirty(dd->dd_dbuf, tx);
 840  840                  dd->dd_phys->dd_origin_obj = origin->ds_object;
 841  841                  if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 842  842                          if (origin->ds_dir->dd_phys->dd_clones == 0) {
 843  843                                  dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 844  844                                  origin->ds_dir->dd_phys->dd_clones =
 845  845                                      zap_create(mos,
 846  846                                      DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 847  847                          }
 848  848                          VERIFY3U(0, ==, zap_add_int(mos,
 849  849                              origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 850  850                  }
 851  851          }
 852  852  
 853  853          if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 854  854                  dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 855  855  
 856  856          dmu_buf_rele(dbuf, FTAG);
 857  857  
 858  858          dmu_buf_will_dirty(dd->dd_dbuf, tx);
 859  859          dd->dd_phys->dd_head_dataset_obj = dsobj;
 860  860  
 861  861          return (dsobj);
 862  862  }
 863  863  
 864  864  uint64_t
 865  865  dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
 866  866      dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 867  867  {
 868  868          dsl_pool_t *dp = pdd->dd_pool;
 869  869          uint64_t dsobj, ddobj;
 870  870          dsl_dir_t *dd;
 871  871  
 872  872          ASSERT(lastname[0] != '@');
 873  873  
 874  874          ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 875  875          VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 876  876  
 877  877          dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
 878  878  
 879  879          dsl_deleg_set_create_perms(dd, tx, cr);
 880  880  
 881  881          dsl_dir_close(dd, FTAG);
 882  882  
 883  883          /*
 884  884           * If we are creating a clone, make sure we zero out any stale
 885  885           * data from the origin snapshots zil header.
 886  886           */
 887  887          if (origin != NULL) {
 888  888                  dsl_dataset_t *ds;
 889  889                  objset_t *os;
 890  890  
 891  891                  VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 892  892                  VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
 893  893                  bzero(&os->os_zil_header, sizeof (os->os_zil_header));
 894  894                  dsl_dataset_dirty(ds, tx);
 895  895                  dsl_dataset_rele(ds, FTAG);
 896  896          }
 897  897  
 898  898          return (dsobj);
 899  899  }
 900  900  
 901  901  /*
 902  902   * The snapshots must all be in the same pool.
 903  903   */
 904  904  int
 905  905  dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer,
 906  906      nvlist_t *errlist)
 907  907  {
 908  908          int err;
 909  909          dsl_sync_task_t *dst;
 910  910          spa_t *spa;
 911  911          nvpair_t *pair;
 912  912          dsl_sync_task_group_t *dstg;
 913  913  
 914  914          pair = nvlist_next_nvpair(snaps, NULL);
 915  915          if (pair == NULL)
 916  916                  return (0);
 917  917  
 918  918          err = spa_open(nvpair_name(pair), &spa, FTAG);
 919  919          if (err)
 920  920                  return (err);
 921  921          dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 922  922  
 923  923          for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 924  924              pair = nvlist_next_nvpair(snaps, pair)) {
 925  925                  dsl_dataset_t *ds;
 926  926  
 927  927                  err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
 928  928                  if (err == 0) {
 929  929                          struct dsl_ds_destroyarg *dsda;
 930  930  
 931  931                          dsl_dataset_make_exclusive(ds, dstg);
 932  932                          dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
 933  933                              KM_SLEEP);
 934  934                          dsda->ds = ds;
 935  935                          dsda->defer = defer;
 936  936                          dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 937  937                              dsl_dataset_destroy_sync, dsda, dstg, 0);
 938  938                  } else if (err == ENOENT) {
 939  939                          err = 0;
 940  940                  } else {
 941  941                          fnvlist_add_int32(errlist, nvpair_name(pair), err);
 942  942                          break;
 943  943                  }
 944  944          }
 945  945  
 946  946          if (err == 0)
 947  947                  err = dsl_sync_task_group_wait(dstg);
 948  948  
 949  949          for (dst = list_head(&dstg->dstg_tasks); dst;
 950  950              dst = list_next(&dstg->dstg_tasks, dst)) {
 951  951                  struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
 952  952                  dsl_dataset_t *ds = dsda->ds;
 953  953  
 954  954                  /*
 955  955                   * Return the snapshots that triggered the error.
 956  956                   */
 957  957                  if (dst->dst_err != 0) {
 958  958                          char name[ZFS_MAXNAMELEN];
 959  959                          dsl_dataset_name(ds, name);
 960  960                          fnvlist_add_int32(errlist, name, dst->dst_err);
 961  961                  }
 962  962                  ASSERT3P(dsda->rm_origin, ==, NULL);
 963  963                  dsl_dataset_disown(ds, dstg);
 964  964                  kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
 965  965          }
 966  966  
 967  967          dsl_sync_task_group_destroy(dstg);
 968  968          spa_close(spa, FTAG);
 969  969          return (err);
 970  970  
 971  971  }
 972  972  
 973  973  static boolean_t
 974  974  dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
 975  975  {
 976  976          boolean_t might_destroy = B_FALSE;
 977  977  
 978  978          mutex_enter(&ds->ds_lock);
 979  979          if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
 980  980              DS_IS_DEFER_DESTROY(ds))
 981  981                  might_destroy = B_TRUE;
 982  982          mutex_exit(&ds->ds_lock);
 983  983  
 984  984          return (might_destroy);
 985  985  }
 986  986  
 987  987  /*
 988  988   * If we're removing a clone, and these three conditions are true:
 989  989   *      1) the clone's origin has no other children
 990  990   *      2) the clone's origin has no user references
 991  991   *      3) the clone's origin has been marked for deferred destruction
 992  992   * Then, prepare to remove the origin as part of this sync task group.
 993  993   */
 994  994  static int
 995  995  dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
 996  996  {
 997  997          dsl_dataset_t *ds = dsda->ds;
 998  998          dsl_dataset_t *origin = ds->ds_prev;
 999  999  
1000 1000          if (dsl_dataset_might_destroy_origin(origin)) {
1001 1001                  char *name;
1002 1002                  int namelen;
1003 1003                  int error;
1004 1004  
1005 1005                  namelen = dsl_dataset_namelen(origin) + 1;
1006 1006                  name = kmem_alloc(namelen, KM_SLEEP);
1007 1007                  dsl_dataset_name(origin, name);
1008 1008  #ifdef _KERNEL
1009 1009                  error = zfs_unmount_snap(name, NULL);
1010 1010                  if (error) {
1011 1011                          kmem_free(name, namelen);
1012 1012                          return (error);
1013 1013                  }
1014 1014  #endif
1015 1015                  error = dsl_dataset_own(name, B_TRUE, tag, &origin);
1016 1016                  kmem_free(name, namelen);
1017 1017                  if (error)
1018 1018                          return (error);
1019 1019                  dsda->rm_origin = origin;
1020 1020                  dsl_dataset_make_exclusive(origin, tag);
1021 1021          }
1022 1022  
1023 1023          return (0);
1024 1024  }
1025 1025  
1026 1026  /*
1027 1027   * ds must be opened as OWNER.  On return (whether successful or not),
1028 1028   * ds will be closed and caller can no longer dereference it.
1029 1029   */
1030 1030  int
1031 1031  dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
1032 1032  {
1033 1033          int err;
1034 1034          dsl_sync_task_group_t *dstg;
1035 1035          objset_t *os;
1036 1036          dsl_dir_t *dd;
1037 1037          uint64_t obj;
1038 1038          struct dsl_ds_destroyarg dsda = { 0 };
1039 1039  
1040 1040          dsda.ds = ds;
1041 1041  
1042 1042          if (dsl_dataset_is_snapshot(ds)) {
1043 1043                  /* Destroying a snapshot is simpler */
1044 1044                  dsl_dataset_make_exclusive(ds, tag);
1045 1045  
1046 1046                  dsda.defer = defer;
1047 1047                  err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1048 1048                      dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
1049 1049                      &dsda, tag, 0);
1050 1050                  ASSERT3P(dsda.rm_origin, ==, NULL);
1051 1051                  goto out;
1052 1052          } else if (defer) {
1053 1053                  err = EINVAL;
1054 1054                  goto out;
1055 1055          }
1056 1056  
1057 1057          dd = ds->ds_dir;
1058 1058  
1059 1059          if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
1060 1060              &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1061 1061                  /*
1062 1062                   * Check for errors and mark this ds as inconsistent, in
1063 1063                   * case we crash while freeing the objects.
1064 1064                   */
1065 1065                  err = dsl_sync_task_do(dd->dd_pool,
1066 1066                      dsl_dataset_destroy_begin_check,
1067 1067                      dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1068 1068                  if (err)
1069 1069                          goto out;
1070 1070  
1071 1071                  err = dmu_objset_from_ds(ds, &os);
1072 1072                  if (err)
1073 1073                          goto out;
1074 1074  
1075 1075                  /*
1076 1076                   * Remove all objects while in the open context so that
1077 1077                   * there is less work to do in the syncing context.
1078 1078                   */
1079 1079                  for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1080 1080                      ds->ds_phys->ds_prev_snap_txg)) {
1081 1081                          /*
1082 1082                           * Ignore errors, if there is not enough disk space
1083 1083                           * we will deal with it in dsl_dataset_destroy_sync().
1084 1084                           */
1085 1085                          (void) dmu_free_object(os, obj);
1086 1086                  }
1087 1087                  if (err != ESRCH)
1088 1088                          goto out;
1089 1089  
1090 1090                  /*
1091 1091                   * Sync out all in-flight IO.
1092 1092                   */
1093 1093                  txg_wait_synced(dd->dd_pool, 0);
1094 1094  
1095 1095                  /*
1096 1096                   * If we managed to free all the objects in open
1097 1097                   * context, the user space accounting should be zero.
1098 1098                   */
1099 1099                  if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1100 1100                      dmu_objset_userused_enabled(os)) {
1101 1101                          uint64_t count;
1102 1102  
1103 1103                          ASSERT(zap_count(os, DMU_USERUSED_OBJECT,
1104 1104                              &count) != 0 || count == 0);
1105 1105                          ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT,
1106 1106                              &count) != 0 || count == 0);
1107 1107                  }
1108 1108          }
1109 1109  
1110 1110          rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
1111 1111          err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
1112 1112          rw_exit(&dd->dd_pool->dp_config_rwlock);
1113 1113  
1114 1114          if (err)
1115 1115                  goto out;
1116 1116  
1117 1117          /*
1118 1118           * Blow away the dsl_dir + head dataset.
1119 1119           */
1120 1120          dsl_dataset_make_exclusive(ds, tag);
1121 1121          /*
1122 1122           * If we're removing a clone, we might also need to remove its
1123 1123           * origin.
1124 1124           */
1125 1125          do {
1126 1126                  dsda.need_prep = B_FALSE;
1127 1127                  if (dsl_dir_is_clone(dd)) {
1128 1128                          err = dsl_dataset_origin_rm_prep(&dsda, tag);
1129 1129                          if (err) {
1130 1130                                  dsl_dir_close(dd, FTAG);
1131 1131                                  goto out;
1132 1132                          }
1133 1133                  }
1134 1134  
1135 1135                  dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1136 1136                  dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1137 1137                      dsl_dataset_destroy_sync, &dsda, tag, 0);
1138 1138                  dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1139 1139                      dsl_dir_destroy_sync, dd, FTAG, 0);
1140 1140                  err = dsl_sync_task_group_wait(dstg);
1141 1141                  dsl_sync_task_group_destroy(dstg);
1142 1142  
1143 1143                  /*
1144 1144                   * We could be racing against 'zfs release' or 'zfs destroy -d'
1145 1145                   * on the origin snap, in which case we can get EBUSY if we
1146 1146                   * needed to destroy the origin snap but were not ready to
1147 1147                   * do so.
1148 1148                   */
1149 1149                  if (dsda.need_prep) {
1150 1150                          ASSERT(err == EBUSY);
1151 1151                          ASSERT(dsl_dir_is_clone(dd));
1152 1152                          ASSERT(dsda.rm_origin == NULL);
1153 1153                  }
1154 1154          } while (dsda.need_prep);
1155 1155  
1156 1156          if (dsda.rm_origin != NULL)
1157 1157                  dsl_dataset_disown(dsda.rm_origin, tag);
1158 1158  
1159 1159          /* if it is successful, dsl_dir_destroy_sync will close the dd */
1160 1160          if (err)
1161 1161                  dsl_dir_close(dd, FTAG);
1162 1162  out:
1163 1163          dsl_dataset_disown(ds, tag);
1164 1164          return (err);
1165 1165  }
1166 1166  
1167 1167  blkptr_t *
1168 1168  dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1169 1169  {
1170 1170          return (&ds->ds_phys->ds_bp);
1171 1171  }
1172 1172  
1173 1173  void
1174 1174  dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1175 1175  {
1176 1176          ASSERT(dmu_tx_is_syncing(tx));
1177 1177          /* If it's the meta-objset, set dp_meta_rootbp */
1178 1178          if (ds == NULL) {
1179 1179                  tx->tx_pool->dp_meta_rootbp = *bp;
1180 1180          } else {
1181 1181                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
1182 1182                  ds->ds_phys->ds_bp = *bp;
1183 1183          }
1184 1184  }
1185 1185  
1186 1186  spa_t *
1187 1187  dsl_dataset_get_spa(dsl_dataset_t *ds)
1188 1188  {
1189 1189          return (ds->ds_dir->dd_pool->dp_spa);
1190 1190  }
1191 1191  
1192 1192  void
1193 1193  dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1194 1194  {
1195 1195          dsl_pool_t *dp;
1196 1196  
1197 1197          if (ds == NULL) /* this is the meta-objset */
1198 1198                  return;
1199 1199  
1200 1200          ASSERT(ds->ds_objset != NULL);
1201 1201  
1202 1202          if (ds->ds_phys->ds_next_snap_obj != 0)
1203 1203                  panic("dirtying snapshot!");
1204 1204  
1205 1205          dp = ds->ds_dir->dd_pool;
1206 1206  
1207 1207          if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1208 1208                  /* up the hold count until we can be written out */
1209 1209                  dmu_buf_add_ref(ds->ds_dbuf, ds);
1210 1210          }
1211 1211  }
1212 1212  
1213 1213  boolean_t
1214 1214  dsl_dataset_is_dirty(dsl_dataset_t *ds)
1215 1215  {
1216 1216          for (int t = 0; t < TXG_SIZE; t++) {
1217 1217                  if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1218 1218                      ds, t))
1219 1219                          return (B_TRUE);
1220 1220          }
1221 1221          return (B_FALSE);
1222 1222  }
1223 1223  
1224 1224  /*
1225 1225   * The unique space in the head dataset can be calculated by subtracting
1226 1226   * the space used in the most recent snapshot, that is still being used
1227 1227   * in this file system, from the space currently in use.  To figure out
1228 1228   * the space in the most recent snapshot still in use, we need to take
1229 1229   * the total space used in the snapshot and subtract out the space that
1230 1230   * has been freed up since the snapshot was taken.
1231 1231   */
1232 1232  static void
1233 1233  dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1234 1234  {
1235 1235          uint64_t mrs_used;
1236 1236          uint64_t dlused, dlcomp, dluncomp;
1237 1237  
1238 1238          ASSERT(!dsl_dataset_is_snapshot(ds));
1239 1239  
1240 1240          if (ds->ds_phys->ds_prev_snap_obj != 0)
1241 1241                  mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
1242 1242          else
1243 1243                  mrs_used = 0;
1244 1244  
1245 1245          dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1246 1246  
1247 1247          ASSERT3U(dlused, <=, mrs_used);
1248 1248          ds->ds_phys->ds_unique_bytes =
1249 1249              ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
1250 1250  
1251 1251          if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1252 1252              SPA_VERSION_UNIQUE_ACCURATE)
1253 1253                  ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1254 1254  }
1255 1255  
1256 1256  struct killarg {
1257 1257          dsl_dataset_t *ds;
1258 1258          dmu_tx_t *tx;
1259 1259  };
1260 1260  
1261 1261  /* ARGSUSED */
1262 1262  static int
1263 1263  kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1264 1264      const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1265 1265  {
1266 1266          struct killarg *ka = arg;
1267 1267          dmu_tx_t *tx = ka->tx;
1268 1268  
1269 1269          if (bp == NULL)
1270 1270                  return (0);
1271 1271  
1272 1272          if (zb->zb_level == ZB_ZIL_LEVEL) {
1273 1273                  ASSERT(zilog != NULL);
1274 1274                  /*
1275 1275                   * It's a block in the intent log.  It has no
1276 1276                   * accounting, so just free it.
1277 1277                   */
1278 1278                  dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
1279 1279          } else {
1280 1280                  ASSERT(zilog == NULL);
1281 1281                  ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1282 1282                  (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
1283 1283          }
1284 1284  
1285 1285          return (0);
1286 1286  }
1287 1287  
1288 1288  /* ARGSUSED */
1289 1289  static int
1290 1290  dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1291 1291  {
1292 1292          dsl_dataset_t *ds = arg1;
1293 1293          objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1294 1294          uint64_t count;
1295 1295          int err;
1296 1296  
1297 1297          /*
1298 1298           * Can't delete a head dataset if there are snapshots of it.
1299 1299           * (Except if the only snapshots are from the branch we cloned
1300 1300           * from.)
1301 1301           */
1302 1302          if (ds->ds_prev != NULL &&
1303 1303              ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1304 1304                  return (EBUSY);
1305 1305  
1306 1306          /*
1307 1307           * This is really a dsl_dir thing, but check it here so that
1308 1308           * we'll be less likely to leave this dataset inconsistent &
1309 1309           * nearly destroyed.
1310 1310           */
1311 1311          err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1312 1312          if (err)
1313 1313                  return (err);
1314 1314          if (count != 0)
1315 1315                  return (EEXIST);
1316 1316  
1317 1317          return (0);
1318 1318  }
1319 1319  
1320 1320  /* ARGSUSED */
1321 1321  static void
1322 1322  dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1323 1323  {
1324 1324          dsl_dataset_t *ds = arg1;
1325 1325  
1326 1326          /* Mark it as inconsistent on-disk, in case we crash */
1327 1327          dmu_buf_will_dirty(ds->ds_dbuf, tx);
1328 1328          ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1329 1329  
1330 1330          spa_history_log_internal_ds(ds, "destroy begin", tx, "");
1331 1331  }
1332 1332  
1333 1333  static int
1334 1334  dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
1335 1335      dmu_tx_t *tx)
1336 1336  {
1337 1337          dsl_dataset_t *ds = dsda->ds;
1338 1338          dsl_dataset_t *ds_prev = ds->ds_prev;
1339 1339  
1340 1340          if (dsl_dataset_might_destroy_origin(ds_prev)) {
1341 1341                  struct dsl_ds_destroyarg ndsda = {0};
1342 1342  
1343 1343                  /*
1344 1344                   * If we're not prepared to remove the origin, don't remove
1345 1345                   * the clone either.
1346 1346                   */
1347 1347                  if (dsda->rm_origin == NULL) {
1348 1348                          dsda->need_prep = B_TRUE;
1349 1349                          return (EBUSY);
1350 1350                  }
1351 1351  
1352 1352                  ndsda.ds = ds_prev;
1353 1353                  ndsda.is_origin_rm = B_TRUE;
1354 1354                  return (dsl_dataset_destroy_check(&ndsda, tag, tx));
1355 1355          }
1356 1356  
1357 1357          /*
1358 1358           * If we're not going to remove the origin after all,
1359 1359           * undo the open context setup.
1360 1360           */
1361 1361          if (dsda->rm_origin != NULL) {
1362 1362                  dsl_dataset_disown(dsda->rm_origin, tag);
1363 1363                  dsda->rm_origin = NULL;
1364 1364          }
1365 1365  
1366 1366          return (0);
1367 1367  }
1368 1368  
1369 1369  /*
1370 1370   * If you add new checks here, you may need to add
1371 1371   * additional checks to the "temporary" case in
1372 1372   * snapshot_check() in dmu_objset.c.
1373 1373   */
1374 1374  /* ARGSUSED */
1375 1375  int
1376 1376  dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1377 1377  {
1378 1378          struct dsl_ds_destroyarg *dsda = arg1;
1379 1379          dsl_dataset_t *ds = dsda->ds;
1380 1380  
1381 1381          /* we have an owner hold, so noone else can destroy us */
1382 1382          ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1383 1383  
1384 1384          /*
1385 1385           * Only allow deferred destroy on pools that support it.
1386 1386           * NOTE: deferred destroy is only supported on snapshots.
1387 1387           */
1388 1388          if (dsda->defer) {
1389 1389                  if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
1390 1390                      SPA_VERSION_USERREFS)
1391 1391                          return (ENOTSUP);
1392 1392                  ASSERT(dsl_dataset_is_snapshot(ds));
1393 1393                  return (0);
1394 1394          }
1395 1395  
1396 1396          /*
1397 1397           * Can't delete a head dataset if there are snapshots of it.
1398 1398           * (Except if the only snapshots are from the branch we cloned
1399 1399           * from.)
1400 1400           */
1401 1401          if (ds->ds_prev != NULL &&
1402 1402              ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1403 1403                  return (EBUSY);
1404 1404  
1405 1405          /*
1406 1406           * If we made changes this txg, traverse_dsl_dataset won't find
1407 1407           * them.  Try again.
1408 1408           */
1409 1409          if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1410 1410                  return (EAGAIN);
1411 1411  
1412 1412          if (dsl_dataset_is_snapshot(ds)) {
1413 1413                  /*
1414 1414                   * If this snapshot has an elevated user reference count,
1415 1415                   * we can't destroy it yet.
1416 1416                   */
1417 1417                  if (ds->ds_userrefs > 0 && !dsda->releasing)
1418 1418                          return (EBUSY);
1419 1419  
1420 1420                  mutex_enter(&ds->ds_lock);
1421 1421                  /*
1422 1422                   * Can't delete a branch point. However, if we're destroying
1423 1423                   * a clone and removing its origin due to it having a user
1424 1424                   * hold count of 0 and having been marked for deferred destroy,
1425 1425                   * it's OK for the origin to have a single clone.
1426 1426                   */
1427 1427                  if (ds->ds_phys->ds_num_children >
1428 1428                      (dsda->is_origin_rm ? 2 : 1)) {
1429 1429                          mutex_exit(&ds->ds_lock);
1430 1430                          return (EEXIST);
1431 1431                  }
1432 1432                  mutex_exit(&ds->ds_lock);
1433 1433          } else if (dsl_dir_is_clone(ds->ds_dir)) {
1434 1434                  return (dsl_dataset_origin_check(dsda, arg2, tx));
1435 1435          }
1436 1436  
1437 1437          /* XXX we should do some i/o error checking... */
1438 1438          return (0);
1439 1439  }
1440 1440  
1441 1441  struct refsarg {
1442 1442          kmutex_t lock;
1443 1443          boolean_t gone;
1444 1444          kcondvar_t cv;
1445 1445  };
1446 1446  
1447 1447  /* ARGSUSED */
1448 1448  static void
1449 1449  dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1450 1450  {
1451 1451          struct refsarg *arg = argv;
1452 1452  
1453 1453          mutex_enter(&arg->lock);
1454 1454          arg->gone = TRUE;
1455 1455          cv_signal(&arg->cv);
1456 1456          mutex_exit(&arg->lock);
1457 1457  }
1458 1458  
1459 1459  static void
1460 1460  dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1461 1461  {
1462 1462          struct refsarg arg;
1463 1463  
1464 1464          mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1465 1465          cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1466 1466          arg.gone = FALSE;
1467 1467          (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1468 1468              dsl_dataset_refs_gone);
1469 1469          dmu_buf_rele(ds->ds_dbuf, tag);
1470 1470          mutex_enter(&arg.lock);
1471 1471          while (!arg.gone)
1472 1472                  cv_wait(&arg.cv, &arg.lock);
1473 1473          ASSERT(arg.gone);
1474 1474          mutex_exit(&arg.lock);
1475 1475          ds->ds_dbuf = NULL;
1476 1476          ds->ds_phys = NULL;
1477 1477          mutex_destroy(&arg.lock);
1478 1478          cv_destroy(&arg.cv);
1479 1479  }
1480 1480  
1481 1481  static void
1482 1482  remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
1483 1483  {
1484 1484          objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1485 1485          uint64_t count;
1486 1486          int err;
1487 1487  
1488 1488          ASSERT(ds->ds_phys->ds_num_children >= 2);
1489 1489          err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
1490 1490          /*
1491 1491           * The err should not be ENOENT, but a bug in a previous version
1492 1492           * of the code could cause upgrade_clones_cb() to not set
1493 1493           * ds_next_snap_obj when it should, leading to a missing entry.
1494 1494           * If we knew that the pool was created after
1495 1495           * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1496 1496           * ENOENT.  However, at least we can check that we don't have
1497 1497           * too many entries in the next_clones_obj even after failing to
1498 1498           * remove this one.
1499 1499           */
1500 1500          if (err != ENOENT) {
1501 1501                  VERIFY0(err);
1502 1502          }
1503 1503          ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1504 1504              &count));
1505 1505          ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
1506 1506  }
1507 1507  
1508 1508  static void
1509 1509  dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
1510 1510  {
1511 1511          objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1512 1512          zap_cursor_t zc;
1513 1513          zap_attribute_t za;
1514 1514  
1515 1515          /*
1516 1516           * If it is the old version, dd_clones doesn't exist so we can't
1517 1517           * find the clones, but deadlist_remove_key() is a no-op so it
1518 1518           * doesn't matter.
1519 1519           */
1520 1520          if (ds->ds_dir->dd_phys->dd_clones == 0)
1521 1521                  return;
1522 1522  
1523 1523          for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
1524 1524              zap_cursor_retrieve(&zc, &za) == 0;
1525 1525              zap_cursor_advance(&zc)) {
1526 1526                  dsl_dataset_t *clone;
1527 1527  
1528 1528                  VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1529 1529                      za.za_first_integer, FTAG, &clone));
1530 1530                  if (clone->ds_dir->dd_origin_txg > mintxg) {
1531 1531                          dsl_deadlist_remove_key(&clone->ds_deadlist,
1532 1532                              mintxg, tx);
1533 1533                          dsl_dataset_remove_clones_key(clone, mintxg, tx);
1534 1534                  }
1535 1535                  dsl_dataset_rele(clone, FTAG);
1536 1536          }
1537 1537          zap_cursor_fini(&zc);
1538 1538  }
1539 1539  
1540 1540  struct process_old_arg {
1541 1541          dsl_dataset_t *ds;
1542 1542          dsl_dataset_t *ds_prev;
1543 1543          boolean_t after_branch_point;
1544 1544          zio_t *pio;
1545 1545          uint64_t used, comp, uncomp;
1546 1546  };
1547 1547  
1548 1548  static int
1549 1549  process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1550 1550  {
1551 1551          struct process_old_arg *poa = arg;
1552 1552          dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
1553 1553  
1554 1554          if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
1555 1555                  dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
1556 1556                  if (poa->ds_prev && !poa->after_branch_point &&
1557 1557                      bp->blk_birth >
1558 1558                      poa->ds_prev->ds_phys->ds_prev_snap_txg) {
1559 1559                          poa->ds_prev->ds_phys->ds_unique_bytes +=
1560 1560                              bp_get_dsize_sync(dp->dp_spa, bp);
1561 1561                  }
1562 1562          } else {
1563 1563                  poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
1564 1564                  poa->comp += BP_GET_PSIZE(bp);
1565 1565                  poa->uncomp += BP_GET_UCSIZE(bp);
1566 1566                  dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
1567 1567          }
1568 1568          return (0);
1569 1569  }
1570 1570  
1571 1571  static void
1572 1572  process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
1573 1573      dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
1574 1574  {
1575 1575          struct process_old_arg poa = { 0 };
1576 1576          dsl_pool_t *dp = ds->ds_dir->dd_pool;
1577 1577          objset_t *mos = dp->dp_meta_objset;
1578 1578  
1579 1579          ASSERT(ds->ds_deadlist.dl_oldfmt);
1580 1580          ASSERT(ds_next->ds_deadlist.dl_oldfmt);
1581 1581  
1582 1582          poa.ds = ds;
1583 1583          poa.ds_prev = ds_prev;
1584 1584          poa.after_branch_point = after_branch_point;
1585 1585          poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1586 1586          VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1587 1587              process_old_cb, &poa, tx));
1588 1588          VERIFY0(zio_wait(poa.pio));
1589 1589          ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1590 1590  
1591 1591          /* change snapused */
1592 1592          dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1593 1593              -poa.used, -poa.comp, -poa.uncomp, tx);
1594 1594  
1595 1595          /* swap next's deadlist to our deadlist */
1596 1596          dsl_deadlist_close(&ds->ds_deadlist);
1597 1597          dsl_deadlist_close(&ds_next->ds_deadlist);
1598 1598          SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1599 1599              ds->ds_phys->ds_deadlist_obj);
1600 1600          dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1601 1601          dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1602 1602              ds_next->ds_phys->ds_deadlist_obj);
1603 1603  }
1604 1604  
1605 1605  static int
1606 1606  old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
1607 1607  {
1608 1608          int err;
1609 1609          struct killarg ka;
1610 1610  
1611 1611          /*
1612 1612           * Free everything that we point to (that's born after
1613 1613           * the previous snapshot, if we are a clone)
1614 1614           *
1615 1615           * NB: this should be very quick, because we already
1616 1616           * freed all the objects in open context.
1617 1617           */
1618 1618          ka.ds = ds;
1619 1619          ka.tx = tx;
1620 1620          err = traverse_dataset(ds,
1621 1621              ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
1622 1622              kill_blkptr, &ka);
1623 1623          ASSERT0(err);
1624 1624          ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
1625 1625  
1626 1626          return (err);
1627 1627  }
1628 1628  
1629 1629  void
1630 1630  dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1631 1631  {
1632 1632          struct dsl_ds_destroyarg *dsda = arg1;
1633 1633          dsl_dataset_t *ds = dsda->ds;
1634 1634          int err;
1635 1635          int after_branch_point = FALSE;
1636 1636          dsl_pool_t *dp = ds->ds_dir->dd_pool;
1637 1637          objset_t *mos = dp->dp_meta_objset;
1638 1638          dsl_dataset_t *ds_prev = NULL;
1639 1639          boolean_t wont_destroy;
1640 1640          uint64_t obj;
1641 1641  
1642 1642          wont_destroy = (dsda->defer &&
1643 1643              (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1644 1644  
1645 1645          ASSERT(ds->ds_owner || wont_destroy);
1646 1646          ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1647 1647          ASSERT(ds->ds_prev == NULL ||
1648 1648              ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1649 1649          ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1650 1650  
1651 1651          if (wont_destroy) {
1652 1652                  ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
1653 1653                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
1654 1654                  ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
1655 1655                  spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
1656 1656                  return;
1657 1657          }
1658 1658  
1659 1659          /* We need to log before removing it from the namespace. */
1660 1660          spa_history_log_internal_ds(ds, "destroy", tx, "");
1661 1661  
1662 1662          /* signal any waiters that this dataset is going away */
1663 1663          mutex_enter(&ds->ds_lock);
1664 1664          ds->ds_owner = dsl_reaper;
1665 1665          cv_broadcast(&ds->ds_exclusive_cv);
1666 1666          mutex_exit(&ds->ds_lock);
1667 1667  
1668 1668          /* Remove our reservation */
1669 1669          if (ds->ds_reserved != 0) {
1670 1670                  dsl_prop_setarg_t psa;
1671 1671                  uint64_t value = 0;
1672 1672  
1673 1673                  dsl_prop_setarg_init_uint64(&psa, "refreservation",
1674 1674                      (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
1675 1675                      &value);
1676 1676                  psa.psa_effective_value = 0;    /* predict default value */
1677 1677  
1678 1678                  dsl_dataset_set_reservation_sync(ds, &psa, tx);
1679 1679                  ASSERT0(ds->ds_reserved);
1680 1680          }
1681 1681  
1682 1682          ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1683 1683  
1684 1684          dsl_scan_ds_destroyed(ds, tx);
1685 1685  
1686 1686          obj = ds->ds_object;
1687 1687  
1688 1688          if (ds->ds_phys->ds_prev_snap_obj != 0) {
1689 1689                  if (ds->ds_prev) {
1690 1690                          ds_prev = ds->ds_prev;
1691 1691                  } else {
1692 1692                          VERIFY(0 == dsl_dataset_hold_obj(dp,
1693 1693                              ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1694 1694                  }
1695 1695                  after_branch_point =
1696 1696                      (ds_prev->ds_phys->ds_next_snap_obj != obj);
1697 1697  
1698 1698                  dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1699 1699                  if (after_branch_point &&
1700 1700                      ds_prev->ds_phys->ds_next_clones_obj != 0) {
1701 1701                          remove_from_next_clones(ds_prev, obj, tx);
1702 1702                          if (ds->ds_phys->ds_next_snap_obj != 0) {
1703 1703                                  VERIFY(0 == zap_add_int(mos,
1704 1704                                      ds_prev->ds_phys->ds_next_clones_obj,
1705 1705                                      ds->ds_phys->ds_next_snap_obj, tx));
1706 1706                          }
1707 1707                  }
1708 1708                  if (after_branch_point &&
1709 1709                      ds->ds_phys->ds_next_snap_obj == 0) {
1710 1710                          /* This clone is toast. */
1711 1711                          ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1712 1712                          ds_prev->ds_phys->ds_num_children--;
1713 1713  
1714 1714                          /*
1715 1715                           * If the clone's origin has no other clones, no
1716 1716                           * user holds, and has been marked for deferred
1717 1717                           * deletion, then we should have done the necessary
1718 1718                           * destroy setup for it.
1719 1719                           */
1720 1720                          if (ds_prev->ds_phys->ds_num_children == 1 &&
1721 1721                              ds_prev->ds_userrefs == 0 &&
1722 1722                              DS_IS_DEFER_DESTROY(ds_prev)) {
1723 1723                                  ASSERT3P(dsda->rm_origin, !=, NULL);
1724 1724                          } else {
1725 1725                                  ASSERT3P(dsda->rm_origin, ==, NULL);
1726 1726                          }
1727 1727                  } else if (!after_branch_point) {
1728 1728                          ds_prev->ds_phys->ds_next_snap_obj =
1729 1729                              ds->ds_phys->ds_next_snap_obj;
1730 1730                  }
1731 1731          }
1732 1732  
1733 1733          if (dsl_dataset_is_snapshot(ds)) {
1734 1734                  dsl_dataset_t *ds_next;
1735 1735                  uint64_t old_unique;
1736 1736                  uint64_t used = 0, comp = 0, uncomp = 0;
1737 1737  
1738 1738                  VERIFY(0 == dsl_dataset_hold_obj(dp,
1739 1739                      ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1740 1740                  ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1741 1741  
1742 1742                  old_unique = ds_next->ds_phys->ds_unique_bytes;
1743 1743  
1744 1744                  dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1745 1745                  ds_next->ds_phys->ds_prev_snap_obj =
1746 1746                      ds->ds_phys->ds_prev_snap_obj;
1747 1747                  ds_next->ds_phys->ds_prev_snap_txg =
1748 1748                      ds->ds_phys->ds_prev_snap_txg;
1749 1749                  ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1750 1750                      ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1751 1751  
1752 1752  
1753 1753                  if (ds_next->ds_deadlist.dl_oldfmt) {
1754 1754                          process_old_deadlist(ds, ds_prev, ds_next,
1755 1755                              after_branch_point, tx);
1756 1756                  } else {
1757 1757                          /* Adjust prev's unique space. */
1758 1758                          if (ds_prev && !after_branch_point) {
1759 1759                                  dsl_deadlist_space_range(&ds_next->ds_deadlist,
1760 1760                                      ds_prev->ds_phys->ds_prev_snap_txg,
1761 1761                                      ds->ds_phys->ds_prev_snap_txg,
1762 1762                                      &used, &comp, &uncomp);
1763 1763                                  ds_prev->ds_phys->ds_unique_bytes += used;
1764 1764                          }
1765 1765  
1766 1766                          /* Adjust snapused. */
1767 1767                          dsl_deadlist_space_range(&ds_next->ds_deadlist,
1768 1768                              ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1769 1769                              &used, &comp, &uncomp);
1770 1770                          dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1771 1771                              -used, -comp, -uncomp, tx);
1772 1772  
1773 1773                          /* Move blocks to be freed to pool's free list. */
1774 1774                          dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1775 1775                              &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1776 1776                              tx);
1777 1777                          dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1778 1778                              DD_USED_HEAD, used, comp, uncomp, tx);
1779 1779  
1780 1780                          /* Merge our deadlist into next's and free it. */
1781 1781                          dsl_deadlist_merge(&ds_next->ds_deadlist,
1782 1782                              ds->ds_phys->ds_deadlist_obj, tx);
1783 1783                  }
1784 1784                  dsl_deadlist_close(&ds->ds_deadlist);
1785 1785                  dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1786 1786  
1787 1787                  /* Collapse range in clone heads */
1788 1788                  dsl_dataset_remove_clones_key(ds,
1789 1789                      ds->ds_phys->ds_creation_txg, tx);
1790 1790  
1791 1791                  if (dsl_dataset_is_snapshot(ds_next)) {
1792 1792                          dsl_dataset_t *ds_nextnext;
1793 1793  
1794 1794                          /*
1795 1795                           * Update next's unique to include blocks which
1796 1796                           * were previously shared by only this snapshot
1797 1797                           * and it.  Those blocks will be born after the
1798 1798                           * prev snap and before this snap, and will have
1799 1799                           * died after the next snap and before the one
1800 1800                           * after that (ie. be on the snap after next's
1801 1801                           * deadlist).
1802 1802                           */
1803 1803                          VERIFY(0 == dsl_dataset_hold_obj(dp,
1804 1804                              ds_next->ds_phys->ds_next_snap_obj,
1805 1805                              FTAG, &ds_nextnext));
1806 1806                          dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
1807 1807                              ds->ds_phys->ds_prev_snap_txg,
1808 1808                              ds->ds_phys->ds_creation_txg,
1809 1809                              &used, &comp, &uncomp);
1810 1810                          ds_next->ds_phys->ds_unique_bytes += used;
1811 1811                          dsl_dataset_rele(ds_nextnext, FTAG);
1812 1812                          ASSERT3P(ds_next->ds_prev, ==, NULL);
1813 1813  
1814 1814                          /* Collapse range in this head. */
1815 1815                          dsl_dataset_t *hds;
1816 1816                          VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
1817 1817                              ds->ds_dir->dd_phys->dd_head_dataset_obj,
1818 1818                              FTAG, &hds));
1819 1819                          dsl_deadlist_remove_key(&hds->ds_deadlist,
1820 1820                              ds->ds_phys->ds_creation_txg, tx);
1821 1821                          dsl_dataset_rele(hds, FTAG);
1822 1822  
1823 1823                  } else {
1824 1824                          ASSERT3P(ds_next->ds_prev, ==, ds);
1825 1825                          dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1826 1826                          ds_next->ds_prev = NULL;
1827 1827                          if (ds_prev) {
1828 1828                                  VERIFY(0 == dsl_dataset_get_ref(dp,
1829 1829                                      ds->ds_phys->ds_prev_snap_obj,
1830 1830                                      ds_next, &ds_next->ds_prev));
1831 1831                          }
1832 1832  
1833 1833                          dsl_dataset_recalc_head_uniq(ds_next);
1834 1834  
1835 1835                          /*
1836 1836                           * Reduce the amount of our unconsmed refreservation
1837 1837                           * being charged to our parent by the amount of
1838 1838                           * new unique data we have gained.
1839 1839                           */
1840 1840                          if (old_unique < ds_next->ds_reserved) {
1841 1841                                  int64_t mrsdelta;
1842 1842                                  uint64_t new_unique =
1843 1843                                      ds_next->ds_phys->ds_unique_bytes;
1844 1844  
1845 1845                                  ASSERT(old_unique <= new_unique);
1846 1846                                  mrsdelta = MIN(new_unique - old_unique,
1847 1847                                      ds_next->ds_reserved - old_unique);
1848 1848                                  dsl_dir_diduse_space(ds->ds_dir,
1849 1849                                      DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1850 1850                          }
1851 1851                  }
1852 1852                  dsl_dataset_rele(ds_next, FTAG);
1853 1853          } else {
1854 1854                  zfeature_info_t *async_destroy =
1855 1855                      &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
1856 1856                  objset_t *os;
1857 1857  
1858 1858                  /*
1859 1859                   * There's no next snapshot, so this is a head dataset.
1860 1860                   * Destroy the deadlist.  Unless it's a clone, the
1861 1861                   * deadlist should be empty.  (If it's a clone, it's
1862 1862                   * safe to ignore the deadlist contents.)
1863 1863                   */
1864 1864                  dsl_deadlist_close(&ds->ds_deadlist);
1865 1865                  dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1866 1866                  ds->ds_phys->ds_deadlist_obj = 0;
1867 1867  
1868 1868                  VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
1869 1869  
1870 1870                  if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
1871 1871                          err = old_synchronous_dataset_destroy(ds, tx);
1872 1872                  } else {
1873 1873                          /*
1874 1874                           * Move the bptree into the pool's list of trees to
1875 1875                           * clean up and update space accounting information.
1876 1876                           */
1877 1877                          uint64_t used, comp, uncomp;
1878 1878  
1879 1879                          zil_destroy_sync(dmu_objset_zil(os), tx);
1880 1880  
1881 1881                          if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
1882 1882                                  spa_feature_incr(dp->dp_spa, async_destroy, tx);
1883 1883                                  dp->dp_bptree_obj = bptree_alloc(mos, tx);
1884 1884                                  VERIFY(zap_add(mos,
1885 1885                                      DMU_POOL_DIRECTORY_OBJECT,
1886 1886                                      DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
1887 1887                                      &dp->dp_bptree_obj, tx) == 0);
1888 1888                          }
1889 1889  
1890 1890                          used = ds->ds_dir->dd_phys->dd_used_bytes;
1891 1891                          comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
1892 1892                          uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
1893 1893  
1894 1894                          ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1895 1895                              ds->ds_phys->ds_unique_bytes == used);
1896 1896  
1897 1897                          bptree_add(mos, dp->dp_bptree_obj,
1898 1898                              &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
1899 1899                              used, comp, uncomp, tx);
1900 1900                          dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
1901 1901                              -used, -comp, -uncomp, tx);
1902 1902                          dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
1903 1903                              used, comp, uncomp, tx);
1904 1904                  }
1905 1905  
1906 1906                  if (ds->ds_prev != NULL) {
1907 1907                          if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1908 1908                                  VERIFY3U(0, ==, zap_remove_int(mos,
1909 1909                                      ds->ds_prev->ds_dir->dd_phys->dd_clones,
1910 1910                                      ds->ds_object, tx));
1911 1911                          }
1912 1912                          dsl_dataset_rele(ds->ds_prev, ds);
1913 1913                          ds->ds_prev = ds_prev = NULL;
1914 1914                  }
1915 1915          }
1916 1916  
1917 1917          /*
1918 1918           * This must be done after the dsl_traverse(), because it will
1919 1919           * re-open the objset.
1920 1920           */
1921 1921          if (ds->ds_objset) {
1922 1922                  dmu_objset_evict(ds->ds_objset);
1923 1923                  ds->ds_objset = NULL;
1924 1924          }
1925 1925  
1926 1926          if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1927 1927                  /* Erase the link in the dir */
1928 1928                  dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1929 1929                  ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1930 1930                  ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1931 1931                  err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1932 1932                  ASSERT(err == 0);
1933 1933          } else {
1934 1934                  /* remove from snapshot namespace */
1935 1935                  dsl_dataset_t *ds_head;
1936 1936                  ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1937 1937                  VERIFY(0 == dsl_dataset_hold_obj(dp,
1938 1938                      ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1939 1939                  VERIFY(0 == dsl_dataset_get_snapname(ds));
1940 1940  #ifdef ZFS_DEBUG
1941 1941                  {
1942 1942                          uint64_t val;
1943 1943  
1944 1944                          err = dsl_dataset_snap_lookup(ds_head,
1945 1945                              ds->ds_snapname, &val);
1946 1946                          ASSERT0(err);
1947 1947                          ASSERT3U(val, ==, obj);
1948 1948                  }
1949 1949  #endif
1950 1950                  err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1951 1951                  ASSERT(err == 0);
1952 1952                  dsl_dataset_rele(ds_head, FTAG);
1953 1953          }
1954 1954  
1955 1955          if (ds_prev && ds->ds_prev != ds_prev)
1956 1956                  dsl_dataset_rele(ds_prev, FTAG);
1957 1957  
1958 1958          spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1959 1959  
1960 1960          if (ds->ds_phys->ds_next_clones_obj != 0) {
1961 1961                  uint64_t count;
1962 1962                  ASSERT(0 == zap_count(mos,
1963 1963                      ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1964 1964                  VERIFY(0 == dmu_object_free(mos,
1965 1965                      ds->ds_phys->ds_next_clones_obj, tx));
1966 1966          }
1967 1967          if (ds->ds_phys->ds_props_obj != 0)
1968 1968                  VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1969 1969          if (ds->ds_phys->ds_userrefs_obj != 0)
1970 1970                  VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
1971 1971          dsl_dir_close(ds->ds_dir, ds);
1972 1972          ds->ds_dir = NULL;
1973 1973          dsl_dataset_drain_refs(ds, tag);
1974 1974          VERIFY(0 == dmu_object_free(mos, obj, tx));
1975 1975  
1976 1976          if (dsda->rm_origin) {
1977 1977                  /*
1978 1978                   * Remove the origin of the clone we just destroyed.
1979 1979                   */
1980 1980                  struct dsl_ds_destroyarg ndsda = {0};
1981 1981  
1982 1982                  ndsda.ds = dsda->rm_origin;
1983 1983                  dsl_dataset_destroy_sync(&ndsda, tag, tx);
1984 1984          }
1985 1985  }
1986 1986  
1987 1987  static int
1988 1988  dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1989 1989  {
1990 1990          uint64_t asize;
1991 1991  
1992 1992          if (!dmu_tx_is_syncing(tx))
1993 1993                  return (0);
1994 1994  
1995 1995          /*
1996 1996           * If there's an fs-only reservation, any blocks that might become
1997 1997           * owned by the snapshot dataset must be accommodated by space
1998 1998           * outside of the reservation.
1999 1999           */
2000 2000          ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
2001 2001          asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2002 2002          if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
2003 2003                  return (ENOSPC);
2004 2004  
2005 2005          /*
2006 2006           * Propagate any reserved space for this snapshot to other
2007 2007           * snapshot checks in this sync group.
2008 2008           */
2009 2009          if (asize > 0)
2010 2010                  dsl_dir_willuse_space(ds->ds_dir, asize, tx);
2011 2011  
2012 2012          return (0);
2013 2013  }
2014 2014  
2015 2015  int
2016 2016  dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
2017 2017      dmu_tx_t *tx)
2018 2018  {
2019 2019          int err;
2020 2020          uint64_t value;
2021 2021  
2022 2022          /*
2023 2023           * We don't allow multiple snapshots of the same txg.  If there
2024 2024           * is already one, try again.
2025 2025           */
2026 2026          if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
2027 2027                  return (EAGAIN);
2028 2028  
2029 2029          /*
2030 2030           * Check for conflicting snapshot name.
2031 2031           */
2032 2032          err = dsl_dataset_snap_lookup(ds, snapname, &value);
2033 2033          if (err == 0)
2034 2034                  return (EEXIST);
2035 2035          if (err != ENOENT)
2036 2036                  return (err);
2037 2037  
2038 2038          /*
2039 2039           * Check that the dataset's name is not too long.  Name consists
2040 2040           * of the dataset's length + 1 for the @-sign + snapshot name's length
2041 2041           */
2042 2042          if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
2043 2043                  return (ENAMETOOLONG);
2044 2044  
2045 2045          err = dsl_dataset_snapshot_reserve_space(ds, tx);
2046 2046          if (err)
2047 2047                  return (err);
2048 2048  
2049 2049          ds->ds_trysnap_txg = tx->tx_txg;
2050 2050          return (0);
2051 2051  }
2052 2052  
2053 2053  void
2054 2054  dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
2055 2055      dmu_tx_t *tx)
2056 2056  {
2057 2057          dsl_pool_t *dp = ds->ds_dir->dd_pool;
2058 2058          dmu_buf_t *dbuf;
2059 2059          dsl_dataset_phys_t *dsphys;
2060 2060          uint64_t dsobj, crtxg;
2061 2061          objset_t *mos = dp->dp_meta_objset;
2062 2062          int err;
2063 2063  
2064 2064          ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
2065 2065  
2066 2066          /*
2067 2067           * The origin's ds_creation_txg has to be < TXG_INITIAL
2068 2068           */
2069 2069          if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
2070 2070                  crtxg = 1;
2071 2071          else
2072 2072                  crtxg = tx->tx_txg;
2073 2073  
2074 2074          dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2075 2075              DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2076 2076          VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2077 2077          dmu_buf_will_dirty(dbuf, tx);
2078 2078          dsphys = dbuf->db_data;
2079 2079          bzero(dsphys, sizeof (dsl_dataset_phys_t));
2080 2080          dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2081 2081          dsphys->ds_fsid_guid = unique_create();
2082 2082          (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2083 2083              sizeof (dsphys->ds_guid));
2084 2084          dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2085 2085          dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2086 2086          dsphys->ds_next_snap_obj = ds->ds_object;
2087 2087          dsphys->ds_num_children = 1;
2088 2088          dsphys->ds_creation_time = gethrestime_sec();
2089 2089          dsphys->ds_creation_txg = crtxg;
2090 2090          dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2091 2091          dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
2092 2092          dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2093 2093          dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2094 2094          dsphys->ds_flags = ds->ds_phys->ds_flags;
2095 2095          dsphys->ds_bp = ds->ds_phys->ds_bp;
2096 2096          dmu_buf_rele(dbuf, FTAG);
2097 2097  
2098 2098          ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2099 2099          if (ds->ds_prev) {
2100 2100                  uint64_t next_clones_obj =
2101 2101                      ds->ds_prev->ds_phys->ds_next_clones_obj;
2102 2102                  ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2103 2103                      ds->ds_object ||
2104 2104                      ds->ds_prev->ds_phys->ds_num_children > 1);
2105 2105                  if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2106 2106                          dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2107 2107                          ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2108 2108                              ds->ds_prev->ds_phys->ds_creation_txg);
2109 2109                          ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2110 2110                  } else if (next_clones_obj != 0) {
2111 2111                          remove_from_next_clones(ds->ds_prev,
2112 2112                              dsphys->ds_next_snap_obj, tx);
2113 2113                          VERIFY3U(0, ==, zap_add_int(mos,
2114 2114                              next_clones_obj, dsobj, tx));
2115 2115                  }
2116 2116          }
2117 2117  
2118 2118          /*
2119 2119           * If we have a reference-reservation on this dataset, we will
2120 2120           * need to increase the amount of refreservation being charged
2121 2121           * since our unique space is going to zero.
2122 2122           */
2123 2123          if (ds->ds_reserved) {
2124 2124                  int64_t delta;
2125 2125                  ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2126 2126                  delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2127 2127                  dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
2128 2128                      delta, 0, 0, tx);
2129 2129          }
2130 2130  
2131 2131          dmu_buf_will_dirty(ds->ds_dbuf, tx);
2132 2132          zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
2133 2133              ds->ds_dir->dd_myname, snapname, dsobj,
2134 2134              ds->ds_phys->ds_prev_snap_txg);
2135 2135          ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
2136 2136              UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
2137 2137          dsl_deadlist_close(&ds->ds_deadlist);
2138 2138          dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
2139 2139          dsl_deadlist_add_key(&ds->ds_deadlist,
2140 2140              ds->ds_phys->ds_prev_snap_txg, tx);
2141 2141  
2142 2142          ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
2143 2143          ds->ds_phys->ds_prev_snap_obj = dsobj;
2144 2144          ds->ds_phys->ds_prev_snap_txg = crtxg;
2145 2145          ds->ds_phys->ds_unique_bytes = 0;
2146 2146          if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
2147 2147                  ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
2148 2148  
2149 2149          err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
2150 2150              snapname, 8, 1, &dsobj, tx);
2151 2151          ASSERT(err == 0);
2152 2152  
2153 2153          if (ds->ds_prev)
2154 2154                  dsl_dataset_drop_ref(ds->ds_prev, ds);
2155 2155          VERIFY(0 == dsl_dataset_get_ref(dp,
2156 2156              ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
2157 2157  
2158 2158          dsl_scan_ds_snapshotted(ds, tx);
2159 2159  
2160 2160          dsl_dir_snap_cmtime_update(ds->ds_dir);
2161 2161  
2162 2162          spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
2163 2163  }
2164 2164  
2165 2165  void
2166 2166  dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
2167 2167  {
2168 2168          ASSERT(dmu_tx_is_syncing(tx));
2169 2169          ASSERT(ds->ds_objset != NULL);
2170 2170          ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
2171 2171  
2172 2172          /*
2173 2173           * in case we had to change ds_fsid_guid when we opened it,
2174 2174           * sync it out now.
2175 2175           */
2176 2176          dmu_buf_will_dirty(ds->ds_dbuf, tx);
2177 2177          ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
2178 2178  
2179 2179          dmu_objset_sync(ds->ds_objset, zio, tx);
2180 2180  }
2181 2181  
2182 2182  static void
2183 2183  get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
2184 2184  {
2185 2185          uint64_t count = 0;
2186 2186          objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
2187 2187          zap_cursor_t zc;
2188 2188          zap_attribute_t za;
2189 2189          nvlist_t *propval;
2190 2190          nvlist_t *val;
2191 2191  
2192 2192          rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2193 2193          VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2194 2194          VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2195 2195  
2196 2196          /*
2197 2197           * There may me missing entries in ds_next_clones_obj
2198 2198           * due to a bug in a previous version of the code.
2199 2199           * Only trust it if it has the right number of entries.
2200 2200           */
2201 2201          if (ds->ds_phys->ds_next_clones_obj != 0) {
2202 2202                  ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
2203 2203                      &count));
2204 2204          }
2205 2205          if (count != ds->ds_phys->ds_num_children - 1) {
2206 2206                  goto fail;
2207 2207          }
2208 2208          for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
2209 2209              zap_cursor_retrieve(&zc, &za) == 0;
2210 2210              zap_cursor_advance(&zc)) {
2211 2211                  dsl_dataset_t *clone;
2212 2212                  char buf[ZFS_MAXNAMELEN];
2213 2213                  /*
2214 2214                   * Even though we hold the dp_config_rwlock, the dataset
2215 2215                   * may fail to open, returning ENOENT.  If there is a
2216 2216                   * thread concurrently attempting to destroy this
2217 2217                   * dataset, it will have the ds_rwlock held for
2218 2218                   * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
2219 2219                   * dsl_dataset_hold_ref() will fail its
2220 2220                   * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
2221 2221                   * dp_config_rwlock, and wait for the destroy progress
2222 2222                   * and signal ds_exclusive_cv.  If the destroy was
2223 2223                   * successful, we will see that
2224 2224                   * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
2225 2225                   */
2226 2226                  if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
2227 2227                      za.za_first_integer, FTAG, &clone) != 0)
2228 2228                          continue;
2229 2229                  dsl_dir_name(clone->ds_dir, buf);
2230 2230                  VERIFY(nvlist_add_boolean(val, buf) == 0);
2231 2231                  dsl_dataset_rele(clone, FTAG);
2232 2232          }
2233 2233          zap_cursor_fini(&zc);
2234 2234          VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
2235 2235          VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
2236 2236              propval) == 0);
2237 2237  fail:
2238 2238          nvlist_free(val);
2239 2239          nvlist_free(propval);
2240 2240          rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2241 2241  }
2242 2242  
2243 2243  void
2244 2244  dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2245 2245  {
2246 2246          uint64_t refd, avail, uobjs, aobjs, ratio;
2247 2247  
2248 2248          ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
2249 2249              (ds->ds_phys->ds_uncompressed_bytes * 100 /
2250 2250              ds->ds_phys->ds_compressed_bytes);
2251 2251  
2252 2252          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
2253 2253  
2254 2254          if (dsl_dataset_is_snapshot(ds)) {
2255 2255                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
2256 2256                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2257 2257                      ds->ds_phys->ds_unique_bytes);
2258 2258                  get_clones_stat(ds, nv);
2259 2259          } else {
2260 2260                  dsl_dir_stats(ds->ds_dir, nv);
2261 2261          }
2262 2262  
2263 2263          dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
2264 2264          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
2265 2265          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
2266 2266  
2267 2267          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
2268 2268              ds->ds_phys->ds_creation_time);
2269 2269          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
2270 2270              ds->ds_phys->ds_creation_txg);
2271 2271          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
2272 2272              ds->ds_quota);
2273 2273          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
2274 2274              ds->ds_reserved);
2275 2275          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
2276 2276              ds->ds_phys->ds_guid);
2277 2277          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
2278 2278              ds->ds_phys->ds_unique_bytes);
2279 2279          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
2280 2280              ds->ds_object);
2281 2281          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
2282 2282              ds->ds_userrefs);
2283 2283          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
2284 2284              DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
2285 2285  
2286 2286          if (ds->ds_phys->ds_prev_snap_obj != 0) {
2287 2287                  uint64_t written, comp, uncomp;
2288 2288                  dsl_pool_t *dp = ds->ds_dir->dd_pool;
2289 2289                  dsl_dataset_t *prev;
2290 2290  
2291 2291                  rw_enter(&dp->dp_config_rwlock, RW_READER);
2292 2292                  int err = dsl_dataset_hold_obj(dp,
2293 2293                      ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
2294 2294                  rw_exit(&dp->dp_config_rwlock);
2295 2295                  if (err == 0) {
2296 2296                          err = dsl_dataset_space_written(prev, ds, &written,
2297 2297                              &comp, &uncomp);
2298 2298                          dsl_dataset_rele(prev, FTAG);
2299 2299                          if (err == 0) {
2300 2300                                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
2301 2301                                      written);
2302 2302                          }
2303 2303                  }
2304 2304          }
2305 2305  }
2306 2306  
2307 2307  void
2308 2308  dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2309 2309  {
2310 2310          stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
2311 2311          stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
2312 2312          stat->dds_guid = ds->ds_phys->ds_guid;
2313 2313          stat->dds_origin[0] = '\0';
2314 2314          if (dsl_dataset_is_snapshot(ds)) {
2315 2315                  stat->dds_is_snapshot = B_TRUE;
2316 2316                  stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
2317 2317          } else {
2318 2318                  stat->dds_is_snapshot = B_FALSE;
2319 2319                  stat->dds_num_clones = 0;
2320 2320  
2321 2321                  rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2322 2322                  if (dsl_dir_is_clone(ds->ds_dir)) {
2323 2323                          dsl_dataset_t *ods;
2324 2324  
2325 2325                          VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
2326 2326                              ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2327 2327                          dsl_dataset_name(ods, stat->dds_origin);
2328 2328                          dsl_dataset_drop_ref(ods, FTAG);
2329 2329                  }
2330 2330                  rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2331 2331          }
2332 2332  }
2333 2333  
2334 2334  uint64_t
2335 2335  dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2336 2336  {
2337 2337          return (ds->ds_fsid_guid);
2338 2338  }
2339 2339  
2340 2340  void
2341 2341  dsl_dataset_space(dsl_dataset_t *ds,
2342 2342      uint64_t *refdbytesp, uint64_t *availbytesp,
2343 2343      uint64_t *usedobjsp, uint64_t *availobjsp)
2344 2344  {
2345 2345          *refdbytesp = ds->ds_phys->ds_referenced_bytes;
2346 2346          *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2347 2347          if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2348 2348                  *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2349 2349          if (ds->ds_quota != 0) {
2350 2350                  /*
2351 2351                   * Adjust available bytes according to refquota
2352 2352                   */
2353 2353                  if (*refdbytesp < ds->ds_quota)
2354 2354                          *availbytesp = MIN(*availbytesp,
2355 2355                              ds->ds_quota - *refdbytesp);
2356 2356                  else
2357 2357                          *availbytesp = 0;
2358 2358          }
2359 2359          *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2360 2360          *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2361 2361  }
2362 2362  
2363 2363  boolean_t
2364 2364  dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2365 2365  {
2366 2366          dsl_pool_t *dp = ds->ds_dir->dd_pool;
2367 2367  
2368 2368          ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2369 2369              dsl_pool_sync_context(dp));
2370 2370          if (ds->ds_prev == NULL)
2371 2371                  return (B_FALSE);
2372 2372          if (ds->ds_phys->ds_bp.blk_birth >
2373 2373              ds->ds_prev->ds_phys->ds_creation_txg) {
2374 2374                  objset_t *os, *os_prev;
2375 2375                  /*
2376 2376                   * It may be that only the ZIL differs, because it was
2377 2377                   * reset in the head.  Don't count that as being
2378 2378                   * modified.
2379 2379                   */
2380 2380                  if (dmu_objset_from_ds(ds, &os) != 0)
2381 2381                          return (B_TRUE);
2382 2382                  if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
2383 2383                          return (B_TRUE);
2384 2384                  return (bcmp(&os->os_phys->os_meta_dnode,
2385 2385                      &os_prev->os_phys->os_meta_dnode,
2386 2386                      sizeof (os->os_phys->os_meta_dnode)) != 0);
2387 2387          }
2388 2388          return (B_FALSE);
2389 2389  }
2390 2390  
2391 2391  /* ARGSUSED */
2392 2392  static int
2393 2393  dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2394 2394  {
2395 2395          dsl_dataset_t *ds = arg1;
2396 2396          char *newsnapname = arg2;
2397 2397          dsl_dir_t *dd = ds->ds_dir;
2398 2398          dsl_dataset_t *hds;
2399 2399          uint64_t val;
2400 2400          int err;
2401 2401  
2402 2402          err = dsl_dataset_hold_obj(dd->dd_pool,
2403 2403              dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2404 2404          if (err)
2405 2405                  return (err);
2406 2406  
2407 2407          /* new name better not be in use */
2408 2408          err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2409 2409          dsl_dataset_rele(hds, FTAG);
2410 2410  
2411 2411          if (err == 0)
2412 2412                  err = EEXIST;
2413 2413          else if (err == ENOENT)
2414 2414                  err = 0;
2415 2415  
2416 2416          /* dataset name + 1 for the "@" + the new snapshot name must fit */
2417 2417          if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2418 2418                  err = ENAMETOOLONG;
2419 2419  
2420 2420          return (err);
2421 2421  }
2422 2422  
2423 2423  static void
2424 2424  dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2425 2425  {
2426 2426          dsl_dataset_t *ds = arg1;
2427 2427          const char *newsnapname = arg2;
2428 2428          dsl_dir_t *dd = ds->ds_dir;
2429 2429          objset_t *mos = dd->dd_pool->dp_meta_objset;
2430 2430          dsl_dataset_t *hds;
2431 2431          int err;
2432 2432  
2433 2433          ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2434 2434  
2435 2435          VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2436 2436              dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2437 2437  
2438 2438          VERIFY(0 == dsl_dataset_get_snapname(ds));
2439 2439          err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2440 2440          ASSERT0(err);
2441 2441          mutex_enter(&ds->ds_lock);
2442 2442          (void) strcpy(ds->ds_snapname, newsnapname);
2443 2443          mutex_exit(&ds->ds_lock);
2444 2444          err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2445 2445              ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2446 2446          ASSERT0(err);
2447 2447  
2448 2448          spa_history_log_internal_ds(ds, "rename", tx,
2449 2449              "-> @%s", newsnapname);
2450 2450          dsl_dataset_rele(hds, FTAG);
2451 2451  }
2452 2452  
2453 2453  struct renamesnaparg {
2454 2454          dsl_sync_task_group_t *dstg;
2455 2455          char failed[MAXPATHLEN];
2456 2456          char *oldsnap;
2457 2457          char *newsnap;
2458 2458  };
2459 2459  
2460 2460  static int
2461 2461  dsl_snapshot_rename_one(const char *name, void *arg)
2462 2462  {
2463 2463          struct renamesnaparg *ra = arg;
2464 2464          dsl_dataset_t *ds = NULL;
2465 2465          char *snapname;
2466 2466          int err;
2467 2467  
2468 2468          snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
2469 2469          (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
2470 2470  
2471 2471          /*
2472 2472           * For recursive snapshot renames the parent won't be changing
2473 2473           * so we just pass name for both the to/from argument.
2474 2474           */
2475 2475          err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
2476 2476          if (err != 0) {
2477 2477                  strfree(snapname);
2478 2478                  return (err == ENOENT ? 0 : err);
2479 2479          }
2480 2480  
2481 2481  #ifdef _KERNEL
2482 2482          /*
2483 2483           * For all filesystems undergoing rename, we'll need to unmount it.
2484 2484           */
2485 2485          (void) zfs_unmount_snap(snapname, NULL);
2486 2486  #endif
2487 2487          err = dsl_dataset_hold(snapname, ra->dstg, &ds);
2488 2488          strfree(snapname);
2489 2489          if (err != 0)
2490 2490                  return (err == ENOENT ? 0 : err);
2491 2491  
2492 2492          dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2493 2493              dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2494 2494  
2495 2495          return (0);
2496 2496  }
2497 2497  
2498 2498  static int
2499 2499  dsl_recursive_rename(char *oldname, const char *newname)
2500 2500  {
2501 2501          int err;
2502 2502          struct renamesnaparg *ra;
2503 2503          dsl_sync_task_t *dst;
2504 2504          spa_t *spa;
2505 2505          char *cp, *fsname = spa_strdup(oldname);
2506 2506          int len = strlen(oldname) + 1;
2507 2507  
2508 2508          /* truncate the snapshot name to get the fsname */
2509 2509          cp = strchr(fsname, '@');
2510 2510          *cp = '\0';
2511 2511  
2512 2512          err = spa_open(fsname, &spa, FTAG);
2513 2513          if (err) {
2514 2514                  kmem_free(fsname, len);
2515 2515                  return (err);
2516 2516          }
2517 2517          ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2518 2518          ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2519 2519  
2520 2520          ra->oldsnap = strchr(oldname, '@') + 1;
2521 2521          ra->newsnap = strchr(newname, '@') + 1;
2522 2522          *ra->failed = '\0';
2523 2523  
2524 2524          err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2525 2525              DS_FIND_CHILDREN);
2526 2526          kmem_free(fsname, len);
2527 2527  
2528 2528          if (err == 0) {
2529 2529                  err = dsl_sync_task_group_wait(ra->dstg);
2530 2530          }
2531 2531  
2532 2532          for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2533 2533              dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2534 2534                  dsl_dataset_t *ds = dst->dst_arg1;
2535 2535                  if (dst->dst_err) {
2536 2536                          dsl_dir_name(ds->ds_dir, ra->failed);
2537 2537                          (void) strlcat(ra->failed, "@", sizeof (ra->failed));
2538 2538                          (void) strlcat(ra->failed, ra->newsnap,
2539 2539                              sizeof (ra->failed));
2540 2540                  }
2541 2541                  dsl_dataset_rele(ds, ra->dstg);
2542 2542          }
2543 2543  
2544 2544          if (err)
2545 2545                  (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
2546 2546  
2547 2547          dsl_sync_task_group_destroy(ra->dstg);
2548 2548          kmem_free(ra, sizeof (struct renamesnaparg));
2549 2549          spa_close(spa, FTAG);
2550 2550          return (err);
2551 2551  }
2552 2552  
2553 2553  static int
2554 2554  dsl_valid_rename(const char *oldname, void *arg)
2555 2555  {
2556 2556          int delta = *(int *)arg;
2557 2557  
2558 2558          if (strlen(oldname) + delta >= MAXNAMELEN)
2559 2559                  return (ENAMETOOLONG);
2560 2560  
2561 2561          return (0);
2562 2562  }
2563 2563  
2564 2564  #pragma weak dmu_objset_rename = dsl_dataset_rename
2565 2565  int
2566 2566  dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
2567 2567  {
2568 2568          dsl_dir_t *dd;
2569 2569          dsl_dataset_t *ds;
2570 2570          const char *tail;
2571 2571          int err;
2572 2572  
2573 2573          err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2574 2574          if (err)
2575 2575                  return (err);
2576 2576  
2577 2577          if (tail == NULL) {
2578 2578                  int delta = strlen(newname) - strlen(oldname);
2579 2579  
2580 2580                  /* if we're growing, validate child name lengths */
2581 2581                  if (delta > 0)
2582 2582                          err = dmu_objset_find(oldname, dsl_valid_rename,
2583 2583                              &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2584 2584  
2585 2585                  if (err == 0)
2586 2586                          err = dsl_dir_rename(dd, newname);
2587 2587                  dsl_dir_close(dd, FTAG);
2588 2588                  return (err);
2589 2589          }
2590 2590  
2591 2591          if (tail[0] != '@') {
2592 2592                  /* the name ended in a nonexistent component */
2593 2593                  dsl_dir_close(dd, FTAG);
2594 2594                  return (ENOENT);
2595 2595          }
2596 2596  
2597 2597          dsl_dir_close(dd, FTAG);
2598 2598  
2599 2599          /* new name must be snapshot in same filesystem */
2600 2600          tail = strchr(newname, '@');
2601 2601          if (tail == NULL)
2602 2602                  return (EINVAL);
2603 2603          tail++;
2604 2604          if (strncmp(oldname, newname, tail - newname) != 0)
2605 2605                  return (EXDEV);
2606 2606  
2607 2607          if (recursive) {
2608 2608                  err = dsl_recursive_rename(oldname, newname);
2609 2609          } else {
2610 2610                  err = dsl_dataset_hold(oldname, FTAG, &ds);
2611 2611                  if (err)
2612 2612                          return (err);
2613 2613  
2614 2614                  err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2615 2615                      dsl_dataset_snapshot_rename_check,
2616 2616                      dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2617 2617  
2618 2618                  dsl_dataset_rele(ds, FTAG);
2619 2619          }
2620 2620  
2621 2621          return (err);
2622 2622  }
2623 2623  
2624 2624  struct promotenode {
2625 2625          list_node_t link;
2626 2626          dsl_dataset_t *ds;
2627 2627  };
2628 2628  
2629 2629  struct promotearg {
2630 2630          list_t shared_snaps, origin_snaps, clone_snaps;
2631 2631          dsl_dataset_t *origin_origin;
2632 2632          uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2633 2633          char *err_ds;
2634 2634  };
2635 2635  
2636 2636  static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2637 2637  static boolean_t snaplist_unstable(list_t *l);
2638 2638  
2639 2639  static int
2640 2640  dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2641 2641  {
2642 2642          dsl_dataset_t *hds = arg1;
2643 2643          struct promotearg *pa = arg2;
2644 2644          struct promotenode *snap = list_head(&pa->shared_snaps);
2645 2645          dsl_dataset_t *origin_ds = snap->ds;
2646 2646          int err;
2647 2647          uint64_t unused;
2648 2648  
2649 2649          /* Check that it is a real clone */
2650 2650          if (!dsl_dir_is_clone(hds->ds_dir))
2651 2651                  return (EINVAL);
2652 2652  
2653 2653          /* Since this is so expensive, don't do the preliminary check */
2654 2654          if (!dmu_tx_is_syncing(tx))
2655 2655                  return (0);
2656 2656  
2657 2657          if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2658 2658                  return (EXDEV);
2659 2659  
2660 2660          /* compute origin's new unique space */
2661 2661          snap = list_tail(&pa->clone_snaps);
2662 2662          ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2663 2663          dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2664 2664              origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2665 2665              &pa->unique, &unused, &unused);
2666 2666  
2667 2667          /*
2668 2668           * Walk the snapshots that we are moving
2669 2669           *
2670 2670           * Compute space to transfer.  Consider the incremental changes
2671 2671           * to used for each snapshot:
2672 2672           * (my used) = (prev's used) + (blocks born) - (blocks killed)
2673 2673           * So each snapshot gave birth to:
2674 2674           * (blocks born) = (my used) - (prev's used) + (blocks killed)
2675 2675           * So a sequence would look like:
2676 2676           * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2677 2677           * Which simplifies to:
2678 2678           * uN + kN + kN-1 + ... + k1 + k0
2679 2679           * Note however, if we stop before we reach the ORIGIN we get:
2680 2680           * uN + kN + kN-1 + ... + kM - uM-1
2681 2681           */
2682 2682          pa->used = origin_ds->ds_phys->ds_referenced_bytes;
2683 2683          pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2684 2684          pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2685 2685          for (snap = list_head(&pa->shared_snaps); snap;
2686 2686              snap = list_next(&pa->shared_snaps, snap)) {
2687 2687                  uint64_t val, dlused, dlcomp, dluncomp;
2688 2688                  dsl_dataset_t *ds = snap->ds;
2689 2689  
2690 2690                  /* Check that the snapshot name does not conflict */
2691 2691                  VERIFY(0 == dsl_dataset_get_snapname(ds));
2692 2692                  err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2693 2693                  if (err == 0) {
2694 2694                          err = EEXIST;
2695 2695                          goto out;
2696 2696                  }
2697 2697                  if (err != ENOENT)
2698 2698                          goto out;
2699 2699  
2700 2700                  /* The very first snapshot does not have a deadlist */
2701 2701                  if (ds->ds_phys->ds_prev_snap_obj == 0)
2702 2702                          continue;
2703 2703  
2704 2704                  dsl_deadlist_space(&ds->ds_deadlist,
2705 2705                      &dlused, &dlcomp, &dluncomp);
2706 2706                  pa->used += dlused;
2707 2707                  pa->comp += dlcomp;
2708 2708                  pa->uncomp += dluncomp;
2709 2709          }
2710 2710  
2711 2711          /*
2712 2712           * If we are a clone of a clone then we never reached ORIGIN,
2713 2713           * so we need to subtract out the clone origin's used space.
2714 2714           */
2715 2715          if (pa->origin_origin) {
2716 2716                  pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
2717 2717                  pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2718 2718                  pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2719 2719          }
2720 2720  
2721 2721          /* Check that there is enough space here */
2722 2722          err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2723 2723              pa->used);
2724 2724          if (err)
2725 2725                  return (err);
2726 2726  
2727 2727          /*
2728 2728           * Compute the amounts of space that will be used by snapshots
2729 2729           * after the promotion (for both origin and clone).  For each,
2730 2730           * it is the amount of space that will be on all of their
2731 2731           * deadlists (that was not born before their new origin).
2732 2732           */
2733 2733          if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2734 2734                  uint64_t space;
2735 2735  
2736 2736                  /*
2737 2737                   * Note, typically this will not be a clone of a clone,
2738 2738                   * so dd_origin_txg will be < TXG_INITIAL, so
2739 2739                   * these snaplist_space() -> dsl_deadlist_space_range()
2740 2740                   * calls will be fast because they do not have to
2741 2741                   * iterate over all bps.
2742 2742                   */
2743 2743                  snap = list_head(&pa->origin_snaps);
2744 2744                  err = snaplist_space(&pa->shared_snaps,
2745 2745                      snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
2746 2746                  if (err)
2747 2747                          return (err);
2748 2748  
2749 2749                  err = snaplist_space(&pa->clone_snaps,
2750 2750                      snap->ds->ds_dir->dd_origin_txg, &space);
2751 2751                  if (err)
2752 2752                          return (err);
2753 2753                  pa->cloneusedsnap += space;
2754 2754          }
2755 2755          if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2756 2756                  err = snaplist_space(&pa->origin_snaps,
2757 2757                      origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2758 2758                  if (err)
2759 2759                          return (err);
2760 2760          }
2761 2761  
2762 2762          return (0);
2763 2763  out:
2764 2764          pa->err_ds =  snap->ds->ds_snapname;
2765 2765          return (err);
2766 2766  }
2767 2767  
2768 2768  static void
2769 2769  dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2770 2770  {
2771 2771          dsl_dataset_t *hds = arg1;
2772 2772          struct promotearg *pa = arg2;
2773 2773          struct promotenode *snap = list_head(&pa->shared_snaps);
2774 2774          dsl_dataset_t *origin_ds = snap->ds;
2775 2775          dsl_dataset_t *origin_head;
2776 2776          dsl_dir_t *dd = hds->ds_dir;
2777 2777          dsl_pool_t *dp = hds->ds_dir->dd_pool;
2778 2778          dsl_dir_t *odd = NULL;
2779 2779          uint64_t oldnext_obj;
2780 2780          int64_t delta;
2781 2781  
2782 2782          ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2783 2783  
2784 2784          snap = list_head(&pa->origin_snaps);
2785 2785          origin_head = snap->ds;
2786 2786  
2787 2787          /*
2788 2788           * We need to explicitly open odd, since origin_ds's dd will be
2789 2789           * changing.
2790 2790           */
2791 2791          VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2792 2792              NULL, FTAG, &odd));
2793 2793  
2794 2794          /* change origin's next snap */
2795 2795          dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2796 2796          oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2797 2797          snap = list_tail(&pa->clone_snaps);
2798 2798          ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2799 2799          origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2800 2800  
2801 2801          /* change the origin's next clone */
2802 2802          if (origin_ds->ds_phys->ds_next_clones_obj) {
2803 2803                  remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
2804 2804                  VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2805 2805                      origin_ds->ds_phys->ds_next_clones_obj,
2806 2806                      oldnext_obj, tx));
2807 2807          }
2808 2808  
2809 2809          /* change origin */
2810 2810          dmu_buf_will_dirty(dd->dd_dbuf, tx);
2811 2811          ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2812 2812          dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2813 2813          dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2814 2814          dmu_buf_will_dirty(odd->dd_dbuf, tx);
2815 2815          odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2816 2816          origin_head->ds_dir->dd_origin_txg =
2817 2817              origin_ds->ds_phys->ds_creation_txg;
2818 2818  
2819 2819          /* change dd_clone entries */
2820 2820          if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2821 2821                  VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2822 2822                      odd->dd_phys->dd_clones, hds->ds_object, tx));
2823 2823                  VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2824 2824                      pa->origin_origin->ds_dir->dd_phys->dd_clones,
2825 2825                      hds->ds_object, tx));
2826 2826  
2827 2827                  VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2828 2828                      pa->origin_origin->ds_dir->dd_phys->dd_clones,
2829 2829                      origin_head->ds_object, tx));
2830 2830                  if (dd->dd_phys->dd_clones == 0) {
2831 2831                          dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2832 2832                              DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2833 2833                  }
2834 2834                  VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2835 2835                      dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2836 2836  
2837 2837          }
2838 2838  
2839 2839          /* move snapshots to this dir */
2840 2840          for (snap = list_head(&pa->shared_snaps); snap;
2841 2841              snap = list_next(&pa->shared_snaps, snap)) {
2842 2842                  dsl_dataset_t *ds = snap->ds;
2843 2843  
2844 2844                  /* unregister props as dsl_dir is changing */
2845 2845                  if (ds->ds_objset) {
2846 2846                          dmu_objset_evict(ds->ds_objset);
2847 2847                          ds->ds_objset = NULL;
2848 2848                  }
2849 2849                  /* move snap name entry */
2850 2850                  VERIFY(0 == dsl_dataset_get_snapname(ds));
2851 2851                  VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2852 2852                      ds->ds_snapname, tx));
2853 2853                  VERIFY(0 == zap_add(dp->dp_meta_objset,
2854 2854                      hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2855 2855                      8, 1, &ds->ds_object, tx));
2856 2856  
2857 2857                  /* change containing dsl_dir */
2858 2858                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
2859 2859                  ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2860 2860                  ds->ds_phys->ds_dir_obj = dd->dd_object;
2861 2861                  ASSERT3P(ds->ds_dir, ==, odd);
2862 2862                  dsl_dir_close(ds->ds_dir, ds);
2863 2863                  VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2864 2864                      NULL, ds, &ds->ds_dir));
2865 2865  
2866 2866                  /* move any clone references */
2867 2867                  if (ds->ds_phys->ds_next_clones_obj &&
2868 2868                      spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2869 2869                          zap_cursor_t zc;
2870 2870                          zap_attribute_t za;
2871 2871  
2872 2872                          for (zap_cursor_init(&zc, dp->dp_meta_objset,
2873 2873                              ds->ds_phys->ds_next_clones_obj);
2874 2874                              zap_cursor_retrieve(&zc, &za) == 0;
2875 2875                              zap_cursor_advance(&zc)) {
2876 2876                                  dsl_dataset_t *cnds;
2877 2877                                  uint64_t o;
2878 2878  
2879 2879                                  if (za.za_first_integer == oldnext_obj) {
2880 2880                                          /*
2881 2881                                           * We've already moved the
2882 2882                                           * origin's reference.
2883 2883                                           */
2884 2884                                          continue;
2885 2885                                  }
2886 2886  
2887 2887                                  VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
2888 2888                                      za.za_first_integer, FTAG, &cnds));
2889 2889                                  o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
2890 2890  
2891 2891                                  VERIFY3U(zap_remove_int(dp->dp_meta_objset,
2892 2892                                      odd->dd_phys->dd_clones, o, tx), ==, 0);
2893 2893                                  VERIFY3U(zap_add_int(dp->dp_meta_objset,
2894 2894                                      dd->dd_phys->dd_clones, o, tx), ==, 0);
2895 2895                                  dsl_dataset_rele(cnds, FTAG);
2896 2896                          }
2897 2897                          zap_cursor_fini(&zc);
2898 2898                  }
2899 2899  
2900 2900                  ASSERT0(dsl_prop_numcb(ds));
2901 2901          }
2902 2902  
2903 2903          /*
2904 2904           * Change space accounting.
2905 2905           * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2906 2906           * both be valid, or both be 0 (resulting in delta == 0).  This
2907 2907           * is true for each of {clone,origin} independently.
2908 2908           */
2909 2909  
2910 2910          delta = pa->cloneusedsnap -
2911 2911              dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2912 2912          ASSERT3S(delta, >=, 0);
2913 2913          ASSERT3U(pa->used, >=, delta);
2914 2914          dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2915 2915          dsl_dir_diduse_space(dd, DD_USED_HEAD,
2916 2916              pa->used - delta, pa->comp, pa->uncomp, tx);
2917 2917  
2918 2918          delta = pa->originusedsnap -
2919 2919              odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2920 2920          ASSERT3S(delta, <=, 0);
2921 2921          ASSERT3U(pa->used, >=, -delta);
2922 2922          dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2923 2923          dsl_dir_diduse_space(odd, DD_USED_HEAD,
2924 2924              -pa->used - delta, -pa->comp, -pa->uncomp, tx);
2925 2925  
2926 2926          origin_ds->ds_phys->ds_unique_bytes = pa->unique;
2927 2927  
2928 2928          /* log history record */
2929 2929          spa_history_log_internal_ds(hds, "promote", tx, "");
2930 2930  
2931 2931          dsl_dir_close(odd, FTAG);
2932 2932  }
2933 2933  
2934 2934  static char *snaplist_tag = "snaplist";
2935 2935  /*
2936 2936   * Make a list of dsl_dataset_t's for the snapshots between first_obj
2937 2937   * (exclusive) and last_obj (inclusive).  The list will be in reverse
2938 2938   * order (last_obj will be the list_head()).  If first_obj == 0, do all
2939 2939   * snapshots back to this dataset's origin.
2940 2940   */
2941 2941  static int
2942 2942  snaplist_make(dsl_pool_t *dp, boolean_t own,
2943 2943      uint64_t first_obj, uint64_t last_obj, list_t *l)
2944 2944  {
2945 2945          uint64_t obj = last_obj;
2946 2946  
2947 2947          ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
2948 2948  
2949 2949          list_create(l, sizeof (struct promotenode),
2950 2950              offsetof(struct promotenode, link));
2951 2951  
2952 2952          while (obj != first_obj) {
2953 2953                  dsl_dataset_t *ds;
2954 2954                  struct promotenode *snap;
2955 2955                  int err;
2956 2956  
2957 2957                  if (own) {
2958 2958                          err = dsl_dataset_own_obj(dp, obj,
2959 2959                              0, snaplist_tag, &ds);
2960 2960                          if (err == 0)
2961 2961                                  dsl_dataset_make_exclusive(ds, snaplist_tag);
2962 2962                  } else {
2963 2963                          err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
2964 2964                  }
2965 2965                  if (err == ENOENT) {
2966 2966                          /* lost race with snapshot destroy */
2967 2967                          struct promotenode *last = list_tail(l);
2968 2968                          ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
2969 2969                          obj = last->ds->ds_phys->ds_prev_snap_obj;
2970 2970                          continue;
2971 2971                  } else if (err) {
2972 2972                          return (err);
2973 2973                  }
2974 2974  
2975 2975                  if (first_obj == 0)
2976 2976                          first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2977 2977  
2978 2978                  snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
2979 2979                  snap->ds = ds;
2980 2980                  list_insert_tail(l, snap);
2981 2981                  obj = ds->ds_phys->ds_prev_snap_obj;
2982 2982          }
2983 2983  
2984 2984          return (0);
2985 2985  }
2986 2986  
2987 2987  static int
2988 2988  snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2989 2989  {
2990 2990          struct promotenode *snap;
2991 2991  
2992 2992          *spacep = 0;
2993 2993          for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2994 2994                  uint64_t used, comp, uncomp;
2995 2995                  dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2996 2996                      mintxg, UINT64_MAX, &used, &comp, &uncomp);
2997 2997                  *spacep += used;
2998 2998          }
2999 2999          return (0);
3000 3000  }
3001 3001  
3002 3002  static void
3003 3003  snaplist_destroy(list_t *l, boolean_t own)
3004 3004  {
3005 3005          struct promotenode *snap;
3006 3006  
3007 3007          if (!l || !list_link_active(&l->list_head))
3008 3008                  return;
3009 3009  
3010 3010          while ((snap = list_tail(l)) != NULL) {
3011 3011                  list_remove(l, snap);
3012 3012                  if (own)
3013 3013                          dsl_dataset_disown(snap->ds, snaplist_tag);
3014 3014                  else
3015 3015                          dsl_dataset_rele(snap->ds, snaplist_tag);
3016 3016                  kmem_free(snap, sizeof (struct promotenode));
3017 3017          }
3018 3018          list_destroy(l);
3019 3019  }
3020 3020  
3021 3021  /*
3022 3022   * Promote a clone.  Nomenclature note:
3023 3023   * "clone" or "cds": the original clone which is being promoted
3024 3024   * "origin" or "ods": the snapshot which is originally clone's origin
3025 3025   * "origin head" or "ohds": the dataset which is the head
3026 3026   * (filesystem/volume) for the origin
3027 3027   * "origin origin": the origin of the origin's filesystem (typically
3028 3028   * NULL, indicating that the clone is not a clone of a clone).
3029 3029   */
3030 3030  int
3031 3031  dsl_dataset_promote(const char *name, char *conflsnap)
3032 3032  {
3033 3033          dsl_dataset_t *ds;
3034 3034          dsl_dir_t *dd;
3035 3035          dsl_pool_t *dp;
3036 3036          dmu_object_info_t doi;
3037 3037          struct promotearg pa = { 0 };
3038 3038          struct promotenode *snap;
3039 3039          int err;
3040 3040  
3041 3041          err = dsl_dataset_hold(name, FTAG, &ds);
3042 3042          if (err)
3043 3043                  return (err);
3044 3044          dd = ds->ds_dir;
3045 3045          dp = dd->dd_pool;
3046 3046  
3047 3047          err = dmu_object_info(dp->dp_meta_objset,
3048 3048              ds->ds_phys->ds_snapnames_zapobj, &doi);
3049 3049          if (err) {
3050 3050                  dsl_dataset_rele(ds, FTAG);
3051 3051                  return (err);
3052 3052          }
3053 3053  
3054 3054          if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
3055 3055                  dsl_dataset_rele(ds, FTAG);
3056 3056                  return (EINVAL);
3057 3057          }
3058 3058  
3059 3059          /*
3060 3060           * We are going to inherit all the snapshots taken before our
3061 3061           * origin (i.e., our new origin will be our parent's origin).
3062 3062           * Take ownership of them so that we can rename them into our
3063 3063           * namespace.
3064 3064           */
3065 3065          rw_enter(&dp->dp_config_rwlock, RW_READER);
3066 3066  
3067 3067          err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
3068 3068              &pa.shared_snaps);
3069 3069          if (err != 0)
3070 3070                  goto out;
3071 3071  
3072 3072          err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
3073 3073          if (err != 0)
3074 3074                  goto out;
3075 3075  
3076 3076          snap = list_head(&pa.shared_snaps);
3077 3077          ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
3078 3078          err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
3079 3079              snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
3080 3080          if (err != 0)
3081 3081                  goto out;
3082 3082  
3083 3083          if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
3084 3084                  err = dsl_dataset_hold_obj(dp,
3085 3085                      snap->ds->ds_dir->dd_phys->dd_origin_obj,
3086 3086                      FTAG, &pa.origin_origin);
3087 3087                  if (err != 0)
3088 3088                          goto out;
3089 3089          }
3090 3090  
3091 3091  out:
3092 3092          rw_exit(&dp->dp_config_rwlock);
3093 3093  
3094 3094          /*
3095 3095           * Add in 128x the snapnames zapobj size, since we will be moving
3096 3096           * a bunch of snapnames to the promoted ds, and dirtying their
3097 3097           * bonus buffers.
3098 3098           */
3099 3099          if (err == 0) {
3100 3100                  err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
3101 3101                      dsl_dataset_promote_sync, ds, &pa,
3102 3102                      2 + 2 * doi.doi_physical_blocks_512);
3103 3103                  if (err && pa.err_ds && conflsnap)
3104 3104                          (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
3105 3105          }
3106 3106  
3107 3107          snaplist_destroy(&pa.shared_snaps, B_TRUE);
3108 3108          snaplist_destroy(&pa.clone_snaps, B_FALSE);
3109 3109          snaplist_destroy(&pa.origin_snaps, B_FALSE);
3110 3110          if (pa.origin_origin)
3111 3111                  dsl_dataset_rele(pa.origin_origin, FTAG);
3112 3112          dsl_dataset_rele(ds, FTAG);
3113 3113          return (err);
3114 3114  }
3115 3115  
3116 3116  struct cloneswaparg {
3117 3117          dsl_dataset_t *cds; /* clone dataset */
3118 3118          dsl_dataset_t *ohds; /* origin's head dataset */
3119 3119          boolean_t force;
3120 3120          int64_t unused_refres_delta; /* change in unconsumed refreservation */
3121 3121  };
3122 3122  
3123 3123  /* ARGSUSED */
3124 3124  static int
3125 3125  dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
3126 3126  {
3127 3127          struct cloneswaparg *csa = arg1;
3128 3128  
3129 3129          /* they should both be heads */
3130 3130          if (dsl_dataset_is_snapshot(csa->cds) ||
3131 3131              dsl_dataset_is_snapshot(csa->ohds))
3132 3132                  return (EINVAL);
3133 3133  
3134 3134          /* the branch point should be just before them */
3135 3135          if (csa->cds->ds_prev != csa->ohds->ds_prev)
3136 3136                  return (EINVAL);
3137 3137  
3138 3138          /* cds should be the clone (unless they are unrelated) */
3139 3139          if (csa->cds->ds_prev != NULL &&
3140 3140              csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
3141 3141              csa->ohds->ds_object !=
3142 3142              csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
3143 3143                  return (EINVAL);
3144 3144  
3145 3145          /* the clone should be a child of the origin */
3146 3146          if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
3147 3147                  return (EINVAL);
3148 3148  
3149 3149          /* ohds shouldn't be modified unless 'force' */
3150 3150          if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
3151 3151                  return (ETXTBSY);
3152 3152  
3153 3153          /* adjust amount of any unconsumed refreservation */
3154 3154          csa->unused_refres_delta =
3155 3155              (int64_t)MIN(csa->ohds->ds_reserved,
3156 3156              csa->ohds->ds_phys->ds_unique_bytes) -
3157 3157              (int64_t)MIN(csa->ohds->ds_reserved,
3158 3158              csa->cds->ds_phys->ds_unique_bytes);
3159 3159  
3160 3160          if (csa->unused_refres_delta > 0 &&
3161 3161              csa->unused_refres_delta >
3162 3162              dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
3163 3163                  return (ENOSPC);
3164 3164  
3165 3165          if (csa->ohds->ds_quota != 0 &&
3166 3166              csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
3167 3167                  return (EDQUOT);
3168 3168  
3169 3169          return (0);
3170 3170  }
3171 3171  
3172 3172  /* ARGSUSED */
3173 3173  static void
3174 3174  dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3175 3175  {
3176 3176          struct cloneswaparg *csa = arg1;
3177 3177          dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
3178 3178  
3179 3179          ASSERT(csa->cds->ds_reserved == 0);
3180 3180          ASSERT(csa->ohds->ds_quota == 0 ||
3181 3181              csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
3182 3182  
3183 3183          dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
3184 3184          dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
3185 3185  
3186 3186          if (csa->cds->ds_objset != NULL) {
3187 3187                  dmu_objset_evict(csa->cds->ds_objset);
3188 3188                  csa->cds->ds_objset = NULL;
3189 3189          }
3190 3190  
3191 3191          if (csa->ohds->ds_objset != NULL) {
3192 3192                  dmu_objset_evict(csa->ohds->ds_objset);
3193 3193                  csa->ohds->ds_objset = NULL;
3194 3194          }
3195 3195  
3196 3196          /*
3197 3197           * Reset origin's unique bytes, if it exists.
3198 3198           */
3199 3199          if (csa->cds->ds_prev) {
3200 3200                  dsl_dataset_t *origin = csa->cds->ds_prev;
3201 3201                  uint64_t comp, uncomp;
3202 3202  
3203 3203                  dmu_buf_will_dirty(origin->ds_dbuf, tx);
3204 3204                  dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3205 3205                      origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
3206 3206                      &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
3207 3207          }
3208 3208  
3209 3209          /* swap blkptrs */
3210 3210          {
3211 3211                  blkptr_t tmp;
3212 3212                  tmp = csa->ohds->ds_phys->ds_bp;
3213 3213                  csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3214 3214                  csa->cds->ds_phys->ds_bp = tmp;
3215 3215          }
3216 3216  
3217 3217          /* set dd_*_bytes */
3218 3218          {
3219 3219                  int64_t dused, dcomp, duncomp;
3220 3220                  uint64_t cdl_used, cdl_comp, cdl_uncomp;
3221 3221                  uint64_t odl_used, odl_comp, odl_uncomp;
3222 3222  
3223 3223                  ASSERT3U(csa->cds->ds_dir->dd_phys->
3224 3224                      dd_used_breakdown[DD_USED_SNAP], ==, 0);
3225 3225  
3226 3226                  dsl_deadlist_space(&csa->cds->ds_deadlist,
3227 3227                      &cdl_used, &cdl_comp, &cdl_uncomp);
3228 3228                  dsl_deadlist_space(&csa->ohds->ds_deadlist,
3229 3229                      &odl_used, &odl_comp, &odl_uncomp);
3230 3230  
3231 3231                  dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
3232 3232                      (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
3233 3233                  dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3234 3234                      (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3235 3235                  duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3236 3236                      cdl_uncomp -
3237 3237                      (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3238 3238  
3239 3239                  dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3240 3240                      dused, dcomp, duncomp, tx);
3241 3241                  dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3242 3242                      -dused, -dcomp, -duncomp, tx);
3243 3243  
3244 3244                  /*
3245 3245                   * The difference in the space used by snapshots is the
3246 3246                   * difference in snapshot space due to the head's
3247 3247                   * deadlist (since that's the only thing that's
3248 3248                   * changing that affects the snapused).
3249 3249                   */
3250 3250                  dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3251 3251                      csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3252 3252                      &cdl_used, &cdl_comp, &cdl_uncomp);
3253 3253                  dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3254 3254                      csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3255 3255                      &odl_used, &odl_comp, &odl_uncomp);
3256 3256                  dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3257 3257                      DD_USED_HEAD, DD_USED_SNAP, tx);
3258 3258          }
3259 3259  
3260 3260          /* swap ds_*_bytes */
3261 3261          SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
3262 3262              csa->cds->ds_phys->ds_referenced_bytes);
3263 3263          SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3264 3264              csa->cds->ds_phys->ds_compressed_bytes);
3265 3265          SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3266 3266              csa->cds->ds_phys->ds_uncompressed_bytes);
3267 3267          SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3268 3268              csa->cds->ds_phys->ds_unique_bytes);
3269 3269  
3270 3270          /* apply any parent delta for change in unconsumed refreservation */
3271 3271          dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3272 3272              csa->unused_refres_delta, 0, 0, tx);
3273 3273  
3274 3274          /*
3275 3275           * Swap deadlists.
3276 3276           */
3277 3277          dsl_deadlist_close(&csa->cds->ds_deadlist);
3278 3278          dsl_deadlist_close(&csa->ohds->ds_deadlist);
3279 3279          SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3280 3280              csa->cds->ds_phys->ds_deadlist_obj);
3281 3281          dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3282 3282              csa->cds->ds_phys->ds_deadlist_obj);
3283 3283          dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
3284 3284              csa->ohds->ds_phys->ds_deadlist_obj);
3285 3285  
3286 3286          dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
3287 3287  
3288 3288          spa_history_log_internal_ds(csa->cds, "clone swap", tx,
3289 3289              "parent=%s", csa->ohds->ds_dir->dd_myname);
3290 3290  }
3291 3291  
3292 3292  /*
3293 3293   * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
3294 3294   * recv" into an existing fs to swizzle the file system to the new
3295 3295   * version, and by "zfs rollback".  Can also be used to swap two
3296 3296   * independent head datasets if neither has any snapshots.
3297 3297   */
3298 3298  int
3299 3299  dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
3300 3300      boolean_t force)
3301 3301  {
3302 3302          struct cloneswaparg csa;
3303 3303          int error;
3304 3304  
3305 3305          ASSERT(clone->ds_owner);
3306 3306          ASSERT(origin_head->ds_owner);
3307 3307  retry:
3308 3308          /*
3309 3309           * Need exclusive access for the swap. If we're swapping these
3310 3310           * datasets back after an error, we already hold the locks.
3311 3311           */
3312 3312          if (!RW_WRITE_HELD(&clone->ds_rwlock))
3313 3313                  rw_enter(&clone->ds_rwlock, RW_WRITER);
3314 3314          if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
3315 3315              !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
3316 3316                  rw_exit(&clone->ds_rwlock);
3317 3317                  rw_enter(&origin_head->ds_rwlock, RW_WRITER);
3318 3318                  if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
3319 3319                          rw_exit(&origin_head->ds_rwlock);
3320 3320                          goto retry;
3321 3321                  }
3322 3322          }
3323 3323          csa.cds = clone;
3324 3324          csa.ohds = origin_head;
3325 3325          csa.force = force;
3326 3326          error = dsl_sync_task_do(clone->ds_dir->dd_pool,
3327 3327              dsl_dataset_clone_swap_check,
3328 3328              dsl_dataset_clone_swap_sync, &csa, NULL, 9);
3329 3329          return (error);
3330 3330  }
3331 3331  
3332 3332  /*
3333 3333   * Given a pool name and a dataset object number in that pool,
3334 3334   * return the name of that dataset.
3335 3335   */
3336 3336  int
3337 3337  dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3338 3338  {
3339 3339          spa_t *spa;
3340 3340          dsl_pool_t *dp;
3341 3341          dsl_dataset_t *ds;
3342 3342          int error;
3343 3343  
3344 3344          if ((error = spa_open(pname, &spa, FTAG)) != 0)
3345 3345                  return (error);
3346 3346          dp = spa_get_dsl(spa);
3347 3347          rw_enter(&dp->dp_config_rwlock, RW_READER);
3348 3348          if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
3349 3349                  dsl_dataset_name(ds, buf);
3350 3350                  dsl_dataset_rele(ds, FTAG);
3351 3351          }
3352 3352          rw_exit(&dp->dp_config_rwlock);
3353 3353          spa_close(spa, FTAG);
3354 3354  
3355 3355          return (error);
3356 3356  }
3357 3357  
3358 3358  int
3359 3359  dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3360 3360      uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3361 3361  {
3362 3362          int error = 0;
3363 3363  
3364 3364          ASSERT3S(asize, >, 0);
3365 3365  
3366 3366          /*
3367 3367           * *ref_rsrv is the portion of asize that will come from any
3368 3368           * unconsumed refreservation space.
3369 3369           */
3370 3370          *ref_rsrv = 0;
3371 3371  
3372 3372          mutex_enter(&ds->ds_lock);
3373 3373          /*
3374 3374           * Make a space adjustment for reserved bytes.
3375 3375           */
3376 3376          if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3377 3377                  ASSERT3U(*used, >=,
3378 3378                      ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3379 3379                  *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3380 3380                  *ref_rsrv =
3381 3381                      asize - MIN(asize, parent_delta(ds, asize + inflight));
3382 3382          }
3383 3383  
3384 3384          if (!check_quota || ds->ds_quota == 0) {
3385 3385                  mutex_exit(&ds->ds_lock);
3386 3386                  return (0);
3387 3387          }
3388 3388          /*
3389 3389           * If they are requesting more space, and our current estimate
3390 3390           * is over quota, they get to try again unless the actual
3391 3391           * on-disk is over quota and there are no pending changes (which
3392 3392           * may free up space for us).
3393 3393           */
3394 3394          if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
3395 3395                  if (inflight > 0 ||
3396 3396                      ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
3397 3397                          error = ERESTART;
3398 3398                  else
3399 3399                          error = EDQUOT;
3400 3400          }
3401 3401          mutex_exit(&ds->ds_lock);
3402 3402  
3403 3403          return (error);
3404 3404  }
3405 3405  
3406 3406  /* ARGSUSED */
3407 3407  static int
3408 3408  dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3409 3409  {
3410 3410          dsl_dataset_t *ds = arg1;
3411 3411          dsl_prop_setarg_t *psa = arg2;
3412 3412          int err;
3413 3413  
3414 3414          if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3415 3415                  return (ENOTSUP);
3416 3416  
3417 3417          if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3418 3418                  return (err);
3419 3419  
3420 3420          if (psa->psa_effective_value == 0)
3421 3421                  return (0);
3422 3422  
3423 3423          if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
3424 3424              psa->psa_effective_value < ds->ds_reserved)
3425 3425                  return (ENOSPC);
3426 3426  
3427 3427          return (0);
3428 3428  }
3429 3429  
3430 3430  extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3431 3431  
3432 3432  void
3433 3433  dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3434 3434  {
3435 3435          dsl_dataset_t *ds = arg1;
3436 3436          dsl_prop_setarg_t *psa = arg2;
3437 3437          uint64_t effective_value = psa->psa_effective_value;
3438 3438  
3439 3439          dsl_prop_set_sync(ds, psa, tx);
3440 3440          DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3441 3441  
3442 3442          if (ds->ds_quota != effective_value) {
3443 3443                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
3444 3444                  ds->ds_quota = effective_value;
3445 3445          }
3446 3446  }
3447 3447  
3448 3448  int
3449 3449  dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
3450 3450  {
3451 3451          dsl_dataset_t *ds;
3452 3452          dsl_prop_setarg_t psa;
3453 3453          int err;
3454 3454  
3455 3455          dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
3456 3456  
3457 3457          err = dsl_dataset_hold(dsname, FTAG, &ds);
3458 3458          if (err)
3459 3459                  return (err);
3460 3460  
3461 3461          /*
3462 3462           * If someone removes a file, then tries to set the quota, we
3463 3463           * want to make sure the file freeing takes effect.
3464 3464           */
3465 3465          txg_wait_open(ds->ds_dir->dd_pool, 0);
3466 3466  
3467 3467          err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3468 3468              dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3469 3469              ds, &psa, 0);
3470 3470  
3471 3471          dsl_dataset_rele(ds, FTAG);
3472 3472          return (err);
3473 3473  }
3474 3474  
3475 3475  static int
3476 3476  dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3477 3477  {
3478 3478          dsl_dataset_t *ds = arg1;
3479 3479          dsl_prop_setarg_t *psa = arg2;
3480 3480          uint64_t effective_value;
3481 3481          uint64_t unique;
3482 3482          int err;
3483 3483  
3484 3484          if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3485 3485              SPA_VERSION_REFRESERVATION)
3486 3486                  return (ENOTSUP);
3487 3487  
3488 3488          if (dsl_dataset_is_snapshot(ds))
3489 3489                  return (EINVAL);
3490 3490  
3491 3491          if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3492 3492                  return (err);
3493 3493  
3494 3494          effective_value = psa->psa_effective_value;
3495 3495  
3496 3496          /*
3497 3497           * If we are doing the preliminary check in open context, the
3498 3498           * space estimates may be inaccurate.
3499 3499           */
3500 3500          if (!dmu_tx_is_syncing(tx))
3501 3501                  return (0);
3502 3502  
3503 3503          mutex_enter(&ds->ds_lock);
3504 3504          if (!DS_UNIQUE_IS_ACCURATE(ds))
3505 3505                  dsl_dataset_recalc_head_uniq(ds);
3506 3506          unique = ds->ds_phys->ds_unique_bytes;
3507 3507          mutex_exit(&ds->ds_lock);
3508 3508  
3509 3509          if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
3510 3510                  uint64_t delta = MAX(unique, effective_value) -
3511 3511                      MAX(unique, ds->ds_reserved);
3512 3512  
3513 3513                  if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3514 3514                          return (ENOSPC);
3515 3515                  if (ds->ds_quota > 0 &&
3516 3516                      effective_value > ds->ds_quota)
3517 3517                          return (ENOSPC);
3518 3518          }
3519 3519  
3520 3520          return (0);
3521 3521  }
3522 3522  
3523 3523  static void
3524 3524  dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3525 3525  {
3526 3526          dsl_dataset_t *ds = arg1;
3527 3527          dsl_prop_setarg_t *psa = arg2;
3528 3528          uint64_t effective_value = psa->psa_effective_value;
3529 3529          uint64_t unique;
3530 3530          int64_t delta;
3531 3531  
3532 3532          dsl_prop_set_sync(ds, psa, tx);
3533 3533          DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3534 3534  
3535 3535          dmu_buf_will_dirty(ds->ds_dbuf, tx);
3536 3536  
3537 3537          mutex_enter(&ds->ds_dir->dd_lock);
3538 3538          mutex_enter(&ds->ds_lock);
3539 3539          ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3540 3540          unique = ds->ds_phys->ds_unique_bytes;
3541 3541          delta = MAX(0, (int64_t)(effective_value - unique)) -
3542 3542              MAX(0, (int64_t)(ds->ds_reserved - unique));
3543 3543          ds->ds_reserved = effective_value;
3544 3544          mutex_exit(&ds->ds_lock);
3545 3545  
3546 3546          dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3547 3547          mutex_exit(&ds->ds_dir->dd_lock);
3548 3548  }
3549 3549  
3550 3550  int
3551 3551  dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
3552 3552      uint64_t reservation)
3553 3553  {
3554 3554          dsl_dataset_t *ds;
3555 3555          dsl_prop_setarg_t psa;
3556 3556          int err;
3557 3557  
3558 3558          dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
3559 3559              &reservation);
3560 3560  
3561 3561          err = dsl_dataset_hold(dsname, FTAG, &ds);
3562 3562          if (err)
3563 3563                  return (err);
3564 3564  
3565 3565          err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3566 3566              dsl_dataset_set_reservation_check,
3567 3567              dsl_dataset_set_reservation_sync, ds, &psa, 0);
3568 3568  
3569 3569          dsl_dataset_rele(ds, FTAG);
3570 3570          return (err);
3571 3571  }
3572 3572  
3573 3573  typedef struct zfs_hold_cleanup_arg {
3574 3574          dsl_pool_t *dp;
3575 3575          uint64_t dsobj;
3576 3576          char htag[MAXNAMELEN];
3577 3577  } zfs_hold_cleanup_arg_t;
3578 3578  
3579 3579  static void
3580 3580  dsl_dataset_user_release_onexit(void *arg)
3581 3581  {
3582 3582          zfs_hold_cleanup_arg_t *ca = arg;
3583 3583  
3584 3584          (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
3585 3585              B_TRUE);
3586 3586          kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
3587 3587  }
3588 3588  
3589 3589  void
3590 3590  dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
3591 3591      minor_t minor)
3592 3592  {
3593 3593          zfs_hold_cleanup_arg_t *ca;
3594 3594  
3595 3595          ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
3596 3596          ca->dp = ds->ds_dir->dd_pool;
3597 3597          ca->dsobj = ds->ds_object;
3598 3598          (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
3599 3599          VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
3600 3600              dsl_dataset_user_release_onexit, ca, NULL));
3601 3601  }
3602 3602  
3603 3603  /*
3604 3604   * If you add new checks here, you may need to add
3605 3605   * additional checks to the "temporary" case in
3606 3606   * snapshot_check() in dmu_objset.c.
3607 3607   */
3608 3608  static int
3609 3609  dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
3610 3610  {
3611 3611          dsl_dataset_t *ds = arg1;
3612 3612          struct dsl_ds_holdarg *ha = arg2;
3613 3613          const char *htag = ha->htag;
3614 3614          objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3615 3615          int error = 0;
3616 3616  
3617 3617          if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3618 3618                  return (ENOTSUP);
3619 3619  
3620 3620          if (!dsl_dataset_is_snapshot(ds))
3621 3621                  return (EINVAL);
3622 3622  
3623 3623          /* tags must be unique */
3624 3624          mutex_enter(&ds->ds_lock);
3625 3625          if (ds->ds_phys->ds_userrefs_obj) {
3626 3626                  error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
3627 3627                      8, 1, tx);
3628 3628                  if (error == 0)
3629 3629                          error = EEXIST;
3630 3630                  else if (error == ENOENT)
3631 3631                          error = 0;
3632 3632          }
3633 3633          mutex_exit(&ds->ds_lock);
3634 3634  
3635 3635          if (error == 0 && ha->temphold &&
3636 3636              strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
3637 3637                  error = E2BIG;
3638 3638  
3639 3639          return (error);
3640 3640  }
3641 3641  
3642 3642  void
3643 3643  dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3644 3644  {
3645 3645          dsl_dataset_t *ds = arg1;
3646 3646          struct dsl_ds_holdarg *ha = arg2;
3647 3647          const char *htag = ha->htag;
3648 3648          dsl_pool_t *dp = ds->ds_dir->dd_pool;
3649 3649          objset_t *mos = dp->dp_meta_objset;
3650 3650          uint64_t now = gethrestime_sec();
3651 3651          uint64_t zapobj;
3652 3652  
3653 3653          mutex_enter(&ds->ds_lock);
3654 3654          if (ds->ds_phys->ds_userrefs_obj == 0) {
3655 3655                  /*
3656 3656                   * This is the first user hold for this dataset.  Create
3657 3657                   * the userrefs zap object.
3658 3658                   */
3659 3659                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
3660 3660                  zapobj = ds->ds_phys->ds_userrefs_obj =
3661 3661                      zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
3662 3662          } else {
3663 3663                  zapobj = ds->ds_phys->ds_userrefs_obj;
3664 3664          }
3665 3665          ds->ds_userrefs++;
3666 3666          mutex_exit(&ds->ds_lock);
3667 3667  
3668 3668          VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
3669 3669  
3670 3670          if (ha->temphold) {
3671 3671                  VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
3672 3672                      htag, &now, tx));
3673 3673          }
3674 3674  
3675 3675          spa_history_log_internal_ds(ds, "hold", tx,
3676 3676              "tag = %s temp = %d holds now = %llu",
3677 3677              htag, (int)ha->temphold, ds->ds_userrefs);
3678 3678  }
3679 3679  
3680 3680  static int
3681 3681  dsl_dataset_user_hold_one(const char *dsname, void *arg)
3682 3682  {
3683 3683          struct dsl_ds_holdarg *ha = arg;
3684 3684          dsl_dataset_t *ds;
3685 3685          int error;
3686 3686          char *name;
3687 3687  
3688 3688          /* alloc a buffer to hold dsname@snapname plus terminating NULL */
3689 3689          name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3690 3690          error = dsl_dataset_hold(name, ha->dstg, &ds);
3691 3691          strfree(name);
3692 3692          if (error == 0) {
3693 3693                  ha->gotone = B_TRUE;
3694 3694                  dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
3695 3695                      dsl_dataset_user_hold_sync, ds, ha, 0);
3696 3696          } else if (error == ENOENT && ha->recursive) {
3697 3697                  error = 0;
3698 3698          } else {
3699 3699                  (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3700 3700          }
3701 3701          return (error);
3702 3702  }
3703 3703  
3704 3704  int
3705 3705  dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
3706 3706      boolean_t temphold)
3707 3707  {
3708 3708          struct dsl_ds_holdarg *ha;
3709 3709          int error;
3710 3710  
3711 3711          ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3712 3712          ha->htag = htag;
3713 3713          ha->temphold = temphold;
3714 3714          error = dsl_sync_task_do(ds->ds_dir->dd_pool,
3715 3715              dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
3716 3716              ds, ha, 0);
3717 3717          kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3718 3718  
3719 3719          return (error);
3720 3720  }
3721 3721  
3722 3722  int
3723 3723  dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
3724 3724      boolean_t recursive, boolean_t temphold, int cleanup_fd)
3725 3725  {
3726 3726          struct dsl_ds_holdarg *ha;
3727 3727          dsl_sync_task_t *dst;
3728 3728          spa_t *spa;
3729 3729          int error;
3730 3730          minor_t minor = 0;
3731 3731  
3732 3732          if (cleanup_fd != -1) {
3733 3733                  /* Currently we only support cleanup-on-exit of tempholds. */
3734 3734                  if (!temphold)
3735 3735                          return (EINVAL);
3736 3736                  error = zfs_onexit_fd_hold(cleanup_fd, &minor);
3737 3737                  if (error)
3738 3738                          return (error);
3739 3739          }
3740 3740  
3741 3741          ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3742 3742  
3743 3743          (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3744 3744  
3745 3745          error = spa_open(dsname, &spa, FTAG);
3746 3746          if (error) {
3747 3747                  kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3748 3748                  if (cleanup_fd != -1)
3749 3749                          zfs_onexit_fd_rele(cleanup_fd);
3750 3750                  return (error);
3751 3751          }
3752 3752  
3753 3753          ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3754 3754          ha->htag = htag;
3755 3755          ha->snapname = snapname;
3756 3756          ha->recursive = recursive;
3757 3757          ha->temphold = temphold;
3758 3758  
3759 3759          if (recursive) {
3760 3760                  error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
3761 3761                      ha, DS_FIND_CHILDREN);
3762 3762          } else {
3763 3763                  error = dsl_dataset_user_hold_one(dsname, ha);
3764 3764          }
3765 3765          if (error == 0)
3766 3766                  error = dsl_sync_task_group_wait(ha->dstg);
3767 3767  
3768 3768          for (dst = list_head(&ha->dstg->dstg_tasks); dst;
3769 3769              dst = list_next(&ha->dstg->dstg_tasks, dst)) {
3770 3770                  dsl_dataset_t *ds = dst->dst_arg1;
3771 3771  
3772 3772                  if (dst->dst_err) {
3773 3773                          dsl_dataset_name(ds, ha->failed);
3774 3774                          *strchr(ha->failed, '@') = '\0';
3775 3775                  } else if (error == 0 && minor != 0 && temphold) {
3776 3776                          /*
3777 3777                           * If this hold is to be released upon process exit,
3778 3778                           * register that action now.
3779 3779                           */
3780 3780                          dsl_register_onexit_hold_cleanup(ds, htag, minor);
3781 3781                  }
3782 3782                  dsl_dataset_rele(ds, ha->dstg);
3783 3783          }
3784 3784  
3785 3785          if (error == 0 && recursive && !ha->gotone)
3786 3786                  error = ENOENT;
3787 3787  
3788 3788          if (error)
3789 3789                  (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
3790 3790  
3791 3791          dsl_sync_task_group_destroy(ha->dstg);
3792 3792  
3793 3793          kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3794 3794          spa_close(spa, FTAG);
3795 3795          if (cleanup_fd != -1)
3796 3796                  zfs_onexit_fd_rele(cleanup_fd);
3797 3797          return (error);
3798 3798  }
3799 3799  
3800 3800  struct dsl_ds_releasearg {
3801 3801          dsl_dataset_t *ds;
3802 3802          const char *htag;
3803 3803          boolean_t own;          /* do we own or just hold ds? */
3804 3804  };
3805 3805  
3806 3806  static int
3807 3807  dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
3808 3808      boolean_t *might_destroy)
3809 3809  {
3810 3810          objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3811 3811          uint64_t zapobj;
3812 3812          uint64_t tmp;
3813 3813          int error;
3814 3814  
3815 3815          *might_destroy = B_FALSE;
3816 3816  
3817 3817          mutex_enter(&ds->ds_lock);
3818 3818          zapobj = ds->ds_phys->ds_userrefs_obj;
3819 3819          if (zapobj == 0) {
3820 3820                  /* The tag can't possibly exist */
3821 3821                  mutex_exit(&ds->ds_lock);
3822 3822                  return (ESRCH);
3823 3823          }
3824 3824  
3825 3825          /* Make sure the tag exists */
3826 3826          error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
3827 3827          if (error) {
3828 3828                  mutex_exit(&ds->ds_lock);
3829 3829                  if (error == ENOENT)
3830 3830                          error = ESRCH;
3831 3831                  return (error);
3832 3832          }
3833 3833  
3834 3834          if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
3835 3835              DS_IS_DEFER_DESTROY(ds))
3836 3836                  *might_destroy = B_TRUE;
3837 3837  
3838 3838          mutex_exit(&ds->ds_lock);
3839 3839          return (0);
3840 3840  }
3841 3841  
3842 3842  static int
3843 3843  dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
3844 3844  {
3845 3845          struct dsl_ds_releasearg *ra = arg1;
3846 3846          dsl_dataset_t *ds = ra->ds;
3847 3847          boolean_t might_destroy;
3848 3848          int error;
3849 3849  
3850 3850          if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3851 3851                  return (ENOTSUP);
3852 3852  
3853 3853          error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
3854 3854          if (error)
3855 3855                  return (error);
3856 3856  
3857 3857          if (might_destroy) {
3858 3858                  struct dsl_ds_destroyarg dsda = {0};
3859 3859  
3860 3860                  if (dmu_tx_is_syncing(tx)) {
3861 3861                          /*
3862 3862                           * If we're not prepared to remove the snapshot,
3863 3863                           * we can't allow the release to happen right now.
3864 3864                           */
3865 3865                          if (!ra->own)
3866 3866                                  return (EBUSY);
3867 3867                  }
3868 3868                  dsda.ds = ds;
3869 3869                  dsda.releasing = B_TRUE;
3870 3870                  return (dsl_dataset_destroy_check(&dsda, tag, tx));
3871 3871          }
3872 3872  
3873 3873          return (0);
3874 3874  }
3875 3875  
3876 3876  static void
3877 3877  dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
3878 3878  {
3879 3879          struct dsl_ds_releasearg *ra = arg1;
3880 3880          dsl_dataset_t *ds = ra->ds;
3881 3881          dsl_pool_t *dp = ds->ds_dir->dd_pool;
3882 3882          objset_t *mos = dp->dp_meta_objset;
3883 3883          uint64_t zapobj;
3884 3884          uint64_t refs;
3885 3885          int error;
3886 3886  
3887 3887          mutex_enter(&ds->ds_lock);
3888 3888          ds->ds_userrefs--;
3889 3889          refs = ds->ds_userrefs;
3890 3890          mutex_exit(&ds->ds_lock);
3891 3891          error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
3892 3892          VERIFY(error == 0 || error == ENOENT);
3893 3893          zapobj = ds->ds_phys->ds_userrefs_obj;
3894 3894          VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
3895 3895  
3896 3896          spa_history_log_internal_ds(ds, "release", tx,
3897 3897              "tag = %s refs now = %lld", ra->htag, (longlong_t)refs);
3898 3898  
3899 3899          if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
3900 3900              DS_IS_DEFER_DESTROY(ds)) {
3901 3901                  struct dsl_ds_destroyarg dsda = {0};
3902 3902  
3903 3903                  ASSERT(ra->own);
3904 3904                  dsda.ds = ds;
3905 3905                  dsda.releasing = B_TRUE;
3906 3906                  /* We already did the destroy_check */
3907 3907                  dsl_dataset_destroy_sync(&dsda, tag, tx);
3908 3908          }
3909 3909  }
3910 3910  
3911 3911  static int
3912 3912  dsl_dataset_user_release_one(const char *dsname, void *arg)
3913 3913  {
3914 3914          struct dsl_ds_holdarg *ha = arg;
3915 3915          struct dsl_ds_releasearg *ra;
3916 3916          dsl_dataset_t *ds;
3917 3917          int error;
3918 3918          void *dtag = ha->dstg;
3919 3919          char *name;
3920 3920          boolean_t own = B_FALSE;
3921 3921          boolean_t might_destroy;
3922 3922  
3923 3923          /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
3924 3924          name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3925 3925          error = dsl_dataset_hold(name, dtag, &ds);
3926 3926          strfree(name);
3927 3927          if (error == ENOENT && ha->recursive)
3928 3928                  return (0);
3929 3929          (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3930 3930          if (error)
3931 3931                  return (error);
3932 3932  
3933 3933          ha->gotone = B_TRUE;
3934 3934  
3935 3935          ASSERT(dsl_dataset_is_snapshot(ds));
3936 3936  
3937 3937          error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
3938 3938          if (error) {
3939 3939                  dsl_dataset_rele(ds, dtag);
3940 3940                  return (error);
3941 3941          }
3942 3942  
3943 3943          if (might_destroy) {
3944 3944  #ifdef _KERNEL
3945 3945                  name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3946 3946                  error = zfs_unmount_snap(name, NULL);
3947 3947                  strfree(name);
3948 3948                  if (error) {
3949 3949                          dsl_dataset_rele(ds, dtag);
3950 3950                          return (error);
3951 3951                  }
3952 3952  #endif
3953 3953                  if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
3954 3954                          dsl_dataset_rele(ds, dtag);
3955 3955                          return (EBUSY);
3956 3956                  } else {
3957 3957                          own = B_TRUE;
3958 3958                          dsl_dataset_make_exclusive(ds, dtag);
3959 3959                  }
3960 3960          }
3961 3961  
3962 3962          ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
3963 3963          ra->ds = ds;
3964 3964          ra->htag = ha->htag;
3965 3965          ra->own = own;
3966 3966          dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
3967 3967              dsl_dataset_user_release_sync, ra, dtag, 0);
3968 3968  
3969 3969          return (0);
3970 3970  }
3971 3971  
3972 3972  int
3973 3973  dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
3974 3974      boolean_t recursive)
3975 3975  {
3976 3976          struct dsl_ds_holdarg *ha;
3977 3977          dsl_sync_task_t *dst;
3978 3978          spa_t *spa;
3979 3979          int error;
3980 3980  
3981 3981  top:
3982 3982          ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3983 3983  
3984 3984          (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3985 3985  
3986 3986          error = spa_open(dsname, &spa, FTAG);
3987 3987          if (error) {
3988 3988                  kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3989 3989                  return (error);
3990 3990          }
3991 3991  
3992 3992          ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3993 3993          ha->htag = htag;
3994 3994          ha->snapname = snapname;
3995 3995          ha->recursive = recursive;
3996 3996          if (recursive) {
3997 3997                  error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
3998 3998                      ha, DS_FIND_CHILDREN);
3999 3999          } else {
4000 4000                  error = dsl_dataset_user_release_one(dsname, ha);
4001 4001          }
4002 4002          if (error == 0)
4003 4003                  error = dsl_sync_task_group_wait(ha->dstg);
4004 4004  
4005 4005          for (dst = list_head(&ha->dstg->dstg_tasks); dst;
4006 4006              dst = list_next(&ha->dstg->dstg_tasks, dst)) {
4007 4007                  struct dsl_ds_releasearg *ra = dst->dst_arg1;
4008 4008                  dsl_dataset_t *ds = ra->ds;
4009 4009  
4010 4010                  if (dst->dst_err)
4011 4011                          dsl_dataset_name(ds, ha->failed);
4012 4012  
4013 4013                  if (ra->own)
4014 4014                          dsl_dataset_disown(ds, ha->dstg);
4015 4015                  else
4016 4016                          dsl_dataset_rele(ds, ha->dstg);
4017 4017  
4018 4018                  kmem_free(ra, sizeof (struct dsl_ds_releasearg));
4019 4019          }
4020 4020  
4021 4021          if (error == 0 && recursive && !ha->gotone)
4022 4022                  error = ENOENT;
4023 4023  
4024 4024          if (error && error != EBUSY)
4025 4025                  (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
4026 4026  
4027 4027          dsl_sync_task_group_destroy(ha->dstg);
4028 4028          kmem_free(ha, sizeof (struct dsl_ds_holdarg));
4029 4029          spa_close(spa, FTAG);
4030 4030  
4031 4031          /*
4032 4032           * We can get EBUSY if we were racing with deferred destroy and
4033 4033           * dsl_dataset_user_release_check() hadn't done the necessary
4034 4034           * open context setup.  We can also get EBUSY if we're racing
4035 4035           * with destroy and that thread is the ds_owner.  Either way
4036 4036           * the busy condition should be transient, and we should retry
4037 4037           * the release operation.
4038 4038           */
4039 4039          if (error == EBUSY)
4040 4040                  goto top;
4041 4041  
4042 4042          return (error);
4043 4043  }
4044 4044  
4045 4045  /*
4046 4046   * Called at spa_load time (with retry == B_FALSE) to release a stale
4047 4047   * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
4048 4048   */
4049 4049  int
4050 4050  dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
4051 4051      boolean_t retry)
4052 4052  {
4053 4053          dsl_dataset_t *ds;
4054 4054          char *snap;
4055 4055          char *name;
4056 4056          int namelen;
4057 4057          int error;
4058 4058  
4059 4059          do {
4060 4060                  rw_enter(&dp->dp_config_rwlock, RW_READER);
4061 4061                  error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
4062 4062                  rw_exit(&dp->dp_config_rwlock);
4063 4063                  if (error)
4064 4064                          return (error);
4065 4065                  namelen = dsl_dataset_namelen(ds)+1;
4066 4066                  name = kmem_alloc(namelen, KM_SLEEP);
4067 4067                  dsl_dataset_name(ds, name);
4068 4068                  dsl_dataset_rele(ds, FTAG);
4069 4069  
4070 4070                  snap = strchr(name, '@');
4071 4071                  *snap = '\0';
4072 4072                  ++snap;
4073 4073                  error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
4074 4074                  kmem_free(name, namelen);
4075 4075  
4076 4076                  /*
4077 4077                   * The object can't have been destroyed because we have a hold,
4078 4078                   * but it might have been renamed, resulting in ENOENT.  Retry
4079 4079                   * if we've been requested to do so.
4080 4080                   *
4081 4081                   * It would be nice if we could use the dsobj all the way
4082 4082                   * through and avoid ENOENT entirely.  But we might need to
4083 4083                   * unmount the snapshot, and there's currently no way to lookup
4084 4084                   * a vfsp using a ZFS object id.
4085 4085                   */
4086 4086          } while ((error == ENOENT) && retry);
4087 4087  
4088 4088          return (error);
4089 4089  }
4090 4090  
4091 4091  int
4092 4092  dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
4093 4093  {
4094 4094          dsl_dataset_t *ds;
4095 4095          int err;
4096 4096  
4097 4097          err = dsl_dataset_hold(dsname, FTAG, &ds);
4098 4098          if (err)
4099 4099                  return (err);
4100 4100  
4101 4101          VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
4102 4102          if (ds->ds_phys->ds_userrefs_obj != 0) {
4103 4103                  zap_attribute_t *za;
4104 4104                  zap_cursor_t zc;
4105 4105  
4106 4106                  za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
4107 4107                  for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
4108 4108                      ds->ds_phys->ds_userrefs_obj);
4109 4109                      zap_cursor_retrieve(&zc, za) == 0;
4110 4110                      zap_cursor_advance(&zc)) {
4111 4111                          VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
4112 4112                              za->za_first_integer));
4113 4113                  }
4114 4114                  zap_cursor_fini(&zc);
4115 4115                  kmem_free(za, sizeof (zap_attribute_t));
4116 4116          }
4117 4117          dsl_dataset_rele(ds, FTAG);
4118 4118          return (0);
4119 4119  }
4120 4120  
4121 4121  /*
4122 4122   * Note, this function is used as the callback for dmu_objset_find().  We
4123 4123   * always return 0 so that we will continue to find and process
4124 4124   * inconsistent datasets, even if we encounter an error trying to
4125 4125   * process one of them.
4126 4126   */
4127 4127  /* ARGSUSED */
4128 4128  int
4129 4129  dsl_destroy_inconsistent(const char *dsname, void *arg)
4130 4130  {
4131 4131          dsl_dataset_t *ds;
4132 4132  
4133 4133          if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
4134 4134                  if (DS_IS_INCONSISTENT(ds))
4135 4135                          (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
4136 4136                  else
4137 4137                          dsl_dataset_disown(ds, FTAG);
4138 4138          }
4139 4139          return (0);
4140 4140  }
4141 4141  
4142 4142  /*
4143 4143   * Return (in *usedp) the amount of space written in new that is not
4144 4144   * present in oldsnap.  New may be a snapshot or the head.  Old must be
4145 4145   * a snapshot before new, in new's filesystem (or its origin).  If not then
4146 4146   * fail and return EINVAL.
4147 4147   *
4148 4148   * The written space is calculated by considering two components:  First, we
4149 4149   * ignore any freed space, and calculate the written as new's used space
4150 4150   * minus old's used space.  Next, we add in the amount of space that was freed
4151 4151   * between the two snapshots, thus reducing new's used space relative to old's.
4152 4152   * Specifically, this is the space that was born before old->ds_creation_txg,
4153 4153   * and freed before new (ie. on new's deadlist or a previous deadlist).
4154 4154   *
4155 4155   * space freed                         [---------------------]
4156 4156   * snapshots                       ---O-------O--------O-------O------
4157 4157   *                                         oldsnap            new
4158 4158   */
4159 4159  int
4160 4160  dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
4161 4161      uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4162 4162  {
4163 4163          int err = 0;
4164 4164          uint64_t snapobj;
4165 4165          dsl_pool_t *dp = new->ds_dir->dd_pool;
4166 4166  
4167 4167          *usedp = 0;
4168 4168          *usedp += new->ds_phys->ds_referenced_bytes;
4169 4169          *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
4170 4170  
4171 4171          *compp = 0;
4172 4172          *compp += new->ds_phys->ds_compressed_bytes;
4173 4173          *compp -= oldsnap->ds_phys->ds_compressed_bytes;
4174 4174  
4175 4175          *uncompp = 0;
4176 4176          *uncompp += new->ds_phys->ds_uncompressed_bytes;
4177 4177          *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
4178 4178  
4179 4179          rw_enter(&dp->dp_config_rwlock, RW_READER);
4180 4180          snapobj = new->ds_object;
4181 4181          while (snapobj != oldsnap->ds_object) {
4182 4182                  dsl_dataset_t *snap;
4183 4183                  uint64_t used, comp, uncomp;
4184 4184  
4185 4185                  if (snapobj == new->ds_object) {
4186 4186                          snap = new;
4187 4187                  } else {
4188 4188                          err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
4189 4189                          if (err != 0)
4190 4190                                  break;
4191 4191                  }
4192 4192  
4193 4193                  if (snap->ds_phys->ds_prev_snap_txg ==
4194 4194                      oldsnap->ds_phys->ds_creation_txg) {
4195 4195                          /*
4196 4196                           * The blocks in the deadlist can not be born after
4197 4197                           * ds_prev_snap_txg, so get the whole deadlist space,
4198 4198                           * which is more efficient (especially for old-format
4199 4199                           * deadlists).  Unfortunately the deadlist code
4200 4200                           * doesn't have enough information to make this
4201 4201                           * optimization itself.
4202 4202                           */
4203 4203                          dsl_deadlist_space(&snap->ds_deadlist,
4204 4204                              &used, &comp, &uncomp);
4205 4205                  } else {
4206 4206                          dsl_deadlist_space_range(&snap->ds_deadlist,
4207 4207                              0, oldsnap->ds_phys->ds_creation_txg,
4208 4208                              &used, &comp, &uncomp);
4209 4209                  }
4210 4210                  *usedp += used;
4211 4211                  *compp += comp;
4212 4212                  *uncompp += uncomp;
4213 4213  
4214 4214                  /*
4215 4215                   * If we get to the beginning of the chain of snapshots
4216 4216                   * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
4217 4217                   * was not a snapshot of/before new.
4218 4218                   */
4219 4219                  snapobj = snap->ds_phys->ds_prev_snap_obj;
4220 4220                  if (snap != new)
4221 4221                          dsl_dataset_rele(snap, FTAG);
4222 4222                  if (snapobj == 0) {
4223 4223                          err = EINVAL;
4224 4224                          break;
4225 4225                  }
4226 4226  
4227 4227          }
4228 4228          rw_exit(&dp->dp_config_rwlock);
4229 4229          return (err);
4230 4230  }
4231 4231  
4232 4232  /*
4233 4233   * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
4234 4234   * lastsnap, and all snapshots in between are deleted.
4235 4235   *
4236 4236   * blocks that would be freed            [---------------------------]
4237 4237   * snapshots                       ---O-------O--------O-------O--------O
4238 4238   *                                        firstsnap        lastsnap
4239 4239   *
4240 4240   * This is the set of blocks that were born after the snap before firstsnap,
4241 4241   * (birth > firstsnap->prev_snap_txg) and died before the snap after the
4242 4242   * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
4243 4243   * We calculate this by iterating over the relevant deadlists (from the snap
4244 4244   * after lastsnap, backward to the snap after firstsnap), summing up the
4245 4245   * space on the deadlist that was born after the snap before firstsnap.
4246 4246   */
4247 4247  int
4248 4248  dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
4249 4249      dsl_dataset_t *lastsnap,
4250 4250      uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4251 4251  {
4252 4252          int err = 0;
4253 4253          uint64_t snapobj;
4254 4254          dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
4255 4255  
4256 4256          ASSERT(dsl_dataset_is_snapshot(firstsnap));
4257 4257          ASSERT(dsl_dataset_is_snapshot(lastsnap));
4258 4258  
4259 4259          /*
4260 4260           * Check that the snapshots are in the same dsl_dir, and firstsnap
4261 4261           * is before lastsnap.
4262 4262           */
4263 4263          if (firstsnap->ds_dir != lastsnap->ds_dir ||
4264 4264              firstsnap->ds_phys->ds_creation_txg >
4265 4265              lastsnap->ds_phys->ds_creation_txg)
4266 4266                  return (EINVAL);
4267 4267  
4268 4268          *usedp = *compp = *uncompp = 0;
4269 4269  
4270 4270          rw_enter(&dp->dp_config_rwlock, RW_READER);
4271 4271          snapobj = lastsnap->ds_phys->ds_next_snap_obj;
4272 4272          while (snapobj != firstsnap->ds_object) {
4273 4273                  dsl_dataset_t *ds;
4274 4274                  uint64_t used, comp, uncomp;
4275 4275  
4276 4276                  err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
4277 4277                  if (err != 0)
4278 4278                          break;
4279 4279  
4280 4280                  dsl_deadlist_space_range(&ds->ds_deadlist,
4281 4281                      firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
4282 4282                      &used, &comp, &uncomp);
4283 4283                  *usedp += used;
4284 4284                  *compp += comp;
4285 4285                  *uncompp += uncomp;
4286 4286  
4287 4287                  snapobj = ds->ds_phys->ds_prev_snap_obj;
4288 4288                  ASSERT3U(snapobj, !=, 0);
4289 4289                  dsl_dataset_rele(ds, FTAG);
4290 4290          }
4291 4291          rw_exit(&dp->dp_config_rwlock);
4292 4292          return (err);
4293 4293  }

↓ open down ↓

3600 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX