dlpx-os-diff Wdiff usr/src/uts/common/fs/zfs/dmu_tx.c

Print this page

3955 ztest failure: assertion refcount_count(&tx->tx_space_written) + delta <= tx->tx_space_towrite
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_impl.h>
  29   29  #include <sys/dbuf.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/dmu_objset.h>
  32   32  #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33   33  #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36   36  #include <sys/spa.h>
  37   37  #include <sys/sa.h>
  38   38  #include <sys/sa_impl.h>
  39   39  #include <sys/zfs_context.h>
  40   40  #include <sys/varargs.h>
  41   41  
  42   42  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  43   43      uint64_t arg1, uint64_t arg2);
  44   44  
  45   45  
  46   46  dmu_tx_t *
  47   47  dmu_tx_create_dd(dsl_dir_t *dd)
  48   48  {
  49   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  50   50          tx->tx_dir = dd;
  51   51          if (dd != NULL)
  52   52                  tx->tx_pool = dd->dd_pool;
  53   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  54   54              offsetof(dmu_tx_hold_t, txh_node));
  55   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  56   56              offsetof(dmu_tx_callback_t, dcb_node));
  57   57  #ifdef ZFS_DEBUG
  58   58          refcount_create(&tx->tx_space_written);
  59   59          refcount_create(&tx->tx_space_freed);
  60   60  #endif
  61   61          return (tx);
  62   62  }
  63   63  
  64   64  dmu_tx_t *
  65   65  dmu_tx_create(objset_t *os)
  66   66  {
  67   67          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  68   68          tx->tx_objset = os;
  69   69          tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  70   70          return (tx);
  71   71  }
  72   72  
  73   73  dmu_tx_t *
  74   74  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  75   75  {
  76   76          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  77   77  
  78   78          ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  79   79          tx->tx_pool = dp;
  80   80          tx->tx_txg = txg;
  81   81          tx->tx_anyobj = TRUE;
  82   82  
  83   83          return (tx);
  84   84  }
  85   85  
  86   86  int
  87   87  dmu_tx_is_syncing(dmu_tx_t *tx)
  88   88  {
  89   89          return (tx->tx_anyobj);
  90   90  }
  91   91  
  92   92  int
  93   93  dmu_tx_private_ok(dmu_tx_t *tx)
  94   94  {
  95   95          return (tx->tx_anyobj);
  96   96  }
  97   97  
  98   98  static dmu_tx_hold_t *
  99   99  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 100  100      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 101  101  {
 102  102          dmu_tx_hold_t *txh;
 103  103          dnode_t *dn = NULL;
 104  104          int err;
 105  105  
 106  106          if (object != DMU_NEW_OBJECT) {
 107  107                  err = dnode_hold(os, object, tx, &dn);
 108  108                  if (err) {
 109  109                          tx->tx_err = err;
 110  110                          return (NULL);
 111  111                  }
 112  112  
 113  113                  if (err == 0 && tx->tx_txg != 0) {
 114  114                          mutex_enter(&dn->dn_mtx);
 115  115                          /*
 116  116                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 117  117                           * problem, but there's no way for it to happen (for
 118  118                           * now, at least).
 119  119                           */
 120  120                          ASSERT(dn->dn_assigned_txg == 0);
 121  121                          dn->dn_assigned_txg = tx->tx_txg;
 122  122                          (void) refcount_add(&dn->dn_tx_holds, tx);
 123  123                          mutex_exit(&dn->dn_mtx);
 124  124                  }
 125  125          }
 126  126  
 127  127          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 128  128          txh->txh_tx = tx;
 129  129          txh->txh_dnode = dn;
 130  130  #ifdef ZFS_DEBUG
 131  131          txh->txh_type = type;
 132  132          txh->txh_arg1 = arg1;
 133  133          txh->txh_arg2 = arg2;
 134  134  #endif
 135  135          list_insert_tail(&tx->tx_holds, txh);
 136  136  
 137  137          return (txh);
 138  138  }
 139  139  
 140  140  void
 141  141  dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 142  142  {
 143  143          /*
 144  144           * If we're syncing, they can manipulate any object anyhow, and
 145  145           * the hold on the dnode_t can cause problems.
 146  146           */
 147  147          if (!dmu_tx_is_syncing(tx)) {
 148  148                  (void) dmu_tx_hold_object_impl(tx, os,
 149  149                      object, THT_NEWOBJECT, 0, 0);
 150  150          }
 151  151  }
 152  152  
 153  153  static int
 154  154  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 155  155  {
 156  156          int err;
 157  157          dmu_buf_impl_t *db;
 158  158  
 159  159          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 160  160          db = dbuf_hold_level(dn, level, blkid, FTAG);
 161  161          rw_exit(&dn->dn_struct_rwlock);
 162  162          if (db == NULL)
 163  163                  return (SET_ERROR(EIO));
 164  164          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 165  165          dbuf_rele(db, FTAG);
 166  166          return (err);
 167  167  }
 168  168  
 169  169  static void
 170  170  dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 171  171      int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 172  172  {
 173  173          objset_t *os = dn->dn_objset;
 174  174          dsl_dataset_t *ds = os->os_dsl_dataset;
 175  175          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 176  176          dmu_buf_impl_t *parent = NULL;
 177  177          blkptr_t *bp = NULL;
 178  178          uint64_t space;
 179  179  
 180  180          if (level >= dn->dn_nlevels || history[level] == blkid)
 181  181                  return;
 182  182  
 183  183          history[level] = blkid;
 184  184  
 185  185          space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 186  186  
 187  187          if (db == NULL || db == dn->dn_dbuf) {
 188  188                  ASSERT(level != 0);
 189  189                  db = NULL;
 190  190          } else {
 191  191                  ASSERT(DB_DNODE(db) == dn);
 192  192                  ASSERT(db->db_level == level);
 193  193                  ASSERT(db->db.db_size == space);
 194  194                  ASSERT(db->db_blkid == blkid);
 195  195                  bp = db->db_blkptr;
 196  196                  parent = db->db_parent;
 197  197          }
 198  198  
 199  199          freeable = (bp && (freeable ||
 200  200              dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 201  201  
 202  202          if (freeable)
 203  203                  txh->txh_space_tooverwrite += space;
 204  204          else
 205  205                  txh->txh_space_towrite += space;
 206  206          if (bp)
 207  207                  txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 208  208  
 209  209          dmu_tx_count_twig(txh, dn, parent, level + 1,
 210  210              blkid >> epbs, freeable, history);
 211  211  }
 212  212  
 213  213  /* ARGSUSED */
 214  214  static void
 215  215  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 216  216  {
 217  217          dnode_t *dn = txh->txh_dnode;
 218  218          uint64_t start, end, i;
 219  219          int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 220  220          int err = 0;
 221  221  
 222  222          if (len == 0)
 223  223                  return;
 224  224  
 225  225          min_bs = SPA_MINBLOCKSHIFT;
 226  226          max_bs = SPA_MAXBLOCKSHIFT;
 227  227          min_ibs = DN_MIN_INDBLKSHIFT;
 228  228          max_ibs = DN_MAX_INDBLKSHIFT;
 229  229  
 230  230          if (dn) {
 231  231                  uint64_t history[DN_MAX_LEVELS];
 232  232                  int nlvls = dn->dn_nlevels;
 233  233                  int delta;
 234  234  
 235  235                  /*
 236  236                   * For i/o error checking, read the first and last level-0
 237  237                   * blocks (if they are not aligned), and all the level-1 blocks.
 238  238                   */
 239  239                  if (dn->dn_maxblkid == 0) {
 240  240                          delta = dn->dn_datablksz;
 241  241                          start = (off < dn->dn_datablksz) ? 0 : 1;
 242  242                          end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 243  243                          if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 244  244                                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 245  245                                  if (err)
 246  246                                          goto out;
 247  247                                  delta -= off;
 248  248                          }
 249  249                  } else {
 250  250                          zio_t *zio = zio_root(dn->dn_objset->os_spa,
 251  251                              NULL, NULL, ZIO_FLAG_CANFAIL);
 252  252  
 253  253                          /* first level-0 block */
 254  254                          start = off >> dn->dn_datablkshift;
 255  255                          if (P2PHASE(off, dn->dn_datablksz) ||
 256  256                              len < dn->dn_datablksz) {
 257  257                                  err = dmu_tx_check_ioerr(zio, dn, 0, start);
 258  258                                  if (err)
 259  259                                          goto out;
 260  260                          }
 261  261  
 262  262                          /* last level-0 block */
 263  263                          end = (off+len-1) >> dn->dn_datablkshift;
 264  264                          if (end != start && end <= dn->dn_maxblkid &&
 265  265                              P2PHASE(off+len, dn->dn_datablksz)) {
 266  266                                  err = dmu_tx_check_ioerr(zio, dn, 0, end);
 267  267                                  if (err)
 268  268                                          goto out;
 269  269                          }
 270  270  
 271  271                          /* level-1 blocks */
 272  272                          if (nlvls > 1) {
 273  273                                  int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 274  274                                  for (i = (start>>shft)+1; i < end>>shft; i++) {
 275  275                                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 276  276                                          if (err)
 277  277                                                  goto out;
 278  278                                  }
 279  279                          }
 280  280  
 281  281                          err = zio_wait(zio);
 282  282                          if (err)
 283  283                                  goto out;
 284  284                          delta = P2NPHASE(off, dn->dn_datablksz);
 285  285                  }
 286  286  
 287  287                  min_ibs = max_ibs = dn->dn_indblkshift;
 288  288                  if (dn->dn_maxblkid > 0) {
 289  289                          /*
 290  290                           * The blocksize can't change,
 291  291                           * so we can make a more precise estimate.
 292  292                           */
 293  293                          ASSERT(dn->dn_datablkshift != 0);
 294  294                          min_bs = max_bs = dn->dn_datablkshift;
 295  295                  }
 296  296  
 297  297                  /*
 298  298                   * If this write is not off the end of the file
 299  299                   * we need to account for overwrites/unref.
 300  300                   */
 301  301                  if (start <= dn->dn_maxblkid) {
 302  302                          for (int l = 0; l < DN_MAX_LEVELS; l++)
 303  303                                  history[l] = -1ULL;
 304  304                  }
 305  305                  while (start <= dn->dn_maxblkid) {
 306  306                          dmu_buf_impl_t *db;
 307  307  
 308  308                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 309  309                          err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 310  310                          rw_exit(&dn->dn_struct_rwlock);
 311  311  
 312  312                          if (err) {
 313  313                                  txh->txh_tx->tx_err = err;
 314  314                                  return;
 315  315                          }
 316  316  
 317  317                          dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 318  318                              history);
 319  319                          dbuf_rele(db, FTAG);
 320  320                          if (++start > end) {
 321  321                                  /*
 322  322                                   * Account for new indirects appearing
 323  323                                   * before this IO gets assigned into a txg.
 324  324                                   */
 325  325                                  bits = 64 - min_bs;
 326  326                                  epbs = min_ibs - SPA_BLKPTRSHIFT;
 327  327                                  for (bits -= epbs * (nlvls - 1);
 328  328                                      bits >= 0; bits -= epbs)
 329  329                                          txh->txh_fudge += 1ULL << max_ibs;
 330  330                                  goto out;
 331  331                          }
 332  332                          off += delta;
 333  333                          if (len >= delta)
 334  334                                  len -= delta;
 335  335                          delta = dn->dn_datablksz;
 336  336                  }
 337  337          }
 338  338  
 339  339          /*
 340  340           * 'end' is the last thing we will access, not one past.
 341  341           * This way we won't overflow when accessing the last byte.
 342  342           */
 343  343          start = P2ALIGN(off, 1ULL << max_bs);
 344  344          end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 345  345          txh->txh_space_towrite += end - start + 1;
 346  346  
 347  347          start >>= min_bs;
 348  348          end >>= min_bs;
 349  349  
 350  350          epbs = min_ibs - SPA_BLKPTRSHIFT;
 351  351  
 352  352          /*
 353  353           * The object contains at most 2^(64 - min_bs) blocks,
 354  354           * and each indirect level maps 2^epbs.
 355  355           */
 356  356          for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 357  357                  start >>= epbs;
 358  358                  end >>= epbs;
 359  359                  ASSERT3U(end, >=, start);
 360  360                  txh->txh_space_towrite += (end - start + 1) << max_ibs;
 361  361                  if (start != 0) {
 362  362                          /*
 363  363                           * We also need a new blkid=0 indirect block
 364  364                           * to reference any existing file data.
 365  365                           */
 366  366                          txh->txh_space_towrite += 1ULL << max_ibs;
 367  367                  }
 368  368          }
 369  369  
 370  370  out:
 371  371          if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 372  372              2 * DMU_MAX_ACCESS)
 373  373                  err = SET_ERROR(EFBIG);
 374  374  
 375  375          if (err)
 376  376                  txh->txh_tx->tx_err = err;
 377  377  }
 378  378  
 379  379  static void
 380  380  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 381  381  {
 382  382          dnode_t *dn = txh->txh_dnode;
 383  383          dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 384  384          uint64_t space = mdn->dn_datablksz +
 385  385              ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 386  386  
 387  387          if (dn && dn->dn_dbuf->db_blkptr &&
 388  388              dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 389  389              dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 390  390                  txh->txh_space_tooverwrite += space;
 391  391                  txh->txh_space_tounref += space;
 392  392          } else {
 393  393                  txh->txh_space_towrite += space;
 394  394                  if (dn && dn->dn_dbuf->db_blkptr)
 395  395                          txh->txh_space_tounref += space;
 396  396          }
 397  397  }
 398  398  
 399  399  void
 400  400  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 401  401  {
 402  402          dmu_tx_hold_t *txh;
 403  403  
 404  404          ASSERT(tx->tx_txg == 0);
 405  405          ASSERT(len < DMU_MAX_ACCESS);
 406  406          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 407  407  
 408  408          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 409  409              object, THT_WRITE, off, len);
 410  410          if (txh == NULL)
 411  411                  return;
 412  412  
 413  413          dmu_tx_count_write(txh, off, len);
 414  414          dmu_tx_count_dnode(txh);
 415  415  }
 416  416  
 417  417  static void
 418  418  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 419  419  {
 420  420          uint64_t blkid, nblks, lastblk;
 421  421          uint64_t space = 0, unref = 0, skipped = 0;
 422  422          dnode_t *dn = txh->txh_dnode;
 423  423          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 424  424          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 425  425          int epbs;
 426  426          uint64_t l0span = 0, nl1blks = 0;
 427  427  
 428  428          if (dn->dn_nlevels == 0)
 429  429                  return;
 430  430  
 431  431          /*
 432  432           * The struct_rwlock protects us against dn_nlevels
 433  433           * changing, in case (against all odds) we manage to dirty &
 434  434           * sync out the changes after we check for being dirty.
 435  435           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 436  436           */
 437  437          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 438  438          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 439  439          if (dn->dn_maxblkid == 0) {
 440  440                  if (off == 0 && len >= dn->dn_datablksz) {

↓ open down ↓

440 lines elided

↑ open up ↑

 441  441                          blkid = 0;
 442  442                          nblks = 1;
 443  443                  } else {
 444  444                          rw_exit(&dn->dn_struct_rwlock);
 445  445                          return;
 446  446                  }
 447  447          } else {
 448  448                  blkid = off >> dn->dn_datablkshift;
 449  449                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 450  450  
 451      -                if (blkid >= dn->dn_maxblkid) {
      451 +                if (blkid > dn->dn_maxblkid) {
 452  452                          rw_exit(&dn->dn_struct_rwlock);
 453  453                          return;
 454  454                  }
 455  455                  if (blkid + nblks > dn->dn_maxblkid)
 456      -                        nblks = dn->dn_maxblkid - blkid;
      456 +                        nblks = dn->dn_maxblkid - blkid + 1;
 457  457  
 458  458          }
 459  459          l0span = nblks;    /* save for later use to calc level > 1 overhead */
 460  460          if (dn->dn_nlevels == 1) {
 461  461                  int i;
 462  462                  for (i = 0; i < nblks; i++) {
 463  463                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 464  464                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 465  465                          bp += blkid + i;
 466  466                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {

 467  467                                  dprintf_bp(bp, "can free old%s", "");
 468  468                                  space += bp_get_dsize(spa, bp);
 469  469                          }
 470  470                          unref += BP_GET_ASIZE(bp);
 471  471                  }
 472  472                  nl1blks = 1;
 473  473                  nblks = 0;
 474  474          }
 475  475  
 476  476          lastblk = blkid + nblks - 1;
 477  477          while (nblks) {
 478  478                  dmu_buf_impl_t *dbuf;
 479  479                  uint64_t ibyte, new_blkid;
 480  480                  int epb = 1 << epbs;
 481  481                  int err, i, blkoff, tochk;
 482  482                  blkptr_t *bp;
 483  483  
 484  484                  ibyte = blkid << dn->dn_datablkshift;
 485  485                  err = dnode_next_offset(dn,
 486  486                      DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 487  487                  new_blkid = ibyte >> dn->dn_datablkshift;
 488  488                  if (err == ESRCH) {
 489  489                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 490  490                          break;
 491  491                  }
 492  492                  if (err) {
 493  493                          txh->txh_tx->tx_err = err;
 494  494                          break;
 495  495                  }
 496  496                  if (new_blkid > lastblk) {
 497  497                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 498  498                          break;
 499  499                  }
 500  500  
 501  501                  if (new_blkid > blkid) {
 502  502                          ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 503  503                          skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 504  504                          nblks -= new_blkid - blkid;
 505  505                          blkid = new_blkid;
 506  506                  }
 507  507                  blkoff = P2PHASE(blkid, epb);
 508  508                  tochk = MIN(epb - blkoff, nblks);
 509  509  
 510  510                  err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 511  511                  if (err) {
 512  512                          txh->txh_tx->tx_err = err;
 513  513                          break;
 514  514                  }
 515  515  
 516  516                  txh->txh_memory_tohold += dbuf->db.db_size;
 517  517  
 518  518                  /*
 519  519                   * We don't check memory_tohold against DMU_MAX_ACCESS because
 520  520                   * memory_tohold is an over-estimation (especially the >L1
 521  521                   * indirect blocks), so it could fail.  Callers should have
 522  522                   * already verified that they will not be holding too much
 523  523                   * memory.
 524  524                   */
 525  525  
 526  526                  err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 527  527                  if (err != 0) {
 528  528                          txh->txh_tx->tx_err = err;
 529  529                          dbuf_rele(dbuf, FTAG);
 530  530                          break;
 531  531                  }
 532  532  
 533  533                  bp = dbuf->db.db_data;
 534  534                  bp += blkoff;
 535  535  
 536  536                  for (i = 0; i < tochk; i++) {
 537  537                          if (dsl_dataset_block_freeable(ds, &bp[i],
 538  538                              bp[i].blk_birth)) {
 539  539                                  dprintf_bp(&bp[i], "can free old%s", "");
 540  540                                  space += bp_get_dsize(spa, &bp[i]);
 541  541                          }
 542  542                          unref += BP_GET_ASIZE(bp);
 543  543                  }
 544  544                  dbuf_rele(dbuf, FTAG);
 545  545  
 546  546                  ++nl1blks;
 547  547                  blkid += tochk;
 548  548                  nblks -= tochk;
 549  549          }
 550  550          rw_exit(&dn->dn_struct_rwlock);
 551  551  
 552  552          /*
 553  553           * Add in memory requirements of higher-level indirects.
 554  554           * This assumes a worst-possible scenario for dn_nlevels and a
 555  555           * worst-possible distribution of l1-blocks over the region to free.
 556  556           */
 557  557          {
 558  558                  uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 559  559                  int level = 2;
 560  560                  /*
 561  561                   * Here we don't use DN_MAX_LEVEL, but calculate it with the
 562  562                   * given datablkshift and indblkshift. This makes the
 563  563                   * difference between 19 and 8 on large files.
 564  564                   */
 565  565                  int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 566  566                      (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 567  567  
 568  568                  while (level++ < maxlevel) {
 569  569                          txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 570  570                              << dn->dn_indblkshift;
 571  571                          blkcnt = 1 + (blkcnt >> epbs);
 572  572                  }
 573  573          }
 574  574  
 575  575          /* account for new level 1 indirect blocks that might show up */
 576  576          if (skipped > 0) {
 577  577                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 578  578                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 579  579                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 580  580          }
 581  581          txh->txh_space_tofree += space;
 582  582          txh->txh_space_tounref += unref;
 583  583  }
 584  584  
 585  585  void
 586  586  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 587  587  {
 588  588          dmu_tx_hold_t *txh;
 589  589          dnode_t *dn;
 590  590          int err;
 591  591          zio_t *zio;
 592  592  
 593  593          ASSERT(tx->tx_txg == 0);
 594  594  
 595  595          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 596  596              object, THT_FREE, off, len);
 597  597          if (txh == NULL)
 598  598                  return;
 599  599          dn = txh->txh_dnode;
 600  600  
 601  601          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 602  602                  return;
 603  603          if (len == DMU_OBJECT_END)
 604  604                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 605  605  
 606  606          dmu_tx_count_dnode(txh);
 607  607  
 608  608          /*
 609  609           * For i/o error checking, we read the first and last level-0
 610  610           * blocks if they are not aligned, and all the level-1 blocks.
 611  611           *
 612  612           * Note:  dbuf_free_range() assumes that we have not instantiated
 613  613           * any level-0 dbufs that will be completely freed.  Therefore we must
 614  614           * exercise care to not read or count the first and last blocks
 615  615           * if they are blocksize-aligned.
 616  616           */
 617  617          if (dn->dn_datablkshift == 0) {
 618  618                  dmu_tx_count_write(txh, off, len);
 619  619          } else {
 620  620                  /* first block will be modified if it is not aligned */
 621  621                  if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 622  622                          dmu_tx_count_write(txh, off, 1);
 623  623                  /* last block will be modified if it is not aligned */
 624  624                  if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 625  625                          dmu_tx_count_write(txh, off+len, 1);
 626  626          }
 627  627  
 628  628          /*
 629  629           * Check level-1 blocks.
 630  630           */
 631  631          if (dn->dn_nlevels > 1) {
 632  632                  int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 633  633                      SPA_BLKPTRSHIFT;
 634  634                  uint64_t start = off >> shift;
 635  635                  uint64_t end = (off + len) >> shift;
 636  636  
 637  637                  ASSERT(dn->dn_datablkshift != 0);
 638  638                  ASSERT(dn->dn_indblkshift != 0);
 639  639  
 640  640                  zio = zio_root(tx->tx_pool->dp_spa,
 641  641                      NULL, NULL, ZIO_FLAG_CANFAIL);
 642  642                  for (uint64_t i = start; i <= end; i++) {
 643  643                          uint64_t ibyte = i << shift;
 644  644                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 645  645                          i = ibyte >> shift;
 646  646                          if (err == ESRCH)
 647  647                                  break;
 648  648                          if (err) {
 649  649                                  tx->tx_err = err;
 650  650                                  return;
 651  651                          }
 652  652  
 653  653                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 654  654                          if (err) {
 655  655                                  tx->tx_err = err;
 656  656                                  return;
 657  657                          }
 658  658                  }
 659  659                  err = zio_wait(zio);
 660  660                  if (err) {
 661  661                          tx->tx_err = err;
 662  662                          return;
 663  663                  }
 664  664          }
 665  665  
 666  666          dmu_tx_count_free(txh, off, len);
 667  667  }
 668  668  
 669  669  void
 670  670  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 671  671  {
 672  672          dmu_tx_hold_t *txh;
 673  673          dnode_t *dn;
 674  674          uint64_t nblocks;
 675  675          int epbs, err;
 676  676  
 677  677          ASSERT(tx->tx_txg == 0);
 678  678  
 679  679          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 680  680              object, THT_ZAP, add, (uintptr_t)name);
 681  681          if (txh == NULL)
 682  682                  return;
 683  683          dn = txh->txh_dnode;
 684  684  
 685  685          dmu_tx_count_dnode(txh);
 686  686  
 687  687          if (dn == NULL) {
 688  688                  /*
 689  689                   * We will be able to fit a new object's entries into one leaf
 690  690                   * block.  So there will be at most 2 blocks total,
 691  691                   * including the header block.
 692  692                   */
 693  693                  dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 694  694                  return;
 695  695          }
 696  696  
 697  697          ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 698  698  
 699  699          if (dn->dn_maxblkid == 0 && !add) {
 700  700                  blkptr_t *bp;
 701  701  
 702  702                  /*
 703  703                   * If there is only one block  (i.e. this is a micro-zap)
 704  704                   * and we are not adding anything, the accounting is simple.
 705  705                   */
 706  706                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 707  707                  if (err) {
 708  708                          tx->tx_err = err;
 709  709                          return;
 710  710                  }
 711  711  
 712  712                  /*
 713  713                   * Use max block size here, since we don't know how much
 714  714                   * the size will change between now and the dbuf dirty call.
 715  715                   */
 716  716                  bp = &dn->dn_phys->dn_blkptr[0];
 717  717                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 718  718                      bp, bp->blk_birth))
 719  719                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 720  720                  else
 721  721                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
 722  722                  if (!BP_IS_HOLE(bp))
 723  723                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 724  724                  return;
 725  725          }
 726  726  
 727  727          if (dn->dn_maxblkid > 0 && name) {
 728  728                  /*
 729  729                   * access the name in this fat-zap so that we'll check
 730  730                   * for i/o errors to the leaf blocks, etc.
 731  731                   */
 732  732                  err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 733  733                      8, 0, NULL);
 734  734                  if (err == EIO) {
 735  735                          tx->tx_err = err;
 736  736                          return;
 737  737                  }
 738  738          }
 739  739  
 740  740          err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 741  741              &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 742  742  
 743  743          /*
 744  744           * If the modified blocks are scattered to the four winds,
 745  745           * we'll have to modify an indirect twig for each.
 746  746           */
 747  747          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 748  748          for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 749  749                  if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
 750  750                          txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 751  751                  else
 752  752                          txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 753  753  }
 754  754  
 755  755  void
 756  756  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 757  757  {
 758  758          dmu_tx_hold_t *txh;
 759  759  
 760  760          ASSERT(tx->tx_txg == 0);
 761  761  
 762  762          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 763  763              object, THT_BONUS, 0, 0);
 764  764          if (txh)
 765  765                  dmu_tx_count_dnode(txh);
 766  766  }
 767  767  
 768  768  void
 769  769  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 770  770  {
 771  771          dmu_tx_hold_t *txh;
 772  772          ASSERT(tx->tx_txg == 0);
 773  773  
 774  774          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 775  775              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 776  776  
 777  777          txh->txh_space_towrite += space;
 778  778  }
 779  779  
 780  780  int
 781  781  dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 782  782  {
 783  783          dmu_tx_hold_t *txh;
 784  784          int holds = 0;
 785  785  
 786  786          /*
 787  787           * By asserting that the tx is assigned, we're counting the
 788  788           * number of dn_tx_holds, which is the same as the number of
 789  789           * dn_holds.  Otherwise, we'd be counting dn_holds, but
 790  790           * dn_tx_holds could be 0.
 791  791           */
 792  792          ASSERT(tx->tx_txg != 0);
 793  793  
 794  794          /* if (tx->tx_anyobj == TRUE) */
 795  795                  /* return (0); */
 796  796  
 797  797          for (txh = list_head(&tx->tx_holds); txh;
 798  798              txh = list_next(&tx->tx_holds, txh)) {
 799  799                  if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 800  800                          holds++;
 801  801          }
 802  802  
 803  803          return (holds);
 804  804  }
 805  805  
 806  806  #ifdef ZFS_DEBUG
 807  807  void
 808  808  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 809  809  {
 810  810          dmu_tx_hold_t *txh;
 811  811          int match_object = FALSE, match_offset = FALSE;
 812  812          dnode_t *dn;
 813  813  
 814  814          DB_DNODE_ENTER(db);
 815  815          dn = DB_DNODE(db);
 816  816          ASSERT(tx->tx_txg != 0);
 817  817          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 818  818          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 819  819  
 820  820          if (tx->tx_anyobj) {
 821  821                  DB_DNODE_EXIT(db);
 822  822                  return;
 823  823          }
 824  824  
 825  825          /* XXX No checking on the meta dnode for now */
 826  826          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 827  827                  DB_DNODE_EXIT(db);
 828  828                  return;
 829  829          }
 830  830  
 831  831          for (txh = list_head(&tx->tx_holds); txh;
 832  832              txh = list_next(&tx->tx_holds, txh)) {
 833  833                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 834  834                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 835  835                          match_object = TRUE;
 836  836                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 837  837                          int datablkshift = dn->dn_datablkshift ?
 838  838                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 839  839                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 840  840                          int shift = datablkshift + epbs * db->db_level;
 841  841                          uint64_t beginblk = shift >= 64 ? 0 :
 842  842                              (txh->txh_arg1 >> shift);
 843  843                          uint64_t endblk = shift >= 64 ? 0 :
 844  844                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 845  845                          uint64_t blkid = db->db_blkid;
 846  846  
 847  847                          /* XXX txh_arg2 better not be zero... */
 848  848  
 849  849                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 850  850                              txh->txh_type, beginblk, endblk);
 851  851  
 852  852                          switch (txh->txh_type) {
 853  853                          case THT_WRITE:
 854  854                                  if (blkid >= beginblk && blkid <= endblk)
 855  855                                          match_offset = TRUE;
 856  856                                  /*
 857  857                                   * We will let this hold work for the bonus
 858  858                                   * or spill buffer so that we don't need to
 859  859                                   * hold it when creating a new object.
 860  860                                   */
 861  861                                  if (blkid == DMU_BONUS_BLKID ||
 862  862                                      blkid == DMU_SPILL_BLKID)
 863  863                                          match_offset = TRUE;
 864  864                                  /*
 865  865                                   * They might have to increase nlevels,
 866  866                                   * thus dirtying the new TLIBs.  Or the
 867  867                                   * might have to change the block size,
 868  868                                   * thus dirying the new lvl=0 blk=0.
 869  869                                   */
 870  870                                  if (blkid == 0)
 871  871                                          match_offset = TRUE;
 872  872                                  break;
 873  873                          case THT_FREE:
 874  874                                  /*
 875  875                                   * We will dirty all the level 1 blocks in
 876  876                                   * the free range and perhaps the first and
 877  877                                   * last level 0 block.
 878  878                                   */
 879  879                                  if (blkid >= beginblk && (blkid <= endblk ||
 880  880                                      txh->txh_arg2 == DMU_OBJECT_END))
 881  881                                          match_offset = TRUE;
 882  882                                  break;
 883  883                          case THT_SPILL:
 884  884                                  if (blkid == DMU_SPILL_BLKID)
 885  885                                          match_offset = TRUE;
 886  886                                  break;
 887  887                          case THT_BONUS:
 888  888                                  if (blkid == DMU_BONUS_BLKID)
 889  889                                          match_offset = TRUE;
 890  890                                  break;
 891  891                          case THT_ZAP:
 892  892                                  match_offset = TRUE;
 893  893                                  break;
 894  894                          case THT_NEWOBJECT:
 895  895                                  match_object = TRUE;
 896  896                                  break;
 897  897                          default:
 898  898                                  ASSERT(!"bad txh_type");
 899  899                          }
 900  900                  }
 901  901                  if (match_object && match_offset) {
 902  902                          DB_DNODE_EXIT(db);
 903  903                          return;
 904  904                  }
 905  905          }
 906  906          DB_DNODE_EXIT(db);
 907  907          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 908  908              (u_longlong_t)db->db.db_object, db->db_level,
 909  909              (u_longlong_t)db->db_blkid);
 910  910  }
 911  911  #endif
 912  912  
 913  913  static int
 914  914  dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 915  915  {
 916  916          dmu_tx_hold_t *txh;
 917  917          spa_t *spa = tx->tx_pool->dp_spa;
 918  918          uint64_t memory, asize, fsize, usize;
 919  919          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 920  920  
 921  921          ASSERT0(tx->tx_txg);
 922  922  
 923  923          if (tx->tx_err)
 924  924                  return (tx->tx_err);
 925  925  
 926  926          if (spa_suspended(spa)) {
 927  927                  /*
 928  928                   * If the user has indicated a blocking failure mode
 929  929                   * then return ERESTART which will block in dmu_tx_wait().
 930  930                   * Otherwise, return EIO so that an error can get
 931  931                   * propagated back to the VOP calls.
 932  932                   *
 933  933                   * Note that we always honor the txg_how flag regardless
 934  934                   * of the failuremode setting.
 935  935                   */
 936  936                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 937  937                      txg_how != TXG_WAIT)
 938  938                          return (SET_ERROR(EIO));
 939  939  
 940  940                  return (SET_ERROR(ERESTART));
 941  941          }
 942  942  
 943  943          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 944  944          tx->tx_needassign_txh = NULL;
 945  945  
 946  946          /*
 947  947           * NB: No error returns are allowed after txg_hold_open, but
 948  948           * before processing the dnode holds, due to the
 949  949           * dmu_tx_unassign() logic.
 950  950           */
 951  951  
 952  952          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
 953  953          for (txh = list_head(&tx->tx_holds); txh;
 954  954              txh = list_next(&tx->tx_holds, txh)) {
 955  955                  dnode_t *dn = txh->txh_dnode;
 956  956                  if (dn != NULL) {
 957  957                          mutex_enter(&dn->dn_mtx);
 958  958                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 959  959                                  mutex_exit(&dn->dn_mtx);
 960  960                                  tx->tx_needassign_txh = txh;
 961  961                                  return (SET_ERROR(ERESTART));
 962  962                          }
 963  963                          if (dn->dn_assigned_txg == 0)
 964  964                                  dn->dn_assigned_txg = tx->tx_txg;
 965  965                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 966  966                          (void) refcount_add(&dn->dn_tx_holds, tx);
 967  967                          mutex_exit(&dn->dn_mtx);
 968  968                  }
 969  969                  towrite += txh->txh_space_towrite;
 970  970                  tofree += txh->txh_space_tofree;
 971  971                  tooverwrite += txh->txh_space_tooverwrite;
 972  972                  tounref += txh->txh_space_tounref;
 973  973                  tohold += txh->txh_memory_tohold;
 974  974                  fudge += txh->txh_fudge;
 975  975          }
 976  976  
 977  977          /*
 978  978           * If a snapshot has been taken since we made our estimates,
 979  979           * assume that we won't be able to free or overwrite anything.
 980  980           */
 981  981          if (tx->tx_objset &&
 982  982              dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 983  983              tx->tx_lastsnap_txg) {
 984  984                  towrite += tooverwrite;
 985  985                  tooverwrite = tofree = 0;
 986  986          }
 987  987  
 988  988          /* needed allocation: worst-case estimate of write space */
 989  989          asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
 990  990          /* freed space estimate: worst-case overwrite + free estimate */
 991  991          fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 992  992          /* convert unrefd space to worst-case estimate */
 993  993          usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 994  994          /* calculate memory footprint estimate */
 995  995          memory = towrite + tooverwrite + tohold;
 996  996  
 997  997  #ifdef ZFS_DEBUG
 998  998          /*
 999  999           * Add in 'tohold' to account for our dirty holds on this memory
1000 1000           * XXX - the "fudge" factor is to account for skipped blocks that
1001 1001           * we missed because dnode_next_offset() misses in-core-only blocks.
1002 1002           */
1003 1003          tx->tx_space_towrite = asize +
1004 1004              spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
1005 1005          tx->tx_space_tofree = tofree;
1006 1006          tx->tx_space_tooverwrite = tooverwrite;
1007 1007          tx->tx_space_tounref = tounref;
1008 1008  #endif
1009 1009  
1010 1010          if (tx->tx_dir && asize != 0) {
1011 1011                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1012 1012                      asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1013 1013                  if (err)
1014 1014                          return (err);
1015 1015          }
1016 1016  
1017 1017          return (0);
1018 1018  }
1019 1019  
1020 1020  static void
1021 1021  dmu_tx_unassign(dmu_tx_t *tx)
1022 1022  {
1023 1023          dmu_tx_hold_t *txh;
1024 1024  
1025 1025          if (tx->tx_txg == 0)
1026 1026                  return;
1027 1027  
1028 1028          txg_rele_to_quiesce(&tx->tx_txgh);
1029 1029  
1030 1030          /*
1031 1031           * Walk the transaction's hold list, removing the hold on the
1032 1032           * associated dnode, and notifying waiters if the refcount drops to 0.
1033 1033           */
1034 1034          for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1035 1035              txh = list_next(&tx->tx_holds, txh)) {
1036 1036                  dnode_t *dn = txh->txh_dnode;
1037 1037  
1038 1038                  if (dn == NULL)
1039 1039                          continue;
1040 1040                  mutex_enter(&dn->dn_mtx);
1041 1041                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1042 1042  
1043 1043                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1044 1044                          dn->dn_assigned_txg = 0;
1045 1045                          cv_broadcast(&dn->dn_notxholds);
1046 1046                  }
1047 1047                  mutex_exit(&dn->dn_mtx);
1048 1048          }
1049 1049  
1050 1050          txg_rele_to_sync(&tx->tx_txgh);
1051 1051  
1052 1052          tx->tx_lasttried_txg = tx->tx_txg;
1053 1053          tx->tx_txg = 0;
1054 1054  }
1055 1055  
1056 1056  /*
1057 1057   * Assign tx to a transaction group.  txg_how can be one of:
1058 1058   *
1059 1059   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1060 1060   *      a new one.  This should be used when you're not holding locks.
1061 1061   *      It will only fail if we're truly out of space (or over quota).
1062 1062   *
1063 1063   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1064 1064   *      blocking, returns immediately with ERESTART.  This should be used
1065 1065   *      whenever you're holding locks.  On an ERESTART error, the caller
1066 1066   *      should drop locks, do a dmu_tx_wait(tx), and try again.
1067 1067   */
1068 1068  int
1069 1069  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1070 1070  {
1071 1071          int err;
1072 1072  
1073 1073          ASSERT(tx->tx_txg == 0);
1074 1074          ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
1075 1075          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1076 1076  
1077 1077          /* If we might wait, we must not hold the config lock. */
1078 1078          ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1079 1079  
1080 1080          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1081 1081                  dmu_tx_unassign(tx);
1082 1082  
1083 1083                  if (err != ERESTART || txg_how != TXG_WAIT)
1084 1084                          return (err);
1085 1085  
1086 1086                  dmu_tx_wait(tx);
1087 1087          }
1088 1088  
1089 1089          txg_rele_to_quiesce(&tx->tx_txgh);
1090 1090  
1091 1091          return (0);
1092 1092  }
1093 1093  
1094 1094  void
1095 1095  dmu_tx_wait(dmu_tx_t *tx)
1096 1096  {
1097 1097          spa_t *spa = tx->tx_pool->dp_spa;
1098 1098  
1099 1099          ASSERT(tx->tx_txg == 0);
1100 1100          ASSERT(!dsl_pool_config_held(tx->tx_pool));
1101 1101  
1102 1102          /*
1103 1103           * It's possible that the pool has become active after this thread
1104 1104           * has tried to obtain a tx. If that's the case then his
1105 1105           * tx_lasttried_txg would not have been assigned.
1106 1106           */
1107 1107          if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1108 1108                  txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
1109 1109          } else if (tx->tx_needassign_txh) {
1110 1110                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1111 1111  
1112 1112                  mutex_enter(&dn->dn_mtx);
1113 1113                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1114 1114                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1115 1115                  mutex_exit(&dn->dn_mtx);
1116 1116                  tx->tx_needassign_txh = NULL;
1117 1117          } else {
1118 1118                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1119 1119          }
1120 1120  }
1121 1121  
1122 1122  void
1123 1123  dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1124 1124  {
1125 1125  #ifdef ZFS_DEBUG
1126 1126          if (tx->tx_dir == NULL || delta == 0)
1127 1127                  return;
1128 1128  
1129 1129          if (delta > 0) {
1130 1130                  ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1131 1131                      tx->tx_space_towrite);
1132 1132                  (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1133 1133          } else {
1134 1134                  (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1135 1135          }
1136 1136  #endif
1137 1137  }
1138 1138  
1139 1139  void
1140 1140  dmu_tx_commit(dmu_tx_t *tx)
1141 1141  {
1142 1142          dmu_tx_hold_t *txh;
1143 1143  
1144 1144          ASSERT(tx->tx_txg != 0);
1145 1145  
1146 1146          /*
1147 1147           * Go through the transaction's hold list and remove holds on
1148 1148           * associated dnodes, notifying waiters if no holds remain.
1149 1149           */
1150 1150          while (txh = list_head(&tx->tx_holds)) {
1151 1151                  dnode_t *dn = txh->txh_dnode;
1152 1152  
1153 1153                  list_remove(&tx->tx_holds, txh);
1154 1154                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1155 1155                  if (dn == NULL)
1156 1156                          continue;
1157 1157                  mutex_enter(&dn->dn_mtx);
1158 1158                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1159 1159  
1160 1160                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1161 1161                          dn->dn_assigned_txg = 0;
1162 1162                          cv_broadcast(&dn->dn_notxholds);
1163 1163                  }
1164 1164                  mutex_exit(&dn->dn_mtx);
1165 1165                  dnode_rele(dn, tx);
1166 1166          }
1167 1167  
1168 1168          if (tx->tx_tempreserve_cookie)
1169 1169                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1170 1170  
1171 1171          if (!list_is_empty(&tx->tx_callbacks))
1172 1172                  txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1173 1173  
1174 1174          if (tx->tx_anyobj == FALSE)
1175 1175                  txg_rele_to_sync(&tx->tx_txgh);
1176 1176  
1177 1177          list_destroy(&tx->tx_callbacks);
1178 1178          list_destroy(&tx->tx_holds);
1179 1179  #ifdef ZFS_DEBUG
1180 1180          dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1181 1181              tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1182 1182              tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1183 1183          refcount_destroy_many(&tx->tx_space_written,
1184 1184              refcount_count(&tx->tx_space_written));
1185 1185          refcount_destroy_many(&tx->tx_space_freed,
1186 1186              refcount_count(&tx->tx_space_freed));
1187 1187  #endif
1188 1188          kmem_free(tx, sizeof (dmu_tx_t));
1189 1189  }
1190 1190  
1191 1191  void
1192 1192  dmu_tx_abort(dmu_tx_t *tx)
1193 1193  {
1194 1194          dmu_tx_hold_t *txh;
1195 1195  
1196 1196          ASSERT(tx->tx_txg == 0);
1197 1197  
1198 1198          while (txh = list_head(&tx->tx_holds)) {
1199 1199                  dnode_t *dn = txh->txh_dnode;
1200 1200  
1201 1201                  list_remove(&tx->tx_holds, txh);
1202 1202                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1203 1203                  if (dn != NULL)
1204 1204                          dnode_rele(dn, tx);
1205 1205          }
1206 1206  
1207 1207          /*
1208 1208           * Call any registered callbacks with an error code.
1209 1209           */
1210 1210          if (!list_is_empty(&tx->tx_callbacks))
1211 1211                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1212 1212  
1213 1213          list_destroy(&tx->tx_callbacks);
1214 1214          list_destroy(&tx->tx_holds);
1215 1215  #ifdef ZFS_DEBUG
1216 1216          refcount_destroy_many(&tx->tx_space_written,
1217 1217              refcount_count(&tx->tx_space_written));
1218 1218          refcount_destroy_many(&tx->tx_space_freed,
1219 1219              refcount_count(&tx->tx_space_freed));
1220 1220  #endif
1221 1221          kmem_free(tx, sizeof (dmu_tx_t));
1222 1222  }
1223 1223  
1224 1224  uint64_t
1225 1225  dmu_tx_get_txg(dmu_tx_t *tx)
1226 1226  {
1227 1227          ASSERT(tx->tx_txg != 0);
1228 1228          return (tx->tx_txg);
1229 1229  }
1230 1230  
1231 1231  dsl_pool_t *
1232 1232  dmu_tx_pool(dmu_tx_t *tx)
1233 1233  {
1234 1234          ASSERT(tx->tx_pool != NULL);
1235 1235          return (tx->tx_pool);
1236 1236  }
1237 1237  
1238 1238  
1239 1239  void
1240 1240  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1241 1241  {
1242 1242          dmu_tx_callback_t *dcb;
1243 1243  
1244 1244          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1245 1245  
1246 1246          dcb->dcb_func = func;
1247 1247          dcb->dcb_data = data;
1248 1248  
1249 1249          list_insert_tail(&tx->tx_callbacks, dcb);
1250 1250  }
1251 1251  
1252 1252  /*
1253 1253   * Call all the commit callbacks on a list, with a given error code.
1254 1254   */
1255 1255  void
1256 1256  dmu_tx_do_callbacks(list_t *cb_list, int error)
1257 1257  {
1258 1258          dmu_tx_callback_t *dcb;
1259 1259  
1260 1260          while (dcb = list_head(cb_list)) {
1261 1261                  list_remove(cb_list, dcb);
1262 1262                  dcb->dcb_func(dcb->dcb_data, error);
1263 1263                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1264 1264          }
1265 1265  }
1266 1266  
1267 1267  /*
1268 1268   * Interface to hold a bunch of attributes.
1269 1269   * used for creating new files.
1270 1270   * attrsize is the total size of all attributes
1271 1271   * to be added during object creation
1272 1272   *
1273 1273   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1274 1274   */
1275 1275  
1276 1276  /*
1277 1277   * hold necessary attribute name for attribute registration.
1278 1278   * should be a very rare case where this is needed.  If it does
1279 1279   * happen it would only happen on the first write to the file system.
1280 1280   */
1281 1281  static void
1282 1282  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1283 1283  {
1284 1284          int i;
1285 1285  
1286 1286          if (!sa->sa_need_attr_registration)
1287 1287                  return;
1288 1288  
1289 1289          for (i = 0; i != sa->sa_num_attrs; i++) {
1290 1290                  if (!sa->sa_attr_table[i].sa_registered) {
1291 1291                          if (sa->sa_reg_attr_obj)
1292 1292                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1293 1293                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1294 1294                          else
1295 1295                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1296 1296                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1297 1297                  }
1298 1298          }
1299 1299  }
1300 1300  
1301 1301  
1302 1302  void
1303 1303  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1304 1304  {
1305 1305          dnode_t *dn;
1306 1306          dmu_tx_hold_t *txh;
1307 1307  
1308 1308          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1309 1309              THT_SPILL, 0, 0);
1310 1310  
1311 1311          dn = txh->txh_dnode;
1312 1312  
1313 1313          if (dn == NULL)
1314 1314                  return;
1315 1315  
1316 1316          /* If blkptr doesn't exist then add space to towrite */
1317 1317          if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1318 1318                  txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1319 1319          } else {
1320 1320                  blkptr_t *bp;
1321 1321  
1322 1322                  bp = &dn->dn_phys->dn_spill;
1323 1323                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1324 1324                      bp, bp->blk_birth))
1325 1325                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
1326 1326                  else
1327 1327                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1328 1328                  if (!BP_IS_HOLE(bp))
1329 1329                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
1330 1330          }
1331 1331  }
1332 1332  
1333 1333  void
1334 1334  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1335 1335  {
1336 1336          sa_os_t *sa = tx->tx_objset->os_sa;
1337 1337  
1338 1338          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1339 1339  
1340 1340          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1341 1341                  return;
1342 1342  
1343 1343          if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1344 1344                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1345 1345          else {
1346 1346                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1347 1347                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1348 1348                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1349 1349                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1350 1350          }
1351 1351  
1352 1352          dmu_tx_sa_registration_hold(sa, tx);
1353 1353  
1354 1354          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1355 1355                  return;
1356 1356  
1357 1357          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1358 1358              THT_SPILL, 0, 0);
1359 1359  }
1360 1360  
1361 1361  /*
1362 1362   * Hold SA attribute
1363 1363   *
1364 1364   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1365 1365   *
1366 1366   * variable_size is the total size of all variable sized attributes
1367 1367   * passed to this function.  It is not the total size of all
1368 1368   * variable size attributes that *may* exist on this object.
1369 1369   */
1370 1370  void
1371 1371  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1372 1372  {
1373 1373          uint64_t object;
1374 1374          sa_os_t *sa = tx->tx_objset->os_sa;
1375 1375  
1376 1376          ASSERT(hdl != NULL);
1377 1377  
1378 1378          object = sa_handle_object(hdl);
1379 1379  
1380 1380          dmu_tx_hold_bonus(tx, object);
1381 1381  
1382 1382          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1383 1383                  return;
1384 1384  
1385 1385          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1386 1386              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1387 1387                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1388 1388                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1389 1389                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1390 1390                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1391 1391          }
1392 1392  
1393 1393          dmu_tx_sa_registration_hold(sa, tx);
1394 1394  
1395 1395          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1396 1396                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1397 1397  
1398 1398          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1399 1399                  ASSERT(tx->tx_txg == 0);
1400 1400                  dmu_tx_hold_spill(tx, object);
1401 1401          } else {
1402 1402                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1403 1403                  dnode_t *dn;
1404 1404  
1405 1405                  DB_DNODE_ENTER(db);
1406 1406                  dn = DB_DNODE(db);
1407 1407                  if (dn->dn_have_spill) {
1408 1408                          ASSERT(tx->tx_txg == 0);
1409 1409                          dmu_tx_hold_spill(tx, object);
1410 1410                  }
1411 1411                  DB_DNODE_EXIT(db);
1412 1412          }
1413 1413  }

↓ open down ↓

947 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX