illumos-3741 Wdiff usr/src/uts/common/fs/zfs/dmu_tx.c

Print this page

3741 zfs needs better comments
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_impl.h>
  29   29  #include <sys/dbuf.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/dmu_objset.h>
  32   32  #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33   33  #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36   36  #include <sys/spa.h>
  37   37  #include <sys/sa.h>
  38   38  #include <sys/sa_impl.h>
  39   39  #include <sys/zfs_context.h>
  40   40  #include <sys/varargs.h>
  41   41  
  42   42  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  43   43      uint64_t arg1, uint64_t arg2);
  44   44  
  45   45  
  46   46  dmu_tx_t *
  47   47  dmu_tx_create_dd(dsl_dir_t *dd)
  48   48  {
  49   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  50   50          tx->tx_dir = dd;
  51   51          if (dd != NULL)
  52   52                  tx->tx_pool = dd->dd_pool;
  53   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  54   54              offsetof(dmu_tx_hold_t, txh_node));
  55   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  56   56              offsetof(dmu_tx_callback_t, dcb_node));
  57   57  #ifdef ZFS_DEBUG
  58   58          refcount_create(&tx->tx_space_written);
  59   59          refcount_create(&tx->tx_space_freed);
  60   60  #endif
  61   61          return (tx);
  62   62  }
  63   63  
  64   64  dmu_tx_t *
  65   65  dmu_tx_create(objset_t *os)
  66   66  {
  67   67          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  68   68          tx->tx_objset = os;
  69   69          tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  70   70          return (tx);
  71   71  }
  72   72  
  73   73  dmu_tx_t *
  74   74  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  75   75  {
  76   76          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  77   77  
  78   78          ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  79   79          tx->tx_pool = dp;
  80   80          tx->tx_txg = txg;
  81   81          tx->tx_anyobj = TRUE;
  82   82  
  83   83          return (tx);
  84   84  }
  85   85  
  86   86  int
  87   87  dmu_tx_is_syncing(dmu_tx_t *tx)
  88   88  {
  89   89          return (tx->tx_anyobj);
  90   90  }
  91   91  
  92   92  int
  93   93  dmu_tx_private_ok(dmu_tx_t *tx)
  94   94  {
  95   95          return (tx->tx_anyobj);
  96   96  }
  97   97  
  98   98  static dmu_tx_hold_t *
  99   99  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 100  100      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 101  101  {
 102  102          dmu_tx_hold_t *txh;
 103  103          dnode_t *dn = NULL;
 104  104          int err;
 105  105  
 106  106          if (object != DMU_NEW_OBJECT) {
 107  107                  err = dnode_hold(os, object, tx, &dn);
 108  108                  if (err) {
 109  109                          tx->tx_err = err;
 110  110                          return (NULL);
 111  111                  }
 112  112  
 113  113                  if (err == 0 && tx->tx_txg != 0) {
 114  114                          mutex_enter(&dn->dn_mtx);
 115  115                          /*
 116  116                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 117  117                           * problem, but there's no way for it to happen (for
 118  118                           * now, at least).
 119  119                           */
 120  120                          ASSERT(dn->dn_assigned_txg == 0);
 121  121                          dn->dn_assigned_txg = tx->tx_txg;
 122  122                          (void) refcount_add(&dn->dn_tx_holds, tx);
 123  123                          mutex_exit(&dn->dn_mtx);
 124  124                  }
 125  125          }
 126  126  
 127  127          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 128  128          txh->txh_tx = tx;
 129  129          txh->txh_dnode = dn;
 130  130  #ifdef ZFS_DEBUG
 131  131          txh->txh_type = type;
 132  132          txh->txh_arg1 = arg1;
 133  133          txh->txh_arg2 = arg2;
 134  134  #endif
 135  135          list_insert_tail(&tx->tx_holds, txh);
 136  136  
 137  137          return (txh);
 138  138  }
 139  139  
 140  140  void
 141  141  dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 142  142  {
 143  143          /*
 144  144           * If we're syncing, they can manipulate any object anyhow, and
 145  145           * the hold on the dnode_t can cause problems.
 146  146           */
 147  147          if (!dmu_tx_is_syncing(tx)) {
 148  148                  (void) dmu_tx_hold_object_impl(tx, os,
 149  149                      object, THT_NEWOBJECT, 0, 0);
 150  150          }
 151  151  }
 152  152  
 153  153  static int
 154  154  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 155  155  {
 156  156          int err;
 157  157          dmu_buf_impl_t *db;
 158  158  
 159  159          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 160  160          db = dbuf_hold_level(dn, level, blkid, FTAG);
 161  161          rw_exit(&dn->dn_struct_rwlock);
 162  162          if (db == NULL)
 163  163                  return (SET_ERROR(EIO));
 164  164          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 165  165          dbuf_rele(db, FTAG);
 166  166          return (err);
 167  167  }
 168  168  
 169  169  static void
 170  170  dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 171  171      int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 172  172  {
 173  173          objset_t *os = dn->dn_objset;
 174  174          dsl_dataset_t *ds = os->os_dsl_dataset;
 175  175          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 176  176          dmu_buf_impl_t *parent = NULL;
 177  177          blkptr_t *bp = NULL;
 178  178          uint64_t space;
 179  179  
 180  180          if (level >= dn->dn_nlevels || history[level] == blkid)
 181  181                  return;
 182  182  
 183  183          history[level] = blkid;
 184  184  
 185  185          space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 186  186  
 187  187          if (db == NULL || db == dn->dn_dbuf) {
 188  188                  ASSERT(level != 0);
 189  189                  db = NULL;
 190  190          } else {
 191  191                  ASSERT(DB_DNODE(db) == dn);
 192  192                  ASSERT(db->db_level == level);
 193  193                  ASSERT(db->db.db_size == space);
 194  194                  ASSERT(db->db_blkid == blkid);
 195  195                  bp = db->db_blkptr;
 196  196                  parent = db->db_parent;
 197  197          }
 198  198  
 199  199          freeable = (bp && (freeable ||
 200  200              dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 201  201  
 202  202          if (freeable)
 203  203                  txh->txh_space_tooverwrite += space;
 204  204          else
 205  205                  txh->txh_space_towrite += space;
 206  206          if (bp)
 207  207                  txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 208  208  
 209  209          dmu_tx_count_twig(txh, dn, parent, level + 1,
 210  210              blkid >> epbs, freeable, history);
 211  211  }
 212  212  
 213  213  /* ARGSUSED */
 214  214  static void
 215  215  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 216  216  {
 217  217          dnode_t *dn = txh->txh_dnode;
 218  218          uint64_t start, end, i;
 219  219          int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 220  220          int err = 0;
 221  221  
 222  222          if (len == 0)
 223  223                  return;
 224  224  
 225  225          min_bs = SPA_MINBLOCKSHIFT;
 226  226          max_bs = SPA_MAXBLOCKSHIFT;
 227  227          min_ibs = DN_MIN_INDBLKSHIFT;
 228  228          max_ibs = DN_MAX_INDBLKSHIFT;
 229  229  
 230  230          if (dn) {
 231  231                  uint64_t history[DN_MAX_LEVELS];
 232  232                  int nlvls = dn->dn_nlevels;
 233  233                  int delta;
 234  234  
 235  235                  /*
 236  236                   * For i/o error checking, read the first and last level-0
 237  237                   * blocks (if they are not aligned), and all the level-1 blocks.
 238  238                   */
 239  239                  if (dn->dn_maxblkid == 0) {
 240  240                          delta = dn->dn_datablksz;
 241  241                          start = (off < dn->dn_datablksz) ? 0 : 1;
 242  242                          end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 243  243                          if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 244  244                                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 245  245                                  if (err)
 246  246                                          goto out;
 247  247                                  delta -= off;
 248  248                          }
 249  249                  } else {
 250  250                          zio_t *zio = zio_root(dn->dn_objset->os_spa,
 251  251                              NULL, NULL, ZIO_FLAG_CANFAIL);
 252  252  
 253  253                          /* first level-0 block */
 254  254                          start = off >> dn->dn_datablkshift;
 255  255                          if (P2PHASE(off, dn->dn_datablksz) ||
 256  256                              len < dn->dn_datablksz) {
 257  257                                  err = dmu_tx_check_ioerr(zio, dn, 0, start);
 258  258                                  if (err)
 259  259                                          goto out;
 260  260                          }
 261  261  
 262  262                          /* last level-0 block */
 263  263                          end = (off+len-1) >> dn->dn_datablkshift;
 264  264                          if (end != start && end <= dn->dn_maxblkid &&
 265  265                              P2PHASE(off+len, dn->dn_datablksz)) {
 266  266                                  err = dmu_tx_check_ioerr(zio, dn, 0, end);
 267  267                                  if (err)
 268  268                                          goto out;
 269  269                          }
 270  270  
 271  271                          /* level-1 blocks */
 272  272                          if (nlvls > 1) {
 273  273                                  int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 274  274                                  for (i = (start>>shft)+1; i < end>>shft; i++) {
 275  275                                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 276  276                                          if (err)
 277  277                                                  goto out;
 278  278                                  }
 279  279                          }
 280  280  
 281  281                          err = zio_wait(zio);
 282  282                          if (err)
 283  283                                  goto out;
 284  284                          delta = P2NPHASE(off, dn->dn_datablksz);
 285  285                  }
 286  286  
 287  287                  min_ibs = max_ibs = dn->dn_indblkshift;
 288  288                  if (dn->dn_maxblkid > 0) {
 289  289                          /*
 290  290                           * The blocksize can't change,
 291  291                           * so we can make a more precise estimate.
 292  292                           */
 293  293                          ASSERT(dn->dn_datablkshift != 0);
 294  294                          min_bs = max_bs = dn->dn_datablkshift;
 295  295                  }
 296  296  
 297  297                  /*
 298  298                   * If this write is not off the end of the file
 299  299                   * we need to account for overwrites/unref.
 300  300                   */
 301  301                  if (start <= dn->dn_maxblkid) {
 302  302                          for (int l = 0; l < DN_MAX_LEVELS; l++)
 303  303                                  history[l] = -1ULL;
 304  304                  }
 305  305                  while (start <= dn->dn_maxblkid) {
 306  306                          dmu_buf_impl_t *db;
 307  307  
 308  308                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 309  309                          err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 310  310                          rw_exit(&dn->dn_struct_rwlock);
 311  311  
 312  312                          if (err) {
 313  313                                  txh->txh_tx->tx_err = err;
 314  314                                  return;
 315  315                          }
 316  316  
 317  317                          dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 318  318                              history);
 319  319                          dbuf_rele(db, FTAG);
 320  320                          if (++start > end) {
 321  321                                  /*
 322  322                                   * Account for new indirects appearing
 323  323                                   * before this IO gets assigned into a txg.
 324  324                                   */
 325  325                                  bits = 64 - min_bs;
 326  326                                  epbs = min_ibs - SPA_BLKPTRSHIFT;
 327  327                                  for (bits -= epbs * (nlvls - 1);
 328  328                                      bits >= 0; bits -= epbs)
 329  329                                          txh->txh_fudge += 1ULL << max_ibs;
 330  330                                  goto out;
 331  331                          }
 332  332                          off += delta;
 333  333                          if (len >= delta)
 334  334                                  len -= delta;
 335  335                          delta = dn->dn_datablksz;
 336  336                  }
 337  337          }
 338  338  
 339  339          /*
 340  340           * 'end' is the last thing we will access, not one past.
 341  341           * This way we won't overflow when accessing the last byte.
 342  342           */
 343  343          start = P2ALIGN(off, 1ULL << max_bs);
 344  344          end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 345  345          txh->txh_space_towrite += end - start + 1;
 346  346  
 347  347          start >>= min_bs;
 348  348          end >>= min_bs;
 349  349  
 350  350          epbs = min_ibs - SPA_BLKPTRSHIFT;
 351  351  
 352  352          /*
 353  353           * The object contains at most 2^(64 - min_bs) blocks,
 354  354           * and each indirect level maps 2^epbs.
 355  355           */
 356  356          for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 357  357                  start >>= epbs;
 358  358                  end >>= epbs;
 359  359                  ASSERT3U(end, >=, start);
 360  360                  txh->txh_space_towrite += (end - start + 1) << max_ibs;
 361  361                  if (start != 0) {
 362  362                          /*
 363  363                           * We also need a new blkid=0 indirect block
 364  364                           * to reference any existing file data.
 365  365                           */
 366  366                          txh->txh_space_towrite += 1ULL << max_ibs;
 367  367                  }
 368  368          }
 369  369  
 370  370  out:
 371  371          if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 372  372              2 * DMU_MAX_ACCESS)
 373  373                  err = SET_ERROR(EFBIG);
 374  374  
 375  375          if (err)
 376  376                  txh->txh_tx->tx_err = err;
 377  377  }
 378  378  
 379  379  static void
 380  380  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 381  381  {
 382  382          dnode_t *dn = txh->txh_dnode;
 383  383          dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 384  384          uint64_t space = mdn->dn_datablksz +
 385  385              ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 386  386  
 387  387          if (dn && dn->dn_dbuf->db_blkptr &&
 388  388              dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 389  389              dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 390  390                  txh->txh_space_tooverwrite += space;
 391  391                  txh->txh_space_tounref += space;
 392  392          } else {
 393  393                  txh->txh_space_towrite += space;
 394  394                  if (dn && dn->dn_dbuf->db_blkptr)
 395  395                          txh->txh_space_tounref += space;
 396  396          }
 397  397  }
 398  398  
 399  399  void
 400  400  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 401  401  {
 402  402          dmu_tx_hold_t *txh;
 403  403  
 404  404          ASSERT(tx->tx_txg == 0);
 405  405          ASSERT(len < DMU_MAX_ACCESS);
 406  406          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 407  407  
 408  408          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 409  409              object, THT_WRITE, off, len);
 410  410          if (txh == NULL)
 411  411                  return;
 412  412  
 413  413          dmu_tx_count_write(txh, off, len);
 414  414          dmu_tx_count_dnode(txh);
 415  415  }
 416  416  
 417  417  static void
 418  418  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 419  419  {
 420  420          uint64_t blkid, nblks, lastblk;
 421  421          uint64_t space = 0, unref = 0, skipped = 0;
 422  422          dnode_t *dn = txh->txh_dnode;
 423  423          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 424  424          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 425  425          int epbs;
 426  426          uint64_t l0span = 0, nl1blks = 0;
 427  427  
 428  428          if (dn->dn_nlevels == 0)
 429  429                  return;
 430  430  
 431  431          /*
 432  432           * The struct_rwlock protects us against dn_nlevels
 433  433           * changing, in case (against all odds) we manage to dirty &
 434  434           * sync out the changes after we check for being dirty.
 435  435           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 436  436           */
 437  437          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 438  438          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 439  439          if (dn->dn_maxblkid == 0) {
 440  440                  if (off == 0 && len >= dn->dn_datablksz) {
 441  441                          blkid = 0;
 442  442                          nblks = 1;
 443  443                  } else {
 444  444                          rw_exit(&dn->dn_struct_rwlock);
 445  445                          return;
 446  446                  }
 447  447          } else {
 448  448                  blkid = off >> dn->dn_datablkshift;
 449  449                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 450  450  
 451  451                  if (blkid >= dn->dn_maxblkid) {
 452  452                          rw_exit(&dn->dn_struct_rwlock);
 453  453                          return;
 454  454                  }
 455  455                  if (blkid + nblks > dn->dn_maxblkid)
 456  456                          nblks = dn->dn_maxblkid - blkid;
 457  457  
 458  458          }
 459  459          l0span = nblks;    /* save for later use to calc level > 1 overhead */
 460  460          if (dn->dn_nlevels == 1) {
 461  461                  int i;
 462  462                  for (i = 0; i < nblks; i++) {
 463  463                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 464  464                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 465  465                          bp += blkid + i;
 466  466                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 467  467                                  dprintf_bp(bp, "can free old%s", "");
 468  468                                  space += bp_get_dsize(spa, bp);
 469  469                          }
 470  470                          unref += BP_GET_ASIZE(bp);
 471  471                  }
 472  472                  nl1blks = 1;
 473  473                  nblks = 0;
 474  474          }
 475  475  
 476  476          lastblk = blkid + nblks - 1;
 477  477          while (nblks) {
 478  478                  dmu_buf_impl_t *dbuf;
 479  479                  uint64_t ibyte, new_blkid;
 480  480                  int epb = 1 << epbs;
 481  481                  int err, i, blkoff, tochk;
 482  482                  blkptr_t *bp;
 483  483  
 484  484                  ibyte = blkid << dn->dn_datablkshift;
 485  485                  err = dnode_next_offset(dn,
 486  486                      DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 487  487                  new_blkid = ibyte >> dn->dn_datablkshift;
 488  488                  if (err == ESRCH) {
 489  489                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 490  490                          break;
 491  491                  }
 492  492                  if (err) {
 493  493                          txh->txh_tx->tx_err = err;
 494  494                          break;
 495  495                  }
 496  496                  if (new_blkid > lastblk) {
 497  497                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 498  498                          break;
 499  499                  }
 500  500  
 501  501                  if (new_blkid > blkid) {
 502  502                          ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 503  503                          skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 504  504                          nblks -= new_blkid - blkid;
 505  505                          blkid = new_blkid;
 506  506                  }
 507  507                  blkoff = P2PHASE(blkid, epb);
 508  508                  tochk = MIN(epb - blkoff, nblks);
 509  509  
 510  510                  err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 511  511                  if (err) {
 512  512                          txh->txh_tx->tx_err = err;
 513  513                          break;
 514  514                  }
 515  515  
 516  516                  txh->txh_memory_tohold += dbuf->db.db_size;
 517  517  
 518  518                  /*
 519  519                   * We don't check memory_tohold against DMU_MAX_ACCESS because
 520  520                   * memory_tohold is an over-estimation (especially the >L1
 521  521                   * indirect blocks), so it could fail.  Callers should have
 522  522                   * already verified that they will not be holding too much
 523  523                   * memory.
 524  524                   */
 525  525  
 526  526                  err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 527  527                  if (err != 0) {
 528  528                          txh->txh_tx->tx_err = err;
 529  529                          dbuf_rele(dbuf, FTAG);
 530  530                          break;
 531  531                  }
 532  532  
 533  533                  bp = dbuf->db.db_data;
 534  534                  bp += blkoff;
 535  535  
 536  536                  for (i = 0; i < tochk; i++) {
 537  537                          if (dsl_dataset_block_freeable(ds, &bp[i],
 538  538                              bp[i].blk_birth)) {
 539  539                                  dprintf_bp(&bp[i], "can free old%s", "");
 540  540                                  space += bp_get_dsize(spa, &bp[i]);
 541  541                          }
 542  542                          unref += BP_GET_ASIZE(bp);
 543  543                  }
 544  544                  dbuf_rele(dbuf, FTAG);
 545  545  
 546  546                  ++nl1blks;
 547  547                  blkid += tochk;
 548  548                  nblks -= tochk;
 549  549          }
 550  550          rw_exit(&dn->dn_struct_rwlock);
 551  551  
 552  552          /*
 553  553           * Add in memory requirements of higher-level indirects.
 554  554           * This assumes a worst-possible scenario for dn_nlevels and a
 555  555           * worst-possible distribution of l1-blocks over the region to free.
 556  556           */
 557  557          {
 558  558                  uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 559  559                  int level = 2;
 560  560                  /*
 561  561                   * Here we don't use DN_MAX_LEVEL, but calculate it with the
 562  562                   * given datablkshift and indblkshift. This makes the
 563  563                   * difference between 19 and 8 on large files.
 564  564                   */
 565  565                  int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 566  566                      (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 567  567  
 568  568                  while (level++ < maxlevel) {
 569  569                          txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 570  570                              << dn->dn_indblkshift;
 571  571                          blkcnt = 1 + (blkcnt >> epbs);
 572  572                  }
 573  573          }
 574  574  
 575  575          /* account for new level 1 indirect blocks that might show up */
 576  576          if (skipped > 0) {
 577  577                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 578  578                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 579  579                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 580  580          }
 581  581          txh->txh_space_tofree += space;
 582  582          txh->txh_space_tounref += unref;
 583  583  }
 584  584  
 585  585  void
 586  586  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 587  587  {
 588  588          dmu_tx_hold_t *txh;
 589  589          dnode_t *dn;
 590  590          uint64_t start, end, i;
 591  591          int err, shift;
 592  592          zio_t *zio;
 593  593  
 594  594          ASSERT(tx->tx_txg == 0);
 595  595  
 596  596          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 597  597              object, THT_FREE, off, len);
 598  598          if (txh == NULL)
 599  599                  return;
 600  600          dn = txh->txh_dnode;
 601  601  
 602  602          /* first block */
 603  603          if (off != 0)
 604  604                  dmu_tx_count_write(txh, off, 1);
 605  605          /* last block */
 606  606          if (len != DMU_OBJECT_END)
 607  607                  dmu_tx_count_write(txh, off+len, 1);
 608  608  
 609  609          dmu_tx_count_dnode(txh);
 610  610  
 611  611          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 612  612                  return;
 613  613          if (len == DMU_OBJECT_END)
 614  614                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 615  615  
 616  616          /*
 617  617           * For i/o error checking, read the first and last level-0
 618  618           * blocks, and all the level-1 blocks.  The above count_write's
 619  619           * have already taken care of the level-0 blocks.
 620  620           */
 621  621          if (dn->dn_nlevels > 1) {
 622  622                  shift = dn->dn_datablkshift + dn->dn_indblkshift -
 623  623                      SPA_BLKPTRSHIFT;
 624  624                  start = off >> shift;
 625  625                  end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
 626  626  
 627  627                  zio = zio_root(tx->tx_pool->dp_spa,
 628  628                      NULL, NULL, ZIO_FLAG_CANFAIL);
 629  629                  for (i = start; i <= end; i++) {
 630  630                          uint64_t ibyte = i << shift;
 631  631                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 632  632                          i = ibyte >> shift;
 633  633                          if (err == ESRCH)
 634  634                                  break;
 635  635                          if (err) {
 636  636                                  tx->tx_err = err;
 637  637                                  return;
 638  638                          }
 639  639  
 640  640                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 641  641                          if (err) {
 642  642                                  tx->tx_err = err;
 643  643                                  return;
 644  644                          }
 645  645                  }
 646  646                  err = zio_wait(zio);
 647  647                  if (err) {
 648  648                          tx->tx_err = err;
 649  649                          return;
 650  650                  }
 651  651          }
 652  652  
 653  653          dmu_tx_count_free(txh, off, len);
 654  654  }
 655  655  
 656  656  void
 657  657  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 658  658  {
 659  659          dmu_tx_hold_t *txh;
 660  660          dnode_t *dn;
 661  661          uint64_t nblocks;
 662  662          int epbs, err;
 663  663  
 664  664          ASSERT(tx->tx_txg == 0);
 665  665  
 666  666          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 667  667              object, THT_ZAP, add, (uintptr_t)name);
 668  668          if (txh == NULL)
 669  669                  return;
 670  670          dn = txh->txh_dnode;
 671  671  
 672  672          dmu_tx_count_dnode(txh);
 673  673  
 674  674          if (dn == NULL) {
 675  675                  /*
 676  676                   * We will be able to fit a new object's entries into one leaf
 677  677                   * block.  So there will be at most 2 blocks total,
 678  678                   * including the header block.
 679  679                   */
 680  680                  dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 681  681                  return;
 682  682          }
 683  683  
 684  684          ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 685  685  
 686  686          if (dn->dn_maxblkid == 0 && !add) {
 687  687                  blkptr_t *bp;
 688  688  
 689  689                  /*
 690  690                   * If there is only one block  (i.e. this is a micro-zap)
 691  691                   * and we are not adding anything, the accounting is simple.
 692  692                   */
 693  693                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 694  694                  if (err) {
 695  695                          tx->tx_err = err;
 696  696                          return;
 697  697                  }
 698  698  
 699  699                  /*
 700  700                   * Use max block size here, since we don't know how much
 701  701                   * the size will change between now and the dbuf dirty call.
 702  702                   */
 703  703                  bp = &dn->dn_phys->dn_blkptr[0];
 704  704                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 705  705                      bp, bp->blk_birth))
 706  706                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 707  707                  else
 708  708                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
 709  709                  if (!BP_IS_HOLE(bp))
 710  710                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 711  711                  return;
 712  712          }
 713  713  
 714  714          if (dn->dn_maxblkid > 0 && name) {
 715  715                  /*
 716  716                   * access the name in this fat-zap so that we'll check
 717  717                   * for i/o errors to the leaf blocks, etc.
 718  718                   */
 719  719                  err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 720  720                      8, 0, NULL);
 721  721                  if (err == EIO) {
 722  722                          tx->tx_err = err;
 723  723                          return;
 724  724                  }
 725  725          }
 726  726  
 727  727          err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 728  728              &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 729  729  
 730  730          /*
 731  731           * If the modified blocks are scattered to the four winds,
 732  732           * we'll have to modify an indirect twig for each.
 733  733           */
 734  734          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 735  735          for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 736  736                  if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
 737  737                          txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 738  738                  else
 739  739                          txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 740  740  }
 741  741  
 742  742  void
 743  743  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 744  744  {
 745  745          dmu_tx_hold_t *txh;
 746  746  
 747  747          ASSERT(tx->tx_txg == 0);
 748  748  
 749  749          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 750  750              object, THT_BONUS, 0, 0);
 751  751          if (txh)
 752  752                  dmu_tx_count_dnode(txh);
 753  753  }
 754  754  
 755  755  void
 756  756  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 757  757  {
 758  758          dmu_tx_hold_t *txh;
 759  759          ASSERT(tx->tx_txg == 0);
 760  760  
 761  761          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 762  762              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 763  763  
 764  764          txh->txh_space_towrite += space;
 765  765  }
 766  766  
 767  767  int
 768  768  dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 769  769  {
 770  770          dmu_tx_hold_t *txh;
 771  771          int holds = 0;
 772  772  
 773  773          /*
 774  774           * By asserting that the tx is assigned, we're counting the
 775  775           * number of dn_tx_holds, which is the same as the number of
 776  776           * dn_holds.  Otherwise, we'd be counting dn_holds, but
 777  777           * dn_tx_holds could be 0.
 778  778           */
 779  779          ASSERT(tx->tx_txg != 0);
 780  780  
 781  781          /* if (tx->tx_anyobj == TRUE) */
 782  782                  /* return (0); */
 783  783  
 784  784          for (txh = list_head(&tx->tx_holds); txh;
 785  785              txh = list_next(&tx->tx_holds, txh)) {
 786  786                  if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 787  787                          holds++;
 788  788          }
 789  789  
 790  790          return (holds);
 791  791  }
 792  792  
 793  793  #ifdef ZFS_DEBUG
 794  794  void
 795  795  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 796  796  {
 797  797          dmu_tx_hold_t *txh;
 798  798          int match_object = FALSE, match_offset = FALSE;
 799  799          dnode_t *dn;
 800  800  
 801  801          DB_DNODE_ENTER(db);
 802  802          dn = DB_DNODE(db);
 803  803          ASSERT(tx->tx_txg != 0);
 804  804          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 805  805          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 806  806  
 807  807          if (tx->tx_anyobj) {
 808  808                  DB_DNODE_EXIT(db);
 809  809                  return;
 810  810          }
 811  811  
 812  812          /* XXX No checking on the meta dnode for now */
 813  813          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 814  814                  DB_DNODE_EXIT(db);
 815  815                  return;
 816  816          }
 817  817  
 818  818          for (txh = list_head(&tx->tx_holds); txh;
 819  819              txh = list_next(&tx->tx_holds, txh)) {
 820  820                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 821  821                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 822  822                          match_object = TRUE;
 823  823                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 824  824                          int datablkshift = dn->dn_datablkshift ?
 825  825                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 826  826                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 827  827                          int shift = datablkshift + epbs * db->db_level;
 828  828                          uint64_t beginblk = shift >= 64 ? 0 :
 829  829                              (txh->txh_arg1 >> shift);
 830  830                          uint64_t endblk = shift >= 64 ? 0 :
 831  831                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 832  832                          uint64_t blkid = db->db_blkid;
 833  833  
 834  834                          /* XXX txh_arg2 better not be zero... */
 835  835  
 836  836                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 837  837                              txh->txh_type, beginblk, endblk);
 838  838  
 839  839                          switch (txh->txh_type) {
 840  840                          case THT_WRITE:
 841  841                                  if (blkid >= beginblk && blkid <= endblk)
 842  842                                          match_offset = TRUE;
 843  843                                  /*
 844  844                                   * We will let this hold work for the bonus
 845  845                                   * or spill buffer so that we don't need to
 846  846                                   * hold it when creating a new object.
 847  847                                   */
 848  848                                  if (blkid == DMU_BONUS_BLKID ||
 849  849                                      blkid == DMU_SPILL_BLKID)
 850  850                                          match_offset = TRUE;
 851  851                                  /*
 852  852                                   * They might have to increase nlevels,
 853  853                                   * thus dirtying the new TLIBs.  Or the
 854  854                                   * might have to change the block size,
 855  855                                   * thus dirying the new lvl=0 blk=0.
 856  856                                   */
 857  857                                  if (blkid == 0)
 858  858                                          match_offset = TRUE;
 859  859                                  break;
 860  860                          case THT_FREE:
 861  861                                  /*
 862  862                                   * We will dirty all the level 1 blocks in
 863  863                                   * the free range and perhaps the first and
 864  864                                   * last level 0 block.
 865  865                                   */
 866  866                                  if (blkid >= beginblk && (blkid <= endblk ||
 867  867                                      txh->txh_arg2 == DMU_OBJECT_END))
 868  868                                          match_offset = TRUE;
 869  869                                  break;
 870  870                          case THT_SPILL:
 871  871                                  if (blkid == DMU_SPILL_BLKID)
 872  872                                          match_offset = TRUE;
 873  873                                  break;
 874  874                          case THT_BONUS:
 875  875                                  if (blkid == DMU_BONUS_BLKID)
 876  876                                          match_offset = TRUE;
 877  877                                  break;
 878  878                          case THT_ZAP:
 879  879                                  match_offset = TRUE;
 880  880                                  break;
 881  881                          case THT_NEWOBJECT:
 882  882                                  match_object = TRUE;
 883  883                                  break;
 884  884                          default:
 885  885                                  ASSERT(!"bad txh_type");
 886  886                          }
 887  887                  }
 888  888                  if (match_object && match_offset) {
 889  889                          DB_DNODE_EXIT(db);
 890  890                          return;
 891  891                  }
 892  892          }
 893  893          DB_DNODE_EXIT(db);
 894  894          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 895  895              (u_longlong_t)db->db.db_object, db->db_level,
 896  896              (u_longlong_t)db->db_blkid);
 897  897  }
 898  898  #endif
 899  899  
 900  900  static int
 901  901  dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 902  902  {
 903  903          dmu_tx_hold_t *txh;
 904  904          spa_t *spa = tx->tx_pool->dp_spa;
 905  905          uint64_t memory, asize, fsize, usize;
 906  906          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 907  907  
 908  908          ASSERT0(tx->tx_txg);
 909  909  
 910  910          if (tx->tx_err)
 911  911                  return (tx->tx_err);
 912  912  
 913  913          if (spa_suspended(spa)) {
 914  914                  /*
 915  915                   * If the user has indicated a blocking failure mode
 916  916                   * then return ERESTART which will block in dmu_tx_wait().
 917  917                   * Otherwise, return EIO so that an error can get
 918  918                   * propagated back to the VOP calls.
 919  919                   *
 920  920                   * Note that we always honor the txg_how flag regardless
 921  921                   * of the failuremode setting.
 922  922                   */
 923  923                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 924  924                      txg_how != TXG_WAIT)
 925  925                          return (SET_ERROR(EIO));
 926  926  
 927  927                  return (SET_ERROR(ERESTART));
 928  928          }
 929  929  
 930  930          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 931  931          tx->tx_needassign_txh = NULL;
 932  932  
 933  933          /*
 934  934           * NB: No error returns are allowed after txg_hold_open, but
 935  935           * before processing the dnode holds, due to the
 936  936           * dmu_tx_unassign() logic.
 937  937           */
 938  938  
 939  939          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
 940  940          for (txh = list_head(&tx->tx_holds); txh;
 941  941              txh = list_next(&tx->tx_holds, txh)) {
 942  942                  dnode_t *dn = txh->txh_dnode;
 943  943                  if (dn != NULL) {
 944  944                          mutex_enter(&dn->dn_mtx);
 945  945                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 946  946                                  mutex_exit(&dn->dn_mtx);
 947  947                                  tx->tx_needassign_txh = txh;
 948  948                                  return (SET_ERROR(ERESTART));
 949  949                          }
 950  950                          if (dn->dn_assigned_txg == 0)
 951  951                                  dn->dn_assigned_txg = tx->tx_txg;
 952  952                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 953  953                          (void) refcount_add(&dn->dn_tx_holds, tx);
 954  954                          mutex_exit(&dn->dn_mtx);
 955  955                  }
 956  956                  towrite += txh->txh_space_towrite;
 957  957                  tofree += txh->txh_space_tofree;
 958  958                  tooverwrite += txh->txh_space_tooverwrite;
 959  959                  tounref += txh->txh_space_tounref;
 960  960                  tohold += txh->txh_memory_tohold;
 961  961                  fudge += txh->txh_fudge;
 962  962          }
 963  963  
 964  964          /*
 965  965           * If a snapshot has been taken since we made our estimates,
 966  966           * assume that we won't be able to free or overwrite anything.
 967  967           */
 968  968          if (tx->tx_objset &&
 969  969              dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 970  970              tx->tx_lastsnap_txg) {
 971  971                  towrite += tooverwrite;
 972  972                  tooverwrite = tofree = 0;
 973  973          }
 974  974  
 975  975          /* needed allocation: worst-case estimate of write space */
 976  976          asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
 977  977          /* freed space estimate: worst-case overwrite + free estimate */
 978  978          fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 979  979          /* convert unrefd space to worst-case estimate */
 980  980          usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 981  981          /* calculate memory footprint estimate */
 982  982          memory = towrite + tooverwrite + tohold;
 983  983  
 984  984  #ifdef ZFS_DEBUG
 985  985          /*
 986  986           * Add in 'tohold' to account for our dirty holds on this memory
 987  987           * XXX - the "fudge" factor is to account for skipped blocks that
 988  988           * we missed because dnode_next_offset() misses in-core-only blocks.
 989  989           */
 990  990          tx->tx_space_towrite = asize +
 991  991              spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
 992  992          tx->tx_space_tofree = tofree;
 993  993          tx->tx_space_tooverwrite = tooverwrite;
 994  994          tx->tx_space_tounref = tounref;
 995  995  #endif
 996  996  
 997  997          if (tx->tx_dir && asize != 0) {
 998  998                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
 999  999                      asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1000 1000                  if (err)
1001 1001                          return (err);
1002 1002          }
1003 1003  
1004 1004          return (0);
1005 1005  }
1006 1006

↓ open down ↓

1006 lines elided

↑ open up ↑

1007 1007  static void
1008 1008  dmu_tx_unassign(dmu_tx_t *tx)
1009 1009  {
1010 1010          dmu_tx_hold_t *txh;
1011 1011  
1012 1012          if (tx->tx_txg == 0)
1013 1013                  return;
1014 1014  
1015 1015          txg_rele_to_quiesce(&tx->tx_txgh);
1016 1016  
     1017 +        /*
     1018 +         * Walk the transaction's hold list, removing the hold on the
     1019 +         * associated dnode, and notifying waiters if the refcount drops to 0.
     1020 +         */
1017 1021          for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1018 1022              txh = list_next(&tx->tx_holds, txh)) {
1019 1023                  dnode_t *dn = txh->txh_dnode;
1020 1024  
1021 1025                  if (dn == NULL)
1022 1026                          continue;
1023 1027                  mutex_enter(&dn->dn_mtx);
1024 1028                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1025 1029  
1026 1030                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {

1027 1031                          dn->dn_assigned_txg = 0;
1028 1032                          cv_broadcast(&dn->dn_notxholds);
1029 1033                  }
1030 1034                  mutex_exit(&dn->dn_mtx);
1031 1035          }
1032 1036  
1033 1037          txg_rele_to_sync(&tx->tx_txgh);
1034 1038  
1035 1039          tx->tx_lasttried_txg = tx->tx_txg;
1036 1040          tx->tx_txg = 0;
1037 1041  }
1038 1042  
1039 1043  /*
1040 1044   * Assign tx to a transaction group.  txg_how can be one of:
1041 1045   *
1042 1046   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1043 1047   *      a new one.  This should be used when you're not holding locks.
1044 1048   *      It will only fail if we're truly out of space (or over quota).
1045 1049   *
1046 1050   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1047 1051   *      blocking, returns immediately with ERESTART.  This should be used
1048 1052   *      whenever you're holding locks.  On an ERESTART error, the caller
1049 1053   *      should drop locks, do a dmu_tx_wait(tx), and try again.
1050 1054   */
1051 1055  int
1052 1056  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1053 1057  {
1054 1058          int err;
1055 1059  
1056 1060          ASSERT(tx->tx_txg == 0);
1057 1061          ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
1058 1062          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1059 1063  
1060 1064          /* If we might wait, we must not hold the config lock. */
1061 1065          ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1062 1066  
1063 1067          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1064 1068                  dmu_tx_unassign(tx);
1065 1069  
1066 1070                  if (err != ERESTART || txg_how != TXG_WAIT)
1067 1071                          return (err);
1068 1072  
1069 1073                  dmu_tx_wait(tx);
1070 1074          }
1071 1075  
1072 1076          txg_rele_to_quiesce(&tx->tx_txgh);
1073 1077  
1074 1078          return (0);
1075 1079  }
1076 1080  
1077 1081  void
1078 1082  dmu_tx_wait(dmu_tx_t *tx)
1079 1083  {
1080 1084          spa_t *spa = tx->tx_pool->dp_spa;
1081 1085  
1082 1086          ASSERT(tx->tx_txg == 0);
1083 1087          ASSERT(!dsl_pool_config_held(tx->tx_pool));
1084 1088  
1085 1089          /*
1086 1090           * It's possible that the pool has become active after this thread
1087 1091           * has tried to obtain a tx. If that's the case then his
1088 1092           * tx_lasttried_txg would not have been assigned.
1089 1093           */
1090 1094          if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1091 1095                  txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
1092 1096          } else if (tx->tx_needassign_txh) {
1093 1097                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1094 1098  
1095 1099                  mutex_enter(&dn->dn_mtx);
1096 1100                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1097 1101                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1098 1102                  mutex_exit(&dn->dn_mtx);
1099 1103                  tx->tx_needassign_txh = NULL;
1100 1104          } else {
1101 1105                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1102 1106          }
1103 1107  }
1104 1108  
1105 1109  void
1106 1110  dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1107 1111  {
1108 1112  #ifdef ZFS_DEBUG
1109 1113          if (tx->tx_dir == NULL || delta == 0)
1110 1114                  return;
1111 1115  
1112 1116          if (delta > 0) {
1113 1117                  ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1114 1118                      tx->tx_space_towrite);
1115 1119                  (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1116 1120          } else {
1117 1121                  (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1118 1122          }

↓ open down ↓

92 lines elided

↑ open up ↑

1119 1123  #endif
1120 1124  }
1121 1125  
1122 1126  void
1123 1127  dmu_tx_commit(dmu_tx_t *tx)
1124 1128  {
1125 1129          dmu_tx_hold_t *txh;
1126 1130  
1127 1131          ASSERT(tx->tx_txg != 0);
1128 1132  
     1133 +        /*
     1134 +         * Go through the transaction's hold list and remove holds on
     1135 +         * associated dnodes, notifying waiters if no holds remain.
     1136 +         */
1129 1137          while (txh = list_head(&tx->tx_holds)) {
1130 1138                  dnode_t *dn = txh->txh_dnode;
1131 1139  
1132 1140                  list_remove(&tx->tx_holds, txh);
1133 1141                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1134 1142                  if (dn == NULL)
1135 1143                          continue;
1136 1144                  mutex_enter(&dn->dn_mtx);
1137 1145                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1138 1146

1139 1147                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1140 1148                          dn->dn_assigned_txg = 0;
1141 1149                          cv_broadcast(&dn->dn_notxholds);
1142 1150                  }
1143 1151                  mutex_exit(&dn->dn_mtx);
1144 1152                  dnode_rele(dn, tx);
1145 1153          }
1146 1154  
1147 1155          if (tx->tx_tempreserve_cookie)
1148 1156                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1149 1157  
1150 1158          if (!list_is_empty(&tx->tx_callbacks))
1151 1159                  txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1152 1160  
1153 1161          if (tx->tx_anyobj == FALSE)
1154 1162                  txg_rele_to_sync(&tx->tx_txgh);
1155 1163  
1156 1164          list_destroy(&tx->tx_callbacks);
1157 1165          list_destroy(&tx->tx_holds);
1158 1166  #ifdef ZFS_DEBUG
1159 1167          dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1160 1168              tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1161 1169              tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1162 1170          refcount_destroy_many(&tx->tx_space_written,
1163 1171              refcount_count(&tx->tx_space_written));
1164 1172          refcount_destroy_many(&tx->tx_space_freed,
1165 1173              refcount_count(&tx->tx_space_freed));
1166 1174  #endif
1167 1175          kmem_free(tx, sizeof (dmu_tx_t));
1168 1176  }
1169 1177  
1170 1178  void
1171 1179  dmu_tx_abort(dmu_tx_t *tx)
1172 1180  {
1173 1181          dmu_tx_hold_t *txh;
1174 1182  
1175 1183          ASSERT(tx->tx_txg == 0);
1176 1184  
1177 1185          while (txh = list_head(&tx->tx_holds)) {
1178 1186                  dnode_t *dn = txh->txh_dnode;
1179 1187  
1180 1188                  list_remove(&tx->tx_holds, txh);
1181 1189                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1182 1190                  if (dn != NULL)
1183 1191                          dnode_rele(dn, tx);
1184 1192          }
1185 1193  
1186 1194          /*
1187 1195           * Call any registered callbacks with an error code.
1188 1196           */
1189 1197          if (!list_is_empty(&tx->tx_callbacks))
1190 1198                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1191 1199  
1192 1200          list_destroy(&tx->tx_callbacks);
1193 1201          list_destroy(&tx->tx_holds);
1194 1202  #ifdef ZFS_DEBUG
1195 1203          refcount_destroy_many(&tx->tx_space_written,
1196 1204              refcount_count(&tx->tx_space_written));
1197 1205          refcount_destroy_many(&tx->tx_space_freed,
1198 1206              refcount_count(&tx->tx_space_freed));
1199 1207  #endif
1200 1208          kmem_free(tx, sizeof (dmu_tx_t));
1201 1209  }
1202 1210  
1203 1211  uint64_t
1204 1212  dmu_tx_get_txg(dmu_tx_t *tx)
1205 1213  {
1206 1214          ASSERT(tx->tx_txg != 0);
1207 1215          return (tx->tx_txg);
1208 1216  }
1209 1217  
1210 1218  dsl_pool_t *
1211 1219  dmu_tx_pool(dmu_tx_t *tx)
1212 1220  {
1213 1221          ASSERT(tx->tx_pool != NULL);
1214 1222          return (tx->tx_pool);
1215 1223  }
1216 1224  
1217 1225  
1218 1226  void
1219 1227  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1220 1228  {
1221 1229          dmu_tx_callback_t *dcb;
1222 1230  
1223 1231          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1224 1232  
1225 1233          dcb->dcb_func = func;
1226 1234          dcb->dcb_data = data;
1227 1235  
1228 1236          list_insert_tail(&tx->tx_callbacks, dcb);
1229 1237  }
1230 1238  
1231 1239  /*
1232 1240   * Call all the commit callbacks on a list, with a given error code.
1233 1241   */
1234 1242  void
1235 1243  dmu_tx_do_callbacks(list_t *cb_list, int error)
1236 1244  {
1237 1245          dmu_tx_callback_t *dcb;
1238 1246  
1239 1247          while (dcb = list_head(cb_list)) {
1240 1248                  list_remove(cb_list, dcb);
1241 1249                  dcb->dcb_func(dcb->dcb_data, error);
1242 1250                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1243 1251          }
1244 1252  }
1245 1253  
1246 1254  /*
1247 1255   * Interface to hold a bunch of attributes.
1248 1256   * used for creating new files.
1249 1257   * attrsize is the total size of all attributes
1250 1258   * to be added during object creation
1251 1259   *
1252 1260   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1253 1261   */
1254 1262  
1255 1263  /*
1256 1264   * hold necessary attribute name for attribute registration.
1257 1265   * should be a very rare case where this is needed.  If it does
1258 1266   * happen it would only happen on the first write to the file system.
1259 1267   */
1260 1268  static void
1261 1269  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1262 1270  {
1263 1271          int i;
1264 1272  
1265 1273          if (!sa->sa_need_attr_registration)
1266 1274                  return;
1267 1275  
1268 1276          for (i = 0; i != sa->sa_num_attrs; i++) {
1269 1277                  if (!sa->sa_attr_table[i].sa_registered) {
1270 1278                          if (sa->sa_reg_attr_obj)
1271 1279                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1272 1280                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1273 1281                          else
1274 1282                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1275 1283                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1276 1284                  }
1277 1285          }
1278 1286  }
1279 1287  
1280 1288  
1281 1289  void
1282 1290  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1283 1291  {
1284 1292          dnode_t *dn;
1285 1293          dmu_tx_hold_t *txh;
1286 1294  
1287 1295          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1288 1296              THT_SPILL, 0, 0);
1289 1297  
1290 1298          dn = txh->txh_dnode;
1291 1299  
1292 1300          if (dn == NULL)
1293 1301                  return;
1294 1302  
1295 1303          /* If blkptr doesn't exist then add space to towrite */
1296 1304          if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1297 1305                  txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1298 1306          } else {
1299 1307                  blkptr_t *bp;
1300 1308  
1301 1309                  bp = &dn->dn_phys->dn_spill;
1302 1310                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1303 1311                      bp, bp->blk_birth))
1304 1312                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
1305 1313                  else
1306 1314                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1307 1315                  if (!BP_IS_HOLE(bp))
1308 1316                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
1309 1317          }
1310 1318  }
1311 1319  
1312 1320  void
1313 1321  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1314 1322  {
1315 1323          sa_os_t *sa = tx->tx_objset->os_sa;
1316 1324  
1317 1325          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1318 1326  
1319 1327          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1320 1328                  return;
1321 1329  
1322 1330          if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1323 1331                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1324 1332          else {
1325 1333                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1326 1334                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1327 1335                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1328 1336                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1329 1337          }
1330 1338  
1331 1339          dmu_tx_sa_registration_hold(sa, tx);
1332 1340  
1333 1341          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1334 1342                  return;
1335 1343  
1336 1344          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1337 1345              THT_SPILL, 0, 0);
1338 1346  }
1339 1347  
1340 1348  /*
1341 1349   * Hold SA attribute
1342 1350   *
1343 1351   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1344 1352   *
1345 1353   * variable_size is the total size of all variable sized attributes
1346 1354   * passed to this function.  It is not the total size of all
1347 1355   * variable size attributes that *may* exist on this object.
1348 1356   */
1349 1357  void
1350 1358  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1351 1359  {
1352 1360          uint64_t object;
1353 1361          sa_os_t *sa = tx->tx_objset->os_sa;
1354 1362  
1355 1363          ASSERT(hdl != NULL);
1356 1364  
1357 1365          object = sa_handle_object(hdl);
1358 1366  
1359 1367          dmu_tx_hold_bonus(tx, object);
1360 1368  
1361 1369          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1362 1370                  return;
1363 1371  
1364 1372          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1365 1373              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1366 1374                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1367 1375                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1368 1376                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1369 1377                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1370 1378          }
1371 1379  
1372 1380          dmu_tx_sa_registration_hold(sa, tx);
1373 1381  
1374 1382          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1375 1383                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1376 1384  
1377 1385          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1378 1386                  ASSERT(tx->tx_txg == 0);
1379 1387                  dmu_tx_hold_spill(tx, object);
1380 1388          } else {
1381 1389                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1382 1390                  dnode_t *dn;
1383 1391  
1384 1392                  DB_DNODE_ENTER(db);
1385 1393                  dn = DB_DNODE(db);
1386 1394                  if (dn->dn_have_spill) {
1387 1395                          ASSERT(tx->tx_txg == 0);
1388 1396                          dmu_tx_hold_spill(tx, object);
1389 1397                  }
1390 1398                  DB_DNODE_EXIT(db);
1391 1399          }
1392 1400  }

↓ open down ↓

254 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX