dlpx-os-diff Wdiff usr/src/uts/common/fs/zfs/dmu_tx.c

Print this page

4082 zfs receive gets EFBIG from dmu_tx_hold_free()
Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_impl.h>
  29   29  #include <sys/dbuf.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/dmu_objset.h>
  32   32  #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33   33  #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36   36  #include <sys/spa.h>
  37   37  #include <sys/sa.h>
  38   38  #include <sys/sa_impl.h>
  39   39  #include <sys/zfs_context.h>
  40   40  #include <sys/varargs.h>
  41   41  
  42   42  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  43   43      uint64_t arg1, uint64_t arg2);
  44   44  
  45   45  
  46   46  dmu_tx_t *
  47   47  dmu_tx_create_dd(dsl_dir_t *dd)
  48   48  {
  49   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  50   50          tx->tx_dir = dd;
  51   51          if (dd != NULL)
  52   52                  tx->tx_pool = dd->dd_pool;
  53   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  54   54              offsetof(dmu_tx_hold_t, txh_node));
  55   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  56   56              offsetof(dmu_tx_callback_t, dcb_node));
  57   57          tx->tx_start = gethrtime();
  58   58  #ifdef ZFS_DEBUG
  59   59          refcount_create(&tx->tx_space_written);
  60   60          refcount_create(&tx->tx_space_freed);
  61   61  #endif
  62   62          return (tx);
  63   63  }
  64   64  
  65   65  dmu_tx_t *
  66   66  dmu_tx_create(objset_t *os)
  67   67  {
  68   68          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  69   69          tx->tx_objset = os;
  70   70          tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  71   71          return (tx);
  72   72  }
  73   73  
  74   74  dmu_tx_t *
  75   75  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  76   76  {
  77   77          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  78   78  
  79   79          ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  80   80          tx->tx_pool = dp;
  81   81          tx->tx_txg = txg;
  82   82          tx->tx_anyobj = TRUE;
  83   83  
  84   84          return (tx);
  85   85  }
  86   86  
  87   87  int
  88   88  dmu_tx_is_syncing(dmu_tx_t *tx)
  89   89  {
  90   90          return (tx->tx_anyobj);
  91   91  }
  92   92  
  93   93  int
  94   94  dmu_tx_private_ok(dmu_tx_t *tx)
  95   95  {
  96   96          return (tx->tx_anyobj);
  97   97  }
  98   98  
  99   99  static dmu_tx_hold_t *
 100  100  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 101  101      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 102  102  {
 103  103          dmu_tx_hold_t *txh;
 104  104          dnode_t *dn = NULL;
 105  105          int err;
 106  106  
 107  107          if (object != DMU_NEW_OBJECT) {
 108  108                  err = dnode_hold(os, object, tx, &dn);
 109  109                  if (err) {
 110  110                          tx->tx_err = err;
 111  111                          return (NULL);
 112  112                  }
 113  113  
 114  114                  if (err == 0 && tx->tx_txg != 0) {
 115  115                          mutex_enter(&dn->dn_mtx);
 116  116                          /*
 117  117                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 118  118                           * problem, but there's no way for it to happen (for
 119  119                           * now, at least).
 120  120                           */
 121  121                          ASSERT(dn->dn_assigned_txg == 0);
 122  122                          dn->dn_assigned_txg = tx->tx_txg;
 123  123                          (void) refcount_add(&dn->dn_tx_holds, tx);
 124  124                          mutex_exit(&dn->dn_mtx);
 125  125                  }
 126  126          }
 127  127  
 128  128          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 129  129          txh->txh_tx = tx;
 130  130          txh->txh_dnode = dn;
 131  131  #ifdef ZFS_DEBUG
 132  132          txh->txh_type = type;
 133  133          txh->txh_arg1 = arg1;
 134  134          txh->txh_arg2 = arg2;
 135  135  #endif
 136  136          list_insert_tail(&tx->tx_holds, txh);
 137  137  
 138  138          return (txh);
 139  139  }
 140  140  
 141  141  void
 142  142  dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 143  143  {
 144  144          /*
 145  145           * If we're syncing, they can manipulate any object anyhow, and
 146  146           * the hold on the dnode_t can cause problems.
 147  147           */
 148  148          if (!dmu_tx_is_syncing(tx)) {
 149  149                  (void) dmu_tx_hold_object_impl(tx, os,
 150  150                      object, THT_NEWOBJECT, 0, 0);
 151  151          }
 152  152  }
 153  153  
 154  154  static int
 155  155  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 156  156  {
 157  157          int err;
 158  158          dmu_buf_impl_t *db;
 159  159  
 160  160          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 161  161          db = dbuf_hold_level(dn, level, blkid, FTAG);
 162  162          rw_exit(&dn->dn_struct_rwlock);
 163  163          if (db == NULL)
 164  164                  return (SET_ERROR(EIO));
 165  165          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 166  166          dbuf_rele(db, FTAG);
 167  167          return (err);
 168  168  }
 169  169  
 170  170  static void
 171  171  dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 172  172      int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 173  173  {
 174  174          objset_t *os = dn->dn_objset;
 175  175          dsl_dataset_t *ds = os->os_dsl_dataset;
 176  176          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 177  177          dmu_buf_impl_t *parent = NULL;
 178  178          blkptr_t *bp = NULL;
 179  179          uint64_t space;
 180  180  
 181  181          if (level >= dn->dn_nlevels || history[level] == blkid)
 182  182                  return;
 183  183  
 184  184          history[level] = blkid;
 185  185  
 186  186          space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 187  187  
 188  188          if (db == NULL || db == dn->dn_dbuf) {
 189  189                  ASSERT(level != 0);
 190  190                  db = NULL;
 191  191          } else {
 192  192                  ASSERT(DB_DNODE(db) == dn);
 193  193                  ASSERT(db->db_level == level);
 194  194                  ASSERT(db->db.db_size == space);
 195  195                  ASSERT(db->db_blkid == blkid);
 196  196                  bp = db->db_blkptr;
 197  197                  parent = db->db_parent;
 198  198          }
 199  199  
 200  200          freeable = (bp && (freeable ||
 201  201              dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 202  202  
 203  203          if (freeable)
 204  204                  txh->txh_space_tooverwrite += space;
 205  205          else
 206  206                  txh->txh_space_towrite += space;
 207  207          if (bp)
 208  208                  txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 209  209  
 210  210          dmu_tx_count_twig(txh, dn, parent, level + 1,
 211  211              blkid >> epbs, freeable, history);
 212  212  }
 213  213  
 214  214  /* ARGSUSED */
 215  215  static void
 216  216  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 217  217  {
 218  218          dnode_t *dn = txh->txh_dnode;
 219  219          uint64_t start, end, i;
 220  220          int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 221  221          int err = 0;
 222  222  
 223  223          if (len == 0)
 224  224                  return;
 225  225  
 226  226          min_bs = SPA_MINBLOCKSHIFT;
 227  227          max_bs = SPA_MAXBLOCKSHIFT;
 228  228          min_ibs = DN_MIN_INDBLKSHIFT;
 229  229          max_ibs = DN_MAX_INDBLKSHIFT;
 230  230  
 231  231          if (dn) {
 232  232                  uint64_t history[DN_MAX_LEVELS];
 233  233                  int nlvls = dn->dn_nlevels;
 234  234                  int delta;
 235  235  
 236  236                  /*
 237  237                   * For i/o error checking, read the first and last level-0
 238  238                   * blocks (if they are not aligned), and all the level-1 blocks.
 239  239                   */
 240  240                  if (dn->dn_maxblkid == 0) {
 241  241                          delta = dn->dn_datablksz;
 242  242                          start = (off < dn->dn_datablksz) ? 0 : 1;
 243  243                          end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 244  244                          if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 245  245                                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 246  246                                  if (err)
 247  247                                          goto out;
 248  248                                  delta -= off;
 249  249                          }
 250  250                  } else {
 251  251                          zio_t *zio = zio_root(dn->dn_objset->os_spa,
 252  252                              NULL, NULL, ZIO_FLAG_CANFAIL);
 253  253  
 254  254                          /* first level-0 block */
 255  255                          start = off >> dn->dn_datablkshift;
 256  256                          if (P2PHASE(off, dn->dn_datablksz) ||
 257  257                              len < dn->dn_datablksz) {
 258  258                                  err = dmu_tx_check_ioerr(zio, dn, 0, start);
 259  259                                  if (err)
 260  260                                          goto out;
 261  261                          }
 262  262  
 263  263                          /* last level-0 block */
 264  264                          end = (off+len-1) >> dn->dn_datablkshift;
 265  265                          if (end != start && end <= dn->dn_maxblkid &&
 266  266                              P2PHASE(off+len, dn->dn_datablksz)) {
 267  267                                  err = dmu_tx_check_ioerr(zio, dn, 0, end);
 268  268                                  if (err)
 269  269                                          goto out;
 270  270                          }
 271  271  
 272  272                          /* level-1 blocks */
 273  273                          if (nlvls > 1) {
 274  274                                  int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 275  275                                  for (i = (start>>shft)+1; i < end>>shft; i++) {
 276  276                                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 277  277                                          if (err)
 278  278                                                  goto out;
 279  279                                  }
 280  280                          }
 281  281  
 282  282                          err = zio_wait(zio);
 283  283                          if (err)
 284  284                                  goto out;
 285  285                          delta = P2NPHASE(off, dn->dn_datablksz);
 286  286                  }
 287  287  
 288  288                  min_ibs = max_ibs = dn->dn_indblkshift;
 289  289                  if (dn->dn_maxblkid > 0) {
 290  290                          /*
 291  291                           * The blocksize can't change,
 292  292                           * so we can make a more precise estimate.
 293  293                           */
 294  294                          ASSERT(dn->dn_datablkshift != 0);
 295  295                          min_bs = max_bs = dn->dn_datablkshift;
 296  296                  }
 297  297  
 298  298                  /*
 299  299                   * If this write is not off the end of the file
 300  300                   * we need to account for overwrites/unref.
 301  301                   */
 302  302                  if (start <= dn->dn_maxblkid) {
 303  303                          for (int l = 0; l < DN_MAX_LEVELS; l++)
 304  304                                  history[l] = -1ULL;
 305  305                  }
 306  306                  while (start <= dn->dn_maxblkid) {
 307  307                          dmu_buf_impl_t *db;
 308  308  
 309  309                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 310  310                          err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 311  311                          rw_exit(&dn->dn_struct_rwlock);
 312  312  
 313  313                          if (err) {
 314  314                                  txh->txh_tx->tx_err = err;
 315  315                                  return;
 316  316                          }
 317  317  
 318  318                          dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 319  319                              history);
 320  320                          dbuf_rele(db, FTAG);
 321  321                          if (++start > end) {
 322  322                                  /*
 323  323                                   * Account for new indirects appearing
 324  324                                   * before this IO gets assigned into a txg.
 325  325                                   */
 326  326                                  bits = 64 - min_bs;
 327  327                                  epbs = min_ibs - SPA_BLKPTRSHIFT;
 328  328                                  for (bits -= epbs * (nlvls - 1);
 329  329                                      bits >= 0; bits -= epbs)
 330  330                                          txh->txh_fudge += 1ULL << max_ibs;
 331  331                                  goto out;
 332  332                          }
 333  333                          off += delta;
 334  334                          if (len >= delta)
 335  335                                  len -= delta;
 336  336                          delta = dn->dn_datablksz;
 337  337                  }
 338  338          }
 339  339  
 340  340          /*
 341  341           * 'end' is the last thing we will access, not one past.
 342  342           * This way we won't overflow when accessing the last byte.
 343  343           */
 344  344          start = P2ALIGN(off, 1ULL << max_bs);
 345  345          end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 346  346          txh->txh_space_towrite += end - start + 1;
 347  347  
 348  348          start >>= min_bs;
 349  349          end >>= min_bs;
 350  350  
 351  351          epbs = min_ibs - SPA_BLKPTRSHIFT;
 352  352  
 353  353          /*
 354  354           * The object contains at most 2^(64 - min_bs) blocks,
 355  355           * and each indirect level maps 2^epbs.
 356  356           */
 357  357          for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 358  358                  start >>= epbs;
 359  359                  end >>= epbs;
 360  360                  ASSERT3U(end, >=, start);
 361  361                  txh->txh_space_towrite += (end - start + 1) << max_ibs;
 362  362                  if (start != 0) {
 363  363                          /*
 364  364                           * We also need a new blkid=0 indirect block
 365  365                           * to reference any existing file data.
 366  366                           */
 367  367                          txh->txh_space_towrite += 1ULL << max_ibs;
 368  368                  }
 369  369          }
 370  370  
 371  371  out:
 372  372          if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 373  373              2 * DMU_MAX_ACCESS)
 374  374                  err = SET_ERROR(EFBIG);
 375  375  
 376  376          if (err)
 377  377                  txh->txh_tx->tx_err = err;
 378  378  }
 379  379  
 380  380  static void
 381  381  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 382  382  {
 383  383          dnode_t *dn = txh->txh_dnode;
 384  384          dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 385  385          uint64_t space = mdn->dn_datablksz +
 386  386              ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 387  387  
 388  388          if (dn && dn->dn_dbuf->db_blkptr &&
 389  389              dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 390  390              dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 391  391                  txh->txh_space_tooverwrite += space;
 392  392                  txh->txh_space_tounref += space;
 393  393          } else {
 394  394                  txh->txh_space_towrite += space;
 395  395                  if (dn && dn->dn_dbuf->db_blkptr)
 396  396                          txh->txh_space_tounref += space;
 397  397          }
 398  398  }
 399  399  
 400  400  void
 401  401  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 402  402  {
 403  403          dmu_tx_hold_t *txh;
 404  404  
 405  405          ASSERT(tx->tx_txg == 0);
 406  406          ASSERT(len < DMU_MAX_ACCESS);
 407  407          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 408  408  
 409  409          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 410  410              object, THT_WRITE, off, len);
 411  411          if (txh == NULL)
 412  412                  return;
 413  413  
 414  414          dmu_tx_count_write(txh, off, len);
 415  415          dmu_tx_count_dnode(txh);
 416  416  }
 417  417  
 418  418  static void
 419  419  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 420  420  {
 421  421          uint64_t blkid, nblks, lastblk;
 422  422          uint64_t space = 0, unref = 0, skipped = 0;
 423  423          dnode_t *dn = txh->txh_dnode;
 424  424          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 425  425          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 426  426          int epbs;
 427  427          uint64_t l0span = 0, nl1blks = 0;
 428  428  
 429  429          if (dn->dn_nlevels == 0)
 430  430                  return;
 431  431  
 432  432          /*
 433  433           * The struct_rwlock protects us against dn_nlevels
 434  434           * changing, in case (against all odds) we manage to dirty &
 435  435           * sync out the changes after we check for being dirty.
 436  436           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 437  437           */
 438  438          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 439  439          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 440  440          if (dn->dn_maxblkid == 0) {
 441  441                  if (off == 0 && len >= dn->dn_datablksz) {
 442  442                          blkid = 0;
 443  443                          nblks = 1;
 444  444                  } else {
 445  445                          rw_exit(&dn->dn_struct_rwlock);
 446  446                          return;
 447  447                  }
 448  448          } else {
 449  449                  blkid = off >> dn->dn_datablkshift;
 450  450                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 451  451  
 452  452                  if (blkid > dn->dn_maxblkid) {
 453  453                          rw_exit(&dn->dn_struct_rwlock);
 454  454                          return;
 455  455                  }
 456  456                  if (blkid + nblks > dn->dn_maxblkid)
 457  457                          nblks = dn->dn_maxblkid - blkid + 1;
 458  458  
 459  459          }
 460  460          l0span = nblks;    /* save for later use to calc level > 1 overhead */
 461  461          if (dn->dn_nlevels == 1) {
 462  462                  int i;
 463  463                  for (i = 0; i < nblks; i++) {
 464  464                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 465  465                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 466  466                          bp += blkid + i;
 467  467                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 468  468                                  dprintf_bp(bp, "can free old%s", "");
 469  469                                  space += bp_get_dsize(spa, bp);
 470  470                          }
 471  471                          unref += BP_GET_ASIZE(bp);
 472  472                  }
 473  473                  nl1blks = 1;
 474  474                  nblks = 0;
 475  475          }
 476  476  
 477  477          lastblk = blkid + nblks - 1;
 478  478          while (nblks) {
 479  479                  dmu_buf_impl_t *dbuf;
 480  480                  uint64_t ibyte, new_blkid;
 481  481                  int epb = 1 << epbs;
 482  482                  int err, i, blkoff, tochk;
 483  483                  blkptr_t *bp;
 484  484  
 485  485                  ibyte = blkid << dn->dn_datablkshift;
 486  486                  err = dnode_next_offset(dn,
 487  487                      DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 488  488                  new_blkid = ibyte >> dn->dn_datablkshift;
 489  489                  if (err == ESRCH) {
 490  490                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 491  491                          break;
 492  492                  }
 493  493                  if (err) {
 494  494                          txh->txh_tx->tx_err = err;
 495  495                          break;
 496  496                  }
 497  497                  if (new_blkid > lastblk) {
 498  498                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 499  499                          break;
 500  500                  }
 501  501  
 502  502                  if (new_blkid > blkid) {
 503  503                          ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 504  504                          skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 505  505                          nblks -= new_blkid - blkid;
 506  506                          blkid = new_blkid;
 507  507                  }
 508  508                  blkoff = P2PHASE(blkid, epb);
 509  509                  tochk = MIN(epb - blkoff, nblks);
 510  510  
 511  511                  err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 512  512                  if (err) {
 513  513                          txh->txh_tx->tx_err = err;
 514  514                          break;
 515  515                  }
 516  516  
 517  517                  txh->txh_memory_tohold += dbuf->db.db_size;
 518  518  
 519  519                  /*
 520  520                   * We don't check memory_tohold against DMU_MAX_ACCESS because
 521  521                   * memory_tohold is an over-estimation (especially the >L1
 522  522                   * indirect blocks), so it could fail.  Callers should have
 523  523                   * already verified that they will not be holding too much
 524  524                   * memory.
 525  525                   */
 526  526  
 527  527                  err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 528  528                  if (err != 0) {
 529  529                          txh->txh_tx->tx_err = err;
 530  530                          dbuf_rele(dbuf, FTAG);
 531  531                          break;
 532  532                  }
 533  533  
 534  534                  bp = dbuf->db.db_data;
 535  535                  bp += blkoff;
 536  536  
 537  537                  for (i = 0; i < tochk; i++) {
 538  538                          if (dsl_dataset_block_freeable(ds, &bp[i],
 539  539                              bp[i].blk_birth)) {
 540  540                                  dprintf_bp(&bp[i], "can free old%s", "");
 541  541                                  space += bp_get_dsize(spa, &bp[i]);
 542  542                          }
 543  543                          unref += BP_GET_ASIZE(bp);
 544  544                  }
 545  545                  dbuf_rele(dbuf, FTAG);
 546  546  
 547  547                  ++nl1blks;
 548  548                  blkid += tochk;
 549  549                  nblks -= tochk;
 550  550          }
 551  551          rw_exit(&dn->dn_struct_rwlock);
 552  552  
 553  553          /*
 554  554           * Add in memory requirements of higher-level indirects.
 555  555           * This assumes a worst-possible scenario for dn_nlevels and a
 556  556           * worst-possible distribution of l1-blocks over the region to free.
 557  557           */
 558  558          {
 559  559                  uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 560  560                  int level = 2;
 561  561                  /*
 562  562                   * Here we don't use DN_MAX_LEVEL, but calculate it with the
 563  563                   * given datablkshift and indblkshift. This makes the
 564  564                   * difference between 19 and 8 on large files.
 565  565                   */
 566  566                  int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 567  567                      (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 568  568  
 569  569                  while (level++ < maxlevel) {
 570  570                          txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 571  571                              << dn->dn_indblkshift;
 572  572                          blkcnt = 1 + (blkcnt >> epbs);
 573  573                  }
 574  574          }
 575  575  
 576  576          /* account for new level 1 indirect blocks that might show up */
 577  577          if (skipped > 0) {
 578  578                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 579  579                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 580  580                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 581  581          }
 582  582          txh->txh_space_tofree += space;
 583  583          txh->txh_space_tounref += unref;
 584  584  }
 585  585  
 586  586  void
 587  587  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 588  588  {
 589  589          dmu_tx_hold_t *txh;
 590  590          dnode_t *dn;
 591  591          int err;
 592  592          zio_t *zio;
 593  593  
 594  594          ASSERT(tx->tx_txg == 0);
 595  595  
 596  596          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 597  597              object, THT_FREE, off, len);

↓ open down ↓

597 lines elided

↑ open up ↑

 598  598          if (txh == NULL)
 599  599                  return;
 600  600          dn = txh->txh_dnode;
 601  601          dmu_tx_count_dnode(txh);
 602  602  
 603  603          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 604  604                  return;
 605  605          if (len == DMU_OBJECT_END)
 606  606                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 607  607  
 608      -
 609  608          /*
 610  609           * For i/o error checking, we read the first and last level-0
 611  610           * blocks if they are not aligned, and all the level-1 blocks.
 612  611           *
 613  612           * Note:  dbuf_free_range() assumes that we have not instantiated
 614  613           * any level-0 dbufs that will be completely freed.  Therefore we must
 615  614           * exercise care to not read or count the first and last blocks
 616  615           * if they are blocksize-aligned.
 617  616           */
 618  617          if (dn->dn_datablkshift == 0) {
 619  618                  if (off != 0 || len < dn->dn_datablksz)
 620      -                        dmu_tx_count_write(txh, off, len);
      619 +                        dmu_tx_count_write(txh, 0, dn->dn_datablksz);
 621  620          } else {
 622  621                  /* first block will be modified if it is not aligned */
 623  622                  if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 624  623                          dmu_tx_count_write(txh, off, 1);
 625  624                  /* last block will be modified if it is not aligned */
 626  625                  if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 627  626                          dmu_tx_count_write(txh, off+len, 1);
 628  627          }
 629  628  
 630  629          /*

 631  630           * Check level-1 blocks.
 632  631           */
 633  632          if (dn->dn_nlevels > 1) {
 634  633                  int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 635  634                      SPA_BLKPTRSHIFT;
 636  635                  uint64_t start = off >> shift;
 637  636                  uint64_t end = (off + len) >> shift;
 638  637  
 639  638                  ASSERT(dn->dn_datablkshift != 0);
 640  639                  ASSERT(dn->dn_indblkshift != 0);
 641  640  
 642  641                  zio = zio_root(tx->tx_pool->dp_spa,
 643  642                      NULL, NULL, ZIO_FLAG_CANFAIL);
 644  643                  for (uint64_t i = start; i <= end; i++) {
 645  644                          uint64_t ibyte = i << shift;
 646  645                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 647  646                          i = ibyte >> shift;
 648  647                          if (err == ESRCH)
 649  648                                  break;
 650  649                          if (err) {
 651  650                                  tx->tx_err = err;
 652  651                                  return;
 653  652                          }
 654  653  
 655  654                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 656  655                          if (err) {
 657  656                                  tx->tx_err = err;
 658  657                                  return;
 659  658                          }
 660  659                  }
 661  660                  err = zio_wait(zio);
 662  661                  if (err) {
 663  662                          tx->tx_err = err;
 664  663                          return;
 665  664                  }
 666  665          }
 667  666  
 668  667          dmu_tx_count_free(txh, off, len);
 669  668  }
 670  669  
 671  670  void
 672  671  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 673  672  {
 674  673          dmu_tx_hold_t *txh;
 675  674          dnode_t *dn;
 676  675          uint64_t nblocks;
 677  676          int epbs, err;
 678  677  
 679  678          ASSERT(tx->tx_txg == 0);
 680  679  
 681  680          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 682  681              object, THT_ZAP, add, (uintptr_t)name);
 683  682          if (txh == NULL)
 684  683                  return;
 685  684          dn = txh->txh_dnode;
 686  685  
 687  686          dmu_tx_count_dnode(txh);
 688  687  
 689  688          if (dn == NULL) {
 690  689                  /*
 691  690                   * We will be able to fit a new object's entries into one leaf
 692  691                   * block.  So there will be at most 2 blocks total,
 693  692                   * including the header block.
 694  693                   */
 695  694                  dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 696  695                  return;
 697  696          }
 698  697  
 699  698          ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 700  699  
 701  700          if (dn->dn_maxblkid == 0 && !add) {
 702  701                  blkptr_t *bp;
 703  702  
 704  703                  /*
 705  704                   * If there is only one block  (i.e. this is a micro-zap)
 706  705                   * and we are not adding anything, the accounting is simple.
 707  706                   */
 708  707                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 709  708                  if (err) {
 710  709                          tx->tx_err = err;
 711  710                          return;
 712  711                  }
 713  712  
 714  713                  /*
 715  714                   * Use max block size here, since we don't know how much
 716  715                   * the size will change between now and the dbuf dirty call.
 717  716                   */
 718  717                  bp = &dn->dn_phys->dn_blkptr[0];
 719  718                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 720  719                      bp, bp->blk_birth))
 721  720                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 722  721                  else
 723  722                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
 724  723                  if (!BP_IS_HOLE(bp))
 725  724                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 726  725                  return;
 727  726          }
 728  727  
 729  728          if (dn->dn_maxblkid > 0 && name) {
 730  729                  /*
 731  730                   * access the name in this fat-zap so that we'll check
 732  731                   * for i/o errors to the leaf blocks, etc.
 733  732                   */
 734  733                  err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 735  734                      8, 0, NULL);
 736  735                  if (err == EIO) {
 737  736                          tx->tx_err = err;
 738  737                          return;
 739  738                  }
 740  739          }
 741  740  
 742  741          err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 743  742              &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 744  743  
 745  744          /*
 746  745           * If the modified blocks are scattered to the four winds,
 747  746           * we'll have to modify an indirect twig for each.
 748  747           */
 749  748          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 750  749          for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 751  750                  if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
 752  751                          txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 753  752                  else
 754  753                          txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 755  754  }
 756  755  
 757  756  void
 758  757  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 759  758  {
 760  759          dmu_tx_hold_t *txh;
 761  760  
 762  761          ASSERT(tx->tx_txg == 0);
 763  762  
 764  763          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 765  764              object, THT_BONUS, 0, 0);
 766  765          if (txh)
 767  766                  dmu_tx_count_dnode(txh);
 768  767  }
 769  768  
 770  769  void
 771  770  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 772  771  {
 773  772          dmu_tx_hold_t *txh;
 774  773          ASSERT(tx->tx_txg == 0);
 775  774  
 776  775          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 777  776              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 778  777  
 779  778          txh->txh_space_towrite += space;
 780  779  }
 781  780  
 782  781  int
 783  782  dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 784  783  {
 785  784          dmu_tx_hold_t *txh;
 786  785          int holds = 0;
 787  786  
 788  787          /*
 789  788           * By asserting that the tx is assigned, we're counting the
 790  789           * number of dn_tx_holds, which is the same as the number of
 791  790           * dn_holds.  Otherwise, we'd be counting dn_holds, but
 792  791           * dn_tx_holds could be 0.
 793  792           */
 794  793          ASSERT(tx->tx_txg != 0);
 795  794  
 796  795          /* if (tx->tx_anyobj == TRUE) */
 797  796                  /* return (0); */
 798  797  
 799  798          for (txh = list_head(&tx->tx_holds); txh;
 800  799              txh = list_next(&tx->tx_holds, txh)) {
 801  800                  if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 802  801                          holds++;
 803  802          }
 804  803  
 805  804          return (holds);
 806  805  }
 807  806  
 808  807  #ifdef ZFS_DEBUG
 809  808  void
 810  809  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 811  810  {
 812  811          dmu_tx_hold_t *txh;
 813  812          int match_object = FALSE, match_offset = FALSE;
 814  813          dnode_t *dn;
 815  814  
 816  815          DB_DNODE_ENTER(db);
 817  816          dn = DB_DNODE(db);
 818  817          ASSERT(tx->tx_txg != 0);
 819  818          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 820  819          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 821  820  
 822  821          if (tx->tx_anyobj) {
 823  822                  DB_DNODE_EXIT(db);
 824  823                  return;
 825  824          }
 826  825  
 827  826          /* XXX No checking on the meta dnode for now */
 828  827          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 829  828                  DB_DNODE_EXIT(db);
 830  829                  return;
 831  830          }
 832  831  
 833  832          for (txh = list_head(&tx->tx_holds); txh;
 834  833              txh = list_next(&tx->tx_holds, txh)) {
 835  834                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 836  835                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 837  836                          match_object = TRUE;
 838  837                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 839  838                          int datablkshift = dn->dn_datablkshift ?
 840  839                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 841  840                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 842  841                          int shift = datablkshift + epbs * db->db_level;
 843  842                          uint64_t beginblk = shift >= 64 ? 0 :
 844  843                              (txh->txh_arg1 >> shift);
 845  844                          uint64_t endblk = shift >= 64 ? 0 :
 846  845                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 847  846                          uint64_t blkid = db->db_blkid;
 848  847  
 849  848                          /* XXX txh_arg2 better not be zero... */
 850  849  
 851  850                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 852  851                              txh->txh_type, beginblk, endblk);
 853  852  
 854  853                          switch (txh->txh_type) {
 855  854                          case THT_WRITE:
 856  855                                  if (blkid >= beginblk && blkid <= endblk)
 857  856                                          match_offset = TRUE;
 858  857                                  /*
 859  858                                   * We will let this hold work for the bonus
 860  859                                   * or spill buffer so that we don't need to
 861  860                                   * hold it when creating a new object.
 862  861                                   */
 863  862                                  if (blkid == DMU_BONUS_BLKID ||
 864  863                                      blkid == DMU_SPILL_BLKID)
 865  864                                          match_offset = TRUE;
 866  865                                  /*
 867  866                                   * They might have to increase nlevels,
 868  867                                   * thus dirtying the new TLIBs.  Or the
 869  868                                   * might have to change the block size,
 870  869                                   * thus dirying the new lvl=0 blk=0.
 871  870                                   */
 872  871                                  if (blkid == 0)
 873  872                                          match_offset = TRUE;
 874  873                                  break;
 875  874                          case THT_FREE:
 876  875                                  /*
 877  876                                   * We will dirty all the level 1 blocks in
 878  877                                   * the free range and perhaps the first and
 879  878                                   * last level 0 block.
 880  879                                   */
 881  880                                  if (blkid >= beginblk && (blkid <= endblk ||
 882  881                                      txh->txh_arg2 == DMU_OBJECT_END))
 883  882                                          match_offset = TRUE;
 884  883                                  break;
 885  884                          case THT_SPILL:
 886  885                                  if (blkid == DMU_SPILL_BLKID)
 887  886                                          match_offset = TRUE;
 888  887                                  break;
 889  888                          case THT_BONUS:
 890  889                                  if (blkid == DMU_BONUS_BLKID)
 891  890                                          match_offset = TRUE;
 892  891                                  break;
 893  892                          case THT_ZAP:
 894  893                                  match_offset = TRUE;
 895  894                                  break;
 896  895                          case THT_NEWOBJECT:
 897  896                                  match_object = TRUE;
 898  897                                  break;
 899  898                          default:
 900  899                                  ASSERT(!"bad txh_type");
 901  900                          }
 902  901                  }
 903  902                  if (match_object && match_offset) {
 904  903                          DB_DNODE_EXIT(db);
 905  904                          return;
 906  905                  }
 907  906          }
 908  907          DB_DNODE_EXIT(db);
 909  908          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 910  909              (u_longlong_t)db->db.db_object, db->db_level,
 911  910              (u_longlong_t)db->db_blkid);
 912  911  }
 913  912  #endif
 914  913  
 915  914  /*
 916  915   * If we can't do 10 iops, something is wrong.  Let us go ahead
 917  916   * and hit zfs_dirty_data_max.
 918  917   */
 919  918  hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
 920  919  int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
 921  920  
 922  921  /*
 923  922   * We delay transactions when we've determined that the backend storage
 924  923   * isn't able to accommodate the rate of incoming writes.
 925  924   *
 926  925   * If there is already a transaction waiting, we delay relative to when
 927  926   * that transaction finishes waiting.  This way the calculated min_time
 928  927   * is independent of the number of threads concurrently executing
 929  928   * transactions.
 930  929   *
 931  930   * If we are the only waiter, wait relative to when the transaction
 932  931   * started, rather than the current time.  This credits the transaction for
 933  932   * "time already served", e.g. reading indirect blocks.
 934  933   *
 935  934   * The minimum time for a transaction to take is calculated as:
 936  935   *     min_time = scale * (dirty - min) / (max - dirty)
 937  936   *     min_time is then capped at zfs_delay_max_ns.
 938  937   *
 939  938   * The delay has two degrees of freedom that can be adjusted via tunables.
 940  939   * The percentage of dirty data at which we start to delay is defined by
 941  940   * zfs_delay_min_dirty_percent. This should typically be at or above
 942  941   * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
 943  942   * delay after writing at full speed has failed to keep up with the incoming
 944  943   * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
 945  944   * speaking, this variable determines the amount of delay at the midpoint of
 946  945   * the curve.
 947  946   *
 948  947   * delay
 949  948   *  10ms +-------------------------------------------------------------*+
 950  949   *       |                                                             *|
 951  950   *   9ms +                                                             *+
 952  951   *       |                                                             *|
 953  952   *   8ms +                                                             *+
 954  953   *       |                                                            * |
 955  954   *   7ms +                                                            * +
 956  955   *       |                                                            * |
 957  956   *   6ms +                                                            * +
 958  957   *       |                                                            * |
 959  958   *   5ms +                                                           *  +
 960  959   *       |                                                           *  |
 961  960   *   4ms +                                                           *  +
 962  961   *       |                                                           *  |
 963  962   *   3ms +                                                          *   +
 964  963   *       |                                                          *   |
 965  964   *   2ms +                                              (midpoint) *    +
 966  965   *       |                                                  |    **     |
 967  966   *   1ms +                                                  v ***       +
 968  967   *       |             zfs_delay_scale ---------->     ********         |
 969  968   *     0 +-------------------------------------*********----------------+
 970  969   *       0%                    <- zfs_dirty_data_max ->               100%
 971  970   *
 972  971   * Note that since the delay is added to the outstanding time remaining on the
 973  972   * most recent transaction, the delay is effectively the inverse of IOPS.
 974  973   * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
 975  974   * was chosen such that small changes in the amount of accumulated dirty data
 976  975   * in the first 3/4 of the curve yield relatively small differences in the
 977  976   * amount of delay.
 978  977   *
 979  978   * The effects can be easier to understand when the amount of delay is
 980  979   * represented on a log scale:
 981  980   *
 982  981   * delay
 983  982   * 100ms +-------------------------------------------------------------++
 984  983   *       +                                                              +
 985  984   *       |                                                              |
 986  985   *       +                                                             *+
 987  986   *  10ms +                                                             *+
 988  987   *       +                                                           ** +
 989  988   *       |                                              (midpoint)  **  |
 990  989   *       +                                                  |     **    +
 991  990   *   1ms +                                                  v ****      +
 992  991   *       +             zfs_delay_scale ---------->        *****         +
 993  992   *       |                                             ****             |
 994  993   *       +                                          ****                +
 995  994   * 100us +                                        **                    +
 996  995   *       +                                       *                      +
 997  996   *       |                                      *                       |
 998  997   *       +                                     *                        +
 999  998   *  10us +                                     *                        +
1000  999   *       +                                                              +
1001 1000   *       |                                                              |
1002 1001   *       +                                                              +
1003 1002   *       +--------------------------------------------------------------+
1004 1003   *       0%                    <- zfs_dirty_data_max ->               100%
1005 1004   *
1006 1005   * Note here that only as the amount of dirty data approaches its limit does
1007 1006   * the delay start to increase rapidly. The goal of a properly tuned system
1008 1007   * should be to keep the amount of dirty data out of that range by first
1009 1008   * ensuring that the appropriate limits are set for the I/O scheduler to reach
1010 1009   * optimal throughput on the backend storage, and then by changing the value
1011 1010   * of zfs_delay_scale to increase the steepness of the curve.
1012 1011   */
1013 1012  static void
1014 1013  dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
1015 1014  {
1016 1015          dsl_pool_t *dp = tx->tx_pool;
1017 1016          uint64_t delay_min_bytes =
1018 1017              zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1019 1018          hrtime_t wakeup, min_tx_time, now;
1020 1019  
1021 1020          if (dirty <= delay_min_bytes)
1022 1021                  return;
1023 1022  
1024 1023          /*
1025 1024           * The caller has already waited until we are under the max.
1026 1025           * We make them pass us the amount of dirty data so we don't
1027 1026           * have to handle the case of it being >= the max, which could
1028 1027           * cause a divide-by-zero if it's == the max.
1029 1028           */
1030 1029          ASSERT3U(dirty, <, zfs_dirty_data_max);
1031 1030  
1032 1031          now = gethrtime();
1033 1032          min_tx_time = zfs_delay_scale *
1034 1033              (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1035 1034          if (now > tx->tx_start + min_tx_time)
1036 1035                  return;
1037 1036  
1038 1037          min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
1039 1038  
1040 1039          DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
1041 1040              uint64_t, min_tx_time);
1042 1041  
1043 1042          mutex_enter(&dp->dp_lock);
1044 1043          wakeup = MAX(tx->tx_start + min_tx_time,
1045 1044              dp->dp_last_wakeup + min_tx_time);
1046 1045          dp->dp_last_wakeup = wakeup;
1047 1046          mutex_exit(&dp->dp_lock);
1048 1047  
1049 1048  #ifdef _KERNEL
1050 1049          mutex_enter(&curthread->t_delay_lock);
1051 1050          while (cv_timedwait_hires(&curthread->t_delay_cv,
1052 1051              &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
1053 1052              CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
1054 1053                  continue;
1055 1054          mutex_exit(&curthread->t_delay_lock);
1056 1055  #else
1057 1056          hrtime_t delta = wakeup - gethrtime();
1058 1057          struct timespec ts;
1059 1058          ts.tv_sec = delta / NANOSEC;
1060 1059          ts.tv_nsec = delta % NANOSEC;
1061 1060          (void) nanosleep(&ts, NULL);
1062 1061  #endif
1063 1062  }
1064 1063  
1065 1064  static int
1066 1065  dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
1067 1066  {
1068 1067          dmu_tx_hold_t *txh;
1069 1068          spa_t *spa = tx->tx_pool->dp_spa;
1070 1069          uint64_t memory, asize, fsize, usize;
1071 1070          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
1072 1071  
1073 1072          ASSERT0(tx->tx_txg);
1074 1073  
1075 1074          if (tx->tx_err)
1076 1075                  return (tx->tx_err);
1077 1076  
1078 1077          if (spa_suspended(spa)) {
1079 1078                  /*
1080 1079                   * If the user has indicated a blocking failure mode
1081 1080                   * then return ERESTART which will block in dmu_tx_wait().
1082 1081                   * Otherwise, return EIO so that an error can get
1083 1082                   * propagated back to the VOP calls.
1084 1083                   *
1085 1084                   * Note that we always honor the txg_how flag regardless
1086 1085                   * of the failuremode setting.
1087 1086                   */
1088 1087                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
1089 1088                      txg_how != TXG_WAIT)
1090 1089                          return (SET_ERROR(EIO));
1091 1090  
1092 1091                  return (SET_ERROR(ERESTART));
1093 1092          }
1094 1093  
1095 1094          if (!tx->tx_waited &&
1096 1095              dsl_pool_need_dirty_delay(tx->tx_pool)) {
1097 1096                  tx->tx_wait_dirty = B_TRUE;
1098 1097                  return (SET_ERROR(ERESTART));
1099 1098          }
1100 1099  
1101 1100          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
1102 1101          tx->tx_needassign_txh = NULL;
1103 1102  
1104 1103          /*
1105 1104           * NB: No error returns are allowed after txg_hold_open, but
1106 1105           * before processing the dnode holds, due to the
1107 1106           * dmu_tx_unassign() logic.
1108 1107           */
1109 1108  
1110 1109          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
1111 1110          for (txh = list_head(&tx->tx_holds); txh;
1112 1111              txh = list_next(&tx->tx_holds, txh)) {
1113 1112                  dnode_t *dn = txh->txh_dnode;
1114 1113                  if (dn != NULL) {
1115 1114                          mutex_enter(&dn->dn_mtx);
1116 1115                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
1117 1116                                  mutex_exit(&dn->dn_mtx);
1118 1117                                  tx->tx_needassign_txh = txh;
1119 1118                                  return (SET_ERROR(ERESTART));
1120 1119                          }
1121 1120                          if (dn->dn_assigned_txg == 0)
1122 1121                                  dn->dn_assigned_txg = tx->tx_txg;
1123 1122                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1124 1123                          (void) refcount_add(&dn->dn_tx_holds, tx);
1125 1124                          mutex_exit(&dn->dn_mtx);
1126 1125                  }
1127 1126                  towrite += txh->txh_space_towrite;
1128 1127                  tofree += txh->txh_space_tofree;
1129 1128                  tooverwrite += txh->txh_space_tooverwrite;
1130 1129                  tounref += txh->txh_space_tounref;
1131 1130                  tohold += txh->txh_memory_tohold;
1132 1131                  fudge += txh->txh_fudge;
1133 1132          }
1134 1133  
1135 1134          /*
1136 1135           * If a snapshot has been taken since we made our estimates,
1137 1136           * assume that we won't be able to free or overwrite anything.
1138 1137           */
1139 1138          if (tx->tx_objset &&
1140 1139              dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
1141 1140              tx->tx_lastsnap_txg) {
1142 1141                  towrite += tooverwrite;
1143 1142                  tooverwrite = tofree = 0;
1144 1143          }
1145 1144  
1146 1145          /* needed allocation: worst-case estimate of write space */
1147 1146          asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
1148 1147          /* freed space estimate: worst-case overwrite + free estimate */
1149 1148          fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
1150 1149          /* convert unrefd space to worst-case estimate */
1151 1150          usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
1152 1151          /* calculate memory footprint estimate */
1153 1152          memory = towrite + tooverwrite + tohold;
1154 1153  
1155 1154  #ifdef ZFS_DEBUG
1156 1155          /*
1157 1156           * Add in 'tohold' to account for our dirty holds on this memory
1158 1157           * XXX - the "fudge" factor is to account for skipped blocks that
1159 1158           * we missed because dnode_next_offset() misses in-core-only blocks.
1160 1159           */
1161 1160          tx->tx_space_towrite = asize +
1162 1161              spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
1163 1162          tx->tx_space_tofree = tofree;
1164 1163          tx->tx_space_tooverwrite = tooverwrite;
1165 1164          tx->tx_space_tounref = tounref;
1166 1165  #endif
1167 1166  
1168 1167          if (tx->tx_dir && asize != 0) {
1169 1168                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1170 1169                      asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1171 1170                  if (err)
1172 1171                          return (err);
1173 1172          }
1174 1173  
1175 1174          return (0);
1176 1175  }
1177 1176  
1178 1177  static void
1179 1178  dmu_tx_unassign(dmu_tx_t *tx)
1180 1179  {
1181 1180          dmu_tx_hold_t *txh;
1182 1181  
1183 1182          if (tx->tx_txg == 0)
1184 1183                  return;
1185 1184  
1186 1185          txg_rele_to_quiesce(&tx->tx_txgh);
1187 1186  
1188 1187          /*
1189 1188           * Walk the transaction's hold list, removing the hold on the
1190 1189           * associated dnode, and notifying waiters if the refcount drops to 0.
1191 1190           */
1192 1191          for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1193 1192              txh = list_next(&tx->tx_holds, txh)) {
1194 1193                  dnode_t *dn = txh->txh_dnode;
1195 1194  
1196 1195                  if (dn == NULL)
1197 1196                          continue;
1198 1197                  mutex_enter(&dn->dn_mtx);
1199 1198                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1200 1199  
1201 1200                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1202 1201                          dn->dn_assigned_txg = 0;
1203 1202                          cv_broadcast(&dn->dn_notxholds);
1204 1203                  }
1205 1204                  mutex_exit(&dn->dn_mtx);
1206 1205          }
1207 1206  
1208 1207          txg_rele_to_sync(&tx->tx_txgh);
1209 1208  
1210 1209          tx->tx_lasttried_txg = tx->tx_txg;
1211 1210          tx->tx_txg = 0;
1212 1211  }
1213 1212  
1214 1213  /*
1215 1214   * Assign tx to a transaction group.  txg_how can be one of:
1216 1215   *
1217 1216   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1218 1217   *      a new one.  This should be used when you're not holding locks.
1219 1218   *      It will only fail if we're truly out of space (or over quota).
1220 1219   *
1221 1220   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1222 1221   *      blocking, returns immediately with ERESTART.  This should be used
1223 1222   *      whenever you're holding locks.  On an ERESTART error, the caller
1224 1223   *      should drop locks, do a dmu_tx_wait(tx), and try again.
1225 1224   *
1226 1225   * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
1227 1226   *      has already been called on behalf of this operation (though
1228 1227   *      most likely on a different tx).
1229 1228   */
1230 1229  int
1231 1230  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1232 1231  {
1233 1232          int err;
1234 1233  
1235 1234          ASSERT(tx->tx_txg == 0);
1236 1235          ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1237 1236              txg_how == TXG_WAITED);
1238 1237          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1239 1238  
1240 1239          /* If we might wait, we must not hold the config lock. */
1241 1240          ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1242 1241  
1243 1242          if (txg_how == TXG_WAITED)
1244 1243                  tx->tx_waited = B_TRUE;
1245 1244  
1246 1245          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1247 1246                  dmu_tx_unassign(tx);
1248 1247  
1249 1248                  if (err != ERESTART || txg_how != TXG_WAIT)
1250 1249                          return (err);
1251 1250  
1252 1251                  dmu_tx_wait(tx);
1253 1252          }
1254 1253  
1255 1254          txg_rele_to_quiesce(&tx->tx_txgh);
1256 1255  
1257 1256          return (0);
1258 1257  }
1259 1258  
1260 1259  void
1261 1260  dmu_tx_wait(dmu_tx_t *tx)
1262 1261  {
1263 1262          spa_t *spa = tx->tx_pool->dp_spa;
1264 1263          dsl_pool_t *dp = tx->tx_pool;
1265 1264  
1266 1265          ASSERT(tx->tx_txg == 0);
1267 1266          ASSERT(!dsl_pool_config_held(tx->tx_pool));
1268 1267  
1269 1268          if (tx->tx_wait_dirty) {
1270 1269                  /*
1271 1270                   * dmu_tx_try_assign() has determined that we need to wait
1272 1271                   * because we've consumed much or all of the dirty buffer
1273 1272                   * space.
1274 1273                   */
1275 1274                  mutex_enter(&dp->dp_lock);
1276 1275                  while (dp->dp_dirty_total >= zfs_dirty_data_max)
1277 1276                          cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1278 1277                  uint64_t dirty = dp->dp_dirty_total;
1279 1278                  mutex_exit(&dp->dp_lock);
1280 1279  
1281 1280                  dmu_tx_delay(tx, dirty);
1282 1281  
1283 1282                  tx->tx_wait_dirty = B_FALSE;
1284 1283  
1285 1284                  /*
1286 1285                   * Note: setting tx_waited only has effect if the caller
1287 1286                   * used TX_WAIT.  Otherwise they are going to destroy
1288 1287                   * this tx and try again.  The common case, zfs_write(),
1289 1288                   * uses TX_WAIT.
1290 1289                   */
1291 1290                  tx->tx_waited = B_TRUE;
1292 1291          } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1293 1292                  /*
1294 1293                   * If the pool is suspended we need to wait until it
1295 1294                   * is resumed.  Note that it's possible that the pool
1296 1295                   * has become active after this thread has tried to
1297 1296                   * obtain a tx.  If that's the case then tx_lasttried_txg
1298 1297                   * would not have been set.
1299 1298                   */
1300 1299                  txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1301 1300          } else if (tx->tx_needassign_txh) {
1302 1301                  /*
1303 1302                   * A dnode is assigned to the quiescing txg.  Wait for its
1304 1303                   * transaction to complete.
1305 1304                   */
1306 1305                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1307 1306  
1308 1307                  mutex_enter(&dn->dn_mtx);
1309 1308                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1310 1309                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1311 1310                  mutex_exit(&dn->dn_mtx);
1312 1311                  tx->tx_needassign_txh = NULL;
1313 1312          } else {
1314 1313                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1315 1314          }
1316 1315  }
1317 1316  
1318 1317  void
1319 1318  dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1320 1319  {
1321 1320  #ifdef ZFS_DEBUG
1322 1321          if (tx->tx_dir == NULL || delta == 0)
1323 1322                  return;
1324 1323  
1325 1324          if (delta > 0) {
1326 1325                  ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1327 1326                      tx->tx_space_towrite);
1328 1327                  (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1329 1328          } else {
1330 1329                  (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1331 1330          }
1332 1331  #endif
1333 1332  }
1334 1333  
1335 1334  void
1336 1335  dmu_tx_commit(dmu_tx_t *tx)
1337 1336  {
1338 1337          dmu_tx_hold_t *txh;
1339 1338  
1340 1339          ASSERT(tx->tx_txg != 0);
1341 1340  
1342 1341          /*
1343 1342           * Go through the transaction's hold list and remove holds on
1344 1343           * associated dnodes, notifying waiters if no holds remain.
1345 1344           */
1346 1345          while (txh = list_head(&tx->tx_holds)) {
1347 1346                  dnode_t *dn = txh->txh_dnode;
1348 1347  
1349 1348                  list_remove(&tx->tx_holds, txh);
1350 1349                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1351 1350                  if (dn == NULL)
1352 1351                          continue;
1353 1352                  mutex_enter(&dn->dn_mtx);
1354 1353                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1355 1354  
1356 1355                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1357 1356                          dn->dn_assigned_txg = 0;
1358 1357                          cv_broadcast(&dn->dn_notxholds);
1359 1358                  }
1360 1359                  mutex_exit(&dn->dn_mtx);
1361 1360                  dnode_rele(dn, tx);
1362 1361          }
1363 1362  
1364 1363          if (tx->tx_tempreserve_cookie)
1365 1364                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1366 1365  
1367 1366          if (!list_is_empty(&tx->tx_callbacks))
1368 1367                  txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1369 1368  
1370 1369          if (tx->tx_anyobj == FALSE)
1371 1370                  txg_rele_to_sync(&tx->tx_txgh);
1372 1371  
1373 1372          list_destroy(&tx->tx_callbacks);
1374 1373          list_destroy(&tx->tx_holds);
1375 1374  #ifdef ZFS_DEBUG
1376 1375          dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1377 1376              tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1378 1377              tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1379 1378          refcount_destroy_many(&tx->tx_space_written,
1380 1379              refcount_count(&tx->tx_space_written));
1381 1380          refcount_destroy_many(&tx->tx_space_freed,
1382 1381              refcount_count(&tx->tx_space_freed));
1383 1382  #endif
1384 1383          kmem_free(tx, sizeof (dmu_tx_t));
1385 1384  }
1386 1385  
1387 1386  void
1388 1387  dmu_tx_abort(dmu_tx_t *tx)
1389 1388  {
1390 1389          dmu_tx_hold_t *txh;
1391 1390  
1392 1391          ASSERT(tx->tx_txg == 0);
1393 1392  
1394 1393          while (txh = list_head(&tx->tx_holds)) {
1395 1394                  dnode_t *dn = txh->txh_dnode;
1396 1395  
1397 1396                  list_remove(&tx->tx_holds, txh);
1398 1397                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1399 1398                  if (dn != NULL)
1400 1399                          dnode_rele(dn, tx);
1401 1400          }
1402 1401  
1403 1402          /*
1404 1403           * Call any registered callbacks with an error code.
1405 1404           */
1406 1405          if (!list_is_empty(&tx->tx_callbacks))
1407 1406                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1408 1407  
1409 1408          list_destroy(&tx->tx_callbacks);
1410 1409          list_destroy(&tx->tx_holds);
1411 1410  #ifdef ZFS_DEBUG
1412 1411          refcount_destroy_many(&tx->tx_space_written,
1413 1412              refcount_count(&tx->tx_space_written));
1414 1413          refcount_destroy_many(&tx->tx_space_freed,
1415 1414              refcount_count(&tx->tx_space_freed));
1416 1415  #endif
1417 1416          kmem_free(tx, sizeof (dmu_tx_t));
1418 1417  }
1419 1418  
1420 1419  uint64_t
1421 1420  dmu_tx_get_txg(dmu_tx_t *tx)
1422 1421  {
1423 1422          ASSERT(tx->tx_txg != 0);
1424 1423          return (tx->tx_txg);
1425 1424  }
1426 1425  
1427 1426  dsl_pool_t *
1428 1427  dmu_tx_pool(dmu_tx_t *tx)
1429 1428  {
1430 1429          ASSERT(tx->tx_pool != NULL);
1431 1430          return (tx->tx_pool);
1432 1431  }
1433 1432  
1434 1433  
1435 1434  void
1436 1435  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1437 1436  {
1438 1437          dmu_tx_callback_t *dcb;
1439 1438  
1440 1439          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1441 1440  
1442 1441          dcb->dcb_func = func;
1443 1442          dcb->dcb_data = data;
1444 1443  
1445 1444          list_insert_tail(&tx->tx_callbacks, dcb);
1446 1445  }
1447 1446  
1448 1447  /*
1449 1448   * Call all the commit callbacks on a list, with a given error code.
1450 1449   */
1451 1450  void
1452 1451  dmu_tx_do_callbacks(list_t *cb_list, int error)
1453 1452  {
1454 1453          dmu_tx_callback_t *dcb;
1455 1454  
1456 1455          while (dcb = list_head(cb_list)) {
1457 1456                  list_remove(cb_list, dcb);
1458 1457                  dcb->dcb_func(dcb->dcb_data, error);
1459 1458                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1460 1459          }
1461 1460  }
1462 1461  
1463 1462  /*
1464 1463   * Interface to hold a bunch of attributes.
1465 1464   * used for creating new files.
1466 1465   * attrsize is the total size of all attributes
1467 1466   * to be added during object creation
1468 1467   *
1469 1468   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1470 1469   */
1471 1470  
1472 1471  /*
1473 1472   * hold necessary attribute name for attribute registration.
1474 1473   * should be a very rare case where this is needed.  If it does
1475 1474   * happen it would only happen on the first write to the file system.
1476 1475   */
1477 1476  static void
1478 1477  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1479 1478  {
1480 1479          int i;
1481 1480  
1482 1481          if (!sa->sa_need_attr_registration)
1483 1482                  return;
1484 1483  
1485 1484          for (i = 0; i != sa->sa_num_attrs; i++) {
1486 1485                  if (!sa->sa_attr_table[i].sa_registered) {
1487 1486                          if (sa->sa_reg_attr_obj)
1488 1487                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1489 1488                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1490 1489                          else
1491 1490                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1492 1491                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1493 1492                  }
1494 1493          }
1495 1494  }
1496 1495  
1497 1496  
1498 1497  void
1499 1498  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1500 1499  {
1501 1500          dnode_t *dn;
1502 1501          dmu_tx_hold_t *txh;
1503 1502  
1504 1503          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1505 1504              THT_SPILL, 0, 0);
1506 1505  
1507 1506          dn = txh->txh_dnode;
1508 1507  
1509 1508          if (dn == NULL)
1510 1509                  return;
1511 1510  
1512 1511          /* If blkptr doesn't exist then add space to towrite */
1513 1512          if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1514 1513                  txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1515 1514          } else {
1516 1515                  blkptr_t *bp;
1517 1516  
1518 1517                  bp = &dn->dn_phys->dn_spill;
1519 1518                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1520 1519                      bp, bp->blk_birth))
1521 1520                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
1522 1521                  else
1523 1522                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1524 1523                  if (!BP_IS_HOLE(bp))
1525 1524                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
1526 1525          }
1527 1526  }
1528 1527  
1529 1528  void
1530 1529  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1531 1530  {
1532 1531          sa_os_t *sa = tx->tx_objset->os_sa;
1533 1532  
1534 1533          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1535 1534  
1536 1535          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1537 1536                  return;
1538 1537  
1539 1538          if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1540 1539                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1541 1540          else {
1542 1541                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1543 1542                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1544 1543                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1545 1544                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1546 1545          }
1547 1546  
1548 1547          dmu_tx_sa_registration_hold(sa, tx);
1549 1548  
1550 1549          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1551 1550                  return;
1552 1551  
1553 1552          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1554 1553              THT_SPILL, 0, 0);
1555 1554  }
1556 1555  
1557 1556  /*
1558 1557   * Hold SA attribute
1559 1558   *
1560 1559   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1561 1560   *
1562 1561   * variable_size is the total size of all variable sized attributes
1563 1562   * passed to this function.  It is not the total size of all
1564 1563   * variable size attributes that *may* exist on this object.
1565 1564   */
1566 1565  void
1567 1566  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1568 1567  {
1569 1568          uint64_t object;
1570 1569          sa_os_t *sa = tx->tx_objset->os_sa;
1571 1570  
1572 1571          ASSERT(hdl != NULL);
1573 1572  
1574 1573          object = sa_handle_object(hdl);
1575 1574  
1576 1575          dmu_tx_hold_bonus(tx, object);
1577 1576  
1578 1577          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1579 1578                  return;
1580 1579  
1581 1580          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1582 1581              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1583 1582                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1584 1583                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1585 1584                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1586 1585                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1587 1586          }
1588 1587  
1589 1588          dmu_tx_sa_registration_hold(sa, tx);
1590 1589  
1591 1590          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1592 1591                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1593 1592  
1594 1593          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1595 1594                  ASSERT(tx->tx_txg == 0);
1596 1595                  dmu_tx_hold_spill(tx, object);
1597 1596          } else {
1598 1597                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1599 1598                  dnode_t *dn;
1600 1599  
1601 1600                  DB_DNODE_ENTER(db);
1602 1601                  dn = DB_DNODE(db);
1603 1602                  if (dn->dn_have_spill) {
1604 1603                          ASSERT(tx->tx_txg == 0);
1605 1604                          dmu_tx_hold_spill(tx, object);
1606 1605                  }
1607 1606                  DB_DNODE_EXIT(db);
1608 1607          }
1609 1608  }

↓ open down ↓

979 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX