dlpx-os-diff Wdiff usr/src/uts/common/fs/zfs/dmu_tx.c

Print this page

4188 assertion failed in dmu_tx_hold_free(): dn_datablkshift != 0
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_impl.h>
  29   29  #include <sys/dbuf.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/dmu_objset.h>
  32   32  #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33   33  #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36   36  #include <sys/spa.h>
  37   37  #include <sys/sa.h>
  38   38  #include <sys/sa_impl.h>
  39   39  #include <sys/zfs_context.h>
  40   40  #include <sys/varargs.h>
  41   41  
  42   42  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  43   43      uint64_t arg1, uint64_t arg2);
  44   44  
  45   45  
  46   46  dmu_tx_t *
  47   47  dmu_tx_create_dd(dsl_dir_t *dd)
  48   48  {
  49   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  50   50          tx->tx_dir = dd;
  51   51          if (dd != NULL)
  52   52                  tx->tx_pool = dd->dd_pool;
  53   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  54   54              offsetof(dmu_tx_hold_t, txh_node));
  55   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  56   56              offsetof(dmu_tx_callback_t, dcb_node));
  57   57          tx->tx_start = gethrtime();
  58   58  #ifdef ZFS_DEBUG
  59   59          refcount_create(&tx->tx_space_written);
  60   60          refcount_create(&tx->tx_space_freed);
  61   61  #endif
  62   62          return (tx);
  63   63  }
  64   64  
  65   65  dmu_tx_t *
  66   66  dmu_tx_create(objset_t *os)
  67   67  {
  68   68          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  69   69          tx->tx_objset = os;
  70   70          tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  71   71          return (tx);
  72   72  }
  73   73  
  74   74  dmu_tx_t *
  75   75  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  76   76  {
  77   77          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  78   78  
  79   79          ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  80   80          tx->tx_pool = dp;
  81   81          tx->tx_txg = txg;
  82   82          tx->tx_anyobj = TRUE;
  83   83  
  84   84          return (tx);
  85   85  }
  86   86  
  87   87  int
  88   88  dmu_tx_is_syncing(dmu_tx_t *tx)
  89   89  {
  90   90          return (tx->tx_anyobj);
  91   91  }
  92   92  
  93   93  int
  94   94  dmu_tx_private_ok(dmu_tx_t *tx)
  95   95  {
  96   96          return (tx->tx_anyobj);
  97   97  }
  98   98  
  99   99  static dmu_tx_hold_t *
 100  100  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 101  101      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 102  102  {
 103  103          dmu_tx_hold_t *txh;
 104  104          dnode_t *dn = NULL;
 105  105          int err;
 106  106  
 107  107          if (object != DMU_NEW_OBJECT) {
 108  108                  err = dnode_hold(os, object, tx, &dn);
 109  109                  if (err) {
 110  110                          tx->tx_err = err;
 111  111                          return (NULL);
 112  112                  }
 113  113  
 114  114                  if (err == 0 && tx->tx_txg != 0) {
 115  115                          mutex_enter(&dn->dn_mtx);
 116  116                          /*
 117  117                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 118  118                           * problem, but there's no way for it to happen (for
 119  119                           * now, at least).
 120  120                           */
 121  121                          ASSERT(dn->dn_assigned_txg == 0);
 122  122                          dn->dn_assigned_txg = tx->tx_txg;
 123  123                          (void) refcount_add(&dn->dn_tx_holds, tx);
 124  124                          mutex_exit(&dn->dn_mtx);
 125  125                  }
 126  126          }
 127  127  
 128  128          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 129  129          txh->txh_tx = tx;
 130  130          txh->txh_dnode = dn;
 131  131  #ifdef ZFS_DEBUG
 132  132          txh->txh_type = type;
 133  133          txh->txh_arg1 = arg1;
 134  134          txh->txh_arg2 = arg2;
 135  135  #endif
 136  136          list_insert_tail(&tx->tx_holds, txh);
 137  137  
 138  138          return (txh);
 139  139  }
 140  140  
 141  141  void
 142  142  dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 143  143  {
 144  144          /*
 145  145           * If we're syncing, they can manipulate any object anyhow, and
 146  146           * the hold on the dnode_t can cause problems.
 147  147           */
 148  148          if (!dmu_tx_is_syncing(tx)) {
 149  149                  (void) dmu_tx_hold_object_impl(tx, os,
 150  150                      object, THT_NEWOBJECT, 0, 0);
 151  151          }
 152  152  }
 153  153  
 154  154  static int
 155  155  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 156  156  {
 157  157          int err;
 158  158          dmu_buf_impl_t *db;
 159  159  
 160  160          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 161  161          db = dbuf_hold_level(dn, level, blkid, FTAG);
 162  162          rw_exit(&dn->dn_struct_rwlock);
 163  163          if (db == NULL)
 164  164                  return (SET_ERROR(EIO));
 165  165          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 166  166          dbuf_rele(db, FTAG);
 167  167          return (err);
 168  168  }
 169  169  
 170  170  static void
 171  171  dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 172  172      int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 173  173  {
 174  174          objset_t *os = dn->dn_objset;
 175  175          dsl_dataset_t *ds = os->os_dsl_dataset;
 176  176          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 177  177          dmu_buf_impl_t *parent = NULL;
 178  178          blkptr_t *bp = NULL;
 179  179          uint64_t space;
 180  180  
 181  181          if (level >= dn->dn_nlevels || history[level] == blkid)
 182  182                  return;
 183  183  
 184  184          history[level] = blkid;
 185  185  
 186  186          space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 187  187  
 188  188          if (db == NULL || db == dn->dn_dbuf) {
 189  189                  ASSERT(level != 0);
 190  190                  db = NULL;
 191  191          } else {
 192  192                  ASSERT(DB_DNODE(db) == dn);
 193  193                  ASSERT(db->db_level == level);
 194  194                  ASSERT(db->db.db_size == space);
 195  195                  ASSERT(db->db_blkid == blkid);
 196  196                  bp = db->db_blkptr;
 197  197                  parent = db->db_parent;
 198  198          }
 199  199  
 200  200          freeable = (bp && (freeable ||
 201  201              dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 202  202  
 203  203          if (freeable)
 204  204                  txh->txh_space_tooverwrite += space;
 205  205          else
 206  206                  txh->txh_space_towrite += space;
 207  207          if (bp)
 208  208                  txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 209  209  
 210  210          dmu_tx_count_twig(txh, dn, parent, level + 1,
 211  211              blkid >> epbs, freeable, history);
 212  212  }
 213  213  
 214  214  /* ARGSUSED */
 215  215  static void
 216  216  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 217  217  {
 218  218          dnode_t *dn = txh->txh_dnode;
 219  219          uint64_t start, end, i;
 220  220          int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 221  221          int err = 0;
 222  222  
 223  223          if (len == 0)
 224  224                  return;
 225  225  
 226  226          min_bs = SPA_MINBLOCKSHIFT;
 227  227          max_bs = SPA_MAXBLOCKSHIFT;
 228  228          min_ibs = DN_MIN_INDBLKSHIFT;
 229  229          max_ibs = DN_MAX_INDBLKSHIFT;
 230  230  
 231  231          if (dn) {
 232  232                  uint64_t history[DN_MAX_LEVELS];
 233  233                  int nlvls = dn->dn_nlevels;
 234  234                  int delta;
 235  235  
 236  236                  /*
 237  237                   * For i/o error checking, read the first and last level-0
 238  238                   * blocks (if they are not aligned), and all the level-1 blocks.
 239  239                   */
 240  240                  if (dn->dn_maxblkid == 0) {
 241  241                          delta = dn->dn_datablksz;
 242  242                          start = (off < dn->dn_datablksz) ? 0 : 1;
 243  243                          end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 244  244                          if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 245  245                                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 246  246                                  if (err)
 247  247                                          goto out;
 248  248                                  delta -= off;
 249  249                          }
 250  250                  } else {
 251  251                          zio_t *zio = zio_root(dn->dn_objset->os_spa,
 252  252                              NULL, NULL, ZIO_FLAG_CANFAIL);
 253  253  
 254  254                          /* first level-0 block */
 255  255                          start = off >> dn->dn_datablkshift;
 256  256                          if (P2PHASE(off, dn->dn_datablksz) ||
 257  257                              len < dn->dn_datablksz) {
 258  258                                  err = dmu_tx_check_ioerr(zio, dn, 0, start);
 259  259                                  if (err)
 260  260                                          goto out;
 261  261                          }
 262  262  
 263  263                          /* last level-0 block */
 264  264                          end = (off+len-1) >> dn->dn_datablkshift;
 265  265                          if (end != start && end <= dn->dn_maxblkid &&
 266  266                              P2PHASE(off+len, dn->dn_datablksz)) {
 267  267                                  err = dmu_tx_check_ioerr(zio, dn, 0, end);
 268  268                                  if (err)
 269  269                                          goto out;
 270  270                          }
 271  271  
 272  272                          /* level-1 blocks */
 273  273                          if (nlvls > 1) {
 274  274                                  int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 275  275                                  for (i = (start>>shft)+1; i < end>>shft; i++) {
 276  276                                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 277  277                                          if (err)
 278  278                                                  goto out;
 279  279                                  }
 280  280                          }
 281  281  
 282  282                          err = zio_wait(zio);
 283  283                          if (err)
 284  284                                  goto out;
 285  285                          delta = P2NPHASE(off, dn->dn_datablksz);
 286  286                  }
 287  287  
 288  288                  min_ibs = max_ibs = dn->dn_indblkshift;
 289  289                  if (dn->dn_maxblkid > 0) {
 290  290                          /*
 291  291                           * The blocksize can't change,
 292  292                           * so we can make a more precise estimate.
 293  293                           */
 294  294                          ASSERT(dn->dn_datablkshift != 0);
 295  295                          min_bs = max_bs = dn->dn_datablkshift;
 296  296                  }
 297  297  
 298  298                  /*
 299  299                   * If this write is not off the end of the file
 300  300                   * we need to account for overwrites/unref.
 301  301                   */
 302  302                  if (start <= dn->dn_maxblkid) {
 303  303                          for (int l = 0; l < DN_MAX_LEVELS; l++)
 304  304                                  history[l] = -1ULL;
 305  305                  }
 306  306                  while (start <= dn->dn_maxblkid) {
 307  307                          dmu_buf_impl_t *db;
 308  308  
 309  309                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 310  310                          err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 311  311                          rw_exit(&dn->dn_struct_rwlock);
 312  312  
 313  313                          if (err) {
 314  314                                  txh->txh_tx->tx_err = err;
 315  315                                  return;
 316  316                          }
 317  317  
 318  318                          dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 319  319                              history);
 320  320                          dbuf_rele(db, FTAG);
 321  321                          if (++start > end) {
 322  322                                  /*
 323  323                                   * Account for new indirects appearing
 324  324                                   * before this IO gets assigned into a txg.
 325  325                                   */
 326  326                                  bits = 64 - min_bs;
 327  327                                  epbs = min_ibs - SPA_BLKPTRSHIFT;
 328  328                                  for (bits -= epbs * (nlvls - 1);
 329  329                                      bits >= 0; bits -= epbs)
 330  330                                          txh->txh_fudge += 1ULL << max_ibs;
 331  331                                  goto out;
 332  332                          }
 333  333                          off += delta;
 334  334                          if (len >= delta)
 335  335                                  len -= delta;
 336  336                          delta = dn->dn_datablksz;
 337  337                  }
 338  338          }
 339  339  
 340  340          /*
 341  341           * 'end' is the last thing we will access, not one past.
 342  342           * This way we won't overflow when accessing the last byte.
 343  343           */
 344  344          start = P2ALIGN(off, 1ULL << max_bs);
 345  345          end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 346  346          txh->txh_space_towrite += end - start + 1;
 347  347  
 348  348          start >>= min_bs;
 349  349          end >>= min_bs;
 350  350  
 351  351          epbs = min_ibs - SPA_BLKPTRSHIFT;
 352  352  
 353  353          /*
 354  354           * The object contains at most 2^(64 - min_bs) blocks,
 355  355           * and each indirect level maps 2^epbs.
 356  356           */
 357  357          for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 358  358                  start >>= epbs;
 359  359                  end >>= epbs;
 360  360                  ASSERT3U(end, >=, start);
 361  361                  txh->txh_space_towrite += (end - start + 1) << max_ibs;
 362  362                  if (start != 0) {
 363  363                          /*
 364  364                           * We also need a new blkid=0 indirect block
 365  365                           * to reference any existing file data.
 366  366                           */
 367  367                          txh->txh_space_towrite += 1ULL << max_ibs;
 368  368                  }
 369  369          }
 370  370  
 371  371  out:
 372  372          if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 373  373              2 * DMU_MAX_ACCESS)
 374  374                  err = SET_ERROR(EFBIG);
 375  375  
 376  376          if (err)
 377  377                  txh->txh_tx->tx_err = err;
 378  378  }
 379  379  
 380  380  static void
 381  381  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 382  382  {
 383  383          dnode_t *dn = txh->txh_dnode;
 384  384          dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 385  385          uint64_t space = mdn->dn_datablksz +
 386  386              ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 387  387  
 388  388          if (dn && dn->dn_dbuf->db_blkptr &&
 389  389              dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 390  390              dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 391  391                  txh->txh_space_tooverwrite += space;
 392  392                  txh->txh_space_tounref += space;
 393  393          } else {
 394  394                  txh->txh_space_towrite += space;
 395  395                  if (dn && dn->dn_dbuf->db_blkptr)
 396  396                          txh->txh_space_tounref += space;
 397  397          }
 398  398  }
 399  399  
 400  400  void
 401  401  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 402  402  {
 403  403          dmu_tx_hold_t *txh;
 404  404  
 405  405          ASSERT(tx->tx_txg == 0);
 406  406          ASSERT(len < DMU_MAX_ACCESS);
 407  407          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 408  408  
 409  409          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 410  410              object, THT_WRITE, off, len);
 411  411          if (txh == NULL)
 412  412                  return;
 413  413  
 414  414          dmu_tx_count_write(txh, off, len);
 415  415          dmu_tx_count_dnode(txh);
 416  416  }
 417  417  
 418  418  static void
 419  419  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 420  420  {
 421  421          uint64_t blkid, nblks, lastblk;
 422  422          uint64_t space = 0, unref = 0, skipped = 0;
 423  423          dnode_t *dn = txh->txh_dnode;
 424  424          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 425  425          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 426  426          int epbs;
 427  427          uint64_t l0span = 0, nl1blks = 0;
 428  428  
 429  429          if (dn->dn_nlevels == 0)
 430  430                  return;
 431  431  
 432  432          /*
 433  433           * The struct_rwlock protects us against dn_nlevels
 434  434           * changing, in case (against all odds) we manage to dirty &
 435  435           * sync out the changes after we check for being dirty.
 436  436           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 437  437           */
 438  438          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 439  439          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 440  440          if (dn->dn_maxblkid == 0) {
 441  441                  if (off == 0 && len >= dn->dn_datablksz) {
 442  442                          blkid = 0;
 443  443                          nblks = 1;
 444  444                  } else {
 445  445                          rw_exit(&dn->dn_struct_rwlock);
 446  446                          return;
 447  447                  }
 448  448          } else {
 449  449                  blkid = off >> dn->dn_datablkshift;
 450  450                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 451  451  
 452  452                  if (blkid > dn->dn_maxblkid) {
 453  453                          rw_exit(&dn->dn_struct_rwlock);
 454  454                          return;
 455  455                  }
 456  456                  if (blkid + nblks > dn->dn_maxblkid)
 457  457                          nblks = dn->dn_maxblkid - blkid + 1;
 458  458  
 459  459          }
 460  460          l0span = nblks;    /* save for later use to calc level > 1 overhead */
 461  461          if (dn->dn_nlevels == 1) {
 462  462                  int i;
 463  463                  for (i = 0; i < nblks; i++) {
 464  464                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 465  465                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 466  466                          bp += blkid + i;
 467  467                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 468  468                                  dprintf_bp(bp, "can free old%s", "");
 469  469                                  space += bp_get_dsize(spa, bp);
 470  470                          }
 471  471                          unref += BP_GET_ASIZE(bp);
 472  472                  }
 473  473                  nl1blks = 1;
 474  474                  nblks = 0;
 475  475          }
 476  476  
 477  477          lastblk = blkid + nblks - 1;
 478  478          while (nblks) {
 479  479                  dmu_buf_impl_t *dbuf;
 480  480                  uint64_t ibyte, new_blkid;
 481  481                  int epb = 1 << epbs;
 482  482                  int err, i, blkoff, tochk;
 483  483                  blkptr_t *bp;
 484  484  
 485  485                  ibyte = blkid << dn->dn_datablkshift;
 486  486                  err = dnode_next_offset(dn,
 487  487                      DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 488  488                  new_blkid = ibyte >> dn->dn_datablkshift;
 489  489                  if (err == ESRCH) {
 490  490                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 491  491                          break;
 492  492                  }
 493  493                  if (err) {
 494  494                          txh->txh_tx->tx_err = err;
 495  495                          break;
 496  496                  }
 497  497                  if (new_blkid > lastblk) {
 498  498                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 499  499                          break;
 500  500                  }
 501  501  
 502  502                  if (new_blkid > blkid) {
 503  503                          ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 504  504                          skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 505  505                          nblks -= new_blkid - blkid;
 506  506                          blkid = new_blkid;
 507  507                  }
 508  508                  blkoff = P2PHASE(blkid, epb);
 509  509                  tochk = MIN(epb - blkoff, nblks);
 510  510  
 511  511                  err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 512  512                  if (err) {
 513  513                          txh->txh_tx->tx_err = err;
 514  514                          break;
 515  515                  }
 516  516  
 517  517                  txh->txh_memory_tohold += dbuf->db.db_size;
 518  518  
 519  519                  /*
 520  520                   * We don't check memory_tohold against DMU_MAX_ACCESS because
 521  521                   * memory_tohold is an over-estimation (especially the >L1
 522  522                   * indirect blocks), so it could fail.  Callers should have
 523  523                   * already verified that they will not be holding too much
 524  524                   * memory.
 525  525                   */
 526  526  
 527  527                  err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 528  528                  if (err != 0) {
 529  529                          txh->txh_tx->tx_err = err;
 530  530                          dbuf_rele(dbuf, FTAG);
 531  531                          break;
 532  532                  }
 533  533  
 534  534                  bp = dbuf->db.db_data;
 535  535                  bp += blkoff;
 536  536  
 537  537                  for (i = 0; i < tochk; i++) {
 538  538                          if (dsl_dataset_block_freeable(ds, &bp[i],
 539  539                              bp[i].blk_birth)) {
 540  540                                  dprintf_bp(&bp[i], "can free old%s", "");
 541  541                                  space += bp_get_dsize(spa, &bp[i]);
 542  542                          }
 543  543                          unref += BP_GET_ASIZE(bp);
 544  544                  }
 545  545                  dbuf_rele(dbuf, FTAG);
 546  546  
 547  547                  ++nl1blks;
 548  548                  blkid += tochk;
 549  549                  nblks -= tochk;
 550  550          }
 551  551          rw_exit(&dn->dn_struct_rwlock);
 552  552  
 553  553          /*
 554  554           * Add in memory requirements of higher-level indirects.
 555  555           * This assumes a worst-possible scenario for dn_nlevels and a
 556  556           * worst-possible distribution of l1-blocks over the region to free.
 557  557           */
 558  558          {
 559  559                  uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 560  560                  int level = 2;
 561  561                  /*
 562  562                   * Here we don't use DN_MAX_LEVEL, but calculate it with the
 563  563                   * given datablkshift and indblkshift. This makes the
 564  564                   * difference between 19 and 8 on large files.
 565  565                   */
 566  566                  int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 567  567                      (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 568  568  
 569  569                  while (level++ < maxlevel) {
 570  570                          txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 571  571                              << dn->dn_indblkshift;
 572  572                          blkcnt = 1 + (blkcnt >> epbs);
 573  573                  }
 574  574          }
 575  575  
 576  576          /* account for new level 1 indirect blocks that might show up */
 577  577          if (skipped > 0) {
 578  578                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 579  579                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 580  580                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 581  581          }
 582  582          txh->txh_space_tofree += space;
 583  583          txh->txh_space_tounref += unref;
 584  584  }
 585  585  
 586  586  void
 587  587  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 588  588  {
 589  589          dmu_tx_hold_t *txh;
 590  590          dnode_t *dn;
 591  591          int err;
 592  592          zio_t *zio;
 593  593  
 594  594          ASSERT(tx->tx_txg == 0);
 595  595  
 596  596          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 597  597              object, THT_FREE, off, len);
 598  598          if (txh == NULL)
 599  599                  return;
 600  600          dn = txh->txh_dnode;
 601  601          dmu_tx_count_dnode(txh);
 602  602  
 603  603          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 604  604                  return;
 605  605          if (len == DMU_OBJECT_END)
 606  606                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 607  607  
 608  608          /*
 609  609           * For i/o error checking, we read the first and last level-0
 610  610           * blocks if they are not aligned, and all the level-1 blocks.
 611  611           *
 612  612           * Note:  dbuf_free_range() assumes that we have not instantiated
 613  613           * any level-0 dbufs that will be completely freed.  Therefore we must
 614  614           * exercise care to not read or count the first and last blocks
 615  615           * if they are blocksize-aligned.
 616  616           */
 617  617          if (dn->dn_datablkshift == 0) {
 618  618                  if (off != 0 || len < dn->dn_datablksz)
 619  619                          dmu_tx_count_write(txh, 0, dn->dn_datablksz);
 620  620          } else {
 621  621                  /* first block will be modified if it is not aligned */
 622  622                  if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 623  623                          dmu_tx_count_write(txh, off, 1);
 624  624                  /* last block will be modified if it is not aligned */
 625  625                  if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 626  626                          dmu_tx_count_write(txh, off+len, 1);
 627  627          }

↓ open down ↓

627 lines elided

↑ open up ↑

 628  628  
 629  629          /*
 630  630           * Check level-1 blocks.
 631  631           */
 632  632          if (dn->dn_nlevels > 1) {
 633  633                  int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 634  634                      SPA_BLKPTRSHIFT;
 635  635                  uint64_t start = off >> shift;
 636  636                  uint64_t end = (off + len) >> shift;
 637  637  
 638      -                ASSERT(dn->dn_datablkshift != 0);
 639  638                  ASSERT(dn->dn_indblkshift != 0);
 640  639  
      640 +                /*
      641 +                 * dnode_reallocate() can result in an object with indirect
      642 +                 * blocks having an odd data block size.  In this case,
      643 +                 * just check the single block.
      644 +                 */
      645 +                if (dn->dn_datablkshift == 0)
      646 +                        start = end = 0;
      647 +
 641  648                  zio = zio_root(tx->tx_pool->dp_spa,
 642  649                      NULL, NULL, ZIO_FLAG_CANFAIL);
 643  650                  for (uint64_t i = start; i <= end; i++) {
 644  651                          uint64_t ibyte = i << shift;
 645  652                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 646  653                          i = ibyte >> shift;
 647  654                          if (err == ESRCH)
 648  655                                  break;
 649  656                          if (err) {
 650  657                                  tx->tx_err = err;

 651  658                                  return;
 652  659                          }
 653  660  
 654  661                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 655  662                          if (err) {
 656  663                                  tx->tx_err = err;
 657  664                                  return;
 658  665                          }
 659  666                  }
 660  667                  err = zio_wait(zio);
 661  668                  if (err) {
 662  669                          tx->tx_err = err;
 663  670                          return;
 664  671                  }
 665  672          }
 666  673  
 667  674          dmu_tx_count_free(txh, off, len);
 668  675  }
 669  676  
 670  677  void
 671  678  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 672  679  {
 673  680          dmu_tx_hold_t *txh;
 674  681          dnode_t *dn;
 675  682          uint64_t nblocks;
 676  683          int epbs, err;
 677  684  
 678  685          ASSERT(tx->tx_txg == 0);
 679  686  
 680  687          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 681  688              object, THT_ZAP, add, (uintptr_t)name);
 682  689          if (txh == NULL)
 683  690                  return;
 684  691          dn = txh->txh_dnode;
 685  692  
 686  693          dmu_tx_count_dnode(txh);
 687  694  
 688  695          if (dn == NULL) {
 689  696                  /*
 690  697                   * We will be able to fit a new object's entries into one leaf
 691  698                   * block.  So there will be at most 2 blocks total,
 692  699                   * including the header block.
 693  700                   */
 694  701                  dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 695  702                  return;
 696  703          }
 697  704  
 698  705          ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 699  706  
 700  707          if (dn->dn_maxblkid == 0 && !add) {
 701  708                  blkptr_t *bp;
 702  709  
 703  710                  /*
 704  711                   * If there is only one block  (i.e. this is a micro-zap)
 705  712                   * and we are not adding anything, the accounting is simple.
 706  713                   */
 707  714                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 708  715                  if (err) {
 709  716                          tx->tx_err = err;
 710  717                          return;
 711  718                  }
 712  719  
 713  720                  /*
 714  721                   * Use max block size here, since we don't know how much
 715  722                   * the size will change between now and the dbuf dirty call.
 716  723                   */
 717  724                  bp = &dn->dn_phys->dn_blkptr[0];
 718  725                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 719  726                      bp, bp->blk_birth))
 720  727                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 721  728                  else
 722  729                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
 723  730                  if (!BP_IS_HOLE(bp))
 724  731                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 725  732                  return;
 726  733          }
 727  734  
 728  735          if (dn->dn_maxblkid > 0 && name) {
 729  736                  /*
 730  737                   * access the name in this fat-zap so that we'll check
 731  738                   * for i/o errors to the leaf blocks, etc.
 732  739                   */
 733  740                  err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 734  741                      8, 0, NULL);
 735  742                  if (err == EIO) {
 736  743                          tx->tx_err = err;
 737  744                          return;
 738  745                  }
 739  746          }
 740  747  
 741  748          err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 742  749              &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 743  750  
 744  751          /*
 745  752           * If the modified blocks are scattered to the four winds,
 746  753           * we'll have to modify an indirect twig for each.
 747  754           */
 748  755          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 749  756          for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 750  757                  if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
 751  758                          txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 752  759                  else
 753  760                          txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 754  761  }
 755  762  
 756  763  void
 757  764  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 758  765  {
 759  766          dmu_tx_hold_t *txh;
 760  767  
 761  768          ASSERT(tx->tx_txg == 0);
 762  769  
 763  770          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 764  771              object, THT_BONUS, 0, 0);
 765  772          if (txh)
 766  773                  dmu_tx_count_dnode(txh);
 767  774  }
 768  775  
 769  776  void
 770  777  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 771  778  {
 772  779          dmu_tx_hold_t *txh;
 773  780          ASSERT(tx->tx_txg == 0);
 774  781  
 775  782          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 776  783              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 777  784  
 778  785          txh->txh_space_towrite += space;
 779  786  }
 780  787  
 781  788  int
 782  789  dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 783  790  {
 784  791          dmu_tx_hold_t *txh;
 785  792          int holds = 0;
 786  793  
 787  794          /*
 788  795           * By asserting that the tx is assigned, we're counting the
 789  796           * number of dn_tx_holds, which is the same as the number of
 790  797           * dn_holds.  Otherwise, we'd be counting dn_holds, but
 791  798           * dn_tx_holds could be 0.
 792  799           */
 793  800          ASSERT(tx->tx_txg != 0);
 794  801  
 795  802          /* if (tx->tx_anyobj == TRUE) */
 796  803                  /* return (0); */
 797  804  
 798  805          for (txh = list_head(&tx->tx_holds); txh;
 799  806              txh = list_next(&tx->tx_holds, txh)) {
 800  807                  if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 801  808                          holds++;
 802  809          }
 803  810  
 804  811          return (holds);
 805  812  }
 806  813  
 807  814  #ifdef ZFS_DEBUG
 808  815  void
 809  816  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 810  817  {
 811  818          dmu_tx_hold_t *txh;
 812  819          int match_object = FALSE, match_offset = FALSE;
 813  820          dnode_t *dn;
 814  821  
 815  822          DB_DNODE_ENTER(db);
 816  823          dn = DB_DNODE(db);
 817  824          ASSERT(tx->tx_txg != 0);
 818  825          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 819  826          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 820  827  
 821  828          if (tx->tx_anyobj) {
 822  829                  DB_DNODE_EXIT(db);
 823  830                  return;
 824  831          }
 825  832  
 826  833          /* XXX No checking on the meta dnode for now */
 827  834          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 828  835                  DB_DNODE_EXIT(db);
 829  836                  return;
 830  837          }
 831  838  
 832  839          for (txh = list_head(&tx->tx_holds); txh;
 833  840              txh = list_next(&tx->tx_holds, txh)) {
 834  841                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 835  842                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 836  843                          match_object = TRUE;
 837  844                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 838  845                          int datablkshift = dn->dn_datablkshift ?
 839  846                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 840  847                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 841  848                          int shift = datablkshift + epbs * db->db_level;
 842  849                          uint64_t beginblk = shift >= 64 ? 0 :
 843  850                              (txh->txh_arg1 >> shift);
 844  851                          uint64_t endblk = shift >= 64 ? 0 :
 845  852                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 846  853                          uint64_t blkid = db->db_blkid;
 847  854  
 848  855                          /* XXX txh_arg2 better not be zero... */
 849  856  
 850  857                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 851  858                              txh->txh_type, beginblk, endblk);
 852  859  
 853  860                          switch (txh->txh_type) {
 854  861                          case THT_WRITE:
 855  862                                  if (blkid >= beginblk && blkid <= endblk)
 856  863                                          match_offset = TRUE;
 857  864                                  /*
 858  865                                   * We will let this hold work for the bonus
 859  866                                   * or spill buffer so that we don't need to
 860  867                                   * hold it when creating a new object.
 861  868                                   */
 862  869                                  if (blkid == DMU_BONUS_BLKID ||
 863  870                                      blkid == DMU_SPILL_BLKID)
 864  871                                          match_offset = TRUE;
 865  872                                  /*
 866  873                                   * They might have to increase nlevels,
 867  874                                   * thus dirtying the new TLIBs.  Or the
 868  875                                   * might have to change the block size,
 869  876                                   * thus dirying the new lvl=0 blk=0.
 870  877                                   */
 871  878                                  if (blkid == 0)
 872  879                                          match_offset = TRUE;
 873  880                                  break;
 874  881                          case THT_FREE:
 875  882                                  /*
 876  883                                   * We will dirty all the level 1 blocks in
 877  884                                   * the free range and perhaps the first and
 878  885                                   * last level 0 block.
 879  886                                   */
 880  887                                  if (blkid >= beginblk && (blkid <= endblk ||
 881  888                                      txh->txh_arg2 == DMU_OBJECT_END))
 882  889                                          match_offset = TRUE;
 883  890                                  break;
 884  891                          case THT_SPILL:
 885  892                                  if (blkid == DMU_SPILL_BLKID)
 886  893                                          match_offset = TRUE;
 887  894                                  break;
 888  895                          case THT_BONUS:
 889  896                                  if (blkid == DMU_BONUS_BLKID)
 890  897                                          match_offset = TRUE;
 891  898                                  break;
 892  899                          case THT_ZAP:
 893  900                                  match_offset = TRUE;
 894  901                                  break;
 895  902                          case THT_NEWOBJECT:
 896  903                                  match_object = TRUE;
 897  904                                  break;
 898  905                          default:
 899  906                                  ASSERT(!"bad txh_type");
 900  907                          }
 901  908                  }
 902  909                  if (match_object && match_offset) {
 903  910                          DB_DNODE_EXIT(db);
 904  911                          return;
 905  912                  }
 906  913          }
 907  914          DB_DNODE_EXIT(db);
 908  915          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 909  916              (u_longlong_t)db->db.db_object, db->db_level,
 910  917              (u_longlong_t)db->db_blkid);
 911  918  }
 912  919  #endif
 913  920  
 914  921  /*
 915  922   * If we can't do 10 iops, something is wrong.  Let us go ahead
 916  923   * and hit zfs_dirty_data_max.
 917  924   */
 918  925  hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
 919  926  int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
 920  927  
 921  928  /*
 922  929   * We delay transactions when we've determined that the backend storage
 923  930   * isn't able to accommodate the rate of incoming writes.
 924  931   *
 925  932   * If there is already a transaction waiting, we delay relative to when
 926  933   * that transaction finishes waiting.  This way the calculated min_time
 927  934   * is independent of the number of threads concurrently executing
 928  935   * transactions.
 929  936   *
 930  937   * If we are the only waiter, wait relative to when the transaction
 931  938   * started, rather than the current time.  This credits the transaction for
 932  939   * "time already served", e.g. reading indirect blocks.
 933  940   *
 934  941   * The minimum time for a transaction to take is calculated as:
 935  942   *     min_time = scale * (dirty - min) / (max - dirty)
 936  943   *     min_time is then capped at zfs_delay_max_ns.
 937  944   *
 938  945   * The delay has two degrees of freedom that can be adjusted via tunables.
 939  946   * The percentage of dirty data at which we start to delay is defined by
 940  947   * zfs_delay_min_dirty_percent. This should typically be at or above
 941  948   * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
 942  949   * delay after writing at full speed has failed to keep up with the incoming
 943  950   * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
 944  951   * speaking, this variable determines the amount of delay at the midpoint of
 945  952   * the curve.
 946  953   *
 947  954   * delay
 948  955   *  10ms +-------------------------------------------------------------*+
 949  956   *       |                                                             *|
 950  957   *   9ms +                                                             *+
 951  958   *       |                                                             *|
 952  959   *   8ms +                                                             *+
 953  960   *       |                                                            * |
 954  961   *   7ms +                                                            * +
 955  962   *       |                                                            * |
 956  963   *   6ms +                                                            * +
 957  964   *       |                                                            * |
 958  965   *   5ms +                                                           *  +
 959  966   *       |                                                           *  |
 960  967   *   4ms +                                                           *  +
 961  968   *       |                                                           *  |
 962  969   *   3ms +                                                          *   +
 963  970   *       |                                                          *   |
 964  971   *   2ms +                                              (midpoint) *    +
 965  972   *       |                                                  |    **     |
 966  973   *   1ms +                                                  v ***       +
 967  974   *       |             zfs_delay_scale ---------->     ********         |
 968  975   *     0 +-------------------------------------*********----------------+
 969  976   *       0%                    <- zfs_dirty_data_max ->               100%
 970  977   *
 971  978   * Note that since the delay is added to the outstanding time remaining on the
 972  979   * most recent transaction, the delay is effectively the inverse of IOPS.
 973  980   * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
 974  981   * was chosen such that small changes in the amount of accumulated dirty data
 975  982   * in the first 3/4 of the curve yield relatively small differences in the
 976  983   * amount of delay.
 977  984   *
 978  985   * The effects can be easier to understand when the amount of delay is
 979  986   * represented on a log scale:
 980  987   *
 981  988   * delay
 982  989   * 100ms +-------------------------------------------------------------++
 983  990   *       +                                                              +
 984  991   *       |                                                              |
 985  992   *       +                                                             *+
 986  993   *  10ms +                                                             *+
 987  994   *       +                                                           ** +
 988  995   *       |                                              (midpoint)  **  |
 989  996   *       +                                                  |     **    +
 990  997   *   1ms +                                                  v ****      +
 991  998   *       +             zfs_delay_scale ---------->        *****         +
 992  999   *       |                                             ****             |
 993 1000   *       +                                          ****                +
 994 1001   * 100us +                                        **                    +
 995 1002   *       +                                       *                      +
 996 1003   *       |                                      *                       |
 997 1004   *       +                                     *                        +
 998 1005   *  10us +                                     *                        +
 999 1006   *       +                                                              +
1000 1007   *       |                                                              |
1001 1008   *       +                                                              +
1002 1009   *       +--------------------------------------------------------------+
1003 1010   *       0%                    <- zfs_dirty_data_max ->               100%
1004 1011   *
1005 1012   * Note here that only as the amount of dirty data approaches its limit does
1006 1013   * the delay start to increase rapidly. The goal of a properly tuned system
1007 1014   * should be to keep the amount of dirty data out of that range by first
1008 1015   * ensuring that the appropriate limits are set for the I/O scheduler to reach
1009 1016   * optimal throughput on the backend storage, and then by changing the value
1010 1017   * of zfs_delay_scale to increase the steepness of the curve.
1011 1018   */
1012 1019  static void
1013 1020  dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
1014 1021  {
1015 1022          dsl_pool_t *dp = tx->tx_pool;
1016 1023          uint64_t delay_min_bytes =
1017 1024              zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1018 1025          hrtime_t wakeup, min_tx_time, now;
1019 1026  
1020 1027          if (dirty <= delay_min_bytes)
1021 1028                  return;
1022 1029  
1023 1030          /*
1024 1031           * The caller has already waited until we are under the max.
1025 1032           * We make them pass us the amount of dirty data so we don't
1026 1033           * have to handle the case of it being >= the max, which could
1027 1034           * cause a divide-by-zero if it's == the max.
1028 1035           */
1029 1036          ASSERT3U(dirty, <, zfs_dirty_data_max);
1030 1037  
1031 1038          now = gethrtime();
1032 1039          min_tx_time = zfs_delay_scale *
1033 1040              (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1034 1041          if (now > tx->tx_start + min_tx_time)
1035 1042                  return;
1036 1043  
1037 1044          min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
1038 1045  
1039 1046          DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
1040 1047              uint64_t, min_tx_time);
1041 1048  
1042 1049          mutex_enter(&dp->dp_lock);
1043 1050          wakeup = MAX(tx->tx_start + min_tx_time,
1044 1051              dp->dp_last_wakeup + min_tx_time);
1045 1052          dp->dp_last_wakeup = wakeup;
1046 1053          mutex_exit(&dp->dp_lock);
1047 1054  
1048 1055  #ifdef _KERNEL
1049 1056          mutex_enter(&curthread->t_delay_lock);
1050 1057          while (cv_timedwait_hires(&curthread->t_delay_cv,
1051 1058              &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
1052 1059              CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
1053 1060                  continue;
1054 1061          mutex_exit(&curthread->t_delay_lock);
1055 1062  #else
1056 1063          hrtime_t delta = wakeup - gethrtime();
1057 1064          struct timespec ts;
1058 1065          ts.tv_sec = delta / NANOSEC;
1059 1066          ts.tv_nsec = delta % NANOSEC;
1060 1067          (void) nanosleep(&ts, NULL);
1061 1068  #endif
1062 1069  }
1063 1070  
1064 1071  static int
1065 1072  dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
1066 1073  {
1067 1074          dmu_tx_hold_t *txh;
1068 1075          spa_t *spa = tx->tx_pool->dp_spa;
1069 1076          uint64_t memory, asize, fsize, usize;
1070 1077          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
1071 1078  
1072 1079          ASSERT0(tx->tx_txg);
1073 1080  
1074 1081          if (tx->tx_err)
1075 1082                  return (tx->tx_err);
1076 1083  
1077 1084          if (spa_suspended(spa)) {
1078 1085                  /*
1079 1086                   * If the user has indicated a blocking failure mode
1080 1087                   * then return ERESTART which will block in dmu_tx_wait().
1081 1088                   * Otherwise, return EIO so that an error can get
1082 1089                   * propagated back to the VOP calls.
1083 1090                   *
1084 1091                   * Note that we always honor the txg_how flag regardless
1085 1092                   * of the failuremode setting.
1086 1093                   */
1087 1094                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
1088 1095                      txg_how != TXG_WAIT)
1089 1096                          return (SET_ERROR(EIO));
1090 1097  
1091 1098                  return (SET_ERROR(ERESTART));
1092 1099          }
1093 1100  
1094 1101          if (!tx->tx_waited &&
1095 1102              dsl_pool_need_dirty_delay(tx->tx_pool)) {
1096 1103                  tx->tx_wait_dirty = B_TRUE;
1097 1104                  return (SET_ERROR(ERESTART));
1098 1105          }
1099 1106  
1100 1107          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
1101 1108          tx->tx_needassign_txh = NULL;
1102 1109  
1103 1110          /*
1104 1111           * NB: No error returns are allowed after txg_hold_open, but
1105 1112           * before processing the dnode holds, due to the
1106 1113           * dmu_tx_unassign() logic.
1107 1114           */
1108 1115  
1109 1116          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
1110 1117          for (txh = list_head(&tx->tx_holds); txh;
1111 1118              txh = list_next(&tx->tx_holds, txh)) {
1112 1119                  dnode_t *dn = txh->txh_dnode;
1113 1120                  if (dn != NULL) {
1114 1121                          mutex_enter(&dn->dn_mtx);
1115 1122                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
1116 1123                                  mutex_exit(&dn->dn_mtx);
1117 1124                                  tx->tx_needassign_txh = txh;
1118 1125                                  return (SET_ERROR(ERESTART));
1119 1126                          }
1120 1127                          if (dn->dn_assigned_txg == 0)
1121 1128                                  dn->dn_assigned_txg = tx->tx_txg;
1122 1129                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1123 1130                          (void) refcount_add(&dn->dn_tx_holds, tx);
1124 1131                          mutex_exit(&dn->dn_mtx);
1125 1132                  }
1126 1133                  towrite += txh->txh_space_towrite;
1127 1134                  tofree += txh->txh_space_tofree;
1128 1135                  tooverwrite += txh->txh_space_tooverwrite;
1129 1136                  tounref += txh->txh_space_tounref;
1130 1137                  tohold += txh->txh_memory_tohold;
1131 1138                  fudge += txh->txh_fudge;
1132 1139          }
1133 1140  
1134 1141          /*
1135 1142           * If a snapshot has been taken since we made our estimates,
1136 1143           * assume that we won't be able to free or overwrite anything.
1137 1144           */
1138 1145          if (tx->tx_objset &&
1139 1146              dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
1140 1147              tx->tx_lastsnap_txg) {
1141 1148                  towrite += tooverwrite;
1142 1149                  tooverwrite = tofree = 0;
1143 1150          }
1144 1151  
1145 1152          /* needed allocation: worst-case estimate of write space */
1146 1153          asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
1147 1154          /* freed space estimate: worst-case overwrite + free estimate */
1148 1155          fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
1149 1156          /* convert unrefd space to worst-case estimate */
1150 1157          usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
1151 1158          /* calculate memory footprint estimate */
1152 1159          memory = towrite + tooverwrite + tohold;
1153 1160  
1154 1161  #ifdef ZFS_DEBUG
1155 1162          /*
1156 1163           * Add in 'tohold' to account for our dirty holds on this memory
1157 1164           * XXX - the "fudge" factor is to account for skipped blocks that
1158 1165           * we missed because dnode_next_offset() misses in-core-only blocks.
1159 1166           */
1160 1167          tx->tx_space_towrite = asize +
1161 1168              spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
1162 1169          tx->tx_space_tofree = tofree;
1163 1170          tx->tx_space_tooverwrite = tooverwrite;
1164 1171          tx->tx_space_tounref = tounref;
1165 1172  #endif
1166 1173  
1167 1174          if (tx->tx_dir && asize != 0) {
1168 1175                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1169 1176                      asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1170 1177                  if (err)
1171 1178                          return (err);
1172 1179          }
1173 1180  
1174 1181          return (0);
1175 1182  }
1176 1183  
1177 1184  static void
1178 1185  dmu_tx_unassign(dmu_tx_t *tx)
1179 1186  {
1180 1187          dmu_tx_hold_t *txh;
1181 1188  
1182 1189          if (tx->tx_txg == 0)
1183 1190                  return;
1184 1191  
1185 1192          txg_rele_to_quiesce(&tx->tx_txgh);
1186 1193  
1187 1194          /*
1188 1195           * Walk the transaction's hold list, removing the hold on the
1189 1196           * associated dnode, and notifying waiters if the refcount drops to 0.
1190 1197           */
1191 1198          for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1192 1199              txh = list_next(&tx->tx_holds, txh)) {
1193 1200                  dnode_t *dn = txh->txh_dnode;
1194 1201  
1195 1202                  if (dn == NULL)
1196 1203                          continue;
1197 1204                  mutex_enter(&dn->dn_mtx);
1198 1205                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1199 1206  
1200 1207                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1201 1208                          dn->dn_assigned_txg = 0;
1202 1209                          cv_broadcast(&dn->dn_notxholds);
1203 1210                  }
1204 1211                  mutex_exit(&dn->dn_mtx);
1205 1212          }
1206 1213  
1207 1214          txg_rele_to_sync(&tx->tx_txgh);
1208 1215  
1209 1216          tx->tx_lasttried_txg = tx->tx_txg;
1210 1217          tx->tx_txg = 0;
1211 1218  }
1212 1219  
1213 1220  /*
1214 1221   * Assign tx to a transaction group.  txg_how can be one of:
1215 1222   *
1216 1223   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1217 1224   *      a new one.  This should be used when you're not holding locks.
1218 1225   *      It will only fail if we're truly out of space (or over quota).
1219 1226   *
1220 1227   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1221 1228   *      blocking, returns immediately with ERESTART.  This should be used
1222 1229   *      whenever you're holding locks.  On an ERESTART error, the caller
1223 1230   *      should drop locks, do a dmu_tx_wait(tx), and try again.
1224 1231   *
1225 1232   * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
1226 1233   *      has already been called on behalf of this operation (though
1227 1234   *      most likely on a different tx).
1228 1235   */
1229 1236  int
1230 1237  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1231 1238  {
1232 1239          int err;
1233 1240  
1234 1241          ASSERT(tx->tx_txg == 0);
1235 1242          ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1236 1243              txg_how == TXG_WAITED);
1237 1244          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1238 1245  
1239 1246          /* If we might wait, we must not hold the config lock. */
1240 1247          ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1241 1248  
1242 1249          if (txg_how == TXG_WAITED)
1243 1250                  tx->tx_waited = B_TRUE;
1244 1251  
1245 1252          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1246 1253                  dmu_tx_unassign(tx);
1247 1254  
1248 1255                  if (err != ERESTART || txg_how != TXG_WAIT)
1249 1256                          return (err);
1250 1257  
1251 1258                  dmu_tx_wait(tx);
1252 1259          }
1253 1260  
1254 1261          txg_rele_to_quiesce(&tx->tx_txgh);
1255 1262  
1256 1263          return (0);
1257 1264  }
1258 1265  
1259 1266  void
1260 1267  dmu_tx_wait(dmu_tx_t *tx)
1261 1268  {
1262 1269          spa_t *spa = tx->tx_pool->dp_spa;
1263 1270          dsl_pool_t *dp = tx->tx_pool;
1264 1271  
1265 1272          ASSERT(tx->tx_txg == 0);
1266 1273          ASSERT(!dsl_pool_config_held(tx->tx_pool));
1267 1274  
1268 1275          if (tx->tx_wait_dirty) {
1269 1276                  /*
1270 1277                   * dmu_tx_try_assign() has determined that we need to wait
1271 1278                   * because we've consumed much or all of the dirty buffer
1272 1279                   * space.
1273 1280                   */
1274 1281                  mutex_enter(&dp->dp_lock);
1275 1282                  while (dp->dp_dirty_total >= zfs_dirty_data_max)
1276 1283                          cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1277 1284                  uint64_t dirty = dp->dp_dirty_total;
1278 1285                  mutex_exit(&dp->dp_lock);
1279 1286  
1280 1287                  dmu_tx_delay(tx, dirty);
1281 1288  
1282 1289                  tx->tx_wait_dirty = B_FALSE;
1283 1290  
1284 1291                  /*
1285 1292                   * Note: setting tx_waited only has effect if the caller
1286 1293                   * used TX_WAIT.  Otherwise they are going to destroy
1287 1294                   * this tx and try again.  The common case, zfs_write(),
1288 1295                   * uses TX_WAIT.
1289 1296                   */
1290 1297                  tx->tx_waited = B_TRUE;
1291 1298          } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1292 1299                  /*
1293 1300                   * If the pool is suspended we need to wait until it
1294 1301                   * is resumed.  Note that it's possible that the pool
1295 1302                   * has become active after this thread has tried to
1296 1303                   * obtain a tx.  If that's the case then tx_lasttried_txg
1297 1304                   * would not have been set.
1298 1305                   */
1299 1306                  txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1300 1307          } else if (tx->tx_needassign_txh) {
1301 1308                  /*
1302 1309                   * A dnode is assigned to the quiescing txg.  Wait for its
1303 1310                   * transaction to complete.
1304 1311                   */
1305 1312                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1306 1313  
1307 1314                  mutex_enter(&dn->dn_mtx);
1308 1315                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1309 1316                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1310 1317                  mutex_exit(&dn->dn_mtx);
1311 1318                  tx->tx_needassign_txh = NULL;
1312 1319          } else {
1313 1320                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1314 1321          }
1315 1322  }
1316 1323  
1317 1324  void
1318 1325  dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1319 1326  {
1320 1327  #ifdef ZFS_DEBUG
1321 1328          if (tx->tx_dir == NULL || delta == 0)
1322 1329                  return;
1323 1330  
1324 1331          if (delta > 0) {
1325 1332                  ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1326 1333                      tx->tx_space_towrite);
1327 1334                  (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1328 1335          } else {
1329 1336                  (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1330 1337          }
1331 1338  #endif
1332 1339  }
1333 1340  
1334 1341  void
1335 1342  dmu_tx_commit(dmu_tx_t *tx)
1336 1343  {
1337 1344          dmu_tx_hold_t *txh;
1338 1345  
1339 1346          ASSERT(tx->tx_txg != 0);
1340 1347  
1341 1348          /*
1342 1349           * Go through the transaction's hold list and remove holds on
1343 1350           * associated dnodes, notifying waiters if no holds remain.
1344 1351           */
1345 1352          while (txh = list_head(&tx->tx_holds)) {
1346 1353                  dnode_t *dn = txh->txh_dnode;
1347 1354  
1348 1355                  list_remove(&tx->tx_holds, txh);
1349 1356                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1350 1357                  if (dn == NULL)
1351 1358                          continue;
1352 1359                  mutex_enter(&dn->dn_mtx);
1353 1360                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1354 1361  
1355 1362                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1356 1363                          dn->dn_assigned_txg = 0;
1357 1364                          cv_broadcast(&dn->dn_notxholds);
1358 1365                  }
1359 1366                  mutex_exit(&dn->dn_mtx);
1360 1367                  dnode_rele(dn, tx);
1361 1368          }
1362 1369  
1363 1370          if (tx->tx_tempreserve_cookie)
1364 1371                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1365 1372  
1366 1373          if (!list_is_empty(&tx->tx_callbacks))
1367 1374                  txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1368 1375  
1369 1376          if (tx->tx_anyobj == FALSE)
1370 1377                  txg_rele_to_sync(&tx->tx_txgh);
1371 1378  
1372 1379          list_destroy(&tx->tx_callbacks);
1373 1380          list_destroy(&tx->tx_holds);
1374 1381  #ifdef ZFS_DEBUG
1375 1382          dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1376 1383              tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1377 1384              tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1378 1385          refcount_destroy_many(&tx->tx_space_written,
1379 1386              refcount_count(&tx->tx_space_written));
1380 1387          refcount_destroy_many(&tx->tx_space_freed,
1381 1388              refcount_count(&tx->tx_space_freed));
1382 1389  #endif
1383 1390          kmem_free(tx, sizeof (dmu_tx_t));
1384 1391  }
1385 1392  
1386 1393  void
1387 1394  dmu_tx_abort(dmu_tx_t *tx)
1388 1395  {
1389 1396          dmu_tx_hold_t *txh;
1390 1397  
1391 1398          ASSERT(tx->tx_txg == 0);
1392 1399  
1393 1400          while (txh = list_head(&tx->tx_holds)) {
1394 1401                  dnode_t *dn = txh->txh_dnode;
1395 1402  
1396 1403                  list_remove(&tx->tx_holds, txh);
1397 1404                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1398 1405                  if (dn != NULL)
1399 1406                          dnode_rele(dn, tx);
1400 1407          }
1401 1408  
1402 1409          /*
1403 1410           * Call any registered callbacks with an error code.
1404 1411           */
1405 1412          if (!list_is_empty(&tx->tx_callbacks))
1406 1413                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1407 1414  
1408 1415          list_destroy(&tx->tx_callbacks);
1409 1416          list_destroy(&tx->tx_holds);
1410 1417  #ifdef ZFS_DEBUG
1411 1418          refcount_destroy_many(&tx->tx_space_written,
1412 1419              refcount_count(&tx->tx_space_written));
1413 1420          refcount_destroy_many(&tx->tx_space_freed,
1414 1421              refcount_count(&tx->tx_space_freed));
1415 1422  #endif
1416 1423          kmem_free(tx, sizeof (dmu_tx_t));
1417 1424  }
1418 1425  
1419 1426  uint64_t
1420 1427  dmu_tx_get_txg(dmu_tx_t *tx)
1421 1428  {
1422 1429          ASSERT(tx->tx_txg != 0);
1423 1430          return (tx->tx_txg);
1424 1431  }
1425 1432  
1426 1433  dsl_pool_t *
1427 1434  dmu_tx_pool(dmu_tx_t *tx)
1428 1435  {
1429 1436          ASSERT(tx->tx_pool != NULL);
1430 1437          return (tx->tx_pool);
1431 1438  }
1432 1439  
1433 1440  
1434 1441  void
1435 1442  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1436 1443  {
1437 1444          dmu_tx_callback_t *dcb;
1438 1445  
1439 1446          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1440 1447  
1441 1448          dcb->dcb_func = func;
1442 1449          dcb->dcb_data = data;
1443 1450  
1444 1451          list_insert_tail(&tx->tx_callbacks, dcb);
1445 1452  }
1446 1453  
1447 1454  /*
1448 1455   * Call all the commit callbacks on a list, with a given error code.
1449 1456   */
1450 1457  void
1451 1458  dmu_tx_do_callbacks(list_t *cb_list, int error)
1452 1459  {
1453 1460          dmu_tx_callback_t *dcb;
1454 1461  
1455 1462          while (dcb = list_head(cb_list)) {
1456 1463                  list_remove(cb_list, dcb);
1457 1464                  dcb->dcb_func(dcb->dcb_data, error);
1458 1465                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1459 1466          }
1460 1467  }
1461 1468  
1462 1469  /*
1463 1470   * Interface to hold a bunch of attributes.
1464 1471   * used for creating new files.
1465 1472   * attrsize is the total size of all attributes
1466 1473   * to be added during object creation
1467 1474   *
1468 1475   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1469 1476   */
1470 1477  
1471 1478  /*
1472 1479   * hold necessary attribute name for attribute registration.
1473 1480   * should be a very rare case where this is needed.  If it does
1474 1481   * happen it would only happen on the first write to the file system.
1475 1482   */
1476 1483  static void
1477 1484  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1478 1485  {
1479 1486          int i;
1480 1487  
1481 1488          if (!sa->sa_need_attr_registration)
1482 1489                  return;
1483 1490  
1484 1491          for (i = 0; i != sa->sa_num_attrs; i++) {
1485 1492                  if (!sa->sa_attr_table[i].sa_registered) {
1486 1493                          if (sa->sa_reg_attr_obj)
1487 1494                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1488 1495                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1489 1496                          else
1490 1497                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1491 1498                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1492 1499                  }
1493 1500          }
1494 1501  }
1495 1502  
1496 1503  
1497 1504  void
1498 1505  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1499 1506  {
1500 1507          dnode_t *dn;
1501 1508          dmu_tx_hold_t *txh;
1502 1509  
1503 1510          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1504 1511              THT_SPILL, 0, 0);
1505 1512  
1506 1513          dn = txh->txh_dnode;
1507 1514  
1508 1515          if (dn == NULL)
1509 1516                  return;
1510 1517  
1511 1518          /* If blkptr doesn't exist then add space to towrite */
1512 1519          if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1513 1520                  txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1514 1521          } else {
1515 1522                  blkptr_t *bp;
1516 1523  
1517 1524                  bp = &dn->dn_phys->dn_spill;
1518 1525                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1519 1526                      bp, bp->blk_birth))
1520 1527                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
1521 1528                  else
1522 1529                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1523 1530                  if (!BP_IS_HOLE(bp))
1524 1531                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
1525 1532          }
1526 1533  }
1527 1534  
1528 1535  void
1529 1536  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1530 1537  {
1531 1538          sa_os_t *sa = tx->tx_objset->os_sa;
1532 1539  
1533 1540          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1534 1541  
1535 1542          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1536 1543                  return;
1537 1544  
1538 1545          if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1539 1546                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1540 1547          else {
1541 1548                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1542 1549                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1543 1550                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1544 1551                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1545 1552          }
1546 1553  
1547 1554          dmu_tx_sa_registration_hold(sa, tx);
1548 1555  
1549 1556          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1550 1557                  return;
1551 1558  
1552 1559          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1553 1560              THT_SPILL, 0, 0);
1554 1561  }
1555 1562  
1556 1563  /*
1557 1564   * Hold SA attribute
1558 1565   *
1559 1566   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1560 1567   *
1561 1568   * variable_size is the total size of all variable sized attributes
1562 1569   * passed to this function.  It is not the total size of all
1563 1570   * variable size attributes that *may* exist on this object.
1564 1571   */
1565 1572  void
1566 1573  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1567 1574  {
1568 1575          uint64_t object;
1569 1576          sa_os_t *sa = tx->tx_objset->os_sa;
1570 1577  
1571 1578          ASSERT(hdl != NULL);
1572 1579  
1573 1580          object = sa_handle_object(hdl);
1574 1581  
1575 1582          dmu_tx_hold_bonus(tx, object);
1576 1583  
1577 1584          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1578 1585                  return;
1579 1586  
1580 1587          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1581 1588              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1582 1589                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1583 1590                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1584 1591                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1585 1592                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1586 1593          }
1587 1594  
1588 1595          dmu_tx_sa_registration_hold(sa, tx);
1589 1596  
1590 1597          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1591 1598                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1592 1599  
1593 1600          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1594 1601                  ASSERT(tx->tx_txg == 0);
1595 1602                  dmu_tx_hold_spill(tx, object);
1596 1603          } else {
1597 1604                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1598 1605                  dnode_t *dn;
1599 1606  
1600 1607                  DB_DNODE_ENTER(db);
1601 1608                  dn = DB_DNODE(db);
1602 1609                  if (dn->dn_have_spill) {
1603 1610                          ASSERT(tx->tx_txg == 0);
1604 1611                          dmu_tx_hold_spill(tx, object);
1605 1612                  }
1606 1613                  DB_DNODE_EXIT(db);
1607 1614          }
1608 1615  }

↓ open down ↓

958 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX