dlpx-os-diff Wdiff usr/src/uts/common/fs/zfs/dmu_tx.c

Print this page

4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_impl.h>
  29   29  #include <sys/dbuf.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/dmu_objset.h>
  32   32  #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33   33  #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36   36  #include <sys/spa.h>
  37   37  #include <sys/sa.h>
  38   38  #include <sys/sa_impl.h>
  39   39  #include <sys/zfs_context.h>
  40   40  #include <sys/varargs.h>
  41   41  
  42   42  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  43   43      uint64_t arg1, uint64_t arg2);
  44   44  
  45   45  
  46   46  dmu_tx_t *

↓ open down ↓

46 lines elided

↑ open up ↑

  47   47  dmu_tx_create_dd(dsl_dir_t *dd)
  48   48  {
  49   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  50   50          tx->tx_dir = dd;
  51   51          if (dd != NULL)
  52   52                  tx->tx_pool = dd->dd_pool;
  53   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  54   54              offsetof(dmu_tx_hold_t, txh_node));
  55   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  56   56              offsetof(dmu_tx_callback_t, dcb_node));
       57 +        tx->tx_start = gethrtime();
  57   58  #ifdef ZFS_DEBUG
  58   59          refcount_create(&tx->tx_space_written);
  59   60          refcount_create(&tx->tx_space_freed);
  60   61  #endif
  61   62          return (tx);
  62   63  }
  63   64  
  64   65  dmu_tx_t *
  65   66  dmu_tx_create(objset_t *os)
  66   67  {

  67   68          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  68   69          tx->tx_objset = os;
  69   70          tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  70   71          return (tx);
  71   72  }
  72   73  
  73   74  dmu_tx_t *
  74   75  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  75   76  {
  76   77          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  77   78  
  78   79          ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  79   80          tx->tx_pool = dp;
  80   81          tx->tx_txg = txg;
  81   82          tx->tx_anyobj = TRUE;
  82   83  
  83   84          return (tx);
  84   85  }
  85   86  
  86   87  int
  87   88  dmu_tx_is_syncing(dmu_tx_t *tx)
  88   89  {
  89   90          return (tx->tx_anyobj);
  90   91  }
  91   92  
  92   93  int
  93   94  dmu_tx_private_ok(dmu_tx_t *tx)
  94   95  {
  95   96          return (tx->tx_anyobj);
  96   97  }
  97   98  
  98   99  static dmu_tx_hold_t *
  99  100  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 100  101      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 101  102  {
 102  103          dmu_tx_hold_t *txh;
 103  104          dnode_t *dn = NULL;
 104  105          int err;
 105  106  
 106  107          if (object != DMU_NEW_OBJECT) {
 107  108                  err = dnode_hold(os, object, tx, &dn);
 108  109                  if (err) {
 109  110                          tx->tx_err = err;
 110  111                          return (NULL);
 111  112                  }
 112  113  
 113  114                  if (err == 0 && tx->tx_txg != 0) {
 114  115                          mutex_enter(&dn->dn_mtx);
 115  116                          /*
 116  117                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 117  118                           * problem, but there's no way for it to happen (for
 118  119                           * now, at least).
 119  120                           */
 120  121                          ASSERT(dn->dn_assigned_txg == 0);
 121  122                          dn->dn_assigned_txg = tx->tx_txg;
 122  123                          (void) refcount_add(&dn->dn_tx_holds, tx);
 123  124                          mutex_exit(&dn->dn_mtx);
 124  125                  }
 125  126          }
 126  127  
 127  128          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 128  129          txh->txh_tx = tx;
 129  130          txh->txh_dnode = dn;
 130  131  #ifdef ZFS_DEBUG
 131  132          txh->txh_type = type;
 132  133          txh->txh_arg1 = arg1;
 133  134          txh->txh_arg2 = arg2;
 134  135  #endif
 135  136          list_insert_tail(&tx->tx_holds, txh);
 136  137  
 137  138          return (txh);
 138  139  }
 139  140  
 140  141  void
 141  142  dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 142  143  {
 143  144          /*
 144  145           * If we're syncing, they can manipulate any object anyhow, and
 145  146           * the hold on the dnode_t can cause problems.
 146  147           */
 147  148          if (!dmu_tx_is_syncing(tx)) {
 148  149                  (void) dmu_tx_hold_object_impl(tx, os,
 149  150                      object, THT_NEWOBJECT, 0, 0);
 150  151          }
 151  152  }
 152  153  
 153  154  static int
 154  155  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 155  156  {
 156  157          int err;
 157  158          dmu_buf_impl_t *db;
 158  159  
 159  160          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 160  161          db = dbuf_hold_level(dn, level, blkid, FTAG);
 161  162          rw_exit(&dn->dn_struct_rwlock);
 162  163          if (db == NULL)
 163  164                  return (SET_ERROR(EIO));
 164  165          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 165  166          dbuf_rele(db, FTAG);
 166  167          return (err);
 167  168  }
 168  169  
 169  170  static void
 170  171  dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 171  172      int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 172  173  {
 173  174          objset_t *os = dn->dn_objset;
 174  175          dsl_dataset_t *ds = os->os_dsl_dataset;
 175  176          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 176  177          dmu_buf_impl_t *parent = NULL;
 177  178          blkptr_t *bp = NULL;
 178  179          uint64_t space;
 179  180  
 180  181          if (level >= dn->dn_nlevels || history[level] == blkid)
 181  182                  return;
 182  183  
 183  184          history[level] = blkid;
 184  185  
 185  186          space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 186  187  
 187  188          if (db == NULL || db == dn->dn_dbuf) {
 188  189                  ASSERT(level != 0);
 189  190                  db = NULL;
 190  191          } else {
 191  192                  ASSERT(DB_DNODE(db) == dn);
 192  193                  ASSERT(db->db_level == level);
 193  194                  ASSERT(db->db.db_size == space);
 194  195                  ASSERT(db->db_blkid == blkid);
 195  196                  bp = db->db_blkptr;
 196  197                  parent = db->db_parent;
 197  198          }
 198  199  
 199  200          freeable = (bp && (freeable ||
 200  201              dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 201  202  
 202  203          if (freeable)
 203  204                  txh->txh_space_tooverwrite += space;
 204  205          else
 205  206                  txh->txh_space_towrite += space;
 206  207          if (bp)
 207  208                  txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 208  209  
 209  210          dmu_tx_count_twig(txh, dn, parent, level + 1,
 210  211              blkid >> epbs, freeable, history);
 211  212  }
 212  213  
 213  214  /* ARGSUSED */
 214  215  static void
 215  216  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 216  217  {
 217  218          dnode_t *dn = txh->txh_dnode;
 218  219          uint64_t start, end, i;
 219  220          int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 220  221          int err = 0;
 221  222  
 222  223          if (len == 0)
 223  224                  return;
 224  225  
 225  226          min_bs = SPA_MINBLOCKSHIFT;
 226  227          max_bs = SPA_MAXBLOCKSHIFT;
 227  228          min_ibs = DN_MIN_INDBLKSHIFT;
 228  229          max_ibs = DN_MAX_INDBLKSHIFT;
 229  230  
 230  231          if (dn) {
 231  232                  uint64_t history[DN_MAX_LEVELS];
 232  233                  int nlvls = dn->dn_nlevels;
 233  234                  int delta;
 234  235  
 235  236                  /*
 236  237                   * For i/o error checking, read the first and last level-0
 237  238                   * blocks (if they are not aligned), and all the level-1 blocks.
 238  239                   */
 239  240                  if (dn->dn_maxblkid == 0) {
 240  241                          delta = dn->dn_datablksz;
 241  242                          start = (off < dn->dn_datablksz) ? 0 : 1;
 242  243                          end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 243  244                          if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 244  245                                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 245  246                                  if (err)
 246  247                                          goto out;
 247  248                                  delta -= off;
 248  249                          }
 249  250                  } else {
 250  251                          zio_t *zio = zio_root(dn->dn_objset->os_spa,
 251  252                              NULL, NULL, ZIO_FLAG_CANFAIL);
 252  253  
 253  254                          /* first level-0 block */
 254  255                          start = off >> dn->dn_datablkshift;
 255  256                          if (P2PHASE(off, dn->dn_datablksz) ||
 256  257                              len < dn->dn_datablksz) {
 257  258                                  err = dmu_tx_check_ioerr(zio, dn, 0, start);
 258  259                                  if (err)
 259  260                                          goto out;
 260  261                          }
 261  262  
 262  263                          /* last level-0 block */
 263  264                          end = (off+len-1) >> dn->dn_datablkshift;
 264  265                          if (end != start && end <= dn->dn_maxblkid &&
 265  266                              P2PHASE(off+len, dn->dn_datablksz)) {
 266  267                                  err = dmu_tx_check_ioerr(zio, dn, 0, end);
 267  268                                  if (err)
 268  269                                          goto out;
 269  270                          }
 270  271  
 271  272                          /* level-1 blocks */
 272  273                          if (nlvls > 1) {
 273  274                                  int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 274  275                                  for (i = (start>>shft)+1; i < end>>shft; i++) {
 275  276                                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 276  277                                          if (err)
 277  278                                                  goto out;
 278  279                                  }
 279  280                          }
 280  281  
 281  282                          err = zio_wait(zio);
 282  283                          if (err)
 283  284                                  goto out;
 284  285                          delta = P2NPHASE(off, dn->dn_datablksz);
 285  286                  }
 286  287  
 287  288                  min_ibs = max_ibs = dn->dn_indblkshift;
 288  289                  if (dn->dn_maxblkid > 0) {
 289  290                          /*
 290  291                           * The blocksize can't change,
 291  292                           * so we can make a more precise estimate.
 292  293                           */
 293  294                          ASSERT(dn->dn_datablkshift != 0);
 294  295                          min_bs = max_bs = dn->dn_datablkshift;
 295  296                  }
 296  297  
 297  298                  /*
 298  299                   * If this write is not off the end of the file
 299  300                   * we need to account for overwrites/unref.
 300  301                   */
 301  302                  if (start <= dn->dn_maxblkid) {
 302  303                          for (int l = 0; l < DN_MAX_LEVELS; l++)
 303  304                                  history[l] = -1ULL;
 304  305                  }
 305  306                  while (start <= dn->dn_maxblkid) {
 306  307                          dmu_buf_impl_t *db;
 307  308  
 308  309                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 309  310                          err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 310  311                          rw_exit(&dn->dn_struct_rwlock);
 311  312  
 312  313                          if (err) {
 313  314                                  txh->txh_tx->tx_err = err;
 314  315                                  return;
 315  316                          }
 316  317  
 317  318                          dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 318  319                              history);
 319  320                          dbuf_rele(db, FTAG);
 320  321                          if (++start > end) {
 321  322                                  /*
 322  323                                   * Account for new indirects appearing
 323  324                                   * before this IO gets assigned into a txg.
 324  325                                   */
 325  326                                  bits = 64 - min_bs;
 326  327                                  epbs = min_ibs - SPA_BLKPTRSHIFT;
 327  328                                  for (bits -= epbs * (nlvls - 1);
 328  329                                      bits >= 0; bits -= epbs)
 329  330                                          txh->txh_fudge += 1ULL << max_ibs;
 330  331                                  goto out;
 331  332                          }
 332  333                          off += delta;
 333  334                          if (len >= delta)
 334  335                                  len -= delta;
 335  336                          delta = dn->dn_datablksz;
 336  337                  }
 337  338          }
 338  339  
 339  340          /*
 340  341           * 'end' is the last thing we will access, not one past.
 341  342           * This way we won't overflow when accessing the last byte.
 342  343           */
 343  344          start = P2ALIGN(off, 1ULL << max_bs);
 344  345          end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 345  346          txh->txh_space_towrite += end - start + 1;
 346  347  
 347  348          start >>= min_bs;
 348  349          end >>= min_bs;
 349  350  
 350  351          epbs = min_ibs - SPA_BLKPTRSHIFT;
 351  352  
 352  353          /*
 353  354           * The object contains at most 2^(64 - min_bs) blocks,
 354  355           * and each indirect level maps 2^epbs.
 355  356           */
 356  357          for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 357  358                  start >>= epbs;
 358  359                  end >>= epbs;
 359  360                  ASSERT3U(end, >=, start);
 360  361                  txh->txh_space_towrite += (end - start + 1) << max_ibs;
 361  362                  if (start != 0) {
 362  363                          /*
 363  364                           * We also need a new blkid=0 indirect block
 364  365                           * to reference any existing file data.
 365  366                           */
 366  367                          txh->txh_space_towrite += 1ULL << max_ibs;
 367  368                  }
 368  369          }
 369  370  
 370  371  out:
 371  372          if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 372  373              2 * DMU_MAX_ACCESS)
 373  374                  err = SET_ERROR(EFBIG);
 374  375  
 375  376          if (err)
 376  377                  txh->txh_tx->tx_err = err;
 377  378  }
 378  379  
 379  380  static void
 380  381  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 381  382  {
 382  383          dnode_t *dn = txh->txh_dnode;
 383  384          dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 384  385          uint64_t space = mdn->dn_datablksz +
 385  386              ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 386  387  
 387  388          if (dn && dn->dn_dbuf->db_blkptr &&
 388  389              dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 389  390              dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 390  391                  txh->txh_space_tooverwrite += space;
 391  392                  txh->txh_space_tounref += space;
 392  393          } else {
 393  394                  txh->txh_space_towrite += space;
 394  395                  if (dn && dn->dn_dbuf->db_blkptr)
 395  396                          txh->txh_space_tounref += space;
 396  397          }
 397  398  }
 398  399  
 399  400  void
 400  401  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 401  402  {
 402  403          dmu_tx_hold_t *txh;
 403  404  
 404  405          ASSERT(tx->tx_txg == 0);
 405  406          ASSERT(len < DMU_MAX_ACCESS);
 406  407          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 407  408  
 408  409          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 409  410              object, THT_WRITE, off, len);
 410  411          if (txh == NULL)
 411  412                  return;
 412  413  
 413  414          dmu_tx_count_write(txh, off, len);
 414  415          dmu_tx_count_dnode(txh);
 415  416  }
 416  417  
 417  418  static void
 418  419  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 419  420  {
 420  421          uint64_t blkid, nblks, lastblk;
 421  422          uint64_t space = 0, unref = 0, skipped = 0;
 422  423          dnode_t *dn = txh->txh_dnode;
 423  424          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 424  425          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 425  426          int epbs;
 426  427          uint64_t l0span = 0, nl1blks = 0;
 427  428  
 428  429          if (dn->dn_nlevels == 0)
 429  430                  return;
 430  431  
 431  432          /*
 432  433           * The struct_rwlock protects us against dn_nlevels
 433  434           * changing, in case (against all odds) we manage to dirty &
 434  435           * sync out the changes after we check for being dirty.
 435  436           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 436  437           */
 437  438          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 438  439          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 439  440          if (dn->dn_maxblkid == 0) {
 440  441                  if (off == 0 && len >= dn->dn_datablksz) {
 441  442                          blkid = 0;
 442  443                          nblks = 1;
 443  444                  } else {
 444  445                          rw_exit(&dn->dn_struct_rwlock);
 445  446                          return;
 446  447                  }
 447  448          } else {
 448  449                  blkid = off >> dn->dn_datablkshift;
 449  450                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 450  451  
 451  452                  if (blkid > dn->dn_maxblkid) {
 452  453                          rw_exit(&dn->dn_struct_rwlock);
 453  454                          return;
 454  455                  }
 455  456                  if (blkid + nblks > dn->dn_maxblkid)
 456  457                          nblks = dn->dn_maxblkid - blkid + 1;
 457  458  
 458  459          }
 459  460          l0span = nblks;    /* save for later use to calc level > 1 overhead */
 460  461          if (dn->dn_nlevels == 1) {
 461  462                  int i;
 462  463                  for (i = 0; i < nblks; i++) {
 463  464                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 464  465                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 465  466                          bp += blkid + i;
 466  467                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 467  468                                  dprintf_bp(bp, "can free old%s", "");
 468  469                                  space += bp_get_dsize(spa, bp);
 469  470                          }
 470  471                          unref += BP_GET_ASIZE(bp);
 471  472                  }
 472  473                  nl1blks = 1;
 473  474                  nblks = 0;
 474  475          }
 475  476  
 476  477          lastblk = blkid + nblks - 1;
 477  478          while (nblks) {
 478  479                  dmu_buf_impl_t *dbuf;
 479  480                  uint64_t ibyte, new_blkid;
 480  481                  int epb = 1 << epbs;
 481  482                  int err, i, blkoff, tochk;
 482  483                  blkptr_t *bp;
 483  484  
 484  485                  ibyte = blkid << dn->dn_datablkshift;
 485  486                  err = dnode_next_offset(dn,
 486  487                      DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 487  488                  new_blkid = ibyte >> dn->dn_datablkshift;
 488  489                  if (err == ESRCH) {
 489  490                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 490  491                          break;
 491  492                  }
 492  493                  if (err) {
 493  494                          txh->txh_tx->tx_err = err;
 494  495                          break;
 495  496                  }
 496  497                  if (new_blkid > lastblk) {
 497  498                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 498  499                          break;
 499  500                  }
 500  501  
 501  502                  if (new_blkid > blkid) {
 502  503                          ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 503  504                          skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 504  505                          nblks -= new_blkid - blkid;
 505  506                          blkid = new_blkid;
 506  507                  }
 507  508                  blkoff = P2PHASE(blkid, epb);
 508  509                  tochk = MIN(epb - blkoff, nblks);
 509  510  
 510  511                  err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 511  512                  if (err) {
 512  513                          txh->txh_tx->tx_err = err;
 513  514                          break;
 514  515                  }
 515  516  
 516  517                  txh->txh_memory_tohold += dbuf->db.db_size;
 517  518  
 518  519                  /*
 519  520                   * We don't check memory_tohold against DMU_MAX_ACCESS because
 520  521                   * memory_tohold is an over-estimation (especially the >L1
 521  522                   * indirect blocks), so it could fail.  Callers should have
 522  523                   * already verified that they will not be holding too much
 523  524                   * memory.
 524  525                   */
 525  526  
 526  527                  err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 527  528                  if (err != 0) {
 528  529                          txh->txh_tx->tx_err = err;
 529  530                          dbuf_rele(dbuf, FTAG);
 530  531                          break;
 531  532                  }
 532  533  
 533  534                  bp = dbuf->db.db_data;
 534  535                  bp += blkoff;
 535  536  
 536  537                  for (i = 0; i < tochk; i++) {
 537  538                          if (dsl_dataset_block_freeable(ds, &bp[i],
 538  539                              bp[i].blk_birth)) {
 539  540                                  dprintf_bp(&bp[i], "can free old%s", "");
 540  541                                  space += bp_get_dsize(spa, &bp[i]);
 541  542                          }
 542  543                          unref += BP_GET_ASIZE(bp);
 543  544                  }
 544  545                  dbuf_rele(dbuf, FTAG);
 545  546  
 546  547                  ++nl1blks;
 547  548                  blkid += tochk;
 548  549                  nblks -= tochk;
 549  550          }
 550  551          rw_exit(&dn->dn_struct_rwlock);
 551  552  
 552  553          /*
 553  554           * Add in memory requirements of higher-level indirects.
 554  555           * This assumes a worst-possible scenario for dn_nlevels and a
 555  556           * worst-possible distribution of l1-blocks over the region to free.
 556  557           */
 557  558          {
 558  559                  uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 559  560                  int level = 2;
 560  561                  /*
 561  562                   * Here we don't use DN_MAX_LEVEL, but calculate it with the
 562  563                   * given datablkshift and indblkshift. This makes the
 563  564                   * difference between 19 and 8 on large files.
 564  565                   */
 565  566                  int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 566  567                      (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 567  568  
 568  569                  while (level++ < maxlevel) {
 569  570                          txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 570  571                              << dn->dn_indblkshift;
 571  572                          blkcnt = 1 + (blkcnt >> epbs);
 572  573                  }
 573  574          }
 574  575  
 575  576          /* account for new level 1 indirect blocks that might show up */
 576  577          if (skipped > 0) {
 577  578                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 578  579                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 579  580                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 580  581          }
 581  582          txh->txh_space_tofree += space;
 582  583          txh->txh_space_tounref += unref;
 583  584  }
 584  585  
 585  586  void
 586  587  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 587  588  {
 588  589          dmu_tx_hold_t *txh;
 589  590          dnode_t *dn;

↓ open down ↓

523 lines elided

↑ open up ↑

 590  591          int err;
 591  592          zio_t *zio;
 592  593  
 593  594          ASSERT(tx->tx_txg == 0);
 594  595  
 595  596          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 596  597              object, THT_FREE, off, len);
 597  598          if (txh == NULL)
 598  599                  return;
 599  600          dn = txh->txh_dnode;
      601 +        dmu_tx_count_dnode(txh);
 600  602  
 601  603          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 602  604                  return;
 603  605          if (len == DMU_OBJECT_END)
 604  606                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 605  607  
 606      -        dmu_tx_count_dnode(txh);
 607  608  
 608  609          /*
 609  610           * For i/o error checking, we read the first and last level-0
 610  611           * blocks if they are not aligned, and all the level-1 blocks.
 611  612           *
 612  613           * Note:  dbuf_free_range() assumes that we have not instantiated
 613  614           * any level-0 dbufs that will be completely freed.  Therefore we must
 614  615           * exercise care to not read or count the first and last blocks
 615  616           * if they are blocksize-aligned.
 616  617           */

 617  618          if (dn->dn_datablkshift == 0) {
 618  619                  if (off != 0 || len < dn->dn_datablksz)
 619  620                          dmu_tx_count_write(txh, off, len);
 620  621          } else {
 621  622                  /* first block will be modified if it is not aligned */
 622  623                  if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 623  624                          dmu_tx_count_write(txh, off, 1);
 624  625                  /* last block will be modified if it is not aligned */
 625  626                  if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 626  627                          dmu_tx_count_write(txh, off+len, 1);
 627  628          }
 628  629  
 629  630          /*
 630  631           * Check level-1 blocks.
 631  632           */
 632  633          if (dn->dn_nlevels > 1) {
 633  634                  int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 634  635                      SPA_BLKPTRSHIFT;
 635  636                  uint64_t start = off >> shift;
 636  637                  uint64_t end = (off + len) >> shift;
 637  638  
 638  639                  ASSERT(dn->dn_datablkshift != 0);
 639  640                  ASSERT(dn->dn_indblkshift != 0);
 640  641  
 641  642                  zio = zio_root(tx->tx_pool->dp_spa,
 642  643                      NULL, NULL, ZIO_FLAG_CANFAIL);
 643  644                  for (uint64_t i = start; i <= end; i++) {
 644  645                          uint64_t ibyte = i << shift;
 645  646                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 646  647                          i = ibyte >> shift;
 647  648                          if (err == ESRCH)
 648  649                                  break;
 649  650                          if (err) {
 650  651                                  tx->tx_err = err;
 651  652                                  return;
 652  653                          }
 653  654  
 654  655                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 655  656                          if (err) {
 656  657                                  tx->tx_err = err;
 657  658                                  return;
 658  659                          }
 659  660                  }
 660  661                  err = zio_wait(zio);
 661  662                  if (err) {
 662  663                          tx->tx_err = err;
 663  664                          return;
 664  665                  }
 665  666          }
 666  667  
 667  668          dmu_tx_count_free(txh, off, len);
 668  669  }
 669  670  
 670  671  void
 671  672  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 672  673  {
 673  674          dmu_tx_hold_t *txh;
 674  675          dnode_t *dn;
 675  676          uint64_t nblocks;
 676  677          int epbs, err;
 677  678  
 678  679          ASSERT(tx->tx_txg == 0);
 679  680  
 680  681          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 681  682              object, THT_ZAP, add, (uintptr_t)name);
 682  683          if (txh == NULL)
 683  684                  return;
 684  685          dn = txh->txh_dnode;
 685  686  
 686  687          dmu_tx_count_dnode(txh);
 687  688  
 688  689          if (dn == NULL) {
 689  690                  /*
 690  691                   * We will be able to fit a new object's entries into one leaf
 691  692                   * block.  So there will be at most 2 blocks total,
 692  693                   * including the header block.
 693  694                   */
 694  695                  dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 695  696                  return;
 696  697          }
 697  698  
 698  699          ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 699  700  
 700  701          if (dn->dn_maxblkid == 0 && !add) {
 701  702                  blkptr_t *bp;
 702  703  
 703  704                  /*
 704  705                   * If there is only one block  (i.e. this is a micro-zap)
 705  706                   * and we are not adding anything, the accounting is simple.
 706  707                   */
 707  708                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 708  709                  if (err) {
 709  710                          tx->tx_err = err;
 710  711                          return;
 711  712                  }
 712  713  
 713  714                  /*
 714  715                   * Use max block size here, since we don't know how much
 715  716                   * the size will change between now and the dbuf dirty call.
 716  717                   */
 717  718                  bp = &dn->dn_phys->dn_blkptr[0];
 718  719                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 719  720                      bp, bp->blk_birth))
 720  721                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 721  722                  else
 722  723                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
 723  724                  if (!BP_IS_HOLE(bp))
 724  725                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 725  726                  return;
 726  727          }
 727  728  
 728  729          if (dn->dn_maxblkid > 0 && name) {
 729  730                  /*
 730  731                   * access the name in this fat-zap so that we'll check
 731  732                   * for i/o errors to the leaf blocks, etc.
 732  733                   */
 733  734                  err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 734  735                      8, 0, NULL);
 735  736                  if (err == EIO) {
 736  737                          tx->tx_err = err;
 737  738                          return;
 738  739                  }
 739  740          }
 740  741  
 741  742          err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 742  743              &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 743  744  
 744  745          /*
 745  746           * If the modified blocks are scattered to the four winds,
 746  747           * we'll have to modify an indirect twig for each.
 747  748           */
 748  749          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 749  750          for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 750  751                  if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
 751  752                          txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 752  753                  else
 753  754                          txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 754  755  }
 755  756  
 756  757  void
 757  758  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 758  759  {
 759  760          dmu_tx_hold_t *txh;
 760  761  
 761  762          ASSERT(tx->tx_txg == 0);
 762  763  
 763  764          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 764  765              object, THT_BONUS, 0, 0);
 765  766          if (txh)
 766  767                  dmu_tx_count_dnode(txh);
 767  768  }
 768  769  
 769  770  void
 770  771  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 771  772  {
 772  773          dmu_tx_hold_t *txh;
 773  774          ASSERT(tx->tx_txg == 0);
 774  775  
 775  776          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 776  777              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 777  778  
 778  779          txh->txh_space_towrite += space;
 779  780  }
 780  781  
 781  782  int
 782  783  dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 783  784  {
 784  785          dmu_tx_hold_t *txh;
 785  786          int holds = 0;
 786  787  
 787  788          /*
 788  789           * By asserting that the tx is assigned, we're counting the
 789  790           * number of dn_tx_holds, which is the same as the number of
 790  791           * dn_holds.  Otherwise, we'd be counting dn_holds, but
 791  792           * dn_tx_holds could be 0.
 792  793           */
 793  794          ASSERT(tx->tx_txg != 0);
 794  795  
 795  796          /* if (tx->tx_anyobj == TRUE) */
 796  797                  /* return (0); */
 797  798  
 798  799          for (txh = list_head(&tx->tx_holds); txh;
 799  800              txh = list_next(&tx->tx_holds, txh)) {
 800  801                  if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 801  802                          holds++;
 802  803          }
 803  804  
 804  805          return (holds);
 805  806  }
 806  807  
 807  808  #ifdef ZFS_DEBUG
 808  809  void
 809  810  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 810  811  {
 811  812          dmu_tx_hold_t *txh;
 812  813          int match_object = FALSE, match_offset = FALSE;
 813  814          dnode_t *dn;
 814  815  
 815  816          DB_DNODE_ENTER(db);
 816  817          dn = DB_DNODE(db);
 817  818          ASSERT(tx->tx_txg != 0);
 818  819          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 819  820          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 820  821  
 821  822          if (tx->tx_anyobj) {
 822  823                  DB_DNODE_EXIT(db);
 823  824                  return;
 824  825          }
 825  826  
 826  827          /* XXX No checking on the meta dnode for now */
 827  828          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 828  829                  DB_DNODE_EXIT(db);
 829  830                  return;
 830  831          }
 831  832  
 832  833          for (txh = list_head(&tx->tx_holds); txh;
 833  834              txh = list_next(&tx->tx_holds, txh)) {
 834  835                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 835  836                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 836  837                          match_object = TRUE;
 837  838                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 838  839                          int datablkshift = dn->dn_datablkshift ?
 839  840                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 840  841                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 841  842                          int shift = datablkshift + epbs * db->db_level;
 842  843                          uint64_t beginblk = shift >= 64 ? 0 :
 843  844                              (txh->txh_arg1 >> shift);
 844  845                          uint64_t endblk = shift >= 64 ? 0 :
 845  846                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 846  847                          uint64_t blkid = db->db_blkid;
 847  848  
 848  849                          /* XXX txh_arg2 better not be zero... */
 849  850  
 850  851                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 851  852                              txh->txh_type, beginblk, endblk);
 852  853  
 853  854                          switch (txh->txh_type) {
 854  855                          case THT_WRITE:
 855  856                                  if (blkid >= beginblk && blkid <= endblk)
 856  857                                          match_offset = TRUE;
 857  858                                  /*
 858  859                                   * We will let this hold work for the bonus
 859  860                                   * or spill buffer so that we don't need to
 860  861                                   * hold it when creating a new object.
 861  862                                   */
 862  863                                  if (blkid == DMU_BONUS_BLKID ||
 863  864                                      blkid == DMU_SPILL_BLKID)
 864  865                                          match_offset = TRUE;
 865  866                                  /*
 866  867                                   * They might have to increase nlevels,
 867  868                                   * thus dirtying the new TLIBs.  Or the
 868  869                                   * might have to change the block size,
 869  870                                   * thus dirying the new lvl=0 blk=0.
 870  871                                   */
 871  872                                  if (blkid == 0)
 872  873                                          match_offset = TRUE;
 873  874                                  break;
 874  875                          case THT_FREE:
 875  876                                  /*
 876  877                                   * We will dirty all the level 1 blocks in
 877  878                                   * the free range and perhaps the first and
 878  879                                   * last level 0 block.
 879  880                                   */
 880  881                                  if (blkid >= beginblk && (blkid <= endblk ||
 881  882                                      txh->txh_arg2 == DMU_OBJECT_END))
 882  883                                          match_offset = TRUE;
 883  884                                  break;
 884  885                          case THT_SPILL:
 885  886                                  if (blkid == DMU_SPILL_BLKID)
 886  887                                          match_offset = TRUE;
 887  888                                  break;
 888  889                          case THT_BONUS:
 889  890                                  if (blkid == DMU_BONUS_BLKID)
 890  891                                          match_offset = TRUE;
 891  892                                  break;
 892  893                          case THT_ZAP:
 893  894                                  match_offset = TRUE;
 894  895                                  break;
 895  896                          case THT_NEWOBJECT:
 896  897                                  match_object = TRUE;
 897  898                                  break;
 898  899                          default:
 899  900                                  ASSERT(!"bad txh_type");
 900  901                          }
 901  902                  }
 902  903                  if (match_object && match_offset) {
 903  904                          DB_DNODE_EXIT(db);

↓ open down ↓

287 lines elided

↑ open up ↑

 904  905                          return;
 905  906                  }
 906  907          }
 907  908          DB_DNODE_EXIT(db);
 908  909          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 909  910              (u_longlong_t)db->db.db_object, db->db_level,
 910  911              (u_longlong_t)db->db_blkid);
 911  912  }
 912  913  #endif
 913  914  
      915 +/*
      916 + * If we can't do 10 iops, something is wrong.  Let us go ahead
      917 + * and hit zfs_dirty_data_max.
      918 + */
      919 +hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
      920 +int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
      921 +
      922 +/*
      923 + * We delay transactions when we've determined that the backend storage
      924 + * isn't able to accommodate the rate of incoming writes.
      925 + *
      926 + * If there is already a transaction waiting, we delay relative to when
      927 + * that transaction finishes waiting.  This way the calculated min_time
      928 + * is independent of the number of threads concurrently executing
      929 + * transactions.
      930 + *
      931 + * If we are the only waiter, wait relative to when the transaction
      932 + * started, rather than the current time.  This credits the transaction for
      933 + * "time already served", e.g. reading indirect blocks.
      934 + *
      935 + * The minimum time for a transaction to take is calculated as:
      936 + *     min_time = scale * (dirty - min) / (max - dirty)
      937 + *     min_time is then capped at zfs_delay_max_ns.
      938 + *
      939 + * The delay has two degrees of freedom that can be adjusted via tunables.
      940 + * The percentage of dirty data at which we start to delay is defined by
      941 + * zfs_delay_min_dirty_percent. This should typically be at or above
      942 + * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
      943 + * delay after writing at full speed has failed to keep up with the incoming
      944 + * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
      945 + * speaking, this variable determines the amount of delay at the midpoint of
      946 + * the curve.
      947 + *
      948 + * delay
      949 + *  10ms +-------------------------------------------------------------*+
      950 + *       |                                                             *|
      951 + *   9ms +                                                             *+
      952 + *       |                                                             *|
      953 + *   8ms +                                                             *+
      954 + *       |                                                            * |
      955 + *   7ms +                                                            * +
      956 + *       |                                                            * |
      957 + *   6ms +                                                            * +
      958 + *       |                                                            * |
      959 + *   5ms +                                                           *  +
      960 + *       |                                                           *  |
      961 + *   4ms +                                                           *  +
      962 + *       |                                                           *  |
      963 + *   3ms +                                                          *   +
      964 + *       |                                                          *   |
      965 + *   2ms +                                              (midpoint) *    +
      966 + *       |                                                  |    **     |
      967 + *   1ms +                                                  v ***       +
      968 + *       |             zfs_delay_scale ---------->     ********         |
      969 + *     0 +-------------------------------------*********----------------+
      970 + *       0%                    <- zfs_dirty_data_max ->               100%
      971 + *
      972 + * Note that since the delay is added to the outstanding time remaining on the
      973 + * most recent transaction, the delay is effectively the inverse of IOPS.
      974 + * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
      975 + * was chosen such that small changes in the amount of accumulated dirty data
      976 + * in the first 3/4 of the curve yield relatively small differences in the
      977 + * amount of delay.
      978 + *
      979 + * The effects can be easier to understand when the amount of delay is
      980 + * represented on a log scale:
      981 + *
      982 + * delay
      983 + * 100ms +-------------------------------------------------------------++
      984 + *       +                                                              +
      985 + *       |                                                              |
      986 + *       +                                                             *+
      987 + *  10ms +                                                             *+
      988 + *       +                                                           ** +
      989 + *       |                                              (midpoint)  **  |
      990 + *       +                                                  |     **    +
      991 + *   1ms +                                                  v ****      +
      992 + *       +             zfs_delay_scale ---------->        *****         +
      993 + *       |                                             ****             |
      994 + *       +                                          ****                +
      995 + * 100us +                                        **                    +
      996 + *       +                                       *                      +
      997 + *       |                                      *                       |
      998 + *       +                                     *                        +
      999 + *  10us +                                     *                        +
     1000 + *       +                                                              +
     1001 + *       |                                                              |
     1002 + *       +                                                              +
     1003 + *       +--------------------------------------------------------------+
     1004 + *       0%                    <- zfs_dirty_data_max ->               100%
     1005 + *
     1006 + * Note here that only as the amount of dirty data approaches its limit does
     1007 + * the delay start to increase rapidly. The goal of a properly tuned system
     1008 + * should be to keep the amount of dirty data out of that range by first
     1009 + * ensuring that the appropriate limits are set for the I/O scheduler to reach
     1010 + * optimal throughput on the backend storage, and then by changing the value
     1011 + * of zfs_delay_scale to increase the steepness of the curve.
     1012 + */
     1013 +static void
     1014 +dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
     1015 +{
     1016 +        dsl_pool_t *dp = tx->tx_pool;
     1017 +        uint64_t delay_min_bytes =
     1018 +            zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
     1019 +        hrtime_t wakeup, min_tx_time, now;
     1020 +
     1021 +        if (dirty <= delay_min_bytes)
     1022 +                return;
     1023 +
     1024 +        /*
     1025 +         * The caller has already waited until we are under the max.
     1026 +         * We make them pass us the amount of dirty data so we don't
     1027 +         * have to handle the case of it being >= the max, which could
     1028 +         * cause a divide-by-zero if it's == the max.
     1029 +         */
     1030 +        ASSERT3U(dirty, <, zfs_dirty_data_max);
     1031 +
     1032 +        now = gethrtime();
     1033 +        min_tx_time = zfs_delay_scale *
     1034 +            (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
     1035 +        if (now > tx->tx_start + min_tx_time)
     1036 +                return;
     1037 +
     1038 +        min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
     1039 +
     1040 +        DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
     1041 +            uint64_t, min_tx_time);
     1042 +
     1043 +        mutex_enter(&dp->dp_lock);
     1044 +        wakeup = MAX(tx->tx_start + min_tx_time,
     1045 +            dp->dp_last_wakeup + min_tx_time);
     1046 +        dp->dp_last_wakeup = wakeup;
     1047 +        mutex_exit(&dp->dp_lock);
     1048 +
     1049 +#ifdef _KERNEL
     1050 +        mutex_enter(&curthread->t_delay_lock);
     1051 +        while (cv_timedwait_hires(&curthread->t_delay_cv,
     1052 +            &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
     1053 +            CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
     1054 +                continue;
     1055 +        mutex_exit(&curthread->t_delay_lock);
     1056 +#else
     1057 +        hrtime_t delta = wakeup - gethrtime();
     1058 +        struct timespec ts;
     1059 +        ts.tv_sec = delta / NANOSEC;
     1060 +        ts.tv_nsec = delta % NANOSEC;
     1061 +        (void) nanosleep(&ts, NULL);
     1062 +#endif
     1063 +}
     1064 +
 914 1065  static int
 915 1066  dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 916 1067  {
 917 1068          dmu_tx_hold_t *txh;
 918 1069          spa_t *spa = tx->tx_pool->dp_spa;
 919 1070          uint64_t memory, asize, fsize, usize;
 920 1071          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 921 1072  
 922 1073          ASSERT0(tx->tx_txg);
 923 1074

 924 1075          if (tx->tx_err)
 925 1076                  return (tx->tx_err);
 926 1077  
 927 1078          if (spa_suspended(spa)) {
 928 1079                  /*
 929 1080                   * If the user has indicated a blocking failure mode
 930 1081                   * then return ERESTART which will block in dmu_tx_wait().
 931 1082                   * Otherwise, return EIO so that an error can get
 932 1083                   * propagated back to the VOP calls.
 933 1084                   *

↓ open down ↓

10 lines elided

↑ open up ↑

 934 1085                   * Note that we always honor the txg_how flag regardless
 935 1086                   * of the failuremode setting.
 936 1087                   */
 937 1088                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 938 1089                      txg_how != TXG_WAIT)
 939 1090                          return (SET_ERROR(EIO));
 940 1091  
 941 1092                  return (SET_ERROR(ERESTART));
 942 1093          }
 943 1094  
     1095 +        if (!tx->tx_waited &&
     1096 +            dsl_pool_need_dirty_delay(tx->tx_pool)) {
     1097 +                tx->tx_wait_dirty = B_TRUE;
     1098 +                return (SET_ERROR(ERESTART));
     1099 +        }
     1100 +
 944 1101          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 945 1102          tx->tx_needassign_txh = NULL;
 946 1103  
 947 1104          /*
 948 1105           * NB: No error returns are allowed after txg_hold_open, but
 949 1106           * before processing the dnode holds, due to the
 950 1107           * dmu_tx_unassign() logic.
 951 1108           */
 952 1109  
 953 1110          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;

 954 1111          for (txh = list_head(&tx->tx_holds); txh;
 955 1112              txh = list_next(&tx->tx_holds, txh)) {
 956 1113                  dnode_t *dn = txh->txh_dnode;
 957 1114                  if (dn != NULL) {
 958 1115                          mutex_enter(&dn->dn_mtx);
 959 1116                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 960 1117                                  mutex_exit(&dn->dn_mtx);
 961 1118                                  tx->tx_needassign_txh = txh;
 962 1119                                  return (SET_ERROR(ERESTART));
 963 1120                          }
 964 1121                          if (dn->dn_assigned_txg == 0)
 965 1122                                  dn->dn_assigned_txg = tx->tx_txg;
 966 1123                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 967 1124                          (void) refcount_add(&dn->dn_tx_holds, tx);
 968 1125                          mutex_exit(&dn->dn_mtx);
 969 1126                  }
 970 1127                  towrite += txh->txh_space_towrite;
 971 1128                  tofree += txh->txh_space_tofree;
 972 1129                  tooverwrite += txh->txh_space_tooverwrite;
 973 1130                  tounref += txh->txh_space_tounref;
 974 1131                  tohold += txh->txh_memory_tohold;
 975 1132                  fudge += txh->txh_fudge;
 976 1133          }
 977 1134  
 978 1135          /*
 979 1136           * If a snapshot has been taken since we made our estimates,
 980 1137           * assume that we won't be able to free or overwrite anything.
 981 1138           */
 982 1139          if (tx->tx_objset &&
 983 1140              dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 984 1141              tx->tx_lastsnap_txg) {
 985 1142                  towrite += tooverwrite;
 986 1143                  tooverwrite = tofree = 0;
 987 1144          }
 988 1145  
 989 1146          /* needed allocation: worst-case estimate of write space */
 990 1147          asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
 991 1148          /* freed space estimate: worst-case overwrite + free estimate */
 992 1149          fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 993 1150          /* convert unrefd space to worst-case estimate */
 994 1151          usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 995 1152          /* calculate memory footprint estimate */
 996 1153          memory = towrite + tooverwrite + tohold;
 997 1154  
 998 1155  #ifdef ZFS_DEBUG
 999 1156          /*
1000 1157           * Add in 'tohold' to account for our dirty holds on this memory
1001 1158           * XXX - the "fudge" factor is to account for skipped blocks that
1002 1159           * we missed because dnode_next_offset() misses in-core-only blocks.
1003 1160           */
1004 1161          tx->tx_space_towrite = asize +
1005 1162              spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
1006 1163          tx->tx_space_tofree = tofree;
1007 1164          tx->tx_space_tooverwrite = tooverwrite;
1008 1165          tx->tx_space_tounref = tounref;
1009 1166  #endif
1010 1167  
1011 1168          if (tx->tx_dir && asize != 0) {
1012 1169                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1013 1170                      asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1014 1171                  if (err)
1015 1172                          return (err);
1016 1173          }
1017 1174  
1018 1175          return (0);
1019 1176  }
1020 1177  
1021 1178  static void
1022 1179  dmu_tx_unassign(dmu_tx_t *tx)
1023 1180  {
1024 1181          dmu_tx_hold_t *txh;
1025 1182  
1026 1183          if (tx->tx_txg == 0)
1027 1184                  return;
1028 1185  
1029 1186          txg_rele_to_quiesce(&tx->tx_txgh);
1030 1187  
1031 1188          /*
1032 1189           * Walk the transaction's hold list, removing the hold on the
1033 1190           * associated dnode, and notifying waiters if the refcount drops to 0.
1034 1191           */
1035 1192          for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1036 1193              txh = list_next(&tx->tx_holds, txh)) {
1037 1194                  dnode_t *dn = txh->txh_dnode;
1038 1195  
1039 1196                  if (dn == NULL)
1040 1197                          continue;
1041 1198                  mutex_enter(&dn->dn_mtx);
1042 1199                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1043 1200  
1044 1201                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1045 1202                          dn->dn_assigned_txg = 0;
1046 1203                          cv_broadcast(&dn->dn_notxholds);
1047 1204                  }
1048 1205                  mutex_exit(&dn->dn_mtx);
1049 1206          }
1050 1207  
1051 1208          txg_rele_to_sync(&tx->tx_txgh);
1052 1209  
1053 1210          tx->tx_lasttried_txg = tx->tx_txg;
1054 1211          tx->tx_txg = 0;
1055 1212  }
1056 1213  
1057 1214  /*

↓ open down ↓

104 lines elided

↑ open up ↑

1058 1215   * Assign tx to a transaction group.  txg_how can be one of:
1059 1216   *
1060 1217   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1061 1218   *      a new one.  This should be used when you're not holding locks.
1062 1219   *      It will only fail if we're truly out of space (or over quota).
1063 1220   *
1064 1221   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1065 1222   *      blocking, returns immediately with ERESTART.  This should be used
1066 1223   *      whenever you're holding locks.  On an ERESTART error, the caller
1067 1224   *      should drop locks, do a dmu_tx_wait(tx), and try again.
     1225 + *
     1226 + * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
     1227 + *      has already been called on behalf of this operation (though
     1228 + *      most likely on a different tx).
1068 1229   */
1069 1230  int
1070 1231  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1071 1232  {
1072 1233          int err;
1073 1234  
1074 1235          ASSERT(tx->tx_txg == 0);
1075      -        ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
     1236 +        ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
     1237 +            txg_how == TXG_WAITED);
1076 1238          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1077 1239  
1078 1240          /* If we might wait, we must not hold the config lock. */
1079 1241          ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1080 1242  
     1243 +        if (txg_how == TXG_WAITED)
     1244 +                tx->tx_waited = B_TRUE;
     1245 +
1081 1246          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1082 1247                  dmu_tx_unassign(tx);
1083 1248  
1084 1249                  if (err != ERESTART || txg_how != TXG_WAIT)
1085 1250                          return (err);
1086 1251  
1087 1252                  dmu_tx_wait(tx);
1088 1253          }
1089 1254  
1090 1255          txg_rele_to_quiesce(&tx->tx_txgh);
1091 1256  
1092 1257          return (0);
1093 1258  }
1094 1259  
1095 1260  void
1096 1261  dmu_tx_wait(dmu_tx_t *tx)
1097 1262  {
1098 1263          spa_t *spa = tx->tx_pool->dp_spa;
     1264 +        dsl_pool_t *dp = tx->tx_pool;
1099 1265  
1100 1266          ASSERT(tx->tx_txg == 0);
1101 1267          ASSERT(!dsl_pool_config_held(tx->tx_pool));
1102 1268  
1103      -        /*
1104      -         * It's possible that the pool has become active after this thread
1105      -         * has tried to obtain a tx. If that's the case then his
1106      -         * tx_lasttried_txg would not have been assigned.
1107      -         */
1108      -        if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1109      -                txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
     1269 +        if (tx->tx_wait_dirty) {
     1270 +                /*
     1271 +                 * dmu_tx_try_assign() has determined that we need to wait
     1272 +                 * because we've consumed much or all of the dirty buffer
     1273 +                 * space.
     1274 +                 */
     1275 +                mutex_enter(&dp->dp_lock);
     1276 +                while (dp->dp_dirty_total >= zfs_dirty_data_max)
     1277 +                        cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
     1278 +                uint64_t dirty = dp->dp_dirty_total;
     1279 +                mutex_exit(&dp->dp_lock);
     1280 +
     1281 +                dmu_tx_delay(tx, dirty);
     1282 +
     1283 +                tx->tx_wait_dirty = B_FALSE;
     1284 +
     1285 +                /*
     1286 +                 * Note: setting tx_waited only has effect if the caller
     1287 +                 * used TX_WAIT.  Otherwise they are going to destroy
     1288 +                 * this tx and try again.  The common case, zfs_write(),
     1289 +                 * uses TX_WAIT.
     1290 +                 */
     1291 +                tx->tx_waited = B_TRUE;
     1292 +        } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
     1293 +                /*
     1294 +                 * If the pool is suspended we need to wait until it
     1295 +                 * is resumed.  Note that it's possible that the pool
     1296 +                 * has become active after this thread has tried to
     1297 +                 * obtain a tx.  If that's the case then tx_lasttried_txg
     1298 +                 * would not have been set.
     1299 +                 */
     1300 +                txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1110 1301          } else if (tx->tx_needassign_txh) {
     1302 +                /*
     1303 +                 * A dnode is assigned to the quiescing txg.  Wait for its
     1304 +                 * transaction to complete.
     1305 +                 */
1111 1306                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1112 1307  
1113 1308                  mutex_enter(&dn->dn_mtx);
1114 1309                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1115 1310                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1116 1311                  mutex_exit(&dn->dn_mtx);
1117 1312                  tx->tx_needassign_txh = NULL;
1118 1313          } else {
1119 1314                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1120 1315          }

1121 1316  }
1122 1317  
1123 1318  void
1124 1319  dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1125 1320  {
1126 1321  #ifdef ZFS_DEBUG
1127 1322          if (tx->tx_dir == NULL || delta == 0)
1128 1323                  return;
1129 1324  
1130 1325          if (delta > 0) {
1131 1326                  ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1132 1327                      tx->tx_space_towrite);
1133 1328                  (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1134 1329          } else {
1135 1330                  (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1136 1331          }
1137 1332  #endif
1138 1333  }
1139 1334  
1140 1335  void
1141 1336  dmu_tx_commit(dmu_tx_t *tx)
1142 1337  {
1143 1338          dmu_tx_hold_t *txh;
1144 1339  
1145 1340          ASSERT(tx->tx_txg != 0);
1146 1341  
1147 1342          /*
1148 1343           * Go through the transaction's hold list and remove holds on
1149 1344           * associated dnodes, notifying waiters if no holds remain.
1150 1345           */
1151 1346          while (txh = list_head(&tx->tx_holds)) {
1152 1347                  dnode_t *dn = txh->txh_dnode;
1153 1348  
1154 1349                  list_remove(&tx->tx_holds, txh);
1155 1350                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1156 1351                  if (dn == NULL)
1157 1352                          continue;
1158 1353                  mutex_enter(&dn->dn_mtx);
1159 1354                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1160 1355  
1161 1356                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1162 1357                          dn->dn_assigned_txg = 0;
1163 1358                          cv_broadcast(&dn->dn_notxholds);
1164 1359                  }
1165 1360                  mutex_exit(&dn->dn_mtx);
1166 1361                  dnode_rele(dn, tx);
1167 1362          }
1168 1363  
1169 1364          if (tx->tx_tempreserve_cookie)
1170 1365                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1171 1366  
1172 1367          if (!list_is_empty(&tx->tx_callbacks))
1173 1368                  txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1174 1369  
1175 1370          if (tx->tx_anyobj == FALSE)
1176 1371                  txg_rele_to_sync(&tx->tx_txgh);
1177 1372  
1178 1373          list_destroy(&tx->tx_callbacks);
1179 1374          list_destroy(&tx->tx_holds);
1180 1375  #ifdef ZFS_DEBUG
1181 1376          dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1182 1377              tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1183 1378              tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1184 1379          refcount_destroy_many(&tx->tx_space_written,
1185 1380              refcount_count(&tx->tx_space_written));
1186 1381          refcount_destroy_many(&tx->tx_space_freed,
1187 1382              refcount_count(&tx->tx_space_freed));
1188 1383  #endif
1189 1384          kmem_free(tx, sizeof (dmu_tx_t));
1190 1385  }
1191 1386  
1192 1387  void
1193 1388  dmu_tx_abort(dmu_tx_t *tx)
1194 1389  {
1195 1390          dmu_tx_hold_t *txh;
1196 1391  
1197 1392          ASSERT(tx->tx_txg == 0);
1198 1393  
1199 1394          while (txh = list_head(&tx->tx_holds)) {
1200 1395                  dnode_t *dn = txh->txh_dnode;
1201 1396  
1202 1397                  list_remove(&tx->tx_holds, txh);
1203 1398                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1204 1399                  if (dn != NULL)
1205 1400                          dnode_rele(dn, tx);
1206 1401          }
1207 1402  
1208 1403          /*
1209 1404           * Call any registered callbacks with an error code.
1210 1405           */
1211 1406          if (!list_is_empty(&tx->tx_callbacks))
1212 1407                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1213 1408  
1214 1409          list_destroy(&tx->tx_callbacks);
1215 1410          list_destroy(&tx->tx_holds);
1216 1411  #ifdef ZFS_DEBUG
1217 1412          refcount_destroy_many(&tx->tx_space_written,
1218 1413              refcount_count(&tx->tx_space_written));
1219 1414          refcount_destroy_many(&tx->tx_space_freed,
1220 1415              refcount_count(&tx->tx_space_freed));
1221 1416  #endif
1222 1417          kmem_free(tx, sizeof (dmu_tx_t));
1223 1418  }
1224 1419  
1225 1420  uint64_t
1226 1421  dmu_tx_get_txg(dmu_tx_t *tx)
1227 1422  {
1228 1423          ASSERT(tx->tx_txg != 0);
1229 1424          return (tx->tx_txg);
1230 1425  }
1231 1426  
1232 1427  dsl_pool_t *
1233 1428  dmu_tx_pool(dmu_tx_t *tx)
1234 1429  {
1235 1430          ASSERT(tx->tx_pool != NULL);
1236 1431          return (tx->tx_pool);
1237 1432  }
1238 1433  
1239 1434  
1240 1435  void
1241 1436  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1242 1437  {
1243 1438          dmu_tx_callback_t *dcb;
1244 1439  
1245 1440          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1246 1441  
1247 1442          dcb->dcb_func = func;
1248 1443          dcb->dcb_data = data;
1249 1444  
1250 1445          list_insert_tail(&tx->tx_callbacks, dcb);
1251 1446  }
1252 1447  
1253 1448  /*
1254 1449   * Call all the commit callbacks on a list, with a given error code.
1255 1450   */
1256 1451  void
1257 1452  dmu_tx_do_callbacks(list_t *cb_list, int error)
1258 1453  {
1259 1454          dmu_tx_callback_t *dcb;
1260 1455  
1261 1456          while (dcb = list_head(cb_list)) {
1262 1457                  list_remove(cb_list, dcb);
1263 1458                  dcb->dcb_func(dcb->dcb_data, error);
1264 1459                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1265 1460          }
1266 1461  }
1267 1462  
1268 1463  /*
1269 1464   * Interface to hold a bunch of attributes.
1270 1465   * used for creating new files.
1271 1466   * attrsize is the total size of all attributes
1272 1467   * to be added during object creation
1273 1468   *
1274 1469   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1275 1470   */
1276 1471  
1277 1472  /*
1278 1473   * hold necessary attribute name for attribute registration.
1279 1474   * should be a very rare case where this is needed.  If it does
1280 1475   * happen it would only happen on the first write to the file system.
1281 1476   */
1282 1477  static void
1283 1478  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1284 1479  {
1285 1480          int i;
1286 1481  
1287 1482          if (!sa->sa_need_attr_registration)
1288 1483                  return;
1289 1484  
1290 1485          for (i = 0; i != sa->sa_num_attrs; i++) {
1291 1486                  if (!sa->sa_attr_table[i].sa_registered) {
1292 1487                          if (sa->sa_reg_attr_obj)
1293 1488                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1294 1489                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1295 1490                          else
1296 1491                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1297 1492                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1298 1493                  }
1299 1494          }
1300 1495  }
1301 1496  
1302 1497  
1303 1498  void
1304 1499  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1305 1500  {
1306 1501          dnode_t *dn;
1307 1502          dmu_tx_hold_t *txh;
1308 1503  
1309 1504          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1310 1505              THT_SPILL, 0, 0);
1311 1506  
1312 1507          dn = txh->txh_dnode;
1313 1508  
1314 1509          if (dn == NULL)
1315 1510                  return;
1316 1511  
1317 1512          /* If blkptr doesn't exist then add space to towrite */
1318 1513          if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1319 1514                  txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1320 1515          } else {
1321 1516                  blkptr_t *bp;
1322 1517  
1323 1518                  bp = &dn->dn_phys->dn_spill;
1324 1519                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1325 1520                      bp, bp->blk_birth))
1326 1521                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
1327 1522                  else
1328 1523                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1329 1524                  if (!BP_IS_HOLE(bp))
1330 1525                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
1331 1526          }
1332 1527  }
1333 1528  
1334 1529  void
1335 1530  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1336 1531  {
1337 1532          sa_os_t *sa = tx->tx_objset->os_sa;
1338 1533  
1339 1534          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1340 1535  
1341 1536          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1342 1537                  return;
1343 1538  
1344 1539          if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1345 1540                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1346 1541          else {
1347 1542                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1348 1543                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1349 1544                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1350 1545                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1351 1546          }
1352 1547  
1353 1548          dmu_tx_sa_registration_hold(sa, tx);
1354 1549  
1355 1550          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1356 1551                  return;
1357 1552  
1358 1553          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1359 1554              THT_SPILL, 0, 0);
1360 1555  }
1361 1556  
1362 1557  /*
1363 1558   * Hold SA attribute
1364 1559   *
1365 1560   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1366 1561   *
1367 1562   * variable_size is the total size of all variable sized attributes
1368 1563   * passed to this function.  It is not the total size of all
1369 1564   * variable size attributes that *may* exist on this object.
1370 1565   */
1371 1566  void
1372 1567  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1373 1568  {
1374 1569          uint64_t object;
1375 1570          sa_os_t *sa = tx->tx_objset->os_sa;
1376 1571  
1377 1572          ASSERT(hdl != NULL);
1378 1573  
1379 1574          object = sa_handle_object(hdl);
1380 1575  
1381 1576          dmu_tx_hold_bonus(tx, object);
1382 1577  
1383 1578          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1384 1579                  return;
1385 1580  
1386 1581          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1387 1582              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1388 1583                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1389 1584                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1390 1585                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1391 1586                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1392 1587          }
1393 1588  
1394 1589          dmu_tx_sa_registration_hold(sa, tx);
1395 1590  
1396 1591          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1397 1592                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1398 1593  
1399 1594          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1400 1595                  ASSERT(tx->tx_txg == 0);
1401 1596                  dmu_tx_hold_spill(tx, object);
1402 1597          } else {
1403 1598                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1404 1599                  dnode_t *dn;
1405 1600  
1406 1601                  DB_DNODE_ENTER(db);
1407 1602                  dn = DB_DNODE(db);
1408 1603                  if (dn->dn_have_spill) {
1409 1604                          ASSERT(tx->tx_txg == 0);
1410 1605                          dmu_tx_hold_spill(tx, object);
1411 1606                  }
1412 1607                  DB_DNODE_EXIT(db);
1413 1608          }
1414 1609  }

↓ open down ↓

294 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX