illumos-gate Wdiff usr/src/uts/common/fs/zfs/dmu_tx.c

Print this page

1862 incremental zfs receive fails for sparse file > 8PB
dmu_tx_count_free is doing a horrible over-estimation of used memory. It
assumes that the file is fully non-sparse and calculates a worst-case estimate
of how much memory is needed to hold all metadata for the file. If a large
hole needs to be freed, the estimation goes into the TB-range, which obviously
fails later on.
This patch tries to calculate a more realistic estimate by counting the l1
blocks (the loop for this is already present) and assumes a worst-case
distribution of those blocks over the full length given.
Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com>
Reviewed by: Simon Klinkert <klinkert@webgods.de>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2012 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_impl.h>
  29   29  #include <sys/dbuf.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/dmu_objset.h>
  32   32  #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33   33  #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36   36  #include <sys/spa.h>
  37   37  #include <sys/sa.h>
  38   38  #include <sys/sa_impl.h>
  39   39  #include <sys/zfs_context.h>
  40   40  #include <sys/varargs.h>
  41   41  
  42   42  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  43   43      uint64_t arg1, uint64_t arg2);
  44   44  
  45   45  
  46   46  dmu_tx_t *
  47   47  dmu_tx_create_dd(dsl_dir_t *dd)
  48   48  {
  49   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  50   50          tx->tx_dir = dd;
  51   51          if (dd != NULL)
  52   52                  tx->tx_pool = dd->dd_pool;
  53   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  54   54              offsetof(dmu_tx_hold_t, txh_node));
  55   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  56   56              offsetof(dmu_tx_callback_t, dcb_node));
  57   57  #ifdef ZFS_DEBUG
  58   58          refcount_create(&tx->tx_space_written);
  59   59          refcount_create(&tx->tx_space_freed);
  60   60  #endif
  61   61          return (tx);
  62   62  }
  63   63  
  64   64  dmu_tx_t *
  65   65  dmu_tx_create(objset_t *os)
  66   66  {
  67   67          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  68   68          tx->tx_objset = os;
  69   69          tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  70   70          return (tx);
  71   71  }
  72   72  
  73   73  dmu_tx_t *
  74   74  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  75   75  {
  76   76          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  77   77  
  78   78          ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  79   79          tx->tx_pool = dp;
  80   80          tx->tx_txg = txg;
  81   81          tx->tx_anyobj = TRUE;
  82   82  
  83   83          return (tx);
  84   84  }
  85   85  
  86   86  int
  87   87  dmu_tx_is_syncing(dmu_tx_t *tx)
  88   88  {
  89   89          return (tx->tx_anyobj);
  90   90  }
  91   91  
  92   92  int
  93   93  dmu_tx_private_ok(dmu_tx_t *tx)
  94   94  {
  95   95          return (tx->tx_anyobj);
  96   96  }
  97   97  
  98   98  static dmu_tx_hold_t *
  99   99  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 100  100      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 101  101  {
 102  102          dmu_tx_hold_t *txh;
 103  103          dnode_t *dn = NULL;
 104  104          int err;
 105  105  
 106  106          if (object != DMU_NEW_OBJECT) {
 107  107                  err = dnode_hold(os, object, tx, &dn);
 108  108                  if (err) {
 109  109                          tx->tx_err = err;
 110  110                          return (NULL);
 111  111                  }
 112  112  
 113  113                  if (err == 0 && tx->tx_txg != 0) {
 114  114                          mutex_enter(&dn->dn_mtx);
 115  115                          /*
 116  116                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 117  117                           * problem, but there's no way for it to happen (for
 118  118                           * now, at least).
 119  119                           */
 120  120                          ASSERT(dn->dn_assigned_txg == 0);
 121  121                          dn->dn_assigned_txg = tx->tx_txg;
 122  122                          (void) refcount_add(&dn->dn_tx_holds, tx);
 123  123                          mutex_exit(&dn->dn_mtx);
 124  124                  }
 125  125          }
 126  126  
 127  127          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 128  128          txh->txh_tx = tx;
 129  129          txh->txh_dnode = dn;
 130  130  #ifdef ZFS_DEBUG
 131  131          txh->txh_type = type;
 132  132          txh->txh_arg1 = arg1;
 133  133          txh->txh_arg2 = arg2;
 134  134  #endif
 135  135          list_insert_tail(&tx->tx_holds, txh);
 136  136  
 137  137          return (txh);
 138  138  }
 139  139  
 140  140  void
 141  141  dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 142  142  {
 143  143          /*
 144  144           * If we're syncing, they can manipulate any object anyhow, and
 145  145           * the hold on the dnode_t can cause problems.
 146  146           */
 147  147          if (!dmu_tx_is_syncing(tx)) {
 148  148                  (void) dmu_tx_hold_object_impl(tx, os,
 149  149                      object, THT_NEWOBJECT, 0, 0);
 150  150          }
 151  151  }
 152  152  
 153  153  static int
 154  154  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 155  155  {
 156  156          int err;
 157  157          dmu_buf_impl_t *db;
 158  158  
 159  159          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 160  160          db = dbuf_hold_level(dn, level, blkid, FTAG);
 161  161          rw_exit(&dn->dn_struct_rwlock);
 162  162          if (db == NULL)
 163  163                  return (EIO);
 164  164          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 165  165          dbuf_rele(db, FTAG);
 166  166          return (err);
 167  167  }
 168  168  
 169  169  static void
 170  170  dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 171  171      int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 172  172  {
 173  173          objset_t *os = dn->dn_objset;
 174  174          dsl_dataset_t *ds = os->os_dsl_dataset;
 175  175          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 176  176          dmu_buf_impl_t *parent = NULL;
 177  177          blkptr_t *bp = NULL;
 178  178          uint64_t space;
 179  179  
 180  180          if (level >= dn->dn_nlevels || history[level] == blkid)
 181  181                  return;
 182  182  
 183  183          history[level] = blkid;
 184  184  
 185  185          space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 186  186  
 187  187          if (db == NULL || db == dn->dn_dbuf) {
 188  188                  ASSERT(level != 0);
 189  189                  db = NULL;
 190  190          } else {
 191  191                  ASSERT(DB_DNODE(db) == dn);
 192  192                  ASSERT(db->db_level == level);
 193  193                  ASSERT(db->db.db_size == space);
 194  194                  ASSERT(db->db_blkid == blkid);
 195  195                  bp = db->db_blkptr;
 196  196                  parent = db->db_parent;
 197  197          }
 198  198  
 199  199          freeable = (bp && (freeable ||
 200  200              dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 201  201  
 202  202          if (freeable)
 203  203                  txh->txh_space_tooverwrite += space;
 204  204          else
 205  205                  txh->txh_space_towrite += space;
 206  206          if (bp)
 207  207                  txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 208  208  
 209  209          dmu_tx_count_twig(txh, dn, parent, level + 1,
 210  210              blkid >> epbs, freeable, history);
 211  211  }
 212  212  
 213  213  /* ARGSUSED */
 214  214  static void
 215  215  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 216  216  {
 217  217          dnode_t *dn = txh->txh_dnode;
 218  218          uint64_t start, end, i;
 219  219          int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 220  220          int err = 0;
 221  221  
 222  222          if (len == 0)
 223  223                  return;
 224  224  
 225  225          min_bs = SPA_MINBLOCKSHIFT;
 226  226          max_bs = SPA_MAXBLOCKSHIFT;
 227  227          min_ibs = DN_MIN_INDBLKSHIFT;
 228  228          max_ibs = DN_MAX_INDBLKSHIFT;
 229  229  
 230  230          if (dn) {
 231  231                  uint64_t history[DN_MAX_LEVELS];
 232  232                  int nlvls = dn->dn_nlevels;
 233  233                  int delta;
 234  234  
 235  235                  /*
 236  236                   * For i/o error checking, read the first and last level-0
 237  237                   * blocks (if they are not aligned), and all the level-1 blocks.
 238  238                   */
 239  239                  if (dn->dn_maxblkid == 0) {
 240  240                          delta = dn->dn_datablksz;
 241  241                          start = (off < dn->dn_datablksz) ? 0 : 1;
 242  242                          end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 243  243                          if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 244  244                                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 245  245                                  if (err)
 246  246                                          goto out;
 247  247                                  delta -= off;
 248  248                          }
 249  249                  } else {
 250  250                          zio_t *zio = zio_root(dn->dn_objset->os_spa,
 251  251                              NULL, NULL, ZIO_FLAG_CANFAIL);
 252  252  
 253  253                          /* first level-0 block */
 254  254                          start = off >> dn->dn_datablkshift;
 255  255                          if (P2PHASE(off, dn->dn_datablksz) ||
 256  256                              len < dn->dn_datablksz) {
 257  257                                  err = dmu_tx_check_ioerr(zio, dn, 0, start);
 258  258                                  if (err)
 259  259                                          goto out;
 260  260                          }
 261  261  
 262  262                          /* last level-0 block */
 263  263                          end = (off+len-1) >> dn->dn_datablkshift;
 264  264                          if (end != start && end <= dn->dn_maxblkid &&
 265  265                              P2PHASE(off+len, dn->dn_datablksz)) {
 266  266                                  err = dmu_tx_check_ioerr(zio, dn, 0, end);
 267  267                                  if (err)
 268  268                                          goto out;
 269  269                          }
 270  270  
 271  271                          /* level-1 blocks */
 272  272                          if (nlvls > 1) {
 273  273                                  int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 274  274                                  for (i = (start>>shft)+1; i < end>>shft; i++) {
 275  275                                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 276  276                                          if (err)
 277  277                                                  goto out;
 278  278                                  }
 279  279                          }
 280  280  
 281  281                          err = zio_wait(zio);
 282  282                          if (err)
 283  283                                  goto out;
 284  284                          delta = P2NPHASE(off, dn->dn_datablksz);
 285  285                  }
 286  286  
 287  287                  if (dn->dn_maxblkid > 0) {
 288  288                          /*
 289  289                           * The blocksize can't change,
 290  290                           * so we can make a more precise estimate.
 291  291                           */
 292  292                          ASSERT(dn->dn_datablkshift != 0);
 293  293                          min_bs = max_bs = dn->dn_datablkshift;
 294  294                          min_ibs = max_ibs = dn->dn_indblkshift;
 295  295                  } else if (dn->dn_indblkshift > max_ibs) {
 296  296                          /*
 297  297                           * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
 298  298                           * the code will still work correctly on older pools.
 299  299                           */
 300  300                          min_ibs = max_ibs = dn->dn_indblkshift;
 301  301                  }
 302  302  
 303  303                  /*
 304  304                   * If this write is not off the end of the file
 305  305                   * we need to account for overwrites/unref.
 306  306                   */
 307  307                  if (start <= dn->dn_maxblkid) {
 308  308                          for (int l = 0; l < DN_MAX_LEVELS; l++)
 309  309                                  history[l] = -1ULL;
 310  310                  }
 311  311                  while (start <= dn->dn_maxblkid) {
 312  312                          dmu_buf_impl_t *db;
 313  313  
 314  314                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 315  315                          err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 316  316                          rw_exit(&dn->dn_struct_rwlock);
 317  317  
 318  318                          if (err) {
 319  319                                  txh->txh_tx->tx_err = err;
 320  320                                  return;
 321  321                          }
 322  322  
 323  323                          dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 324  324                              history);
 325  325                          dbuf_rele(db, FTAG);
 326  326                          if (++start > end) {
 327  327                                  /*
 328  328                                   * Account for new indirects appearing
 329  329                                   * before this IO gets assigned into a txg.
 330  330                                   */
 331  331                                  bits = 64 - min_bs;
 332  332                                  epbs = min_ibs - SPA_BLKPTRSHIFT;
 333  333                                  for (bits -= epbs * (nlvls - 1);
 334  334                                      bits >= 0; bits -= epbs)
 335  335                                          txh->txh_fudge += 1ULL << max_ibs;
 336  336                                  goto out;
 337  337                          }
 338  338                          off += delta;
 339  339                          if (len >= delta)
 340  340                                  len -= delta;
 341  341                          delta = dn->dn_datablksz;
 342  342                  }
 343  343          }
 344  344  
 345  345          /*
 346  346           * 'end' is the last thing we will access, not one past.
 347  347           * This way we won't overflow when accessing the last byte.
 348  348           */
 349  349          start = P2ALIGN(off, 1ULL << max_bs);
 350  350          end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 351  351          txh->txh_space_towrite += end - start + 1;
 352  352  
 353  353          start >>= min_bs;
 354  354          end >>= min_bs;
 355  355  
 356  356          epbs = min_ibs - SPA_BLKPTRSHIFT;
 357  357  
 358  358          /*
 359  359           * The object contains at most 2^(64 - min_bs) blocks,
 360  360           * and each indirect level maps 2^epbs.
 361  361           */
 362  362          for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 363  363                  start >>= epbs;
 364  364                  end >>= epbs;
 365  365                  ASSERT3U(end, >=, start);
 366  366                  txh->txh_space_towrite += (end - start + 1) << max_ibs;
 367  367                  if (start != 0) {
 368  368                          /*
 369  369                           * We also need a new blkid=0 indirect block
 370  370                           * to reference any existing file data.
 371  371                           */
 372  372                          txh->txh_space_towrite += 1ULL << max_ibs;
 373  373                  }
 374  374          }
 375  375  
 376  376  out:
 377  377          if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 378  378              2 * DMU_MAX_ACCESS)
 379  379                  err = EFBIG;
 380  380  
 381  381          if (err)
 382  382                  txh->txh_tx->tx_err = err;
 383  383  }
 384  384  
 385  385  static void
 386  386  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 387  387  {
 388  388          dnode_t *dn = txh->txh_dnode;
 389  389          dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 390  390          uint64_t space = mdn->dn_datablksz +
 391  391              ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 392  392  
 393  393          if (dn && dn->dn_dbuf->db_blkptr &&
 394  394              dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 395  395              dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 396  396                  txh->txh_space_tooverwrite += space;
 397  397                  txh->txh_space_tounref += space;
 398  398          } else {
 399  399                  txh->txh_space_towrite += space;
 400  400                  if (dn && dn->dn_dbuf->db_blkptr)
 401  401                          txh->txh_space_tounref += space;
 402  402          }
 403  403  }
 404  404  
 405  405  void
 406  406  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 407  407  {
 408  408          dmu_tx_hold_t *txh;
 409  409  
 410  410          ASSERT(tx->tx_txg == 0);
 411  411          ASSERT(len < DMU_MAX_ACCESS);
 412  412          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 413  413  
 414  414          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 415  415              object, THT_WRITE, off, len);
 416  416          if (txh == NULL)
 417  417                  return;
 418  418  
 419  419          dmu_tx_count_write(txh, off, len);
 420  420          dmu_tx_count_dnode(txh);
 421  421  }

↓ open down ↓

421 lines elided

↑ open up ↑

 422  422  
 423  423  static void
 424  424  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 425  425  {
 426  426          uint64_t blkid, nblks, lastblk;
 427  427          uint64_t space = 0, unref = 0, skipped = 0;
 428  428          dnode_t *dn = txh->txh_dnode;
 429  429          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 430  430          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 431  431          int epbs;
      432 +        uint64_t l0span = 0, nl1blks = 0;
 432  433  
 433  434          if (dn->dn_nlevels == 0)
 434  435                  return;
 435  436  
 436  437          /*
 437  438           * The struct_rwlock protects us against dn_nlevels
 438  439           * changing, in case (against all odds) we manage to dirty &
 439  440           * sync out the changes after we check for being dirty.
 440  441           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 441  442           */

 442  443          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 443  444          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 444  445          if (dn->dn_maxblkid == 0) {
 445  446                  if (off == 0 && len >= dn->dn_datablksz) {
 446  447                          blkid = 0;
 447  448                          nblks = 1;
 448  449                  } else {
 449  450                          rw_exit(&dn->dn_struct_rwlock);
 450  451                          return;
 451  452                  }
 452  453          } else {
 453  454                  blkid = off >> dn->dn_datablkshift;

↓ open down ↓

12 lines elided

↑ open up ↑

 454  455                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 455  456  
 456  457                  if (blkid >= dn->dn_maxblkid) {
 457  458                          rw_exit(&dn->dn_struct_rwlock);
 458  459                          return;
 459  460                  }
 460  461                  if (blkid + nblks > dn->dn_maxblkid)
 461  462                          nblks = dn->dn_maxblkid - blkid;
 462  463  
 463  464          }
      465 +        l0span = nblks;    /* save for later use to calc level > 1 overhead */
 464  466          if (dn->dn_nlevels == 1) {
 465  467                  int i;
 466  468                  for (i = 0; i < nblks; i++) {
 467  469                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 468  470                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 469  471                          bp += blkid + i;
 470  472                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 471  473                                  dprintf_bp(bp, "can free old%s", "");
 472  474                                  space += bp_get_dsize(spa, bp);
 473  475                          }
 474  476                          unref += BP_GET_ASIZE(bp);
 475  477                  }
      478 +                nl1blks = 1;
 476  479                  nblks = 0;
 477  480          }
 478  481  
 479      -        /*
 480      -         * Add in memory requirements of higher-level indirects.
 481      -         * This assumes a worst-possible scenario for dn_nlevels.
 482      -         */
 483      -        {
 484      -                uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
 485      -                int level = (dn->dn_nlevels > 1) ? 2 : 1;
 486      -
 487      -                while (level++ < DN_MAX_LEVELS) {
 488      -                        txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
 489      -                        blkcnt = 1 + (blkcnt >> epbs);
 490      -                }
 491      -                ASSERT(blkcnt <= dn->dn_nblkptr);
 492      -        }
 493      -
 494  482          lastblk = blkid + nblks - 1;
 495  483          while (nblks) {
 496  484                  dmu_buf_impl_t *dbuf;
 497  485                  uint64_t ibyte, new_blkid;
 498  486                  int epb = 1 << epbs;
 499  487                  int err, i, blkoff, tochk;
 500  488                  blkptr_t *bp;
 501  489  
 502  490                  ibyte = blkid << dn->dn_datablkshift;
 503  491                  err = dnode_next_offset(dn,

 504  492                      DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 505  493                  new_blkid = ibyte >> dn->dn_datablkshift;
 506  494                  if (err == ESRCH) {
 507  495                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 508  496                          break;
 509  497                  }
 510  498                  if (err) {
 511  499                          txh->txh_tx->tx_err = err;
 512  500                          break;
 513  501                  }
 514  502                  if (new_blkid > lastblk) {
 515  503                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 516  504                          break;
 517  505                  }
 518  506  
 519  507                  if (new_blkid > blkid) {
 520  508                          ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 521  509                          skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 522  510                          nblks -= new_blkid - blkid;
 523  511                          blkid = new_blkid;
 524  512                  }
 525  513                  blkoff = P2PHASE(blkid, epb);
 526  514                  tochk = MIN(epb - blkoff, nblks);
 527  515  
 528  516                  err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 529  517                  if (err) {
 530  518                          txh->txh_tx->tx_err = err;
 531  519                          break;
 532  520                  }
 533  521  
 534  522                  txh->txh_memory_tohold += dbuf->db.db_size;
 535  523  
 536  524                  /*
 537  525                   * We don't check memory_tohold against DMU_MAX_ACCESS because
 538  526                   * memory_tohold is an over-estimation (especially the >L1
 539  527                   * indirect blocks), so it could fail.  Callers should have
 540  528                   * already verified that they will not be holding too much
 541  529                   * memory.
 542  530                   */
 543  531  
 544  532                  err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 545  533                  if (err != 0) {
 546  534                          txh->txh_tx->tx_err = err;
 547  535                          dbuf_rele(dbuf, FTAG);
 548  536                          break;
 549  537                  }
 550  538  
 551  539                  bp = dbuf->db.db_data;
 552  540                  bp += blkoff;
 553  541

↓ open down ↓

50 lines elided

↑ open up ↑

 554  542                  for (i = 0; i < tochk; i++) {
 555  543                          if (dsl_dataset_block_freeable(ds, &bp[i],
 556  544                              bp[i].blk_birth)) {
 557  545                                  dprintf_bp(&bp[i], "can free old%s", "");
 558  546                                  space += bp_get_dsize(spa, &bp[i]);
 559  547                          }
 560  548                          unref += BP_GET_ASIZE(bp);
 561  549                  }
 562  550                  dbuf_rele(dbuf, FTAG);
 563  551  
      552 +                ++nl1blks;
 564  553                  blkid += tochk;
 565  554                  nblks -= tochk;
 566  555          }
 567  556          rw_exit(&dn->dn_struct_rwlock);
 568  557  
      558 +        /*
      559 +         * Add in memory requirements of higher-level indirects.
      560 +         * This assumes a worst-possible scenario for dn_nlevels and a
      561 +         * worst-possible distribution of l1-blocks over the region to free.
      562 +         */
      563 +        {
      564 +                uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
      565 +                int level = 2;
      566 +                /*
      567 +                 * Here we don't use DN_MAX_LEVEL, but calculate it with the
      568 +                 * given datablkshift and indblkshift. This makes the
      569 +                 * difference between 19 and 8 on large files.
      570 +                 */
      571 +                int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
      572 +                    (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
      573 +
      574 +                while (level++ < maxlevel) {
      575 +                        txh->txh_memory_tohold += MIN(blkcnt, (nl1blks >> epbs))
      576 +                            << dn->dn_indblkshift;
      577 +                        blkcnt = 1 + (blkcnt >> epbs);
      578 +                }
      579 +        }
      580 +
 569  581          /* account for new level 1 indirect blocks that might show up */
 570  582          if (skipped > 0) {
 571  583                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 572  584                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 573  585                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 574  586          }
 575  587          txh->txh_space_tofree += space;
 576  588          txh->txh_space_tounref += unref;
 577  589  }
 578  590

 579  591  void
 580  592  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 581  593  {
 582  594          dmu_tx_hold_t *txh;
 583  595          dnode_t *dn;
 584  596          uint64_t start, end, i;
 585  597          int err, shift;
 586  598          zio_t *zio;
 587  599  
 588  600          ASSERT(tx->tx_txg == 0);
 589  601  
 590  602          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 591  603              object, THT_FREE, off, len);
 592  604          if (txh == NULL)
 593  605                  return;
 594  606          dn = txh->txh_dnode;
 595  607  
 596  608          /* first block */
 597  609          if (off != 0)
 598  610                  dmu_tx_count_write(txh, off, 1);
 599  611          /* last block */
 600  612          if (len != DMU_OBJECT_END)
 601  613                  dmu_tx_count_write(txh, off+len, 1);
 602  614  
 603  615          dmu_tx_count_dnode(txh);
 604  616  
 605  617          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 606  618                  return;
 607  619          if (len == DMU_OBJECT_END)
 608  620                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 609  621  
 610  622          /*
 611  623           * For i/o error checking, read the first and last level-0
 612  624           * blocks, and all the level-1 blocks.  The above count_write's
 613  625           * have already taken care of the level-0 blocks.
 614  626           */
 615  627          if (dn->dn_nlevels > 1) {
 616  628                  shift = dn->dn_datablkshift + dn->dn_indblkshift -
 617  629                      SPA_BLKPTRSHIFT;
 618  630                  start = off >> shift;
 619  631                  end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
 620  632  
 621  633                  zio = zio_root(tx->tx_pool->dp_spa,
 622  634                      NULL, NULL, ZIO_FLAG_CANFAIL);
 623  635                  for (i = start; i <= end; i++) {
 624  636                          uint64_t ibyte = i << shift;
 625  637                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 626  638                          i = ibyte >> shift;
 627  639                          if (err == ESRCH)
 628  640                                  break;
 629  641                          if (err) {
 630  642                                  tx->tx_err = err;
 631  643                                  return;
 632  644                          }
 633  645  
 634  646                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 635  647                          if (err) {
 636  648                                  tx->tx_err = err;
 637  649                                  return;
 638  650                          }
 639  651                  }
 640  652                  err = zio_wait(zio);
 641  653                  if (err) {
 642  654                          tx->tx_err = err;
 643  655                          return;
 644  656                  }
 645  657          }
 646  658  
 647  659          dmu_tx_count_free(txh, off, len);
 648  660  }
 649  661  
 650  662  void
 651  663  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 652  664  {
 653  665          dmu_tx_hold_t *txh;
 654  666          dnode_t *dn;
 655  667          uint64_t nblocks;
 656  668          int epbs, err;
 657  669  
 658  670          ASSERT(tx->tx_txg == 0);
 659  671  
 660  672          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 661  673              object, THT_ZAP, add, (uintptr_t)name);
 662  674          if (txh == NULL)
 663  675                  return;
 664  676          dn = txh->txh_dnode;
 665  677  
 666  678          dmu_tx_count_dnode(txh);
 667  679  
 668  680          if (dn == NULL) {
 669  681                  /*
 670  682                   * We will be able to fit a new object's entries into one leaf
 671  683                   * block.  So there will be at most 2 blocks total,
 672  684                   * including the header block.
 673  685                   */
 674  686                  dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 675  687                  return;
 676  688          }
 677  689  
 678  690          ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 679  691  
 680  692          if (dn->dn_maxblkid == 0 && !add) {
 681  693                  blkptr_t *bp;
 682  694  
 683  695                  /*
 684  696                   * If there is only one block  (i.e. this is a micro-zap)
 685  697                   * and we are not adding anything, the accounting is simple.
 686  698                   */
 687  699                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 688  700                  if (err) {
 689  701                          tx->tx_err = err;
 690  702                          return;
 691  703                  }
 692  704  
 693  705                  /*
 694  706                   * Use max block size here, since we don't know how much
 695  707                   * the size will change between now and the dbuf dirty call.
 696  708                   */
 697  709                  bp = &dn->dn_phys->dn_blkptr[0];
 698  710                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 699  711                      bp, bp->blk_birth))
 700  712                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 701  713                  else
 702  714                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
 703  715                  if (!BP_IS_HOLE(bp))
 704  716                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 705  717                  return;
 706  718          }
 707  719  
 708  720          if (dn->dn_maxblkid > 0 && name) {
 709  721                  /*
 710  722                   * access the name in this fat-zap so that we'll check
 711  723                   * for i/o errors to the leaf blocks, etc.
 712  724                   */
 713  725                  err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 714  726                      8, 0, NULL);
 715  727                  if (err == EIO) {
 716  728                          tx->tx_err = err;
 717  729                          return;
 718  730                  }
 719  731          }
 720  732  
 721  733          err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 722  734              &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 723  735  
 724  736          /*
 725  737           * If the modified blocks are scattered to the four winds,
 726  738           * we'll have to modify an indirect twig for each.
 727  739           */
 728  740          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 729  741          for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 730  742                  if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
 731  743                          txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 732  744                  else
 733  745                          txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 734  746  }
 735  747  
 736  748  void
 737  749  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 738  750  {
 739  751          dmu_tx_hold_t *txh;
 740  752  
 741  753          ASSERT(tx->tx_txg == 0);
 742  754  
 743  755          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 744  756              object, THT_BONUS, 0, 0);
 745  757          if (txh)
 746  758                  dmu_tx_count_dnode(txh);
 747  759  }
 748  760  
 749  761  void
 750  762  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 751  763  {
 752  764          dmu_tx_hold_t *txh;
 753  765          ASSERT(tx->tx_txg == 0);
 754  766  
 755  767          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 756  768              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 757  769  
 758  770          txh->txh_space_towrite += space;
 759  771  }
 760  772  
 761  773  int
 762  774  dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 763  775  {
 764  776          dmu_tx_hold_t *txh;
 765  777          int holds = 0;
 766  778  
 767  779          /*
 768  780           * By asserting that the tx is assigned, we're counting the
 769  781           * number of dn_tx_holds, which is the same as the number of
 770  782           * dn_holds.  Otherwise, we'd be counting dn_holds, but
 771  783           * dn_tx_holds could be 0.
 772  784           */
 773  785          ASSERT(tx->tx_txg != 0);
 774  786  
 775  787          /* if (tx->tx_anyobj == TRUE) */
 776  788                  /* return (0); */
 777  789  
 778  790          for (txh = list_head(&tx->tx_holds); txh;
 779  791              txh = list_next(&tx->tx_holds, txh)) {
 780  792                  if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 781  793                          holds++;
 782  794          }
 783  795  
 784  796          return (holds);
 785  797  }
 786  798  
 787  799  #ifdef ZFS_DEBUG
 788  800  void
 789  801  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 790  802  {
 791  803          dmu_tx_hold_t *txh;
 792  804          int match_object = FALSE, match_offset = FALSE;
 793  805          dnode_t *dn;
 794  806  
 795  807          DB_DNODE_ENTER(db);
 796  808          dn = DB_DNODE(db);
 797  809          ASSERT(tx->tx_txg != 0);
 798  810          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 799  811          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 800  812  
 801  813          if (tx->tx_anyobj) {
 802  814                  DB_DNODE_EXIT(db);
 803  815                  return;
 804  816          }
 805  817  
 806  818          /* XXX No checking on the meta dnode for now */
 807  819          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 808  820                  DB_DNODE_EXIT(db);
 809  821                  return;
 810  822          }
 811  823  
 812  824          for (txh = list_head(&tx->tx_holds); txh;
 813  825              txh = list_next(&tx->tx_holds, txh)) {
 814  826                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 815  827                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 816  828                          match_object = TRUE;
 817  829                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 818  830                          int datablkshift = dn->dn_datablkshift ?
 819  831                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 820  832                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 821  833                          int shift = datablkshift + epbs * db->db_level;
 822  834                          uint64_t beginblk = shift >= 64 ? 0 :
 823  835                              (txh->txh_arg1 >> shift);
 824  836                          uint64_t endblk = shift >= 64 ? 0 :
 825  837                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 826  838                          uint64_t blkid = db->db_blkid;
 827  839  
 828  840                          /* XXX txh_arg2 better not be zero... */
 829  841  
 830  842                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 831  843                              txh->txh_type, beginblk, endblk);
 832  844  
 833  845                          switch (txh->txh_type) {
 834  846                          case THT_WRITE:
 835  847                                  if (blkid >= beginblk && blkid <= endblk)
 836  848                                          match_offset = TRUE;
 837  849                                  /*
 838  850                                   * We will let this hold work for the bonus
 839  851                                   * or spill buffer so that we don't need to
 840  852                                   * hold it when creating a new object.
 841  853                                   */
 842  854                                  if (blkid == DMU_BONUS_BLKID ||
 843  855                                      blkid == DMU_SPILL_BLKID)
 844  856                                          match_offset = TRUE;
 845  857                                  /*
 846  858                                   * They might have to increase nlevels,
 847  859                                   * thus dirtying the new TLIBs.  Or the
 848  860                                   * might have to change the block size,
 849  861                                   * thus dirying the new lvl=0 blk=0.
 850  862                                   */
 851  863                                  if (blkid == 0)
 852  864                                          match_offset = TRUE;
 853  865                                  break;
 854  866                          case THT_FREE:
 855  867                                  /*
 856  868                                   * We will dirty all the level 1 blocks in
 857  869                                   * the free range and perhaps the first and
 858  870                                   * last level 0 block.
 859  871                                   */
 860  872                                  if (blkid >= beginblk && (blkid <= endblk ||
 861  873                                      txh->txh_arg2 == DMU_OBJECT_END))
 862  874                                          match_offset = TRUE;
 863  875                                  break;
 864  876                          case THT_SPILL:
 865  877                                  if (blkid == DMU_SPILL_BLKID)
 866  878                                          match_offset = TRUE;
 867  879                                  break;
 868  880                          case THT_BONUS:
 869  881                                  if (blkid == DMU_BONUS_BLKID)
 870  882                                          match_offset = TRUE;
 871  883                                  break;
 872  884                          case THT_ZAP:
 873  885                                  match_offset = TRUE;
 874  886                                  break;
 875  887                          case THT_NEWOBJECT:
 876  888                                  match_object = TRUE;
 877  889                                  break;
 878  890                          default:
 879  891                                  ASSERT(!"bad txh_type");
 880  892                          }
 881  893                  }
 882  894                  if (match_object && match_offset) {
 883  895                          DB_DNODE_EXIT(db);
 884  896                          return;
 885  897                  }
 886  898          }
 887  899          DB_DNODE_EXIT(db);
 888  900          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 889  901              (u_longlong_t)db->db.db_object, db->db_level,
 890  902              (u_longlong_t)db->db_blkid);
 891  903  }
 892  904  #endif
 893  905  
 894  906  static int
 895  907  dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 896  908  {
 897  909          dmu_tx_hold_t *txh;
 898  910          spa_t *spa = tx->tx_pool->dp_spa;
 899  911          uint64_t memory, asize, fsize, usize;
 900  912          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 901  913  
 902  914          ASSERT3U(tx->tx_txg, ==, 0);
 903  915  
 904  916          if (tx->tx_err)
 905  917                  return (tx->tx_err);
 906  918  
 907  919          if (spa_suspended(spa)) {
 908  920                  /*
 909  921                   * If the user has indicated a blocking failure mode
 910  922                   * then return ERESTART which will block in dmu_tx_wait().
 911  923                   * Otherwise, return EIO so that an error can get
 912  924                   * propagated back to the VOP calls.
 913  925                   *
 914  926                   * Note that we always honor the txg_how flag regardless
 915  927                   * of the failuremode setting.
 916  928                   */
 917  929                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 918  930                      txg_how != TXG_WAIT)
 919  931                          return (EIO);
 920  932  
 921  933                  return (ERESTART);
 922  934          }
 923  935  
 924  936          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 925  937          tx->tx_needassign_txh = NULL;
 926  938  
 927  939          /*
 928  940           * NB: No error returns are allowed after txg_hold_open, but
 929  941           * before processing the dnode holds, due to the
 930  942           * dmu_tx_unassign() logic.
 931  943           */
 932  944  
 933  945          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
 934  946          for (txh = list_head(&tx->tx_holds); txh;
 935  947              txh = list_next(&tx->tx_holds, txh)) {
 936  948                  dnode_t *dn = txh->txh_dnode;
 937  949                  if (dn != NULL) {
 938  950                          mutex_enter(&dn->dn_mtx);
 939  951                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 940  952                                  mutex_exit(&dn->dn_mtx);
 941  953                                  tx->tx_needassign_txh = txh;
 942  954                                  return (ERESTART);
 943  955                          }
 944  956                          if (dn->dn_assigned_txg == 0)
 945  957                                  dn->dn_assigned_txg = tx->tx_txg;
 946  958                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 947  959                          (void) refcount_add(&dn->dn_tx_holds, tx);
 948  960                          mutex_exit(&dn->dn_mtx);
 949  961                  }
 950  962                  towrite += txh->txh_space_towrite;
 951  963                  tofree += txh->txh_space_tofree;
 952  964                  tooverwrite += txh->txh_space_tooverwrite;
 953  965                  tounref += txh->txh_space_tounref;
 954  966                  tohold += txh->txh_memory_tohold;
 955  967                  fudge += txh->txh_fudge;
 956  968          }
 957  969  
 958  970          /*
 959  971           * NB: This check must be after we've held the dnodes, so that
 960  972           * the dmu_tx_unassign() logic will work properly
 961  973           */
 962  974          if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
 963  975                  return (ERESTART);
 964  976  
 965  977          /*
 966  978           * If a snapshot has been taken since we made our estimates,
 967  979           * assume that we won't be able to free or overwrite anything.
 968  980           */
 969  981          if (tx->tx_objset &&
 970  982              dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 971  983              tx->tx_lastsnap_txg) {
 972  984                  towrite += tooverwrite;
 973  985                  tooverwrite = tofree = 0;
 974  986          }
 975  987  
 976  988          /* needed allocation: worst-case estimate of write space */
 977  989          asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
 978  990          /* freed space estimate: worst-case overwrite + free estimate */
 979  991          fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 980  992          /* convert unrefd space to worst-case estimate */
 981  993          usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 982  994          /* calculate memory footprint estimate */
 983  995          memory = towrite + tooverwrite + tohold;
 984  996  
 985  997  #ifdef ZFS_DEBUG
 986  998          /*
 987  999           * Add in 'tohold' to account for our dirty holds on this memory
 988 1000           * XXX - the "fudge" factor is to account for skipped blocks that
 989 1001           * we missed because dnode_next_offset() misses in-core-only blocks.
 990 1002           */
 991 1003          tx->tx_space_towrite = asize +
 992 1004              spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
 993 1005          tx->tx_space_tofree = tofree;
 994 1006          tx->tx_space_tooverwrite = tooverwrite;
 995 1007          tx->tx_space_tounref = tounref;
 996 1008  #endif
 997 1009  
 998 1010          if (tx->tx_dir && asize != 0) {
 999 1011                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1000 1012                      asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1001 1013                  if (err)
1002 1014                          return (err);
1003 1015          }
1004 1016  
1005 1017          return (0);
1006 1018  }
1007 1019  
1008 1020  static void
1009 1021  dmu_tx_unassign(dmu_tx_t *tx)
1010 1022  {
1011 1023          dmu_tx_hold_t *txh;
1012 1024  
1013 1025          if (tx->tx_txg == 0)
1014 1026                  return;
1015 1027  
1016 1028          txg_rele_to_quiesce(&tx->tx_txgh);
1017 1029  
1018 1030          for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1019 1031              txh = list_next(&tx->tx_holds, txh)) {
1020 1032                  dnode_t *dn = txh->txh_dnode;
1021 1033  
1022 1034                  if (dn == NULL)
1023 1035                          continue;
1024 1036                  mutex_enter(&dn->dn_mtx);
1025 1037                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1026 1038  
1027 1039                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1028 1040                          dn->dn_assigned_txg = 0;
1029 1041                          cv_broadcast(&dn->dn_notxholds);
1030 1042                  }
1031 1043                  mutex_exit(&dn->dn_mtx);
1032 1044          }
1033 1045  
1034 1046          txg_rele_to_sync(&tx->tx_txgh);
1035 1047  
1036 1048          tx->tx_lasttried_txg = tx->tx_txg;
1037 1049          tx->tx_txg = 0;
1038 1050  }
1039 1051  
1040 1052  /*
1041 1053   * Assign tx to a transaction group.  txg_how can be one of:
1042 1054   *
1043 1055   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1044 1056   *      a new one.  This should be used when you're not holding locks.
1045 1057   *      If will only fail if we're truly out of space (or over quota).
1046 1058   *
1047 1059   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1048 1060   *      blocking, returns immediately with ERESTART.  This should be used
1049 1061   *      whenever you're holding locks.  On an ERESTART error, the caller
1050 1062   *      should drop locks, do a dmu_tx_wait(tx), and try again.
1051 1063   *
1052 1064   * (3)  A specific txg.  Use this if you need to ensure that multiple
1053 1065   *      transactions all sync in the same txg.  Like TXG_NOWAIT, it
1054 1066   *      returns ERESTART if it can't assign you into the requested txg.
1055 1067   */
1056 1068  int
1057 1069  dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
1058 1070  {
1059 1071          int err;
1060 1072  
1061 1073          ASSERT(tx->tx_txg == 0);
1062 1074          ASSERT(txg_how != 0);
1063 1075          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1064 1076  
1065 1077          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1066 1078                  dmu_tx_unassign(tx);
1067 1079  
1068 1080                  if (err != ERESTART || txg_how != TXG_WAIT)
1069 1081                          return (err);
1070 1082  
1071 1083                  dmu_tx_wait(tx);
1072 1084          }
1073 1085  
1074 1086          txg_rele_to_quiesce(&tx->tx_txgh);
1075 1087  
1076 1088          return (0);
1077 1089  }
1078 1090  
1079 1091  void
1080 1092  dmu_tx_wait(dmu_tx_t *tx)
1081 1093  {
1082 1094          spa_t *spa = tx->tx_pool->dp_spa;
1083 1095  
1084 1096          ASSERT(tx->tx_txg == 0);
1085 1097  
1086 1098          /*
1087 1099           * It's possible that the pool has become active after this thread
1088 1100           * has tried to obtain a tx. If that's the case then his
1089 1101           * tx_lasttried_txg would not have been assigned.
1090 1102           */
1091 1103          if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1092 1104                  txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
1093 1105          } else if (tx->tx_needassign_txh) {
1094 1106                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1095 1107  
1096 1108                  mutex_enter(&dn->dn_mtx);
1097 1109                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1098 1110                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1099 1111                  mutex_exit(&dn->dn_mtx);
1100 1112                  tx->tx_needassign_txh = NULL;
1101 1113          } else {
1102 1114                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1103 1115          }
1104 1116  }
1105 1117  
1106 1118  void
1107 1119  dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1108 1120  {
1109 1121  #ifdef ZFS_DEBUG
1110 1122          if (tx->tx_dir == NULL || delta == 0)
1111 1123                  return;
1112 1124  
1113 1125          if (delta > 0) {
1114 1126                  ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1115 1127                      tx->tx_space_towrite);
1116 1128                  (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1117 1129          } else {
1118 1130                  (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1119 1131          }
1120 1132  #endif
1121 1133  }
1122 1134  
1123 1135  void
1124 1136  dmu_tx_commit(dmu_tx_t *tx)
1125 1137  {
1126 1138          dmu_tx_hold_t *txh;
1127 1139  
1128 1140          ASSERT(tx->tx_txg != 0);
1129 1141  
1130 1142          while (txh = list_head(&tx->tx_holds)) {
1131 1143                  dnode_t *dn = txh->txh_dnode;
1132 1144  
1133 1145                  list_remove(&tx->tx_holds, txh);
1134 1146                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1135 1147                  if (dn == NULL)
1136 1148                          continue;
1137 1149                  mutex_enter(&dn->dn_mtx);
1138 1150                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1139 1151  
1140 1152                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1141 1153                          dn->dn_assigned_txg = 0;
1142 1154                          cv_broadcast(&dn->dn_notxholds);
1143 1155                  }
1144 1156                  mutex_exit(&dn->dn_mtx);
1145 1157                  dnode_rele(dn, tx);
1146 1158          }
1147 1159  
1148 1160          if (tx->tx_tempreserve_cookie)
1149 1161                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1150 1162  
1151 1163          if (!list_is_empty(&tx->tx_callbacks))
1152 1164                  txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1153 1165  
1154 1166          if (tx->tx_anyobj == FALSE)
1155 1167                  txg_rele_to_sync(&tx->tx_txgh);
1156 1168  
1157 1169          list_destroy(&tx->tx_callbacks);
1158 1170          list_destroy(&tx->tx_holds);
1159 1171  #ifdef ZFS_DEBUG
1160 1172          dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1161 1173              tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1162 1174              tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1163 1175          refcount_destroy_many(&tx->tx_space_written,
1164 1176              refcount_count(&tx->tx_space_written));
1165 1177          refcount_destroy_many(&tx->tx_space_freed,
1166 1178              refcount_count(&tx->tx_space_freed));
1167 1179  #endif
1168 1180          kmem_free(tx, sizeof (dmu_tx_t));
1169 1181  }
1170 1182  
1171 1183  void
1172 1184  dmu_tx_abort(dmu_tx_t *tx)
1173 1185  {
1174 1186          dmu_tx_hold_t *txh;
1175 1187  
1176 1188          ASSERT(tx->tx_txg == 0);
1177 1189  
1178 1190          while (txh = list_head(&tx->tx_holds)) {
1179 1191                  dnode_t *dn = txh->txh_dnode;
1180 1192  
1181 1193                  list_remove(&tx->tx_holds, txh);
1182 1194                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1183 1195                  if (dn != NULL)
1184 1196                          dnode_rele(dn, tx);
1185 1197          }
1186 1198  
1187 1199          /*
1188 1200           * Call any registered callbacks with an error code.
1189 1201           */
1190 1202          if (!list_is_empty(&tx->tx_callbacks))
1191 1203                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1192 1204  
1193 1205          list_destroy(&tx->tx_callbacks);
1194 1206          list_destroy(&tx->tx_holds);
1195 1207  #ifdef ZFS_DEBUG
1196 1208          refcount_destroy_many(&tx->tx_space_written,
1197 1209              refcount_count(&tx->tx_space_written));
1198 1210          refcount_destroy_many(&tx->tx_space_freed,
1199 1211              refcount_count(&tx->tx_space_freed));
1200 1212  #endif
1201 1213          kmem_free(tx, sizeof (dmu_tx_t));
1202 1214  }
1203 1215  
1204 1216  uint64_t
1205 1217  dmu_tx_get_txg(dmu_tx_t *tx)
1206 1218  {
1207 1219          ASSERT(tx->tx_txg != 0);
1208 1220          return (tx->tx_txg);
1209 1221  }
1210 1222  
1211 1223  void
1212 1224  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1213 1225  {
1214 1226          dmu_tx_callback_t *dcb;
1215 1227  
1216 1228          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1217 1229  
1218 1230          dcb->dcb_func = func;
1219 1231          dcb->dcb_data = data;
1220 1232  
1221 1233          list_insert_tail(&tx->tx_callbacks, dcb);
1222 1234  }
1223 1235  
1224 1236  /*
1225 1237   * Call all the commit callbacks on a list, with a given error code.
1226 1238   */
1227 1239  void
1228 1240  dmu_tx_do_callbacks(list_t *cb_list, int error)
1229 1241  {
1230 1242          dmu_tx_callback_t *dcb;
1231 1243  
1232 1244          while (dcb = list_head(cb_list)) {
1233 1245                  list_remove(cb_list, dcb);
1234 1246                  dcb->dcb_func(dcb->dcb_data, error);
1235 1247                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1236 1248          }
1237 1249  }
1238 1250  
1239 1251  /*
1240 1252   * Interface to hold a bunch of attributes.
1241 1253   * used for creating new files.
1242 1254   * attrsize is the total size of all attributes
1243 1255   * to be added during object creation
1244 1256   *
1245 1257   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1246 1258   */
1247 1259  
1248 1260  /*
1249 1261   * hold necessary attribute name for attribute registration.
1250 1262   * should be a very rare case where this is needed.  If it does
1251 1263   * happen it would only happen on the first write to the file system.
1252 1264   */
1253 1265  static void
1254 1266  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1255 1267  {
1256 1268          int i;
1257 1269  
1258 1270          if (!sa->sa_need_attr_registration)
1259 1271                  return;
1260 1272  
1261 1273          for (i = 0; i != sa->sa_num_attrs; i++) {
1262 1274                  if (!sa->sa_attr_table[i].sa_registered) {
1263 1275                          if (sa->sa_reg_attr_obj)
1264 1276                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1265 1277                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1266 1278                          else
1267 1279                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1268 1280                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1269 1281                  }
1270 1282          }
1271 1283  }
1272 1284  
1273 1285  
1274 1286  void
1275 1287  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1276 1288  {
1277 1289          dnode_t *dn;
1278 1290          dmu_tx_hold_t *txh;
1279 1291  
1280 1292          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1281 1293              THT_SPILL, 0, 0);
1282 1294  
1283 1295          dn = txh->txh_dnode;
1284 1296  
1285 1297          if (dn == NULL)
1286 1298                  return;
1287 1299  
1288 1300          /* If blkptr doesn't exist then add space to towrite */
1289 1301          if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1290 1302                  txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1291 1303          } else {
1292 1304                  blkptr_t *bp;
1293 1305  
1294 1306                  bp = &dn->dn_phys->dn_spill;
1295 1307                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1296 1308                      bp, bp->blk_birth))
1297 1309                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
1298 1310                  else
1299 1311                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1300 1312                  if (!BP_IS_HOLE(bp))
1301 1313                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
1302 1314          }
1303 1315  }
1304 1316  
1305 1317  void
1306 1318  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1307 1319  {
1308 1320          sa_os_t *sa = tx->tx_objset->os_sa;
1309 1321  
1310 1322          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1311 1323  
1312 1324          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1313 1325                  return;
1314 1326  
1315 1327          if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1316 1328                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1317 1329          else {
1318 1330                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1319 1331                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1320 1332                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1321 1333                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1322 1334          }
1323 1335  
1324 1336          dmu_tx_sa_registration_hold(sa, tx);
1325 1337  
1326 1338          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1327 1339                  return;
1328 1340  
1329 1341          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1330 1342              THT_SPILL, 0, 0);
1331 1343  }
1332 1344  
1333 1345  /*
1334 1346   * Hold SA attribute
1335 1347   *
1336 1348   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1337 1349   *
1338 1350   * variable_size is the total size of all variable sized attributes
1339 1351   * passed to this function.  It is not the total size of all
1340 1352   * variable size attributes that *may* exist on this object.
1341 1353   */
1342 1354  void
1343 1355  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1344 1356  {
1345 1357          uint64_t object;
1346 1358          sa_os_t *sa = tx->tx_objset->os_sa;
1347 1359  
1348 1360          ASSERT(hdl != NULL);
1349 1361  
1350 1362          object = sa_handle_object(hdl);
1351 1363  
1352 1364          dmu_tx_hold_bonus(tx, object);
1353 1365  
1354 1366          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1355 1367                  return;
1356 1368  
1357 1369          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1358 1370              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1359 1371                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1360 1372                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1361 1373                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1362 1374                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1363 1375          }
1364 1376  
1365 1377          dmu_tx_sa_registration_hold(sa, tx);
1366 1378  
1367 1379          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1368 1380                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1369 1381  
1370 1382          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1371 1383                  ASSERT(tx->tx_txg == 0);
1372 1384                  dmu_tx_hold_spill(tx, object);
1373 1385          } else {
1374 1386                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1375 1387                  dnode_t *dn;
1376 1388  
1377 1389                  DB_DNODE_ENTER(db);
1378 1390                  dn = DB_DNODE(db);
1379 1391                  if (dn->dn_have_spill) {
1380 1392                          ASSERT(tx->tx_txg == 0);
1381 1393                          dmu_tx_hold_spill(tx, object);
1382 1394                  }
1383 1395                  DB_DNODE_EXIT(db);
1384 1396          }
1385 1397  }

↓ open down ↓

807 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX