illumos-gate Wdiff usr/src/uts/common/fs/zfs/dmu_tx.c

Print this page

3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2012 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_impl.h>
  29   29  #include <sys/dbuf.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/dmu_objset.h>
  32   32  #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33   33  #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36   36  #include <sys/spa.h>
  37   37  #include <sys/sa.h>
  38   38  #include <sys/sa_impl.h>
  39   39  #include <sys/zfs_context.h>
  40   40  #include <sys/varargs.h>
  41   41  
  42   42  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  43   43      uint64_t arg1, uint64_t arg2);
  44   44  
  45   45  
  46   46  dmu_tx_t *
  47   47  dmu_tx_create_dd(dsl_dir_t *dd)
  48   48  {
  49   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  50   50          tx->tx_dir = dd;
  51   51          if (dd != NULL)
  52   52                  tx->tx_pool = dd->dd_pool;
  53   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  54   54              offsetof(dmu_tx_hold_t, txh_node));
  55   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  56   56              offsetof(dmu_tx_callback_t, dcb_node));
  57   57  #ifdef ZFS_DEBUG
  58   58          refcount_create(&tx->tx_space_written);
  59   59          refcount_create(&tx->tx_space_freed);
  60   60  #endif
  61   61          return (tx);
  62   62  }
  63   63  
  64   64  dmu_tx_t *
  65   65  dmu_tx_create(objset_t *os)
  66   66  {
  67   67          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  68   68          tx->tx_objset = os;
  69   69          tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  70   70          return (tx);
  71   71  }
  72   72  
  73   73  dmu_tx_t *
  74   74  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  75   75  {
  76   76          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  77   77  
  78   78          ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  79   79          tx->tx_pool = dp;
  80   80          tx->tx_txg = txg;
  81   81          tx->tx_anyobj = TRUE;
  82   82  
  83   83          return (tx);
  84   84  }
  85   85  
  86   86  int
  87   87  dmu_tx_is_syncing(dmu_tx_t *tx)
  88   88  {
  89   89          return (tx->tx_anyobj);
  90   90  }
  91   91  
  92   92  int
  93   93  dmu_tx_private_ok(dmu_tx_t *tx)
  94   94  {
  95   95          return (tx->tx_anyobj);
  96   96  }
  97   97  
  98   98  static dmu_tx_hold_t *
  99   99  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 100  100      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 101  101  {
 102  102          dmu_tx_hold_t *txh;
 103  103          dnode_t *dn = NULL;
 104  104          int err;
 105  105  
 106  106          if (object != DMU_NEW_OBJECT) {
 107  107                  err = dnode_hold(os, object, tx, &dn);
 108  108                  if (err) {
 109  109                          tx->tx_err = err;
 110  110                          return (NULL);
 111  111                  }
 112  112  
 113  113                  if (err == 0 && tx->tx_txg != 0) {
 114  114                          mutex_enter(&dn->dn_mtx);
 115  115                          /*
 116  116                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 117  117                           * problem, but there's no way for it to happen (for
 118  118                           * now, at least).
 119  119                           */
 120  120                          ASSERT(dn->dn_assigned_txg == 0);
 121  121                          dn->dn_assigned_txg = tx->tx_txg;
 122  122                          (void) refcount_add(&dn->dn_tx_holds, tx);
 123  123                          mutex_exit(&dn->dn_mtx);
 124  124                  }
 125  125          }
 126  126  
 127  127          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 128  128          txh->txh_tx = tx;
 129  129          txh->txh_dnode = dn;
 130  130  #ifdef ZFS_DEBUG
 131  131          txh->txh_type = type;
 132  132          txh->txh_arg1 = arg1;
 133  133          txh->txh_arg2 = arg2;
 134  134  #endif
 135  135          list_insert_tail(&tx->tx_holds, txh);
 136  136  
 137  137          return (txh);
 138  138  }
 139  139  
 140  140  void
 141  141  dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 142  142  {
 143  143          /*
 144  144           * If we're syncing, they can manipulate any object anyhow, and
 145  145           * the hold on the dnode_t can cause problems.
 146  146           */
 147  147          if (!dmu_tx_is_syncing(tx)) {
 148  148                  (void) dmu_tx_hold_object_impl(tx, os,
 149  149                      object, THT_NEWOBJECT, 0, 0);
 150  150          }
 151  151  }
 152  152  
 153  153  static int
 154  154  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 155  155  {
 156  156          int err;
 157  157          dmu_buf_impl_t *db;
 158  158  
 159  159          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 160  160          db = dbuf_hold_level(dn, level, blkid, FTAG);
 161  161          rw_exit(&dn->dn_struct_rwlock);
 162  162          if (db == NULL)
 163  163                  return (EIO);
 164  164          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 165  165          dbuf_rele(db, FTAG);
 166  166          return (err);
 167  167  }
 168  168  
 169  169  static void
 170  170  dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 171  171      int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 172  172  {
 173  173          objset_t *os = dn->dn_objset;
 174  174          dsl_dataset_t *ds = os->os_dsl_dataset;
 175  175          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 176  176          dmu_buf_impl_t *parent = NULL;
 177  177          blkptr_t *bp = NULL;
 178  178          uint64_t space;
 179  179  
 180  180          if (level >= dn->dn_nlevels || history[level] == blkid)
 181  181                  return;
 182  182  
 183  183          history[level] = blkid;
 184  184  
 185  185          space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 186  186  
 187  187          if (db == NULL || db == dn->dn_dbuf) {
 188  188                  ASSERT(level != 0);
 189  189                  db = NULL;
 190  190          } else {
 191  191                  ASSERT(DB_DNODE(db) == dn);
 192  192                  ASSERT(db->db_level == level);
 193  193                  ASSERT(db->db.db_size == space);
 194  194                  ASSERT(db->db_blkid == blkid);
 195  195                  bp = db->db_blkptr;
 196  196                  parent = db->db_parent;
 197  197          }
 198  198  
 199  199          freeable = (bp && (freeable ||
 200  200              dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 201  201  
 202  202          if (freeable)
 203  203                  txh->txh_space_tooverwrite += space;
 204  204          else
 205  205                  txh->txh_space_towrite += space;
 206  206          if (bp)
 207  207                  txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 208  208  
 209  209          dmu_tx_count_twig(txh, dn, parent, level + 1,
 210  210              blkid >> epbs, freeable, history);
 211  211  }
 212  212  
 213  213  /* ARGSUSED */
 214  214  static void
 215  215  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 216  216  {
 217  217          dnode_t *dn = txh->txh_dnode;
 218  218          uint64_t start, end, i;
 219  219          int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 220  220          int err = 0;
 221  221  
 222  222          if (len == 0)
 223  223                  return;
 224  224  
 225  225          min_bs = SPA_MINBLOCKSHIFT;
 226  226          max_bs = SPA_MAXBLOCKSHIFT;
 227  227          min_ibs = DN_MIN_INDBLKSHIFT;
 228  228          max_ibs = DN_MAX_INDBLKSHIFT;
 229  229  
 230  230          if (dn) {
 231  231                  uint64_t history[DN_MAX_LEVELS];
 232  232                  int nlvls = dn->dn_nlevels;
 233  233                  int delta;
 234  234  
 235  235                  /*
 236  236                   * For i/o error checking, read the first and last level-0
 237  237                   * blocks (if they are not aligned), and all the level-1 blocks.
 238  238                   */
 239  239                  if (dn->dn_maxblkid == 0) {
 240  240                          delta = dn->dn_datablksz;
 241  241                          start = (off < dn->dn_datablksz) ? 0 : 1;
 242  242                          end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 243  243                          if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 244  244                                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 245  245                                  if (err)
 246  246                                          goto out;
 247  247                                  delta -= off;
 248  248                          }
 249  249                  } else {
 250  250                          zio_t *zio = zio_root(dn->dn_objset->os_spa,
 251  251                              NULL, NULL, ZIO_FLAG_CANFAIL);
 252  252  
 253  253                          /* first level-0 block */
 254  254                          start = off >> dn->dn_datablkshift;
 255  255                          if (P2PHASE(off, dn->dn_datablksz) ||
 256  256                              len < dn->dn_datablksz) {
 257  257                                  err = dmu_tx_check_ioerr(zio, dn, 0, start);
 258  258                                  if (err)
 259  259                                          goto out;
 260  260                          }
 261  261  
 262  262                          /* last level-0 block */
 263  263                          end = (off+len-1) >> dn->dn_datablkshift;
 264  264                          if (end != start && end <= dn->dn_maxblkid &&
 265  265                              P2PHASE(off+len, dn->dn_datablksz)) {
 266  266                                  err = dmu_tx_check_ioerr(zio, dn, 0, end);
 267  267                                  if (err)
 268  268                                          goto out;
 269  269                          }
 270  270  
 271  271                          /* level-1 blocks */
 272  272                          if (nlvls > 1) {
 273  273                                  int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 274  274                                  for (i = (start>>shft)+1; i < end>>shft; i++) {
 275  275                                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 276  276                                          if (err)
 277  277                                                  goto out;
 278  278                                  }
 279  279                          }
 280  280  
 281  281                          err = zio_wait(zio);
 282  282                          if (err)
 283  283                                  goto out;
 284  284                          delta = P2NPHASE(off, dn->dn_datablksz);
 285  285                  }
 286  286  
 287  287                  if (dn->dn_maxblkid > 0) {
 288  288                          /*
 289  289                           * The blocksize can't change,
 290  290                           * so we can make a more precise estimate.
 291  291                           */
 292  292                          ASSERT(dn->dn_datablkshift != 0);
 293  293                          min_bs = max_bs = dn->dn_datablkshift;
 294  294                          min_ibs = max_ibs = dn->dn_indblkshift;
 295  295                  } else if (dn->dn_indblkshift > max_ibs) {
 296  296                          /*
 297  297                           * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
 298  298                           * the code will still work correctly on older pools.
 299  299                           */
 300  300                          min_ibs = max_ibs = dn->dn_indblkshift;
 301  301                  }
 302  302  
 303  303                  /*
 304  304                   * If this write is not off the end of the file
 305  305                   * we need to account for overwrites/unref.
 306  306                   */
 307  307                  if (start <= dn->dn_maxblkid) {
 308  308                          for (int l = 0; l < DN_MAX_LEVELS; l++)
 309  309                                  history[l] = -1ULL;
 310  310                  }
 311  311                  while (start <= dn->dn_maxblkid) {
 312  312                          dmu_buf_impl_t *db;
 313  313  
 314  314                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 315  315                          err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 316  316                          rw_exit(&dn->dn_struct_rwlock);
 317  317  
 318  318                          if (err) {
 319  319                                  txh->txh_tx->tx_err = err;
 320  320                                  return;
 321  321                          }
 322  322  
 323  323                          dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 324  324                              history);
 325  325                          dbuf_rele(db, FTAG);
 326  326                          if (++start > end) {
 327  327                                  /*
 328  328                                   * Account for new indirects appearing
 329  329                                   * before this IO gets assigned into a txg.
 330  330                                   */
 331  331                                  bits = 64 - min_bs;
 332  332                                  epbs = min_ibs - SPA_BLKPTRSHIFT;
 333  333                                  for (bits -= epbs * (nlvls - 1);
 334  334                                      bits >= 0; bits -= epbs)
 335  335                                          txh->txh_fudge += 1ULL << max_ibs;
 336  336                                  goto out;
 337  337                          }
 338  338                          off += delta;
 339  339                          if (len >= delta)
 340  340                                  len -= delta;
 341  341                          delta = dn->dn_datablksz;
 342  342                  }
 343  343          }
 344  344  
 345  345          /*
 346  346           * 'end' is the last thing we will access, not one past.
 347  347           * This way we won't overflow when accessing the last byte.
 348  348           */
 349  349          start = P2ALIGN(off, 1ULL << max_bs);
 350  350          end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 351  351          txh->txh_space_towrite += end - start + 1;
 352  352  
 353  353          start >>= min_bs;
 354  354          end >>= min_bs;
 355  355  
 356  356          epbs = min_ibs - SPA_BLKPTRSHIFT;
 357  357  
 358  358          /*
 359  359           * The object contains at most 2^(64 - min_bs) blocks,
 360  360           * and each indirect level maps 2^epbs.
 361  361           */
 362  362          for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 363  363                  start >>= epbs;
 364  364                  end >>= epbs;
 365  365                  ASSERT3U(end, >=, start);
 366  366                  txh->txh_space_towrite += (end - start + 1) << max_ibs;
 367  367                  if (start != 0) {
 368  368                          /*
 369  369                           * We also need a new blkid=0 indirect block
 370  370                           * to reference any existing file data.
 371  371                           */
 372  372                          txh->txh_space_towrite += 1ULL << max_ibs;
 373  373                  }
 374  374          }
 375  375  
 376  376  out:
 377  377          if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 378  378              2 * DMU_MAX_ACCESS)
 379  379                  err = EFBIG;
 380  380  
 381  381          if (err)
 382  382                  txh->txh_tx->tx_err = err;
 383  383  }
 384  384  
 385  385  static void
 386  386  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 387  387  {
 388  388          dnode_t *dn = txh->txh_dnode;
 389  389          dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 390  390          uint64_t space = mdn->dn_datablksz +
 391  391              ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 392  392  
 393  393          if (dn && dn->dn_dbuf->db_blkptr &&
 394  394              dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 395  395              dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 396  396                  txh->txh_space_tooverwrite += space;
 397  397                  txh->txh_space_tounref += space;
 398  398          } else {
 399  399                  txh->txh_space_towrite += space;
 400  400                  if (dn && dn->dn_dbuf->db_blkptr)
 401  401                          txh->txh_space_tounref += space;
 402  402          }
 403  403  }
 404  404  
 405  405  void
 406  406  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 407  407  {
 408  408          dmu_tx_hold_t *txh;
 409  409  
 410  410          ASSERT(tx->tx_txg == 0);
 411  411          ASSERT(len < DMU_MAX_ACCESS);
 412  412          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 413  413  
 414  414          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 415  415              object, THT_WRITE, off, len);
 416  416          if (txh == NULL)
 417  417                  return;
 418  418  
 419  419          dmu_tx_count_write(txh, off, len);
 420  420          dmu_tx_count_dnode(txh);
 421  421  }
 422  422  
 423  423  static void
 424  424  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 425  425  {
 426  426          uint64_t blkid, nblks, lastblk;
 427  427          uint64_t space = 0, unref = 0, skipped = 0;
 428  428          dnode_t *dn = txh->txh_dnode;
 429  429          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 430  430          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 431  431          int epbs;
 432  432  
 433  433          if (dn->dn_nlevels == 0)
 434  434                  return;
 435  435  
 436  436          /*
 437  437           * The struct_rwlock protects us against dn_nlevels
 438  438           * changing, in case (against all odds) we manage to dirty &
 439  439           * sync out the changes after we check for being dirty.
 440  440           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 441  441           */
 442  442          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 443  443          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 444  444          if (dn->dn_maxblkid == 0) {
 445  445                  if (off == 0 && len >= dn->dn_datablksz) {
 446  446                          blkid = 0;
 447  447                          nblks = 1;
 448  448                  } else {
 449  449                          rw_exit(&dn->dn_struct_rwlock);
 450  450                          return;
 451  451                  }
 452  452          } else {
 453  453                  blkid = off >> dn->dn_datablkshift;
 454  454                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 455  455  
 456  456                  if (blkid >= dn->dn_maxblkid) {
 457  457                          rw_exit(&dn->dn_struct_rwlock);
 458  458                          return;
 459  459                  }
 460  460                  if (blkid + nblks > dn->dn_maxblkid)
 461  461                          nblks = dn->dn_maxblkid - blkid;
 462  462  
 463  463          }
 464  464          if (dn->dn_nlevels == 1) {
 465  465                  int i;
 466  466                  for (i = 0; i < nblks; i++) {
 467  467                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 468  468                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 469  469                          bp += blkid + i;
 470  470                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 471  471                                  dprintf_bp(bp, "can free old%s", "");
 472  472                                  space += bp_get_dsize(spa, bp);
 473  473                          }
 474  474                          unref += BP_GET_ASIZE(bp);
 475  475                  }
 476  476                  nblks = 0;
 477  477          }
 478  478  
 479  479          /*
 480  480           * Add in memory requirements of higher-level indirects.
 481  481           * This assumes a worst-possible scenario for dn_nlevels.
 482  482           */
 483  483          {
 484  484                  uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
 485  485                  int level = (dn->dn_nlevels > 1) ? 2 : 1;
 486  486  
 487  487                  while (level++ < DN_MAX_LEVELS) {
 488  488                          txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
 489  489                          blkcnt = 1 + (blkcnt >> epbs);
 490  490                  }
 491  491                  ASSERT(blkcnt <= dn->dn_nblkptr);
 492  492          }
 493  493  
 494  494          lastblk = blkid + nblks - 1;
 495  495          while (nblks) {
 496  496                  dmu_buf_impl_t *dbuf;
 497  497                  uint64_t ibyte, new_blkid;
 498  498                  int epb = 1 << epbs;
 499  499                  int err, i, blkoff, tochk;
 500  500                  blkptr_t *bp;
 501  501  
 502  502                  ibyte = blkid << dn->dn_datablkshift;
 503  503                  err = dnode_next_offset(dn,
 504  504                      DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 505  505                  new_blkid = ibyte >> dn->dn_datablkshift;
 506  506                  if (err == ESRCH) {
 507  507                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 508  508                          break;
 509  509                  }
 510  510                  if (err) {
 511  511                          txh->txh_tx->tx_err = err;
 512  512                          break;
 513  513                  }
 514  514                  if (new_blkid > lastblk) {
 515  515                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 516  516                          break;
 517  517                  }
 518  518  
 519  519                  if (new_blkid > blkid) {
 520  520                          ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 521  521                          skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 522  522                          nblks -= new_blkid - blkid;
 523  523                          blkid = new_blkid;
 524  524                  }
 525  525                  blkoff = P2PHASE(blkid, epb);
 526  526                  tochk = MIN(epb - blkoff, nblks);
 527  527  
 528  528                  err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 529  529                  if (err) {
 530  530                          txh->txh_tx->tx_err = err;
 531  531                          break;
 532  532                  }
 533  533  
 534  534                  txh->txh_memory_tohold += dbuf->db.db_size;
 535  535  
 536  536                  /*
 537  537                   * We don't check memory_tohold against DMU_MAX_ACCESS because
 538  538                   * memory_tohold is an over-estimation (especially the >L1
 539  539                   * indirect blocks), so it could fail.  Callers should have
 540  540                   * already verified that they will not be holding too much
 541  541                   * memory.
 542  542                   */
 543  543  
 544  544                  err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 545  545                  if (err != 0) {
 546  546                          txh->txh_tx->tx_err = err;
 547  547                          dbuf_rele(dbuf, FTAG);
 548  548                          break;
 549  549                  }
 550  550  
 551  551                  bp = dbuf->db.db_data;
 552  552                  bp += blkoff;
 553  553  
 554  554                  for (i = 0; i < tochk; i++) {
 555  555                          if (dsl_dataset_block_freeable(ds, &bp[i],
 556  556                              bp[i].blk_birth)) {
 557  557                                  dprintf_bp(&bp[i], "can free old%s", "");
 558  558                                  space += bp_get_dsize(spa, &bp[i]);
 559  559                          }
 560  560                          unref += BP_GET_ASIZE(bp);
 561  561                  }
 562  562                  dbuf_rele(dbuf, FTAG);
 563  563  
 564  564                  blkid += tochk;
 565  565                  nblks -= tochk;
 566  566          }
 567  567          rw_exit(&dn->dn_struct_rwlock);
 568  568  
 569  569          /* account for new level 1 indirect blocks that might show up */
 570  570          if (skipped > 0) {
 571  571                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 572  572                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 573  573                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 574  574          }
 575  575          txh->txh_space_tofree += space;
 576  576          txh->txh_space_tounref += unref;
 577  577  }
 578  578  
 579  579  void
 580  580  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 581  581  {
 582  582          dmu_tx_hold_t *txh;
 583  583          dnode_t *dn;
 584  584          uint64_t start, end, i;
 585  585          int err, shift;
 586  586          zio_t *zio;
 587  587  
 588  588          ASSERT(tx->tx_txg == 0);
 589  589  
 590  590          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 591  591              object, THT_FREE, off, len);
 592  592          if (txh == NULL)
 593  593                  return;
 594  594          dn = txh->txh_dnode;
 595  595  
 596  596          /* first block */
 597  597          if (off != 0)
 598  598                  dmu_tx_count_write(txh, off, 1);
 599  599          /* last block */
 600  600          if (len != DMU_OBJECT_END)
 601  601                  dmu_tx_count_write(txh, off+len, 1);
 602  602  
 603  603          dmu_tx_count_dnode(txh);
 604  604  
 605  605          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 606  606                  return;
 607  607          if (len == DMU_OBJECT_END)
 608  608                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 609  609  
 610  610          /*
 611  611           * For i/o error checking, read the first and last level-0
 612  612           * blocks, and all the level-1 blocks.  The above count_write's
 613  613           * have already taken care of the level-0 blocks.
 614  614           */
 615  615          if (dn->dn_nlevels > 1) {
 616  616                  shift = dn->dn_datablkshift + dn->dn_indblkshift -
 617  617                      SPA_BLKPTRSHIFT;
 618  618                  start = off >> shift;
 619  619                  end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
 620  620  
 621  621                  zio = zio_root(tx->tx_pool->dp_spa,
 622  622                      NULL, NULL, ZIO_FLAG_CANFAIL);
 623  623                  for (i = start; i <= end; i++) {
 624  624                          uint64_t ibyte = i << shift;
 625  625                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 626  626                          i = ibyte >> shift;
 627  627                          if (err == ESRCH)
 628  628                                  break;
 629  629                          if (err) {
 630  630                                  tx->tx_err = err;
 631  631                                  return;
 632  632                          }
 633  633  
 634  634                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 635  635                          if (err) {
 636  636                                  tx->tx_err = err;
 637  637                                  return;
 638  638                          }
 639  639                  }
 640  640                  err = zio_wait(zio);
 641  641                  if (err) {
 642  642                          tx->tx_err = err;
 643  643                          return;
 644  644                  }
 645  645          }
 646  646  
 647  647          dmu_tx_count_free(txh, off, len);
 648  648  }
 649  649  
 650  650  void
 651  651  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 652  652  {
 653  653          dmu_tx_hold_t *txh;
 654  654          dnode_t *dn;
 655  655          uint64_t nblocks;
 656  656          int epbs, err;
 657  657  
 658  658          ASSERT(tx->tx_txg == 0);
 659  659  
 660  660          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 661  661              object, THT_ZAP, add, (uintptr_t)name);
 662  662          if (txh == NULL)
 663  663                  return;
 664  664          dn = txh->txh_dnode;
 665  665  
 666  666          dmu_tx_count_dnode(txh);
 667  667  
 668  668          if (dn == NULL) {
 669  669                  /*
 670  670                   * We will be able to fit a new object's entries into one leaf
 671  671                   * block.  So there will be at most 2 blocks total,
 672  672                   * including the header block.
 673  673                   */
 674  674                  dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 675  675                  return;
 676  676          }
 677  677  
 678  678          ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 679  679  
 680  680          if (dn->dn_maxblkid == 0 && !add) {
 681  681                  blkptr_t *bp;
 682  682  
 683  683                  /*
 684  684                   * If there is only one block  (i.e. this is a micro-zap)
 685  685                   * and we are not adding anything, the accounting is simple.
 686  686                   */
 687  687                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 688  688                  if (err) {
 689  689                          tx->tx_err = err;
 690  690                          return;
 691  691                  }
 692  692  
 693  693                  /*
 694  694                   * Use max block size here, since we don't know how much
 695  695                   * the size will change between now and the dbuf dirty call.
 696  696                   */
 697  697                  bp = &dn->dn_phys->dn_blkptr[0];
 698  698                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 699  699                      bp, bp->blk_birth))
 700  700                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 701  701                  else
 702  702                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
 703  703                  if (!BP_IS_HOLE(bp))
 704  704                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 705  705                  return;
 706  706          }
 707  707  
 708  708          if (dn->dn_maxblkid > 0 && name) {
 709  709                  /*
 710  710                   * access the name in this fat-zap so that we'll check
 711  711                   * for i/o errors to the leaf blocks, etc.
 712  712                   */
 713  713                  err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 714  714                      8, 0, NULL);
 715  715                  if (err == EIO) {
 716  716                          tx->tx_err = err;
 717  717                          return;
 718  718                  }
 719  719          }
 720  720  
 721  721          err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 722  722              &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 723  723  
 724  724          /*
 725  725           * If the modified blocks are scattered to the four winds,
 726  726           * we'll have to modify an indirect twig for each.
 727  727           */
 728  728          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 729  729          for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 730  730                  if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
 731  731                          txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 732  732                  else
 733  733                          txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 734  734  }
 735  735  
 736  736  void
 737  737  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 738  738  {
 739  739          dmu_tx_hold_t *txh;
 740  740  
 741  741          ASSERT(tx->tx_txg == 0);
 742  742  
 743  743          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 744  744              object, THT_BONUS, 0, 0);
 745  745          if (txh)
 746  746                  dmu_tx_count_dnode(txh);
 747  747  }
 748  748  
 749  749  void
 750  750  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 751  751  {
 752  752          dmu_tx_hold_t *txh;
 753  753          ASSERT(tx->tx_txg == 0);
 754  754  
 755  755          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 756  756              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 757  757  
 758  758          txh->txh_space_towrite += space;
 759  759  }
 760  760  
 761  761  int
 762  762  dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 763  763  {
 764  764          dmu_tx_hold_t *txh;
 765  765          int holds = 0;
 766  766  
 767  767          /*
 768  768           * By asserting that the tx is assigned, we're counting the
 769  769           * number of dn_tx_holds, which is the same as the number of
 770  770           * dn_holds.  Otherwise, we'd be counting dn_holds, but
 771  771           * dn_tx_holds could be 0.
 772  772           */
 773  773          ASSERT(tx->tx_txg != 0);
 774  774  
 775  775          /* if (tx->tx_anyobj == TRUE) */
 776  776                  /* return (0); */
 777  777  
 778  778          for (txh = list_head(&tx->tx_holds); txh;
 779  779              txh = list_next(&tx->tx_holds, txh)) {
 780  780                  if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 781  781                          holds++;
 782  782          }
 783  783  
 784  784          return (holds);
 785  785  }
 786  786  
 787  787  #ifdef ZFS_DEBUG
 788  788  void
 789  789  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 790  790  {
 791  791          dmu_tx_hold_t *txh;
 792  792          int match_object = FALSE, match_offset = FALSE;
 793  793          dnode_t *dn;
 794  794  
 795  795          DB_DNODE_ENTER(db);
 796  796          dn = DB_DNODE(db);
 797  797          ASSERT(tx->tx_txg != 0);
 798  798          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 799  799          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 800  800  
 801  801          if (tx->tx_anyobj) {
 802  802                  DB_DNODE_EXIT(db);
 803  803                  return;
 804  804          }
 805  805  
 806  806          /* XXX No checking on the meta dnode for now */
 807  807          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 808  808                  DB_DNODE_EXIT(db);
 809  809                  return;
 810  810          }
 811  811  
 812  812          for (txh = list_head(&tx->tx_holds); txh;
 813  813              txh = list_next(&tx->tx_holds, txh)) {
 814  814                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 815  815                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 816  816                          match_object = TRUE;
 817  817                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 818  818                          int datablkshift = dn->dn_datablkshift ?
 819  819                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 820  820                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 821  821                          int shift = datablkshift + epbs * db->db_level;
 822  822                          uint64_t beginblk = shift >= 64 ? 0 :
 823  823                              (txh->txh_arg1 >> shift);
 824  824                          uint64_t endblk = shift >= 64 ? 0 :
 825  825                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 826  826                          uint64_t blkid = db->db_blkid;
 827  827  
 828  828                          /* XXX txh_arg2 better not be zero... */
 829  829  
 830  830                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 831  831                              txh->txh_type, beginblk, endblk);
 832  832  
 833  833                          switch (txh->txh_type) {
 834  834                          case THT_WRITE:
 835  835                                  if (blkid >= beginblk && blkid <= endblk)
 836  836                                          match_offset = TRUE;
 837  837                                  /*
 838  838                                   * We will let this hold work for the bonus
 839  839                                   * or spill buffer so that we don't need to
 840  840                                   * hold it when creating a new object.
 841  841                                   */
 842  842                                  if (blkid == DMU_BONUS_BLKID ||
 843  843                                      blkid == DMU_SPILL_BLKID)
 844  844                                          match_offset = TRUE;
 845  845                                  /*
 846  846                                   * They might have to increase nlevels,
 847  847                                   * thus dirtying the new TLIBs.  Or the
 848  848                                   * might have to change the block size,
 849  849                                   * thus dirying the new lvl=0 blk=0.
 850  850                                   */
 851  851                                  if (blkid == 0)
 852  852                                          match_offset = TRUE;
 853  853                                  break;
 854  854                          case THT_FREE:
 855  855                                  /*
 856  856                                   * We will dirty all the level 1 blocks in
 857  857                                   * the free range and perhaps the first and
 858  858                                   * last level 0 block.
 859  859                                   */
 860  860                                  if (blkid >= beginblk && (blkid <= endblk ||
 861  861                                      txh->txh_arg2 == DMU_OBJECT_END))
 862  862                                          match_offset = TRUE;
 863  863                                  break;
 864  864                          case THT_SPILL:
 865  865                                  if (blkid == DMU_SPILL_BLKID)
 866  866                                          match_offset = TRUE;
 867  867                                  break;
 868  868                          case THT_BONUS:
 869  869                                  if (blkid == DMU_BONUS_BLKID)
 870  870                                          match_offset = TRUE;
 871  871                                  break;
 872  872                          case THT_ZAP:
 873  873                                  match_offset = TRUE;
 874  874                                  break;
 875  875                          case THT_NEWOBJECT:
 876  876                                  match_object = TRUE;
 877  877                                  break;
 878  878                          default:
 879  879                                  ASSERT(!"bad txh_type");
 880  880                          }
 881  881                  }
 882  882                  if (match_object && match_offset) {
 883  883                          DB_DNODE_EXIT(db);
 884  884                          return;
 885  885                  }
 886  886          }
 887  887          DB_DNODE_EXIT(db);
 888  888          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 889  889              (u_longlong_t)db->db.db_object, db->db_level,
 890  890              (u_longlong_t)db->db_blkid);
 891  891  }

↓ open down ↓

891 lines elided

↑ open up ↑

 892  892  #endif
 893  893  
 894  894  static int
 895  895  dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 896  896  {
 897  897          dmu_tx_hold_t *txh;
 898  898          spa_t *spa = tx->tx_pool->dp_spa;
 899  899          uint64_t memory, asize, fsize, usize;
 900  900          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 901  901  
 902      -        ASSERT3U(tx->tx_txg, ==, 0);
      902 +        ASSERT0(tx->tx_txg);
 903  903  
 904  904          if (tx->tx_err)
 905  905                  return (tx->tx_err);
 906  906  
 907  907          if (spa_suspended(spa)) {
 908  908                  /*
 909  909                   * If the user has indicated a blocking failure mode
 910  910                   * then return ERESTART which will block in dmu_tx_wait().
 911  911                   * Otherwise, return EIO so that an error can get
 912  912                   * propagated back to the VOP calls.

 913  913                   *
 914  914                   * Note that we always honor the txg_how flag regardless
 915  915                   * of the failuremode setting.
 916  916                   */
 917  917                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 918  918                      txg_how != TXG_WAIT)
 919  919                          return (EIO);
 920  920  
 921  921                  return (ERESTART);
 922  922          }
 923  923  
 924  924          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 925  925          tx->tx_needassign_txh = NULL;
 926  926  
 927  927          /*
 928  928           * NB: No error returns are allowed after txg_hold_open, but
 929  929           * before processing the dnode holds, due to the
 930  930           * dmu_tx_unassign() logic.
 931  931           */
 932  932  
 933  933          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
 934  934          for (txh = list_head(&tx->tx_holds); txh;
 935  935              txh = list_next(&tx->tx_holds, txh)) {
 936  936                  dnode_t *dn = txh->txh_dnode;
 937  937                  if (dn != NULL) {
 938  938                          mutex_enter(&dn->dn_mtx);
 939  939                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 940  940                                  mutex_exit(&dn->dn_mtx);
 941  941                                  tx->tx_needassign_txh = txh;
 942  942                                  return (ERESTART);
 943  943                          }
 944  944                          if (dn->dn_assigned_txg == 0)
 945  945                                  dn->dn_assigned_txg = tx->tx_txg;
 946  946                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 947  947                          (void) refcount_add(&dn->dn_tx_holds, tx);
 948  948                          mutex_exit(&dn->dn_mtx);
 949  949                  }
 950  950                  towrite += txh->txh_space_towrite;
 951  951                  tofree += txh->txh_space_tofree;
 952  952                  tooverwrite += txh->txh_space_tooverwrite;
 953  953                  tounref += txh->txh_space_tounref;
 954  954                  tohold += txh->txh_memory_tohold;
 955  955                  fudge += txh->txh_fudge;
 956  956          }
 957  957  
 958  958          /*
 959  959           * NB: This check must be after we've held the dnodes, so that
 960  960           * the dmu_tx_unassign() logic will work properly
 961  961           */
 962  962          if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
 963  963                  return (ERESTART);
 964  964  
 965  965          /*
 966  966           * If a snapshot has been taken since we made our estimates,
 967  967           * assume that we won't be able to free or overwrite anything.
 968  968           */
 969  969          if (tx->tx_objset &&
 970  970              dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 971  971              tx->tx_lastsnap_txg) {
 972  972                  towrite += tooverwrite;
 973  973                  tooverwrite = tofree = 0;
 974  974          }
 975  975  
 976  976          /* needed allocation: worst-case estimate of write space */
 977  977          asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
 978  978          /* freed space estimate: worst-case overwrite + free estimate */
 979  979          fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 980  980          /* convert unrefd space to worst-case estimate */
 981  981          usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 982  982          /* calculate memory footprint estimate */
 983  983          memory = towrite + tooverwrite + tohold;
 984  984  
 985  985  #ifdef ZFS_DEBUG
 986  986          /*
 987  987           * Add in 'tohold' to account for our dirty holds on this memory
 988  988           * XXX - the "fudge" factor is to account for skipped blocks that
 989  989           * we missed because dnode_next_offset() misses in-core-only blocks.
 990  990           */
 991  991          tx->tx_space_towrite = asize +
 992  992              spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
 993  993          tx->tx_space_tofree = tofree;
 994  994          tx->tx_space_tooverwrite = tooverwrite;
 995  995          tx->tx_space_tounref = tounref;
 996  996  #endif
 997  997  
 998  998          if (tx->tx_dir && asize != 0) {
 999  999                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1000 1000                      asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1001 1001                  if (err)
1002 1002                          return (err);
1003 1003          }
1004 1004  
1005 1005          return (0);
1006 1006  }
1007 1007  
1008 1008  static void
1009 1009  dmu_tx_unassign(dmu_tx_t *tx)
1010 1010  {
1011 1011          dmu_tx_hold_t *txh;
1012 1012  
1013 1013          if (tx->tx_txg == 0)
1014 1014                  return;
1015 1015  
1016 1016          txg_rele_to_quiesce(&tx->tx_txgh);
1017 1017  
1018 1018          for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1019 1019              txh = list_next(&tx->tx_holds, txh)) {
1020 1020                  dnode_t *dn = txh->txh_dnode;
1021 1021  
1022 1022                  if (dn == NULL)
1023 1023                          continue;
1024 1024                  mutex_enter(&dn->dn_mtx);
1025 1025                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1026 1026  
1027 1027                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1028 1028                          dn->dn_assigned_txg = 0;
1029 1029                          cv_broadcast(&dn->dn_notxholds);
1030 1030                  }
1031 1031                  mutex_exit(&dn->dn_mtx);
1032 1032          }
1033 1033  
1034 1034          txg_rele_to_sync(&tx->tx_txgh);
1035 1035  
1036 1036          tx->tx_lasttried_txg = tx->tx_txg;
1037 1037          tx->tx_txg = 0;
1038 1038  }
1039 1039  
1040 1040  /*
1041 1041   * Assign tx to a transaction group.  txg_how can be one of:
1042 1042   *
1043 1043   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1044 1044   *      a new one.  This should be used when you're not holding locks.
1045 1045   *      If will only fail if we're truly out of space (or over quota).
1046 1046   *
1047 1047   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1048 1048   *      blocking, returns immediately with ERESTART.  This should be used
1049 1049   *      whenever you're holding locks.  On an ERESTART error, the caller
1050 1050   *      should drop locks, do a dmu_tx_wait(tx), and try again.
1051 1051   *
1052 1052   * (3)  A specific txg.  Use this if you need to ensure that multiple
1053 1053   *      transactions all sync in the same txg.  Like TXG_NOWAIT, it
1054 1054   *      returns ERESTART if it can't assign you into the requested txg.
1055 1055   */
1056 1056  int
1057 1057  dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
1058 1058  {
1059 1059          int err;
1060 1060  
1061 1061          ASSERT(tx->tx_txg == 0);
1062 1062          ASSERT(txg_how != 0);
1063 1063          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1064 1064  
1065 1065          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1066 1066                  dmu_tx_unassign(tx);
1067 1067  
1068 1068                  if (err != ERESTART || txg_how != TXG_WAIT)
1069 1069                          return (err);
1070 1070  
1071 1071                  dmu_tx_wait(tx);
1072 1072          }
1073 1073  
1074 1074          txg_rele_to_quiesce(&tx->tx_txgh);
1075 1075  
1076 1076          return (0);
1077 1077  }
1078 1078  
1079 1079  void
1080 1080  dmu_tx_wait(dmu_tx_t *tx)
1081 1081  {
1082 1082          spa_t *spa = tx->tx_pool->dp_spa;
1083 1083  
1084 1084          ASSERT(tx->tx_txg == 0);
1085 1085  
1086 1086          /*
1087 1087           * It's possible that the pool has become active after this thread
1088 1088           * has tried to obtain a tx. If that's the case then his
1089 1089           * tx_lasttried_txg would not have been assigned.
1090 1090           */
1091 1091          if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1092 1092                  txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
1093 1093          } else if (tx->tx_needassign_txh) {
1094 1094                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1095 1095  
1096 1096                  mutex_enter(&dn->dn_mtx);
1097 1097                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1098 1098                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1099 1099                  mutex_exit(&dn->dn_mtx);
1100 1100                  tx->tx_needassign_txh = NULL;
1101 1101          } else {
1102 1102                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1103 1103          }
1104 1104  }
1105 1105  
1106 1106  void
1107 1107  dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1108 1108  {
1109 1109  #ifdef ZFS_DEBUG
1110 1110          if (tx->tx_dir == NULL || delta == 0)
1111 1111                  return;
1112 1112  
1113 1113          if (delta > 0) {
1114 1114                  ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1115 1115                      tx->tx_space_towrite);
1116 1116                  (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1117 1117          } else {
1118 1118                  (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1119 1119          }
1120 1120  #endif
1121 1121  }
1122 1122  
1123 1123  void
1124 1124  dmu_tx_commit(dmu_tx_t *tx)
1125 1125  {
1126 1126          dmu_tx_hold_t *txh;
1127 1127  
1128 1128          ASSERT(tx->tx_txg != 0);
1129 1129  
1130 1130          while (txh = list_head(&tx->tx_holds)) {
1131 1131                  dnode_t *dn = txh->txh_dnode;
1132 1132  
1133 1133                  list_remove(&tx->tx_holds, txh);
1134 1134                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1135 1135                  if (dn == NULL)
1136 1136                          continue;
1137 1137                  mutex_enter(&dn->dn_mtx);
1138 1138                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1139 1139  
1140 1140                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1141 1141                          dn->dn_assigned_txg = 0;
1142 1142                          cv_broadcast(&dn->dn_notxholds);
1143 1143                  }
1144 1144                  mutex_exit(&dn->dn_mtx);
1145 1145                  dnode_rele(dn, tx);
1146 1146          }
1147 1147  
1148 1148          if (tx->tx_tempreserve_cookie)
1149 1149                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1150 1150  
1151 1151          if (!list_is_empty(&tx->tx_callbacks))
1152 1152                  txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1153 1153  
1154 1154          if (tx->tx_anyobj == FALSE)
1155 1155                  txg_rele_to_sync(&tx->tx_txgh);
1156 1156  
1157 1157          list_destroy(&tx->tx_callbacks);
1158 1158          list_destroy(&tx->tx_holds);
1159 1159  #ifdef ZFS_DEBUG
1160 1160          dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1161 1161              tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1162 1162              tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1163 1163          refcount_destroy_many(&tx->tx_space_written,
1164 1164              refcount_count(&tx->tx_space_written));
1165 1165          refcount_destroy_many(&tx->tx_space_freed,
1166 1166              refcount_count(&tx->tx_space_freed));
1167 1167  #endif
1168 1168          kmem_free(tx, sizeof (dmu_tx_t));
1169 1169  }
1170 1170  
1171 1171  void
1172 1172  dmu_tx_abort(dmu_tx_t *tx)
1173 1173  {
1174 1174          dmu_tx_hold_t *txh;
1175 1175  
1176 1176          ASSERT(tx->tx_txg == 0);
1177 1177  
1178 1178          while (txh = list_head(&tx->tx_holds)) {
1179 1179                  dnode_t *dn = txh->txh_dnode;
1180 1180  
1181 1181                  list_remove(&tx->tx_holds, txh);
1182 1182                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1183 1183                  if (dn != NULL)
1184 1184                          dnode_rele(dn, tx);
1185 1185          }
1186 1186  
1187 1187          /*
1188 1188           * Call any registered callbacks with an error code.
1189 1189           */
1190 1190          if (!list_is_empty(&tx->tx_callbacks))
1191 1191                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1192 1192  
1193 1193          list_destroy(&tx->tx_callbacks);
1194 1194          list_destroy(&tx->tx_holds);
1195 1195  #ifdef ZFS_DEBUG
1196 1196          refcount_destroy_many(&tx->tx_space_written,
1197 1197              refcount_count(&tx->tx_space_written));
1198 1198          refcount_destroy_many(&tx->tx_space_freed,
1199 1199              refcount_count(&tx->tx_space_freed));
1200 1200  #endif
1201 1201          kmem_free(tx, sizeof (dmu_tx_t));
1202 1202  }
1203 1203  
1204 1204  uint64_t
1205 1205  dmu_tx_get_txg(dmu_tx_t *tx)
1206 1206  {
1207 1207          ASSERT(tx->tx_txg != 0);
1208 1208          return (tx->tx_txg);
1209 1209  }
1210 1210  
1211 1211  void
1212 1212  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1213 1213  {
1214 1214          dmu_tx_callback_t *dcb;
1215 1215  
1216 1216          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1217 1217  
1218 1218          dcb->dcb_func = func;
1219 1219          dcb->dcb_data = data;
1220 1220  
1221 1221          list_insert_tail(&tx->tx_callbacks, dcb);
1222 1222  }
1223 1223  
1224 1224  /*
1225 1225   * Call all the commit callbacks on a list, with a given error code.
1226 1226   */
1227 1227  void
1228 1228  dmu_tx_do_callbacks(list_t *cb_list, int error)
1229 1229  {
1230 1230          dmu_tx_callback_t *dcb;
1231 1231  
1232 1232          while (dcb = list_head(cb_list)) {
1233 1233                  list_remove(cb_list, dcb);
1234 1234                  dcb->dcb_func(dcb->dcb_data, error);
1235 1235                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1236 1236          }
1237 1237  }
1238 1238  
1239 1239  /*
1240 1240   * Interface to hold a bunch of attributes.
1241 1241   * used for creating new files.
1242 1242   * attrsize is the total size of all attributes
1243 1243   * to be added during object creation
1244 1244   *
1245 1245   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1246 1246   */
1247 1247  
1248 1248  /*
1249 1249   * hold necessary attribute name for attribute registration.
1250 1250   * should be a very rare case where this is needed.  If it does
1251 1251   * happen it would only happen on the first write to the file system.
1252 1252   */
1253 1253  static void
1254 1254  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1255 1255  {
1256 1256          int i;
1257 1257  
1258 1258          if (!sa->sa_need_attr_registration)
1259 1259                  return;
1260 1260  
1261 1261          for (i = 0; i != sa->sa_num_attrs; i++) {
1262 1262                  if (!sa->sa_attr_table[i].sa_registered) {
1263 1263                          if (sa->sa_reg_attr_obj)
1264 1264                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1265 1265                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1266 1266                          else
1267 1267                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1268 1268                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1269 1269                  }
1270 1270          }
1271 1271  }
1272 1272  
1273 1273  
1274 1274  void
1275 1275  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1276 1276  {
1277 1277          dnode_t *dn;
1278 1278          dmu_tx_hold_t *txh;
1279 1279  
1280 1280          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1281 1281              THT_SPILL, 0, 0);
1282 1282  
1283 1283          dn = txh->txh_dnode;
1284 1284  
1285 1285          if (dn == NULL)
1286 1286                  return;
1287 1287  
1288 1288          /* If blkptr doesn't exist then add space to towrite */
1289 1289          if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1290 1290                  txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1291 1291          } else {
1292 1292                  blkptr_t *bp;
1293 1293  
1294 1294                  bp = &dn->dn_phys->dn_spill;
1295 1295                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1296 1296                      bp, bp->blk_birth))
1297 1297                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
1298 1298                  else
1299 1299                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1300 1300                  if (!BP_IS_HOLE(bp))
1301 1301                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
1302 1302          }
1303 1303  }
1304 1304  
1305 1305  void
1306 1306  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1307 1307  {
1308 1308          sa_os_t *sa = tx->tx_objset->os_sa;
1309 1309  
1310 1310          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1311 1311  
1312 1312          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1313 1313                  return;
1314 1314  
1315 1315          if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1316 1316                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1317 1317          else {
1318 1318                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1319 1319                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1320 1320                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1321 1321                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1322 1322          }
1323 1323  
1324 1324          dmu_tx_sa_registration_hold(sa, tx);
1325 1325  
1326 1326          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1327 1327                  return;
1328 1328  
1329 1329          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1330 1330              THT_SPILL, 0, 0);
1331 1331  }
1332 1332  
1333 1333  /*
1334 1334   * Hold SA attribute
1335 1335   *
1336 1336   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1337 1337   *
1338 1338   * variable_size is the total size of all variable sized attributes
1339 1339   * passed to this function.  It is not the total size of all
1340 1340   * variable size attributes that *may* exist on this object.
1341 1341   */
1342 1342  void
1343 1343  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1344 1344  {
1345 1345          uint64_t object;
1346 1346          sa_os_t *sa = tx->tx_objset->os_sa;
1347 1347  
1348 1348          ASSERT(hdl != NULL);
1349 1349  
1350 1350          object = sa_handle_object(hdl);
1351 1351  
1352 1352          dmu_tx_hold_bonus(tx, object);
1353 1353  
1354 1354          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1355 1355                  return;
1356 1356  
1357 1357          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1358 1358              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1359 1359                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1360 1360                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1361 1361                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1362 1362                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1363 1363          }
1364 1364  
1365 1365          dmu_tx_sa_registration_hold(sa, tx);
1366 1366  
1367 1367          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1368 1368                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1369 1369  
1370 1370          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1371 1371                  ASSERT(tx->tx_txg == 0);
1372 1372                  dmu_tx_hold_spill(tx, object);
1373 1373          } else {
1374 1374                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1375 1375                  dnode_t *dn;
1376 1376  
1377 1377                  DB_DNODE_ENTER(db);
1378 1378                  dn = DB_DNODE(db);
1379 1379                  if (dn->dn_have_spill) {
1380 1380                          ASSERT(tx->tx_txg == 0);
1381 1381                          dmu_tx_hold_spill(tx, object);
1382 1382                  }
1383 1383                  DB_DNODE_EXIT(db);
1384 1384          }
1385 1385  }

↓ open down ↓

473 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX