dlpx-os-diff Wdiff usr/src/uts/common/fs/zfs/dmu_tx.c

Print this page

4047 panic from dbuf_free_range() from dmu_free_object() while doing zfs receive
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_impl.h>
  29   29  #include <sys/dbuf.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/dmu_objset.h>
  32   32  #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33   33  #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36   36  #include <sys/spa.h>
  37   37  #include <sys/sa.h>
  38   38  #include <sys/sa_impl.h>
  39   39  #include <sys/zfs_context.h>
  40   40  #include <sys/varargs.h>
  41   41  
  42   42  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  43   43      uint64_t arg1, uint64_t arg2);
  44   44  
  45   45  
  46   46  dmu_tx_t *
  47   47  dmu_tx_create_dd(dsl_dir_t *dd)
  48   48  {
  49   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  50   50          tx->tx_dir = dd;
  51   51          if (dd != NULL)
  52   52                  tx->tx_pool = dd->dd_pool;
  53   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  54   54              offsetof(dmu_tx_hold_t, txh_node));
  55   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  56   56              offsetof(dmu_tx_callback_t, dcb_node));
  57   57  #ifdef ZFS_DEBUG
  58   58          refcount_create(&tx->tx_space_written);
  59   59          refcount_create(&tx->tx_space_freed);
  60   60  #endif
  61   61          return (tx);
  62   62  }
  63   63  
  64   64  dmu_tx_t *
  65   65  dmu_tx_create(objset_t *os)
  66   66  {
  67   67          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  68   68          tx->tx_objset = os;
  69   69          tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  70   70          return (tx);
  71   71  }
  72   72  
  73   73  dmu_tx_t *
  74   74  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  75   75  {
  76   76          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  77   77  
  78   78          ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  79   79          tx->tx_pool = dp;
  80   80          tx->tx_txg = txg;
  81   81          tx->tx_anyobj = TRUE;
  82   82  
  83   83          return (tx);
  84   84  }
  85   85  
  86   86  int
  87   87  dmu_tx_is_syncing(dmu_tx_t *tx)
  88   88  {
  89   89          return (tx->tx_anyobj);
  90   90  }
  91   91  
  92   92  int
  93   93  dmu_tx_private_ok(dmu_tx_t *tx)
  94   94  {
  95   95          return (tx->tx_anyobj);
  96   96  }
  97   97  
  98   98  static dmu_tx_hold_t *
  99   99  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 100  100      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 101  101  {
 102  102          dmu_tx_hold_t *txh;
 103  103          dnode_t *dn = NULL;
 104  104          int err;
 105  105  
 106  106          if (object != DMU_NEW_OBJECT) {
 107  107                  err = dnode_hold(os, object, tx, &dn);
 108  108                  if (err) {
 109  109                          tx->tx_err = err;
 110  110                          return (NULL);
 111  111                  }
 112  112  
 113  113                  if (err == 0 && tx->tx_txg != 0) {
 114  114                          mutex_enter(&dn->dn_mtx);
 115  115                          /*
 116  116                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 117  117                           * problem, but there's no way for it to happen (for
 118  118                           * now, at least).
 119  119                           */
 120  120                          ASSERT(dn->dn_assigned_txg == 0);
 121  121                          dn->dn_assigned_txg = tx->tx_txg;
 122  122                          (void) refcount_add(&dn->dn_tx_holds, tx);
 123  123                          mutex_exit(&dn->dn_mtx);
 124  124                  }
 125  125          }
 126  126  
 127  127          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 128  128          txh->txh_tx = tx;
 129  129          txh->txh_dnode = dn;
 130  130  #ifdef ZFS_DEBUG
 131  131          txh->txh_type = type;
 132  132          txh->txh_arg1 = arg1;
 133  133          txh->txh_arg2 = arg2;
 134  134  #endif
 135  135          list_insert_tail(&tx->tx_holds, txh);
 136  136  
 137  137          return (txh);
 138  138  }
 139  139  
 140  140  void
 141  141  dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 142  142  {
 143  143          /*
 144  144           * If we're syncing, they can manipulate any object anyhow, and
 145  145           * the hold on the dnode_t can cause problems.
 146  146           */
 147  147          if (!dmu_tx_is_syncing(tx)) {
 148  148                  (void) dmu_tx_hold_object_impl(tx, os,
 149  149                      object, THT_NEWOBJECT, 0, 0);
 150  150          }
 151  151  }
 152  152  
 153  153  static int
 154  154  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 155  155  {
 156  156          int err;
 157  157          dmu_buf_impl_t *db;
 158  158  
 159  159          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 160  160          db = dbuf_hold_level(dn, level, blkid, FTAG);
 161  161          rw_exit(&dn->dn_struct_rwlock);
 162  162          if (db == NULL)
 163  163                  return (SET_ERROR(EIO));
 164  164          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 165  165          dbuf_rele(db, FTAG);
 166  166          return (err);
 167  167  }
 168  168  
 169  169  static void
 170  170  dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 171  171      int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 172  172  {
 173  173          objset_t *os = dn->dn_objset;
 174  174          dsl_dataset_t *ds = os->os_dsl_dataset;
 175  175          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 176  176          dmu_buf_impl_t *parent = NULL;
 177  177          blkptr_t *bp = NULL;
 178  178          uint64_t space;
 179  179  
 180  180          if (level >= dn->dn_nlevels || history[level] == blkid)
 181  181                  return;
 182  182  
 183  183          history[level] = blkid;
 184  184  
 185  185          space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 186  186  
 187  187          if (db == NULL || db == dn->dn_dbuf) {
 188  188                  ASSERT(level != 0);
 189  189                  db = NULL;
 190  190          } else {
 191  191                  ASSERT(DB_DNODE(db) == dn);
 192  192                  ASSERT(db->db_level == level);
 193  193                  ASSERT(db->db.db_size == space);
 194  194                  ASSERT(db->db_blkid == blkid);
 195  195                  bp = db->db_blkptr;
 196  196                  parent = db->db_parent;
 197  197          }
 198  198  
 199  199          freeable = (bp && (freeable ||
 200  200              dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 201  201  
 202  202          if (freeable)
 203  203                  txh->txh_space_tooverwrite += space;
 204  204          else
 205  205                  txh->txh_space_towrite += space;
 206  206          if (bp)
 207  207                  txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 208  208  
 209  209          dmu_tx_count_twig(txh, dn, parent, level + 1,
 210  210              blkid >> epbs, freeable, history);
 211  211  }
 212  212  
 213  213  /* ARGSUSED */
 214  214  static void
 215  215  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 216  216  {
 217  217          dnode_t *dn = txh->txh_dnode;
 218  218          uint64_t start, end, i;
 219  219          int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 220  220          int err = 0;
 221  221  
 222  222          if (len == 0)
 223  223                  return;
 224  224  
 225  225          min_bs = SPA_MINBLOCKSHIFT;
 226  226          max_bs = SPA_MAXBLOCKSHIFT;
 227  227          min_ibs = DN_MIN_INDBLKSHIFT;
 228  228          max_ibs = DN_MAX_INDBLKSHIFT;
 229  229  
 230  230          if (dn) {
 231  231                  uint64_t history[DN_MAX_LEVELS];
 232  232                  int nlvls = dn->dn_nlevels;
 233  233                  int delta;
 234  234  
 235  235                  /*
 236  236                   * For i/o error checking, read the first and last level-0
 237  237                   * blocks (if they are not aligned), and all the level-1 blocks.
 238  238                   */
 239  239                  if (dn->dn_maxblkid == 0) {
 240  240                          delta = dn->dn_datablksz;
 241  241                          start = (off < dn->dn_datablksz) ? 0 : 1;
 242  242                          end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 243  243                          if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 244  244                                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 245  245                                  if (err)
 246  246                                          goto out;
 247  247                                  delta -= off;
 248  248                          }
 249  249                  } else {
 250  250                          zio_t *zio = zio_root(dn->dn_objset->os_spa,
 251  251                              NULL, NULL, ZIO_FLAG_CANFAIL);
 252  252  
 253  253                          /* first level-0 block */
 254  254                          start = off >> dn->dn_datablkshift;
 255  255                          if (P2PHASE(off, dn->dn_datablksz) ||
 256  256                              len < dn->dn_datablksz) {
 257  257                                  err = dmu_tx_check_ioerr(zio, dn, 0, start);
 258  258                                  if (err)
 259  259                                          goto out;
 260  260                          }
 261  261  
 262  262                          /* last level-0 block */
 263  263                          end = (off+len-1) >> dn->dn_datablkshift;
 264  264                          if (end != start && end <= dn->dn_maxblkid &&
 265  265                              P2PHASE(off+len, dn->dn_datablksz)) {
 266  266                                  err = dmu_tx_check_ioerr(zio, dn, 0, end);
 267  267                                  if (err)
 268  268                                          goto out;
 269  269                          }
 270  270  
 271  271                          /* level-1 blocks */
 272  272                          if (nlvls > 1) {
 273  273                                  int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 274  274                                  for (i = (start>>shft)+1; i < end>>shft; i++) {
 275  275                                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 276  276                                          if (err)
 277  277                                                  goto out;
 278  278                                  }
 279  279                          }
 280  280  
 281  281                          err = zio_wait(zio);
 282  282                          if (err)
 283  283                                  goto out;
 284  284                          delta = P2NPHASE(off, dn->dn_datablksz);
 285  285                  }
 286  286  
 287  287                  min_ibs = max_ibs = dn->dn_indblkshift;
 288  288                  if (dn->dn_maxblkid > 0) {
 289  289                          /*
 290  290                           * The blocksize can't change,
 291  291                           * so we can make a more precise estimate.
 292  292                           */
 293  293                          ASSERT(dn->dn_datablkshift != 0);
 294  294                          min_bs = max_bs = dn->dn_datablkshift;
 295  295                  }
 296  296  
 297  297                  /*
 298  298                   * If this write is not off the end of the file
 299  299                   * we need to account for overwrites/unref.
 300  300                   */
 301  301                  if (start <= dn->dn_maxblkid) {
 302  302                          for (int l = 0; l < DN_MAX_LEVELS; l++)
 303  303                                  history[l] = -1ULL;
 304  304                  }
 305  305                  while (start <= dn->dn_maxblkid) {
 306  306                          dmu_buf_impl_t *db;
 307  307  
 308  308                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 309  309                          err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 310  310                          rw_exit(&dn->dn_struct_rwlock);
 311  311  
 312  312                          if (err) {
 313  313                                  txh->txh_tx->tx_err = err;
 314  314                                  return;
 315  315                          }
 316  316  
 317  317                          dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 318  318                              history);
 319  319                          dbuf_rele(db, FTAG);
 320  320                          if (++start > end) {
 321  321                                  /*
 322  322                                   * Account for new indirects appearing
 323  323                                   * before this IO gets assigned into a txg.
 324  324                                   */
 325  325                                  bits = 64 - min_bs;
 326  326                                  epbs = min_ibs - SPA_BLKPTRSHIFT;
 327  327                                  for (bits -= epbs * (nlvls - 1);
 328  328                                      bits >= 0; bits -= epbs)
 329  329                                          txh->txh_fudge += 1ULL << max_ibs;
 330  330                                  goto out;
 331  331                          }
 332  332                          off += delta;
 333  333                          if (len >= delta)
 334  334                                  len -= delta;
 335  335                          delta = dn->dn_datablksz;
 336  336                  }
 337  337          }
 338  338  
 339  339          /*
 340  340           * 'end' is the last thing we will access, not one past.
 341  341           * This way we won't overflow when accessing the last byte.
 342  342           */
 343  343          start = P2ALIGN(off, 1ULL << max_bs);
 344  344          end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 345  345          txh->txh_space_towrite += end - start + 1;
 346  346  
 347  347          start >>= min_bs;
 348  348          end >>= min_bs;
 349  349  
 350  350          epbs = min_ibs - SPA_BLKPTRSHIFT;
 351  351  
 352  352          /*
 353  353           * The object contains at most 2^(64 - min_bs) blocks,
 354  354           * and each indirect level maps 2^epbs.
 355  355           */
 356  356          for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 357  357                  start >>= epbs;
 358  358                  end >>= epbs;
 359  359                  ASSERT3U(end, >=, start);
 360  360                  txh->txh_space_towrite += (end - start + 1) << max_ibs;
 361  361                  if (start != 0) {
 362  362                          /*
 363  363                           * We also need a new blkid=0 indirect block
 364  364                           * to reference any existing file data.
 365  365                           */
 366  366                          txh->txh_space_towrite += 1ULL << max_ibs;
 367  367                  }
 368  368          }
 369  369  
 370  370  out:
 371  371          if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 372  372              2 * DMU_MAX_ACCESS)
 373  373                  err = SET_ERROR(EFBIG);
 374  374  
 375  375          if (err)
 376  376                  txh->txh_tx->tx_err = err;
 377  377  }
 378  378  
 379  379  static void
 380  380  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 381  381  {
 382  382          dnode_t *dn = txh->txh_dnode;
 383  383          dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 384  384          uint64_t space = mdn->dn_datablksz +
 385  385              ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 386  386  
 387  387          if (dn && dn->dn_dbuf->db_blkptr &&
 388  388              dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 389  389              dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 390  390                  txh->txh_space_tooverwrite += space;
 391  391                  txh->txh_space_tounref += space;
 392  392          } else {
 393  393                  txh->txh_space_towrite += space;
 394  394                  if (dn && dn->dn_dbuf->db_blkptr)
 395  395                          txh->txh_space_tounref += space;
 396  396          }
 397  397  }
 398  398  
 399  399  void
 400  400  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 401  401  {
 402  402          dmu_tx_hold_t *txh;
 403  403  
 404  404          ASSERT(tx->tx_txg == 0);
 405  405          ASSERT(len < DMU_MAX_ACCESS);
 406  406          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 407  407  
 408  408          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 409  409              object, THT_WRITE, off, len);
 410  410          if (txh == NULL)
 411  411                  return;
 412  412  
 413  413          dmu_tx_count_write(txh, off, len);
 414  414          dmu_tx_count_dnode(txh);
 415  415  }
 416  416  
 417  417  static void
 418  418  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 419  419  {
 420  420          uint64_t blkid, nblks, lastblk;
 421  421          uint64_t space = 0, unref = 0, skipped = 0;
 422  422          dnode_t *dn = txh->txh_dnode;
 423  423          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 424  424          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 425  425          int epbs;
 426  426          uint64_t l0span = 0, nl1blks = 0;
 427  427  
 428  428          if (dn->dn_nlevels == 0)
 429  429                  return;
 430  430  
 431  431          /*
 432  432           * The struct_rwlock protects us against dn_nlevels
 433  433           * changing, in case (against all odds) we manage to dirty &
 434  434           * sync out the changes after we check for being dirty.
 435  435           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 436  436           */
 437  437          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 438  438          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 439  439          if (dn->dn_maxblkid == 0) {
 440  440                  if (off == 0 && len >= dn->dn_datablksz) {
 441  441                          blkid = 0;
 442  442                          nblks = 1;
 443  443                  } else {
 444  444                          rw_exit(&dn->dn_struct_rwlock);
 445  445                          return;
 446  446                  }
 447  447          } else {
 448  448                  blkid = off >> dn->dn_datablkshift;
 449  449                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 450  450  
 451  451                  if (blkid > dn->dn_maxblkid) {
 452  452                          rw_exit(&dn->dn_struct_rwlock);
 453  453                          return;
 454  454                  }
 455  455                  if (blkid + nblks > dn->dn_maxblkid)
 456  456                          nblks = dn->dn_maxblkid - blkid + 1;
 457  457  
 458  458          }
 459  459          l0span = nblks;    /* save for later use to calc level > 1 overhead */
 460  460          if (dn->dn_nlevels == 1) {
 461  461                  int i;
 462  462                  for (i = 0; i < nblks; i++) {
 463  463                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 464  464                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 465  465                          bp += blkid + i;
 466  466                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 467  467                                  dprintf_bp(bp, "can free old%s", "");
 468  468                                  space += bp_get_dsize(spa, bp);
 469  469                          }
 470  470                          unref += BP_GET_ASIZE(bp);
 471  471                  }
 472  472                  nl1blks = 1;
 473  473                  nblks = 0;
 474  474          }
 475  475  
 476  476          lastblk = blkid + nblks - 1;
 477  477          while (nblks) {
 478  478                  dmu_buf_impl_t *dbuf;
 479  479                  uint64_t ibyte, new_blkid;
 480  480                  int epb = 1 << epbs;
 481  481                  int err, i, blkoff, tochk;
 482  482                  blkptr_t *bp;
 483  483  
 484  484                  ibyte = blkid << dn->dn_datablkshift;
 485  485                  err = dnode_next_offset(dn,
 486  486                      DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 487  487                  new_blkid = ibyte >> dn->dn_datablkshift;
 488  488                  if (err == ESRCH) {
 489  489                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 490  490                          break;
 491  491                  }
 492  492                  if (err) {
 493  493                          txh->txh_tx->tx_err = err;
 494  494                          break;
 495  495                  }
 496  496                  if (new_blkid > lastblk) {
 497  497                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 498  498                          break;
 499  499                  }
 500  500  
 501  501                  if (new_blkid > blkid) {
 502  502                          ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 503  503                          skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 504  504                          nblks -= new_blkid - blkid;
 505  505                          blkid = new_blkid;
 506  506                  }
 507  507                  blkoff = P2PHASE(blkid, epb);
 508  508                  tochk = MIN(epb - blkoff, nblks);
 509  509  
 510  510                  err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 511  511                  if (err) {
 512  512                          txh->txh_tx->tx_err = err;
 513  513                          break;
 514  514                  }
 515  515  
 516  516                  txh->txh_memory_tohold += dbuf->db.db_size;
 517  517  
 518  518                  /*
 519  519                   * We don't check memory_tohold against DMU_MAX_ACCESS because
 520  520                   * memory_tohold is an over-estimation (especially the >L1
 521  521                   * indirect blocks), so it could fail.  Callers should have
 522  522                   * already verified that they will not be holding too much
 523  523                   * memory.
 524  524                   */
 525  525  
 526  526                  err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 527  527                  if (err != 0) {
 528  528                          txh->txh_tx->tx_err = err;
 529  529                          dbuf_rele(dbuf, FTAG);
 530  530                          break;
 531  531                  }
 532  532  
 533  533                  bp = dbuf->db.db_data;
 534  534                  bp += blkoff;
 535  535  
 536  536                  for (i = 0; i < tochk; i++) {
 537  537                          if (dsl_dataset_block_freeable(ds, &bp[i],
 538  538                              bp[i].blk_birth)) {
 539  539                                  dprintf_bp(&bp[i], "can free old%s", "");
 540  540                                  space += bp_get_dsize(spa, &bp[i]);
 541  541                          }
 542  542                          unref += BP_GET_ASIZE(bp);
 543  543                  }
 544  544                  dbuf_rele(dbuf, FTAG);
 545  545  
 546  546                  ++nl1blks;
 547  547                  blkid += tochk;
 548  548                  nblks -= tochk;
 549  549          }
 550  550          rw_exit(&dn->dn_struct_rwlock);
 551  551  
 552  552          /*
 553  553           * Add in memory requirements of higher-level indirects.
 554  554           * This assumes a worst-possible scenario for dn_nlevels and a
 555  555           * worst-possible distribution of l1-blocks over the region to free.
 556  556           */
 557  557          {
 558  558                  uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 559  559                  int level = 2;
 560  560                  /*
 561  561                   * Here we don't use DN_MAX_LEVEL, but calculate it with the
 562  562                   * given datablkshift and indblkshift. This makes the
 563  563                   * difference between 19 and 8 on large files.
 564  564                   */
 565  565                  int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 566  566                      (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 567  567  
 568  568                  while (level++ < maxlevel) {
 569  569                          txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 570  570                              << dn->dn_indblkshift;
 571  571                          blkcnt = 1 + (blkcnt >> epbs);
 572  572                  }
 573  573          }
 574  574  
 575  575          /* account for new level 1 indirect blocks that might show up */
 576  576          if (skipped > 0) {
 577  577                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 578  578                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 579  579                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 580  580          }
 581  581          txh->txh_space_tofree += space;
 582  582          txh->txh_space_tounref += unref;
 583  583  }
 584  584  
 585  585  void
 586  586  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 587  587  {
 588  588          dmu_tx_hold_t *txh;
 589  589          dnode_t *dn;
 590  590          int err;
 591  591          zio_t *zio;
 592  592  
 593  593          ASSERT(tx->tx_txg == 0);
 594  594  
 595  595          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 596  596              object, THT_FREE, off, len);
 597  597          if (txh == NULL)
 598  598                  return;
 599  599          dn = txh->txh_dnode;
 600  600  
 601  601          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 602  602                  return;
 603  603          if (len == DMU_OBJECT_END)
 604  604                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 605  605  
 606  606          dmu_tx_count_dnode(txh);
 607  607

↓ open down ↓

607 lines elided

↑ open up ↑

 608  608          /*
 609  609           * For i/o error checking, we read the first and last level-0
 610  610           * blocks if they are not aligned, and all the level-1 blocks.
 611  611           *
 612  612           * Note:  dbuf_free_range() assumes that we have not instantiated
 613  613           * any level-0 dbufs that will be completely freed.  Therefore we must
 614  614           * exercise care to not read or count the first and last blocks
 615  615           * if they are blocksize-aligned.
 616  616           */
 617  617          if (dn->dn_datablkshift == 0) {
 618      -                dmu_tx_count_write(txh, off, len);
      618 +                if (off != 0 || len < dn->dn_datablksz)
      619 +                        dmu_tx_count_write(txh, off, len);
 619  620          } else {
 620  621                  /* first block will be modified if it is not aligned */
 621  622                  if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 622  623                          dmu_tx_count_write(txh, off, 1);
 623  624                  /* last block will be modified if it is not aligned */
 624  625                  if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 625  626                          dmu_tx_count_write(txh, off+len, 1);
 626  627          }
 627  628  
 628  629          /*

 629  630           * Check level-1 blocks.
 630  631           */
 631  632          if (dn->dn_nlevels > 1) {
 632  633                  int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 633  634                      SPA_BLKPTRSHIFT;
 634  635                  uint64_t start = off >> shift;
 635  636                  uint64_t end = (off + len) >> shift;
 636  637  
 637  638                  ASSERT(dn->dn_datablkshift != 0);
 638  639                  ASSERT(dn->dn_indblkshift != 0);
 639  640  
 640  641                  zio = zio_root(tx->tx_pool->dp_spa,
 641  642                      NULL, NULL, ZIO_FLAG_CANFAIL);
 642  643                  for (uint64_t i = start; i <= end; i++) {
 643  644                          uint64_t ibyte = i << shift;
 644  645                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 645  646                          i = ibyte >> shift;
 646  647                          if (err == ESRCH)
 647  648                                  break;
 648  649                          if (err) {
 649  650                                  tx->tx_err = err;
 650  651                                  return;
 651  652                          }
 652  653  
 653  654                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 654  655                          if (err) {
 655  656                                  tx->tx_err = err;
 656  657                                  return;
 657  658                          }
 658  659                  }
 659  660                  err = zio_wait(zio);
 660  661                  if (err) {
 661  662                          tx->tx_err = err;
 662  663                          return;
 663  664                  }
 664  665          }
 665  666  
 666  667          dmu_tx_count_free(txh, off, len);
 667  668  }
 668  669  
 669  670  void
 670  671  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 671  672  {
 672  673          dmu_tx_hold_t *txh;
 673  674          dnode_t *dn;
 674  675          uint64_t nblocks;
 675  676          int epbs, err;
 676  677  
 677  678          ASSERT(tx->tx_txg == 0);
 678  679  
 679  680          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 680  681              object, THT_ZAP, add, (uintptr_t)name);
 681  682          if (txh == NULL)
 682  683                  return;
 683  684          dn = txh->txh_dnode;
 684  685  
 685  686          dmu_tx_count_dnode(txh);
 686  687  
 687  688          if (dn == NULL) {
 688  689                  /*
 689  690                   * We will be able to fit a new object's entries into one leaf
 690  691                   * block.  So there will be at most 2 blocks total,
 691  692                   * including the header block.
 692  693                   */
 693  694                  dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 694  695                  return;
 695  696          }
 696  697  
 697  698          ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 698  699  
 699  700          if (dn->dn_maxblkid == 0 && !add) {
 700  701                  blkptr_t *bp;
 701  702  
 702  703                  /*
 703  704                   * If there is only one block  (i.e. this is a micro-zap)
 704  705                   * and we are not adding anything, the accounting is simple.
 705  706                   */
 706  707                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 707  708                  if (err) {
 708  709                          tx->tx_err = err;
 709  710                          return;
 710  711                  }
 711  712  
 712  713                  /*
 713  714                   * Use max block size here, since we don't know how much
 714  715                   * the size will change between now and the dbuf dirty call.
 715  716                   */
 716  717                  bp = &dn->dn_phys->dn_blkptr[0];
 717  718                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 718  719                      bp, bp->blk_birth))
 719  720                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 720  721                  else
 721  722                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
 722  723                  if (!BP_IS_HOLE(bp))
 723  724                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 724  725                  return;
 725  726          }
 726  727  
 727  728          if (dn->dn_maxblkid > 0 && name) {
 728  729                  /*
 729  730                   * access the name in this fat-zap so that we'll check
 730  731                   * for i/o errors to the leaf blocks, etc.
 731  732                   */
 732  733                  err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 733  734                      8, 0, NULL);
 734  735                  if (err == EIO) {
 735  736                          tx->tx_err = err;
 736  737                          return;
 737  738                  }
 738  739          }
 739  740  
 740  741          err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 741  742              &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 742  743  
 743  744          /*
 744  745           * If the modified blocks are scattered to the four winds,
 745  746           * we'll have to modify an indirect twig for each.
 746  747           */
 747  748          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 748  749          for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 749  750                  if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
 750  751                          txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 751  752                  else
 752  753                          txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 753  754  }
 754  755  
 755  756  void
 756  757  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 757  758  {
 758  759          dmu_tx_hold_t *txh;
 759  760  
 760  761          ASSERT(tx->tx_txg == 0);
 761  762  
 762  763          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 763  764              object, THT_BONUS, 0, 0);
 764  765          if (txh)
 765  766                  dmu_tx_count_dnode(txh);
 766  767  }
 767  768  
 768  769  void
 769  770  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 770  771  {
 771  772          dmu_tx_hold_t *txh;
 772  773          ASSERT(tx->tx_txg == 0);
 773  774  
 774  775          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 775  776              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 776  777  
 777  778          txh->txh_space_towrite += space;
 778  779  }
 779  780  
 780  781  int
 781  782  dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 782  783  {
 783  784          dmu_tx_hold_t *txh;
 784  785          int holds = 0;
 785  786  
 786  787          /*
 787  788           * By asserting that the tx is assigned, we're counting the
 788  789           * number of dn_tx_holds, which is the same as the number of
 789  790           * dn_holds.  Otherwise, we'd be counting dn_holds, but
 790  791           * dn_tx_holds could be 0.
 791  792           */
 792  793          ASSERT(tx->tx_txg != 0);
 793  794  
 794  795          /* if (tx->tx_anyobj == TRUE) */
 795  796                  /* return (0); */
 796  797  
 797  798          for (txh = list_head(&tx->tx_holds); txh;
 798  799              txh = list_next(&tx->tx_holds, txh)) {
 799  800                  if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 800  801                          holds++;
 801  802          }
 802  803  
 803  804          return (holds);
 804  805  }
 805  806  
 806  807  #ifdef ZFS_DEBUG
 807  808  void
 808  809  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 809  810  {
 810  811          dmu_tx_hold_t *txh;
 811  812          int match_object = FALSE, match_offset = FALSE;
 812  813          dnode_t *dn;
 813  814  
 814  815          DB_DNODE_ENTER(db);
 815  816          dn = DB_DNODE(db);
 816  817          ASSERT(tx->tx_txg != 0);
 817  818          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 818  819          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 819  820  
 820  821          if (tx->tx_anyobj) {
 821  822                  DB_DNODE_EXIT(db);
 822  823                  return;
 823  824          }
 824  825  
 825  826          /* XXX No checking on the meta dnode for now */
 826  827          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 827  828                  DB_DNODE_EXIT(db);
 828  829                  return;
 829  830          }
 830  831  
 831  832          for (txh = list_head(&tx->tx_holds); txh;
 832  833              txh = list_next(&tx->tx_holds, txh)) {
 833  834                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 834  835                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 835  836                          match_object = TRUE;
 836  837                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 837  838                          int datablkshift = dn->dn_datablkshift ?
 838  839                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 839  840                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 840  841                          int shift = datablkshift + epbs * db->db_level;
 841  842                          uint64_t beginblk = shift >= 64 ? 0 :
 842  843                              (txh->txh_arg1 >> shift);
 843  844                          uint64_t endblk = shift >= 64 ? 0 :
 844  845                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 845  846                          uint64_t blkid = db->db_blkid;
 846  847  
 847  848                          /* XXX txh_arg2 better not be zero... */
 848  849  
 849  850                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 850  851                              txh->txh_type, beginblk, endblk);
 851  852  
 852  853                          switch (txh->txh_type) {
 853  854                          case THT_WRITE:
 854  855                                  if (blkid >= beginblk && blkid <= endblk)
 855  856                                          match_offset = TRUE;
 856  857                                  /*
 857  858                                   * We will let this hold work for the bonus
 858  859                                   * or spill buffer so that we don't need to
 859  860                                   * hold it when creating a new object.
 860  861                                   */
 861  862                                  if (blkid == DMU_BONUS_BLKID ||
 862  863                                      blkid == DMU_SPILL_BLKID)
 863  864                                          match_offset = TRUE;
 864  865                                  /*
 865  866                                   * They might have to increase nlevels,
 866  867                                   * thus dirtying the new TLIBs.  Or the
 867  868                                   * might have to change the block size,
 868  869                                   * thus dirying the new lvl=0 blk=0.
 869  870                                   */
 870  871                                  if (blkid == 0)
 871  872                                          match_offset = TRUE;
 872  873                                  break;
 873  874                          case THT_FREE:
 874  875                                  /*
 875  876                                   * We will dirty all the level 1 blocks in
 876  877                                   * the free range and perhaps the first and
 877  878                                   * last level 0 block.
 878  879                                   */
 879  880                                  if (blkid >= beginblk && (blkid <= endblk ||
 880  881                                      txh->txh_arg2 == DMU_OBJECT_END))
 881  882                                          match_offset = TRUE;
 882  883                                  break;
 883  884                          case THT_SPILL:
 884  885                                  if (blkid == DMU_SPILL_BLKID)
 885  886                                          match_offset = TRUE;
 886  887                                  break;
 887  888                          case THT_BONUS:
 888  889                                  if (blkid == DMU_BONUS_BLKID)
 889  890                                          match_offset = TRUE;
 890  891                                  break;
 891  892                          case THT_ZAP:
 892  893                                  match_offset = TRUE;
 893  894                                  break;
 894  895                          case THT_NEWOBJECT:
 895  896                                  match_object = TRUE;
 896  897                                  break;
 897  898                          default:
 898  899                                  ASSERT(!"bad txh_type");
 899  900                          }
 900  901                  }
 901  902                  if (match_object && match_offset) {
 902  903                          DB_DNODE_EXIT(db);
 903  904                          return;
 904  905                  }
 905  906          }
 906  907          DB_DNODE_EXIT(db);
 907  908          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 908  909              (u_longlong_t)db->db.db_object, db->db_level,
 909  910              (u_longlong_t)db->db_blkid);
 910  911  }
 911  912  #endif
 912  913  
 913  914  static int
 914  915  dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 915  916  {
 916  917          dmu_tx_hold_t *txh;
 917  918          spa_t *spa = tx->tx_pool->dp_spa;
 918  919          uint64_t memory, asize, fsize, usize;
 919  920          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 920  921  
 921  922          ASSERT0(tx->tx_txg);
 922  923  
 923  924          if (tx->tx_err)
 924  925                  return (tx->tx_err);
 925  926  
 926  927          if (spa_suspended(spa)) {
 927  928                  /*
 928  929                   * If the user has indicated a blocking failure mode
 929  930                   * then return ERESTART which will block in dmu_tx_wait().
 930  931                   * Otherwise, return EIO so that an error can get
 931  932                   * propagated back to the VOP calls.
 932  933                   *
 933  934                   * Note that we always honor the txg_how flag regardless
 934  935                   * of the failuremode setting.
 935  936                   */
 936  937                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 937  938                      txg_how != TXG_WAIT)
 938  939                          return (SET_ERROR(EIO));
 939  940  
 940  941                  return (SET_ERROR(ERESTART));
 941  942          }
 942  943  
 943  944          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 944  945          tx->tx_needassign_txh = NULL;
 945  946  
 946  947          /*
 947  948           * NB: No error returns are allowed after txg_hold_open, but
 948  949           * before processing the dnode holds, due to the
 949  950           * dmu_tx_unassign() logic.
 950  951           */
 951  952  
 952  953          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
 953  954          for (txh = list_head(&tx->tx_holds); txh;
 954  955              txh = list_next(&tx->tx_holds, txh)) {
 955  956                  dnode_t *dn = txh->txh_dnode;
 956  957                  if (dn != NULL) {
 957  958                          mutex_enter(&dn->dn_mtx);
 958  959                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 959  960                                  mutex_exit(&dn->dn_mtx);
 960  961                                  tx->tx_needassign_txh = txh;
 961  962                                  return (SET_ERROR(ERESTART));
 962  963                          }
 963  964                          if (dn->dn_assigned_txg == 0)
 964  965                                  dn->dn_assigned_txg = tx->tx_txg;
 965  966                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 966  967                          (void) refcount_add(&dn->dn_tx_holds, tx);
 967  968                          mutex_exit(&dn->dn_mtx);
 968  969                  }
 969  970                  towrite += txh->txh_space_towrite;
 970  971                  tofree += txh->txh_space_tofree;
 971  972                  tooverwrite += txh->txh_space_tooverwrite;
 972  973                  tounref += txh->txh_space_tounref;
 973  974                  tohold += txh->txh_memory_tohold;
 974  975                  fudge += txh->txh_fudge;
 975  976          }
 976  977  
 977  978          /*
 978  979           * If a snapshot has been taken since we made our estimates,
 979  980           * assume that we won't be able to free or overwrite anything.
 980  981           */
 981  982          if (tx->tx_objset &&
 982  983              dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 983  984              tx->tx_lastsnap_txg) {
 984  985                  towrite += tooverwrite;
 985  986                  tooverwrite = tofree = 0;
 986  987          }
 987  988  
 988  989          /* needed allocation: worst-case estimate of write space */
 989  990          asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
 990  991          /* freed space estimate: worst-case overwrite + free estimate */
 991  992          fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 992  993          /* convert unrefd space to worst-case estimate */
 993  994          usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 994  995          /* calculate memory footprint estimate */
 995  996          memory = towrite + tooverwrite + tohold;
 996  997  
 997  998  #ifdef ZFS_DEBUG
 998  999          /*
 999 1000           * Add in 'tohold' to account for our dirty holds on this memory
1000 1001           * XXX - the "fudge" factor is to account for skipped blocks that
1001 1002           * we missed because dnode_next_offset() misses in-core-only blocks.
1002 1003           */
1003 1004          tx->tx_space_towrite = asize +
1004 1005              spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
1005 1006          tx->tx_space_tofree = tofree;
1006 1007          tx->tx_space_tooverwrite = tooverwrite;
1007 1008          tx->tx_space_tounref = tounref;
1008 1009  #endif
1009 1010  
1010 1011          if (tx->tx_dir && asize != 0) {
1011 1012                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1012 1013                      asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1013 1014                  if (err)
1014 1015                          return (err);
1015 1016          }
1016 1017  
1017 1018          return (0);
1018 1019  }
1019 1020  
1020 1021  static void
1021 1022  dmu_tx_unassign(dmu_tx_t *tx)
1022 1023  {
1023 1024          dmu_tx_hold_t *txh;
1024 1025  
1025 1026          if (tx->tx_txg == 0)
1026 1027                  return;
1027 1028  
1028 1029          txg_rele_to_quiesce(&tx->tx_txgh);
1029 1030  
1030 1031          /*
1031 1032           * Walk the transaction's hold list, removing the hold on the
1032 1033           * associated dnode, and notifying waiters if the refcount drops to 0.
1033 1034           */
1034 1035          for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1035 1036              txh = list_next(&tx->tx_holds, txh)) {
1036 1037                  dnode_t *dn = txh->txh_dnode;
1037 1038  
1038 1039                  if (dn == NULL)
1039 1040                          continue;
1040 1041                  mutex_enter(&dn->dn_mtx);
1041 1042                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1042 1043  
1043 1044                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1044 1045                          dn->dn_assigned_txg = 0;
1045 1046                          cv_broadcast(&dn->dn_notxholds);
1046 1047                  }
1047 1048                  mutex_exit(&dn->dn_mtx);
1048 1049          }
1049 1050  
1050 1051          txg_rele_to_sync(&tx->tx_txgh);
1051 1052  
1052 1053          tx->tx_lasttried_txg = tx->tx_txg;
1053 1054          tx->tx_txg = 0;
1054 1055  }
1055 1056  
1056 1057  /*
1057 1058   * Assign tx to a transaction group.  txg_how can be one of:
1058 1059   *
1059 1060   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1060 1061   *      a new one.  This should be used when you're not holding locks.
1061 1062   *      It will only fail if we're truly out of space (or over quota).
1062 1063   *
1063 1064   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1064 1065   *      blocking, returns immediately with ERESTART.  This should be used
1065 1066   *      whenever you're holding locks.  On an ERESTART error, the caller
1066 1067   *      should drop locks, do a dmu_tx_wait(tx), and try again.
1067 1068   */
1068 1069  int
1069 1070  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1070 1071  {
1071 1072          int err;
1072 1073  
1073 1074          ASSERT(tx->tx_txg == 0);
1074 1075          ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
1075 1076          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1076 1077  
1077 1078          /* If we might wait, we must not hold the config lock. */
1078 1079          ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1079 1080  
1080 1081          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1081 1082                  dmu_tx_unassign(tx);
1082 1083  
1083 1084                  if (err != ERESTART || txg_how != TXG_WAIT)
1084 1085                          return (err);
1085 1086  
1086 1087                  dmu_tx_wait(tx);
1087 1088          }
1088 1089  
1089 1090          txg_rele_to_quiesce(&tx->tx_txgh);
1090 1091  
1091 1092          return (0);
1092 1093  }
1093 1094  
1094 1095  void
1095 1096  dmu_tx_wait(dmu_tx_t *tx)
1096 1097  {
1097 1098          spa_t *spa = tx->tx_pool->dp_spa;
1098 1099  
1099 1100          ASSERT(tx->tx_txg == 0);
1100 1101          ASSERT(!dsl_pool_config_held(tx->tx_pool));
1101 1102  
1102 1103          /*
1103 1104           * It's possible that the pool has become active after this thread
1104 1105           * has tried to obtain a tx. If that's the case then his
1105 1106           * tx_lasttried_txg would not have been assigned.
1106 1107           */
1107 1108          if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1108 1109                  txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
1109 1110          } else if (tx->tx_needassign_txh) {
1110 1111                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1111 1112  
1112 1113                  mutex_enter(&dn->dn_mtx);
1113 1114                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1114 1115                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1115 1116                  mutex_exit(&dn->dn_mtx);
1116 1117                  tx->tx_needassign_txh = NULL;
1117 1118          } else {
1118 1119                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1119 1120          }
1120 1121  }
1121 1122  
1122 1123  void
1123 1124  dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1124 1125  {
1125 1126  #ifdef ZFS_DEBUG
1126 1127          if (tx->tx_dir == NULL || delta == 0)
1127 1128                  return;
1128 1129  
1129 1130          if (delta > 0) {
1130 1131                  ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1131 1132                      tx->tx_space_towrite);
1132 1133                  (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1133 1134          } else {
1134 1135                  (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1135 1136          }
1136 1137  #endif
1137 1138  }
1138 1139  
1139 1140  void
1140 1141  dmu_tx_commit(dmu_tx_t *tx)
1141 1142  {
1142 1143          dmu_tx_hold_t *txh;
1143 1144  
1144 1145          ASSERT(tx->tx_txg != 0);
1145 1146  
1146 1147          /*
1147 1148           * Go through the transaction's hold list and remove holds on
1148 1149           * associated dnodes, notifying waiters if no holds remain.
1149 1150           */
1150 1151          while (txh = list_head(&tx->tx_holds)) {
1151 1152                  dnode_t *dn = txh->txh_dnode;
1152 1153  
1153 1154                  list_remove(&tx->tx_holds, txh);
1154 1155                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1155 1156                  if (dn == NULL)
1156 1157                          continue;
1157 1158                  mutex_enter(&dn->dn_mtx);
1158 1159                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1159 1160  
1160 1161                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1161 1162                          dn->dn_assigned_txg = 0;
1162 1163                          cv_broadcast(&dn->dn_notxholds);
1163 1164                  }
1164 1165                  mutex_exit(&dn->dn_mtx);
1165 1166                  dnode_rele(dn, tx);
1166 1167          }
1167 1168  
1168 1169          if (tx->tx_tempreserve_cookie)
1169 1170                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1170 1171  
1171 1172          if (!list_is_empty(&tx->tx_callbacks))
1172 1173                  txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1173 1174  
1174 1175          if (tx->tx_anyobj == FALSE)
1175 1176                  txg_rele_to_sync(&tx->tx_txgh);
1176 1177  
1177 1178          list_destroy(&tx->tx_callbacks);
1178 1179          list_destroy(&tx->tx_holds);
1179 1180  #ifdef ZFS_DEBUG
1180 1181          dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1181 1182              tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1182 1183              tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1183 1184          refcount_destroy_many(&tx->tx_space_written,
1184 1185              refcount_count(&tx->tx_space_written));
1185 1186          refcount_destroy_many(&tx->tx_space_freed,
1186 1187              refcount_count(&tx->tx_space_freed));
1187 1188  #endif
1188 1189          kmem_free(tx, sizeof (dmu_tx_t));
1189 1190  }
1190 1191  
1191 1192  void
1192 1193  dmu_tx_abort(dmu_tx_t *tx)
1193 1194  {
1194 1195          dmu_tx_hold_t *txh;
1195 1196  
1196 1197          ASSERT(tx->tx_txg == 0);
1197 1198  
1198 1199          while (txh = list_head(&tx->tx_holds)) {
1199 1200                  dnode_t *dn = txh->txh_dnode;
1200 1201  
1201 1202                  list_remove(&tx->tx_holds, txh);
1202 1203                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1203 1204                  if (dn != NULL)
1204 1205                          dnode_rele(dn, tx);
1205 1206          }
1206 1207  
1207 1208          /*
1208 1209           * Call any registered callbacks with an error code.
1209 1210           */
1210 1211          if (!list_is_empty(&tx->tx_callbacks))
1211 1212                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1212 1213  
1213 1214          list_destroy(&tx->tx_callbacks);
1214 1215          list_destroy(&tx->tx_holds);
1215 1216  #ifdef ZFS_DEBUG
1216 1217          refcount_destroy_many(&tx->tx_space_written,
1217 1218              refcount_count(&tx->tx_space_written));
1218 1219          refcount_destroy_many(&tx->tx_space_freed,
1219 1220              refcount_count(&tx->tx_space_freed));
1220 1221  #endif
1221 1222          kmem_free(tx, sizeof (dmu_tx_t));
1222 1223  }
1223 1224  
1224 1225  uint64_t
1225 1226  dmu_tx_get_txg(dmu_tx_t *tx)
1226 1227  {
1227 1228          ASSERT(tx->tx_txg != 0);
1228 1229          return (tx->tx_txg);
1229 1230  }
1230 1231  
1231 1232  dsl_pool_t *
1232 1233  dmu_tx_pool(dmu_tx_t *tx)
1233 1234  {
1234 1235          ASSERT(tx->tx_pool != NULL);
1235 1236          return (tx->tx_pool);
1236 1237  }
1237 1238  
1238 1239  
1239 1240  void
1240 1241  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1241 1242  {
1242 1243          dmu_tx_callback_t *dcb;
1243 1244  
1244 1245          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1245 1246  
1246 1247          dcb->dcb_func = func;
1247 1248          dcb->dcb_data = data;
1248 1249  
1249 1250          list_insert_tail(&tx->tx_callbacks, dcb);
1250 1251  }
1251 1252  
1252 1253  /*
1253 1254   * Call all the commit callbacks on a list, with a given error code.
1254 1255   */
1255 1256  void
1256 1257  dmu_tx_do_callbacks(list_t *cb_list, int error)
1257 1258  {
1258 1259          dmu_tx_callback_t *dcb;
1259 1260  
1260 1261          while (dcb = list_head(cb_list)) {
1261 1262                  list_remove(cb_list, dcb);
1262 1263                  dcb->dcb_func(dcb->dcb_data, error);
1263 1264                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1264 1265          }
1265 1266  }
1266 1267  
1267 1268  /*
1268 1269   * Interface to hold a bunch of attributes.
1269 1270   * used for creating new files.
1270 1271   * attrsize is the total size of all attributes
1271 1272   * to be added during object creation
1272 1273   *
1273 1274   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1274 1275   */
1275 1276  
1276 1277  /*
1277 1278   * hold necessary attribute name for attribute registration.
1278 1279   * should be a very rare case where this is needed.  If it does
1279 1280   * happen it would only happen on the first write to the file system.
1280 1281   */
1281 1282  static void
1282 1283  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1283 1284  {
1284 1285          int i;
1285 1286  
1286 1287          if (!sa->sa_need_attr_registration)
1287 1288                  return;
1288 1289  
1289 1290          for (i = 0; i != sa->sa_num_attrs; i++) {
1290 1291                  if (!sa->sa_attr_table[i].sa_registered) {
1291 1292                          if (sa->sa_reg_attr_obj)
1292 1293                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1293 1294                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1294 1295                          else
1295 1296                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1296 1297                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1297 1298                  }
1298 1299          }
1299 1300  }
1300 1301  
1301 1302  
1302 1303  void
1303 1304  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1304 1305  {
1305 1306          dnode_t *dn;
1306 1307          dmu_tx_hold_t *txh;
1307 1308  
1308 1309          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1309 1310              THT_SPILL, 0, 0);
1310 1311  
1311 1312          dn = txh->txh_dnode;
1312 1313  
1313 1314          if (dn == NULL)
1314 1315                  return;
1315 1316  
1316 1317          /* If blkptr doesn't exist then add space to towrite */
1317 1318          if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1318 1319                  txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1319 1320          } else {
1320 1321                  blkptr_t *bp;
1321 1322  
1322 1323                  bp = &dn->dn_phys->dn_spill;
1323 1324                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1324 1325                      bp, bp->blk_birth))
1325 1326                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
1326 1327                  else
1327 1328                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1328 1329                  if (!BP_IS_HOLE(bp))
1329 1330                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
1330 1331          }
1331 1332  }
1332 1333  
1333 1334  void
1334 1335  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1335 1336  {
1336 1337          sa_os_t *sa = tx->tx_objset->os_sa;
1337 1338  
1338 1339          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1339 1340  
1340 1341          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1341 1342                  return;
1342 1343  
1343 1344          if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1344 1345                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1345 1346          else {
1346 1347                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1347 1348                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1348 1349                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1349 1350                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1350 1351          }
1351 1352  
1352 1353          dmu_tx_sa_registration_hold(sa, tx);
1353 1354  
1354 1355          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1355 1356                  return;
1356 1357  
1357 1358          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1358 1359              THT_SPILL, 0, 0);
1359 1360  }
1360 1361  
1361 1362  /*
1362 1363   * Hold SA attribute
1363 1364   *
1364 1365   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1365 1366   *
1366 1367   * variable_size is the total size of all variable sized attributes
1367 1368   * passed to this function.  It is not the total size of all
1368 1369   * variable size attributes that *may* exist on this object.
1369 1370   */
1370 1371  void
1371 1372  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1372 1373  {
1373 1374          uint64_t object;
1374 1375          sa_os_t *sa = tx->tx_objset->os_sa;
1375 1376  
1376 1377          ASSERT(hdl != NULL);
1377 1378  
1378 1379          object = sa_handle_object(hdl);
1379 1380  
1380 1381          dmu_tx_hold_bonus(tx, object);
1381 1382  
1382 1383          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1383 1384                  return;
1384 1385  
1385 1386          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1386 1387              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1387 1388                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1388 1389                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1389 1390                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1390 1391                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1391 1392          }
1392 1393  
1393 1394          dmu_tx_sa_registration_hold(sa, tx);
1394 1395  
1395 1396          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1396 1397                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1397 1398  
1398 1399          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1399 1400                  ASSERT(tx->tx_txg == 0);
1400 1401                  dmu_tx_hold_spill(tx, object);
1401 1402          } else {
1402 1403                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1403 1404                  dnode_t *dn;
1404 1405  
1405 1406                  DB_DNODE_ENTER(db);
1406 1407                  dn = DB_DNODE(db);
1407 1408                  if (dn->dn_have_spill) {
1408 1409                          ASSERT(tx->tx_txg == 0);
1409 1410                          dmu_tx_hold_spill(tx, object);
1410 1411                  }
1411 1412                  DB_DNODE_EXIT(db);
1412 1413          }
1413 1414  }

↓ open down ↓

785 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX