Print this page
    
3752 want more verifiable dbuf user eviction
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Will Andrews <willa@spectralogic.com>
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dnode_sync.c
          +++ new/usr/src/uts/common/fs/zfs/dnode_sync.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2012 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/dbuf.h>
  29   29  #include <sys/dnode.h>
  30   30  #include <sys/dmu.h>
  31   31  #include <sys/dmu_tx.h>
  32   32  #include <sys/dmu_objset.h>
  33   33  #include <sys/dsl_dataset.h>
  34   34  #include <sys/spa.h>
  35   35  
  36   36  static void
  37   37  dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
  38   38  {
  39   39          dmu_buf_impl_t *db;
  40   40          int txgoff = tx->tx_txg & TXG_MASK;
  41   41          int nblkptr = dn->dn_phys->dn_nblkptr;
  42   42          int old_toplvl = dn->dn_phys->dn_nlevels - 1;
  43   43          int new_level = dn->dn_next_nlevels[txgoff];
  44   44          int i;
  45   45  
  46   46          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
  47   47  
  48   48          /* this dnode can't be paged out because it's dirty */
  49   49          ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
  50   50          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
  51   51          ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
  52   52  
  53   53          db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
  54   54          ASSERT(db != NULL);
  55   55  
  56   56          dn->dn_phys->dn_nlevels = new_level;
  57   57          dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
  58   58              dn->dn_object, dn->dn_phys->dn_nlevels);
  59   59  
  60   60          /* check for existing blkptrs in the dnode */
  61   61          for (i = 0; i < nblkptr; i++)
  62   62                  if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
  63   63                          break;
  64   64          if (i != nblkptr) {
  65   65                  /* transfer dnode's block pointers to new indirect block */
  66   66                  (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
  67   67                  ASSERT(db->db.db_data);
  68   68                  ASSERT(arc_released(db->db_buf));
  69   69                  ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
  70   70                  bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
  71   71                      sizeof (blkptr_t) * nblkptr);
  72   72                  arc_buf_freeze(db->db_buf);
  73   73          }
  74   74  
  75   75          /* set dbuf's parent pointers to new indirect buf */
  76   76          for (i = 0; i < nblkptr; i++) {
  77   77                  dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
  78   78  
  79   79                  if (child == NULL)
  80   80                          continue;
  81   81  #ifdef  DEBUG
  82   82                  DB_DNODE_ENTER(child);
  83   83                  ASSERT3P(DB_DNODE(child), ==, dn);
  84   84                  DB_DNODE_EXIT(child);
  85   85  #endif  /* DEBUG */
  86   86                  if (child->db_parent && child->db_parent != dn->dn_dbuf) {
  87   87                          ASSERT(child->db_parent->db_level == db->db_level);
  88   88                          ASSERT(child->db_blkptr !=
  89   89                              &dn->dn_phys->dn_blkptr[child->db_blkid]);
  90   90                          mutex_exit(&child->db_mtx);
  91   91                          continue;
  92   92                  }
  93   93                  ASSERT(child->db_parent == NULL ||
  94   94                      child->db_parent == dn->dn_dbuf);
  95   95  
  96   96                  child->db_parent = db;
  97   97                  dbuf_add_ref(db, child);
  98   98                  if (db->db.db_data)
  99   99                          child->db_blkptr = (blkptr_t *)db->db.db_data + i;
 100  100                  else
 101  101                          child->db_blkptr = NULL;
 102  102                  dprintf_dbuf_bp(child, child->db_blkptr,
 103  103                      "changed db_blkptr to new indirect %s", "");
 104  104  
 105  105                  mutex_exit(&child->db_mtx);
 106  106          }
 107  107  
 108  108          bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
 109  109  
 110  110          dbuf_rele(db, FTAG);
 111  111  
 112  112          rw_exit(&dn->dn_struct_rwlock);
 113  113  }
 114  114  
 115  115  static int
 116  116  free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 117  117  {
 118  118          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 119  119          uint64_t bytesfreed = 0;
 120  120          int i, blocks_freed = 0;
 121  121  
 122  122          dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
 123  123  
 124  124          for (i = 0; i < num; i++, bp++) {
 125  125                  if (BP_IS_HOLE(bp))
 126  126                          continue;
 127  127  
 128  128                  bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
 129  129                  ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
 130  130                  bzero(bp, sizeof (blkptr_t));
 131  131                  blocks_freed += 1;
 132  132          }
 133  133          dnode_diduse_space(dn, -bytesfreed);
 134  134          return (blocks_freed);
 135  135  }
 136  136  
 137  137  #ifdef ZFS_DEBUG
 138  138  static void
 139  139  free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 140  140  {
 141  141          int off, num;
 142  142          int i, err, epbs;
 143  143          uint64_t txg = tx->tx_txg;
 144  144          dnode_t *dn;
 145  145  
 146  146          DB_DNODE_ENTER(db);
 147  147          dn = DB_DNODE(db);
 148  148          epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 149  149          off = start - (db->db_blkid * 1<<epbs);
 150  150          num = end - start + 1;
 151  151  
 152  152          ASSERT3U(off, >=, 0);
 153  153          ASSERT3U(num, >=, 0);
 154  154          ASSERT3U(db->db_level, >, 0);
 155  155          ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 156  156          ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
 157  157          ASSERT(db->db_blkptr != NULL);
 158  158  
 159  159          for (i = off; i < off+num; i++) {
 160  160                  uint64_t *buf;
 161  161                  dmu_buf_impl_t *child;
 162  162                  dbuf_dirty_record_t *dr;
 163  163                  int j;
 164  164  
 165  165                  ASSERT(db->db_level == 1);
 166  166  
 167  167                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 168  168                  err = dbuf_hold_impl(dn, db->db_level-1,
 169  169                      (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
 170  170                  rw_exit(&dn->dn_struct_rwlock);
 171  171                  if (err == ENOENT)
 172  172                          continue;
 173  173                  ASSERT(err == 0);
 174  174                  ASSERT(child->db_level == 0);
 175  175                  dr = child->db_last_dirty;
 176  176                  while (dr && dr->dr_txg > txg)
 177  177                          dr = dr->dr_next;
 178  178                  ASSERT(dr == NULL || dr->dr_txg == txg);
 179  179  
 180  180                  /* data_old better be zeroed */
 181  181                  if (dr) {
 182  182                          buf = dr->dt.dl.dr_data->b_data;
 183  183                          for (j = 0; j < child->db.db_size >> 3; j++) {
 184  184                                  if (buf[j] != 0) {
 185  185                                          panic("freed data not zero: "
 186  186                                              "child=%p i=%d off=%d num=%d\n",
 187  187                                              (void *)child, i, off, num);
 188  188                                  }
 189  189                          }
 190  190                  }
 191  191  
 192  192                  /*
 193  193                   * db_data better be zeroed unless it's dirty in a
 194  194                   * future txg.
 195  195                   */
 196  196                  mutex_enter(&child->db_mtx);
 197  197                  buf = child->db.db_data;
 198  198                  if (buf != NULL && child->db_state != DB_FILL &&
 199  199                      child->db_last_dirty == NULL) {
 200  200                          for (j = 0; j < child->db.db_size >> 3; j++) {
 201  201                                  if (buf[j] != 0) {
 202  202                                          panic("freed data not zero: "
 203  203                                              "child=%p i=%d off=%d num=%d\n",
 204  204                                              (void *)child, i, off, num);
 205  205                                  }
 206  206                          }
 207  207                  }
 208  208                  mutex_exit(&child->db_mtx);
 209  209  
 210  210                  dbuf_rele(child, FTAG);
 211  211          }
 212  212          DB_DNODE_EXIT(db);
 213  213  }
 214  214  #endif
 215  215  
 216  216  #define ALL -1
 217  217  
 218  218  static int
 219  219  free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 220  220      dmu_tx_t *tx)
 221  221  {
 222  222          dnode_t *dn;
 223  223          blkptr_t *bp;
 224  224          dmu_buf_impl_t *subdb;
 225  225          uint64_t start, end, dbstart, dbend, i;
 226  226          int epbs, shift, err;
 227  227          int all = TRUE;
 228  228          int blocks_freed = 0;
 229  229  
 230  230          /*
 231  231           * There is a small possibility that this block will not be cached:
 232  232           *   1 - if level > 1 and there are no children with level <= 1
 233  233           *   2 - if we didn't get a dirty hold (because this block had just
 234  234           *       finished being written -- and so had no holds), and then this
 235  235           *       block got evicted before we got here.
 236  236           */
 237  237          if (db->db_state != DB_CACHED)
 238  238                  (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 239  239  
 240  240          dbuf_release_bp(db);
 241  241          bp = (blkptr_t *)db->db.db_data;
 242  242  
 243  243          DB_DNODE_ENTER(db);
 244  244          dn = DB_DNODE(db);
 245  245          epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 246  246          shift = (db->db_level - 1) * epbs;
 247  247          dbstart = db->db_blkid << epbs;
 248  248          start = blkid >> shift;
 249  249          if (dbstart < start) {
 250  250                  bp += start - dbstart;
 251  251                  all = FALSE;
 252  252          } else {
 253  253                  start = dbstart;
 254  254          }
 255  255          dbend = ((db->db_blkid + 1) << epbs) - 1;
 256  256          end = (blkid + nblks - 1) >> shift;
 257  257          if (dbend <= end)
 258  258                  end = dbend;
 259  259          else if (all)
 260  260                  all = trunc;
 261  261          ASSERT3U(start, <=, end);
 262  262  
 263  263          if (db->db_level == 1) {
 264  264                  FREE_VERIFY(db, start, end, tx);
 265  265                  blocks_freed = free_blocks(dn, bp, end-start+1, tx);
 266  266                  arc_buf_freeze(db->db_buf);
 267  267                  ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
 268  268                  DB_DNODE_EXIT(db);
 269  269                  return (all ? ALL : blocks_freed);
 270  270          }
 271  271  
 272  272          for (i = start; i <= end; i++, bp++) {
 273  273                  if (BP_IS_HOLE(bp))
 274  274                          continue;
 275  275                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 276  276                  err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
 277  277                  ASSERT0(err);
 278  278                  rw_exit(&dn->dn_struct_rwlock);
 279  279  
 280  280                  if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
 281  281                          ASSERT3P(subdb->db_blkptr, ==, bp);
 282  282                          blocks_freed += free_blocks(dn, bp, 1, tx);
 283  283                  } else {
 284  284                          all = FALSE;
 285  285                  }
 286  286                  dbuf_rele(subdb, FTAG);
 287  287          }
 288  288          DB_DNODE_EXIT(db);
 289  289          arc_buf_freeze(db->db_buf);
 290  290  #ifdef ZFS_DEBUG
 291  291          bp -= (end-start)+1;
 292  292          for (i = start; i <= end; i++, bp++) {
 293  293                  if (i == start && blkid != 0)
 294  294                          continue;
 295  295                  else if (i == end && !trunc)
 296  296                          continue;
 297  297                  ASSERT0(bp->blk_birth);
 298  298          }
 299  299  #endif
 300  300          ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
 301  301          return (all ? ALL : blocks_freed);
 302  302  }
 303  303  
 304  304  /*
 305  305   * free_range: Traverse the indicated range of the provided file
 306  306   * and "free" all the blocks contained there.
 307  307   */
 308  308  static void
 309  309  dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 310  310  {
 311  311          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 312  312          dmu_buf_impl_t *db;
 313  313          int trunc, start, end, shift, i, err;
 314  314          int dnlevel = dn->dn_phys->dn_nlevels;
 315  315  
 316  316          if (blkid > dn->dn_phys->dn_maxblkid)
 317  317                  return;
 318  318  
 319  319          ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
 320  320          trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
 321  321          if (trunc)
 322  322                  nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
 323  323  
 324  324          /* There are no indirect blocks in the object */
 325  325          if (dnlevel == 1) {
 326  326                  if (blkid >= dn->dn_phys->dn_nblkptr) {
 327  327                          /* this range was never made persistent */
 328  328                          return;
 329  329                  }
 330  330                  ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
 331  331                  (void) free_blocks(dn, bp + blkid, nblks, tx);
 332  332                  if (trunc) {
 333  333                          uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
 334  334                              (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 335  335                          dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
 336  336                          ASSERT(off < dn->dn_phys->dn_maxblkid ||
 337  337                              dn->dn_phys->dn_maxblkid == 0 ||
 338  338                              dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
 339  339                  }
 340  340                  return;
 341  341          }
 342  342  
 343  343          shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
 344  344          start = blkid >> shift;
 345  345          ASSERT(start < dn->dn_phys->dn_nblkptr);
 346  346          end = (blkid + nblks - 1) >> shift;
 347  347          bp += start;
 348  348          for (i = start; i <= end; i++, bp++) {
 349  349                  if (BP_IS_HOLE(bp))
 350  350                          continue;
 351  351                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 352  352                  err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
 353  353                  ASSERT0(err);
 354  354                  rw_exit(&dn->dn_struct_rwlock);
 355  355  
 356  356                  if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
 357  357                          ASSERT3P(db->db_blkptr, ==, bp);
 358  358                          (void) free_blocks(dn, bp, 1, tx);
 359  359                  }
 360  360                  dbuf_rele(db, FTAG);
 361  361          }
 362  362          if (trunc) {
 363  363                  uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
 364  364                      (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 365  365                  dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
 366  366                  ASSERT(off < dn->dn_phys->dn_maxblkid ||
 367  367                      dn->dn_phys->dn_maxblkid == 0 ||
 368  368                      dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
 369  369          }
  
    | ↓ open down ↓ | 369 lines elided | ↑ open up ↑ | 
 370  370  }
 371  371  
 372  372  /*
 373  373   * Try to kick all the dnodes dbufs out of the cache...
 374  374   */
 375  375  void
 376  376  dnode_evict_dbufs(dnode_t *dn)
 377  377  {
 378  378          int progress;
 379  379          int pass = 0;
      380 +        list_t evict_list;
      381 +
      382 +        dmu_buf_create_user_evict_list(&evict_list);
 380  383  
 381  384          do {
 382  385                  dmu_buf_impl_t *db, marker;
 383  386                  int evicting = FALSE;
 384  387  
 385  388                  progress = FALSE;
 386  389                  mutex_enter(&dn->dn_dbufs_mtx);
 387  390                  list_insert_tail(&dn->dn_dbufs, &marker);
 388  391                  db = list_head(&dn->dn_dbufs);
 389  392                  for (; db != ▮ db = list_head(&dn->dn_dbufs)) {
 390  393                          list_remove(&dn->dn_dbufs, db);
 391  394                          list_insert_tail(&dn->dn_dbufs, db);
 392  395  #ifdef  DEBUG
 393  396                          DB_DNODE_ENTER(db);
 394  397                          ASSERT3P(DB_DNODE(db), ==, dn);
  
    | ↓ open down ↓ | 5 lines elided | ↑ open up ↑ | 
 395  398                          DB_DNODE_EXIT(db);
 396  399  #endif  /* DEBUG */
 397  400  
 398  401                          mutex_enter(&db->db_mtx);
 399  402                          if (db->db_state == DB_EVICTING) {
 400  403                                  progress = TRUE;
 401  404                                  evicting = TRUE;
 402  405                                  mutex_exit(&db->db_mtx);
 403  406                          } else if (refcount_is_zero(&db->db_holds)) {
 404  407                                  progress = TRUE;
 405      -                                dbuf_clear(db); /* exits db_mtx for us */
      408 +                                dbuf_clear(db, &evict_list); /* exits db_mtx */
 406  409                          } else {
 407  410                                  mutex_exit(&db->db_mtx);
 408  411                          }
 409      -
      412 +                        ASSERT(MUTEX_NOT_HELD(&db->db_mtx));
      413 +                        dmu_buf_process_user_evicts(&evict_list);
 410  414                  }
 411  415                  list_remove(&dn->dn_dbufs, &marker);
 412  416                  /*
 413  417                   * NB: we need to drop dn_dbufs_mtx between passes so
 414  418                   * that any DB_EVICTING dbufs can make progress.
 415  419                   * Ideally, we would have some cv we could wait on, but
 416  420                   * since we don't, just wait a bit to give the other
 417  421                   * thread a chance to run.
 418  422                   */
 419  423                  mutex_exit(&dn->dn_dbufs_mtx);
 420  424                  if (evicting)
 421  425                          delay(1);
 422  426                  pass++;
 423  427                  ASSERT(pass < 100); /* sanity check */
 424  428          } while (progress);
 425  429  
 426  430          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 427  431          if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
 428  432                  mutex_enter(&dn->dn_bonus->db_mtx);
 429      -                dbuf_evict(dn->dn_bonus);
      433 +                dbuf_evict(dn->dn_bonus, &evict_list);
 430  434                  dn->dn_bonus = NULL;
 431  435          }
 432  436          rw_exit(&dn->dn_struct_rwlock);
      437 +        dmu_buf_destroy_user_evict_list(&evict_list);
 433  438  }
 434  439  
 435  440  static void
 436  441  dnode_undirty_dbufs(list_t *list)
 437  442  {
 438  443          dbuf_dirty_record_t *dr;
 439  444  
 440  445          while (dr = list_head(list)) {
 441  446                  dmu_buf_impl_t *db = dr->dr_dbuf;
 442  447                  uint64_t txg = dr->dr_txg;
 443  448  
 444  449                  if (db->db_level != 0)
 445  450                          dnode_undirty_dbufs(&dr->dt.di.dr_children);
 446  451  
 447  452                  mutex_enter(&db->db_mtx);
 448  453                  /* XXX - use dbuf_undirty()? */
 449  454                  list_remove(list, dr);
 450  455                  ASSERT(db->db_last_dirty == dr);
 451  456                  db->db_last_dirty = NULL;
 452  457                  db->db_dirtycnt -= 1;
 453  458                  if (db->db_level == 0) {
 454  459                          ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 455  460                              dr->dt.dl.dr_data == db->db_buf);
 456  461                          dbuf_unoverride(dr);
 457  462                  }
 458  463                  kmem_free(dr, sizeof (dbuf_dirty_record_t));
 459  464                  dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 460  465          }
 461  466  }
 462  467  
 463  468  static void
 464  469  dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 465  470  {
 466  471          int txgoff = tx->tx_txg & TXG_MASK;
 467  472  
 468  473          ASSERT(dmu_tx_is_syncing(tx));
 469  474  
 470  475          /*
 471  476           * Our contents should have been freed in dnode_sync() by the
 472  477           * free range record inserted by the caller of dnode_free().
 473  478           */
 474  479          ASSERT0(DN_USED_BYTES(dn->dn_phys));
 475  480          ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
 476  481  
 477  482          dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 478  483          dnode_evict_dbufs(dn);
 479  484          ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 480  485          ASSERT3P(dn->dn_bonus, ==, NULL);
 481  486  
 482  487          /*
 483  488           * XXX - It would be nice to assert this, but we may still
 484  489           * have residual holds from async evictions from the arc...
 485  490           *
 486  491           * zfs_obj_to_path() also depends on this being
 487  492           * commented out.
 488  493           *
 489  494           * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
 490  495           */
 491  496  
 492  497          /* Undirty next bits */
 493  498          dn->dn_next_nlevels[txgoff] = 0;
 494  499          dn->dn_next_indblkshift[txgoff] = 0;
 495  500          dn->dn_next_blksz[txgoff] = 0;
 496  501  
 497  502          /* ASSERT(blkptrs are zero); */
 498  503          ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 499  504          ASSERT(dn->dn_type != DMU_OT_NONE);
 500  505  
 501  506          ASSERT(dn->dn_free_txg > 0);
 502  507          if (dn->dn_allocated_txg != dn->dn_free_txg)
 503  508                  dbuf_will_dirty(dn->dn_dbuf, tx);
 504  509          bzero(dn->dn_phys, sizeof (dnode_phys_t));
 505  510  
 506  511          mutex_enter(&dn->dn_mtx);
 507  512          dn->dn_type = DMU_OT_NONE;
 508  513          dn->dn_maxblkid = 0;
 509  514          dn->dn_allocated_txg = 0;
 510  515          dn->dn_free_txg = 0;
 511  516          dn->dn_have_spill = B_FALSE;
 512  517          mutex_exit(&dn->dn_mtx);
 513  518  
 514  519          ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 515  520  
 516  521          dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 517  522          /*
 518  523           * Now that we've released our hold, the dnode may
 519  524           * be evicted, so we musn't access it.
 520  525           */
 521  526  }
 522  527  
 523  528  /*
 524  529   * Write out the dnode's dirty buffers.
 525  530   */
 526  531  void
 527  532  dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 528  533  {
 529  534          free_range_t *rp;
 530  535          dnode_phys_t *dnp = dn->dn_phys;
 531  536          int txgoff = tx->tx_txg & TXG_MASK;
 532  537          list_t *list = &dn->dn_dirty_records[txgoff];
 533  538          static const dnode_phys_t zerodn = { 0 };
 534  539          boolean_t kill_spill = B_FALSE;
 535  540  
 536  541          ASSERT(dmu_tx_is_syncing(tx));
 537  542          ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
 538  543          ASSERT(dnp->dn_type != DMU_OT_NONE ||
 539  544              bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
 540  545          DNODE_VERIFY(dn);
 541  546  
 542  547          ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
 543  548  
 544  549          if (dmu_objset_userused_enabled(dn->dn_objset) &&
 545  550              !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 546  551                  mutex_enter(&dn->dn_mtx);
 547  552                  dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
 548  553                  dn->dn_oldflags = dn->dn_phys->dn_flags;
 549  554                  dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
 550  555                  mutex_exit(&dn->dn_mtx);
 551  556                  dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
 552  557          } else {
 553  558                  /* Once we account for it, we should always account for it. */
 554  559                  ASSERT(!(dn->dn_phys->dn_flags &
 555  560                      DNODE_FLAG_USERUSED_ACCOUNTED));
 556  561          }
 557  562  
 558  563          mutex_enter(&dn->dn_mtx);
 559  564          if (dn->dn_allocated_txg == tx->tx_txg) {
 560  565                  /* The dnode is newly allocated or reallocated */
 561  566                  if (dnp->dn_type == DMU_OT_NONE) {
 562  567                          /* this is a first alloc, not a realloc */
 563  568                          dnp->dn_nlevels = 1;
 564  569                          dnp->dn_nblkptr = dn->dn_nblkptr;
 565  570                  }
 566  571  
 567  572                  dnp->dn_type = dn->dn_type;
 568  573                  dnp->dn_bonustype = dn->dn_bonustype;
 569  574                  dnp->dn_bonuslen = dn->dn_bonuslen;
 570  575          }
 571  576  
 572  577          ASSERT(dnp->dn_nlevels > 1 ||
 573  578              BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 574  579              BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
 575  580              dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 576  581  
 577  582          if (dn->dn_next_blksz[txgoff]) {
 578  583                  ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
 579  584                      SPA_MINBLOCKSIZE) == 0);
 580  585                  ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 581  586                      dn->dn_maxblkid == 0 || list_head(list) != NULL ||
 582  587                      avl_last(&dn->dn_ranges[txgoff]) ||
 583  588                      dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
 584  589                      dnp->dn_datablkszsec);
 585  590                  dnp->dn_datablkszsec =
 586  591                      dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
 587  592                  dn->dn_next_blksz[txgoff] = 0;
 588  593          }
 589  594  
 590  595          if (dn->dn_next_bonuslen[txgoff]) {
 591  596                  if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
 592  597                          dnp->dn_bonuslen = 0;
 593  598                  else
 594  599                          dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
 595  600                  ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
 596  601                  dn->dn_next_bonuslen[txgoff] = 0;
 597  602          }
 598  603  
 599  604          if (dn->dn_next_bonustype[txgoff]) {
 600  605                  ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
 601  606                  dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
 602  607                  dn->dn_next_bonustype[txgoff] = 0;
 603  608          }
 604  609  
 605  610          /*
 606  611           * We will either remove a spill block when a file is being removed
 607  612           * or we have been asked to remove it.
 608  613           */
 609  614          if (dn->dn_rm_spillblk[txgoff] ||
 610  615              ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
 611  616              dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
 612  617                  if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 613  618                          kill_spill = B_TRUE;
 614  619                  dn->dn_rm_spillblk[txgoff] = 0;
 615  620          }
 616  621  
 617  622          if (dn->dn_next_indblkshift[txgoff]) {
 618  623                  ASSERT(dnp->dn_nlevels == 1);
 619  624                  dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
 620  625                  dn->dn_next_indblkshift[txgoff] = 0;
 621  626          }
 622  627  
 623  628          /*
 624  629           * Just take the live (open-context) values for checksum and compress.
 625  630           * Strictly speaking it's a future leak, but nothing bad happens if we
 626  631           * start using the new checksum or compress algorithm a little early.
 627  632           */
 628  633          dnp->dn_checksum = dn->dn_checksum;
 629  634          dnp->dn_compress = dn->dn_compress;
 630  635  
 631  636          mutex_exit(&dn->dn_mtx);
 632  637  
 633  638          if (kill_spill) {
 634  639                  (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
 635  640                  mutex_enter(&dn->dn_mtx);
 636  641                  dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
 637  642                  mutex_exit(&dn->dn_mtx);
 638  643          }
 639  644  
 640  645          /* process all the "freed" ranges in the file */
 641  646          while (rp = avl_last(&dn->dn_ranges[txgoff])) {
 642  647                  dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
 643  648                  /* grab the mutex so we don't race with dnode_block_freed() */
 644  649                  mutex_enter(&dn->dn_mtx);
 645  650                  avl_remove(&dn->dn_ranges[txgoff], rp);
 646  651                  mutex_exit(&dn->dn_mtx);
 647  652                  kmem_free(rp, sizeof (free_range_t));
 648  653          }
 649  654  
 650  655          if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
 651  656                  dnode_sync_free(dn, tx);
 652  657                  return;
 653  658          }
 654  659  
 655  660          if (dn->dn_next_nblkptr[txgoff]) {
 656  661                  /* this should only happen on a realloc */
 657  662                  ASSERT(dn->dn_allocated_txg == tx->tx_txg);
 658  663                  if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
 659  664                          /* zero the new blkptrs we are gaining */
 660  665                          bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
 661  666                              sizeof (blkptr_t) *
 662  667                              (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
 663  668  #ifdef ZFS_DEBUG
 664  669                  } else {
 665  670                          int i;
 666  671                          ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
 667  672                          /* the blkptrs we are losing better be unallocated */
 668  673                          for (i = dn->dn_next_nblkptr[txgoff];
 669  674                              i < dnp->dn_nblkptr; i++)
 670  675                                  ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
 671  676  #endif
 672  677                  }
 673  678                  mutex_enter(&dn->dn_mtx);
 674  679                  dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
 675  680                  dn->dn_next_nblkptr[txgoff] = 0;
 676  681                  mutex_exit(&dn->dn_mtx);
 677  682          }
 678  683  
 679  684          if (dn->dn_next_nlevels[txgoff]) {
 680  685                  dnode_increase_indirection(dn, tx);
 681  686                  dn->dn_next_nlevels[txgoff] = 0;
 682  687          }
 683  688  
 684  689          dbuf_sync_list(list, tx);
 685  690  
 686  691          if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 687  692                  ASSERT3P(list_head(list), ==, NULL);
 688  693                  dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 689  694          }
 690  695  
 691  696          /*
 692  697           * Although we have dropped our reference to the dnode, it
 693  698           * can't be evicted until its written, and we haven't yet
 694  699           * initiated the IO for the dnode's dbuf.
 695  700           */
 696  701  }
  
    | ↓ open down ↓ | 254 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX