dlpx-os-diff Wdiff usr/src/uts/common/fs/zfs/dbuf.c

Print this page

4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dbuf.c
          +++ new/usr/src/uts/common/fs/zfs/dbuf.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26   26   */
  27   27  
  28   28  #include <sys/zfs_context.h>
  29   29  #include <sys/dmu.h>
  30   30  #include <sys/dmu_send.h>
  31   31  #include <sys/dmu_impl.h>
  32   32  #include <sys/dbuf.h>
  33   33  #include <sys/dmu_objset.h>
  34   34  #include <sys/dsl_dataset.h>
  35   35  #include <sys/dsl_dir.h>
  36   36  #include <sys/dmu_tx.h>
  37   37  #include <sys/spa.h>
  38   38  #include <sys/zio.h>
  39   39  #include <sys/dmu_zfetch.h>
  40   40  #include <sys/sa.h>
  41   41  #include <sys/sa_impl.h>
  42   42  
  43   43  /*
  44   44   * Number of times that zfs_free_range() took the slow path while doing
  45   45   * a zfs receive.  A nonzero value indicates a potential performance problem.
  46   46   */
  47   47  uint64_t zfs_free_range_recv_miss;
  48   48  
  49   49  static void dbuf_destroy(dmu_buf_impl_t *db);
  50   50  static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
  51   51  static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
  52   52  
  53   53  /*
  54   54   * Global data structures and functions for the dbuf cache.
  55   55   */
  56   56  static kmem_cache_t *dbuf_cache;
  57   57  
  58   58  /* ARGSUSED */
  59   59  static int
  60   60  dbuf_cons(void *vdb, void *unused, int kmflag)
  61   61  {
  62   62          dmu_buf_impl_t *db = vdb;
  63   63          bzero(db, sizeof (dmu_buf_impl_t));
  64   64  
  65   65          mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
  66   66          cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
  67   67          refcount_create(&db->db_holds);
  68   68          return (0);
  69   69  }
  70   70  
  71   71  /* ARGSUSED */
  72   72  static void
  73   73  dbuf_dest(void *vdb, void *unused)
  74   74  {
  75   75          dmu_buf_impl_t *db = vdb;
  76   76          mutex_destroy(&db->db_mtx);
  77   77          cv_destroy(&db->db_changed);
  78   78          refcount_destroy(&db->db_holds);
  79   79  }
  80   80  
  81   81  /*
  82   82   * dbuf hash table routines
  83   83   */
  84   84  static dbuf_hash_table_t dbuf_hash_table;
  85   85  
  86   86  static uint64_t dbuf_hash_count;
  87   87  
  88   88  static uint64_t
  89   89  dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
  90   90  {
  91   91          uintptr_t osv = (uintptr_t)os;
  92   92          uint64_t crc = -1ULL;
  93   93  
  94   94          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
  95   95          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
  96   96          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
  97   97          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
  98   98          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
  99   99          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
 100  100          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
 101  101  
 102  102          crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
 103  103  
 104  104          return (crc);
 105  105  }
 106  106  
 107  107  #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
 108  108  
 109  109  #define DBUF_EQUAL(dbuf, os, obj, level, blkid)         \
 110  110          ((dbuf)->db.db_object == (obj) &&               \
 111  111          (dbuf)->db_objset == (os) &&                    \
 112  112          (dbuf)->db_level == (level) &&                  \
 113  113          (dbuf)->db_blkid == (blkid))
 114  114  
 115  115  dmu_buf_impl_t *
 116  116  dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 117  117  {
 118  118          dbuf_hash_table_t *h = &dbuf_hash_table;
 119  119          objset_t *os = dn->dn_objset;
 120  120          uint64_t obj = dn->dn_object;
 121  121          uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 122  122          uint64_t idx = hv & h->hash_table_mask;
 123  123          dmu_buf_impl_t *db;
 124  124  
 125  125          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 126  126          for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 127  127                  if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 128  128                          mutex_enter(&db->db_mtx);
 129  129                          if (db->db_state != DB_EVICTING) {
 130  130                                  mutex_exit(DBUF_HASH_MUTEX(h, idx));
 131  131                                  return (db);
 132  132                          }
 133  133                          mutex_exit(&db->db_mtx);
 134  134                  }
 135  135          }
 136  136          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 137  137          return (NULL);
 138  138  }
 139  139  
 140  140  /*
 141  141   * Insert an entry into the hash table.  If there is already an element
 142  142   * equal to elem in the hash table, then the already existing element
 143  143   * will be returned and the new element will not be inserted.
 144  144   * Otherwise returns NULL.
 145  145   */
 146  146  static dmu_buf_impl_t *
 147  147  dbuf_hash_insert(dmu_buf_impl_t *db)
 148  148  {
 149  149          dbuf_hash_table_t *h = &dbuf_hash_table;
 150  150          objset_t *os = db->db_objset;
 151  151          uint64_t obj = db->db.db_object;
 152  152          int level = db->db_level;
 153  153          uint64_t blkid = db->db_blkid;
 154  154          uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 155  155          uint64_t idx = hv & h->hash_table_mask;
 156  156          dmu_buf_impl_t *dbf;
 157  157  
 158  158          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 159  159          for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 160  160                  if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 161  161                          mutex_enter(&dbf->db_mtx);
 162  162                          if (dbf->db_state != DB_EVICTING) {
 163  163                                  mutex_exit(DBUF_HASH_MUTEX(h, idx));
 164  164                                  return (dbf);
 165  165                          }
 166  166                          mutex_exit(&dbf->db_mtx);
 167  167                  }
 168  168          }
 169  169  
 170  170          mutex_enter(&db->db_mtx);
 171  171          db->db_hash_next = h->hash_table[idx];
 172  172          h->hash_table[idx] = db;
 173  173          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 174  174          atomic_add_64(&dbuf_hash_count, 1);
 175  175  
 176  176          return (NULL);
 177  177  }
 178  178  
 179  179  /*
 180  180   * Remove an entry from the hash table.  This operation will
 181  181   * fail if there are any existing holds on the db.
 182  182   */
 183  183  static void
 184  184  dbuf_hash_remove(dmu_buf_impl_t *db)
 185  185  {
 186  186          dbuf_hash_table_t *h = &dbuf_hash_table;
 187  187          uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
 188  188              db->db_level, db->db_blkid);
 189  189          uint64_t idx = hv & h->hash_table_mask;
 190  190          dmu_buf_impl_t *dbf, **dbp;
 191  191  
 192  192          /*
 193  193           * We musn't hold db_mtx to maintin lock ordering:
 194  194           * DBUF_HASH_MUTEX > db_mtx.
 195  195           */
 196  196          ASSERT(refcount_is_zero(&db->db_holds));
 197  197          ASSERT(db->db_state == DB_EVICTING);
 198  198          ASSERT(!MUTEX_HELD(&db->db_mtx));
 199  199  
 200  200          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 201  201          dbp = &h->hash_table[idx];
 202  202          while ((dbf = *dbp) != db) {
 203  203                  dbp = &dbf->db_hash_next;
 204  204                  ASSERT(dbf != NULL);
 205  205          }
 206  206          *dbp = db->db_hash_next;
 207  207          db->db_hash_next = NULL;
 208  208          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 209  209          atomic_add_64(&dbuf_hash_count, -1);
 210  210  }
 211  211  
 212  212  static arc_evict_func_t dbuf_do_evict;
 213  213  
 214  214  static void
 215  215  dbuf_evict_user(dmu_buf_impl_t *db)
 216  216  {
 217  217          ASSERT(MUTEX_HELD(&db->db_mtx));
 218  218  
 219  219          if (db->db_level != 0 || db->db_evict_func == NULL)
 220  220                  return;
 221  221  
 222  222          if (db->db_user_data_ptr_ptr)
 223  223                  *db->db_user_data_ptr_ptr = db->db.db_data;
 224  224          db->db_evict_func(&db->db, db->db_user_ptr);
 225  225          db->db_user_ptr = NULL;
 226  226          db->db_user_data_ptr_ptr = NULL;
 227  227          db->db_evict_func = NULL;
 228  228  }
 229  229  
 230  230  boolean_t
 231  231  dbuf_is_metadata(dmu_buf_impl_t *db)
 232  232  {
 233  233          if (db->db_level > 0) {
 234  234                  return (B_TRUE);
 235  235          } else {
 236  236                  boolean_t is_metadata;
 237  237  
 238  238                  DB_DNODE_ENTER(db);
 239  239                  is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 240  240                  DB_DNODE_EXIT(db);
 241  241  
 242  242                  return (is_metadata);
 243  243          }
 244  244  }
 245  245  
 246  246  void
 247  247  dbuf_evict(dmu_buf_impl_t *db)
 248  248  {
 249  249          ASSERT(MUTEX_HELD(&db->db_mtx));
 250  250          ASSERT(db->db_buf == NULL);
 251  251          ASSERT(db->db_data_pending == NULL);
 252  252  
 253  253          dbuf_clear(db);
 254  254          dbuf_destroy(db);
 255  255  }
 256  256  
 257  257  void
 258  258  dbuf_init(void)
 259  259  {
 260  260          uint64_t hsize = 1ULL << 16;
 261  261          dbuf_hash_table_t *h = &dbuf_hash_table;
 262  262          int i;
 263  263  
 264  264          /*
 265  265           * The hash table is big enough to fill all of physical memory
 266  266           * with an average 4K block size.  The table will take up
 267  267           * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 268  268           */
 269  269          while (hsize * 4096 < physmem * PAGESIZE)
 270  270                  hsize <<= 1;
 271  271  
 272  272  retry:
 273  273          h->hash_table_mask = hsize - 1;
 274  274          h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 275  275          if (h->hash_table == NULL) {
 276  276                  /* XXX - we should really return an error instead of assert */
 277  277                  ASSERT(hsize > (1ULL << 10));
 278  278                  hsize >>= 1;
 279  279                  goto retry;
 280  280          }
 281  281  
 282  282          dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 283  283              sizeof (dmu_buf_impl_t),
 284  284              0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 285  285  
 286  286          for (i = 0; i < DBUF_MUTEXES; i++)
 287  287                  mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 288  288  }
 289  289  
 290  290  void
 291  291  dbuf_fini(void)
 292  292  {
 293  293          dbuf_hash_table_t *h = &dbuf_hash_table;
 294  294          int i;
 295  295  
 296  296          for (i = 0; i < DBUF_MUTEXES; i++)
 297  297                  mutex_destroy(&h->hash_mutexes[i]);
 298  298          kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 299  299          kmem_cache_destroy(dbuf_cache);
 300  300  }
 301  301  
 302  302  /*
 303  303   * Other stuff.
 304  304   */
 305  305  
 306  306  #ifdef ZFS_DEBUG
 307  307  static void
 308  308  dbuf_verify(dmu_buf_impl_t *db)
 309  309  {
 310  310          dnode_t *dn;
 311  311          dbuf_dirty_record_t *dr;
 312  312  
 313  313          ASSERT(MUTEX_HELD(&db->db_mtx));
 314  314  
 315  315          if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 316  316                  return;
 317  317  
 318  318          ASSERT(db->db_objset != NULL);
 319  319          DB_DNODE_ENTER(db);
 320  320          dn = DB_DNODE(db);
 321  321          if (dn == NULL) {
 322  322                  ASSERT(db->db_parent == NULL);
 323  323                  ASSERT(db->db_blkptr == NULL);
 324  324          } else {
 325  325                  ASSERT3U(db->db.db_object, ==, dn->dn_object);
 326  326                  ASSERT3P(db->db_objset, ==, dn->dn_objset);
 327  327                  ASSERT3U(db->db_level, <, dn->dn_nlevels);
 328  328                  ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 329  329                      db->db_blkid == DMU_SPILL_BLKID ||
 330  330                      !list_is_empty(&dn->dn_dbufs));
 331  331          }
 332  332          if (db->db_blkid == DMU_BONUS_BLKID) {
 333  333                  ASSERT(dn != NULL);
 334  334                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 335  335                  ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 336  336          } else if (db->db_blkid == DMU_SPILL_BLKID) {
 337  337                  ASSERT(dn != NULL);
 338  338                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 339  339                  ASSERT0(db->db.db_offset);
 340  340          } else {
 341  341                  ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 342  342          }
 343  343  
 344  344          for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
 345  345                  ASSERT(dr->dr_dbuf == db);
 346  346  
 347  347          for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
 348  348                  ASSERT(dr->dr_dbuf == db);
 349  349  
 350  350          /*
 351  351           * We can't assert that db_size matches dn_datablksz because it
 352  352           * can be momentarily different when another thread is doing
 353  353           * dnode_set_blksz().
 354  354           */
 355  355          if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 356  356                  dr = db->db_data_pending;
 357  357                  /*
 358  358                   * It should only be modified in syncing context, so
 359  359                   * make sure we only have one copy of the data.
 360  360                   */
 361  361                  ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 362  362          }
 363  363  
 364  364          /* verify db->db_blkptr */
 365  365          if (db->db_blkptr) {
 366  366                  if (db->db_parent == dn->dn_dbuf) {
 367  367                          /* db is pointed to by the dnode */
 368  368                          /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 369  369                          if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 370  370                                  ASSERT(db->db_parent == NULL);
 371  371                          else
 372  372                                  ASSERT(db->db_parent != NULL);
 373  373                          if (db->db_blkid != DMU_SPILL_BLKID)
 374  374                                  ASSERT3P(db->db_blkptr, ==,
 375  375                                      &dn->dn_phys->dn_blkptr[db->db_blkid]);
 376  376                  } else {
 377  377                          /* db is pointed to by an indirect block */
 378  378                          int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
 379  379                          ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 380  380                          ASSERT3U(db->db_parent->db.db_object, ==,
 381  381                              db->db.db_object);
 382  382                          /*
 383  383                           * dnode_grow_indblksz() can make this fail if we don't
 384  384                           * have the struct_rwlock.  XXX indblksz no longer
 385  385                           * grows.  safe to do this now?
 386  386                           */
 387  387                          if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 388  388                                  ASSERT3P(db->db_blkptr, ==,
 389  389                                      ((blkptr_t *)db->db_parent->db.db_data +
 390  390                                      db->db_blkid % epb));
 391  391                          }
 392  392                  }
 393  393          }
 394  394          if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 395  395              (db->db_buf == NULL || db->db_buf->b_data) &&
 396  396              db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 397  397              db->db_state != DB_FILL && !dn->dn_free_txg) {
 398  398                  /*
 399  399                   * If the blkptr isn't set but they have nonzero data,
 400  400                   * it had better be dirty, otherwise we'll lose that
 401  401                   * data when we evict this buffer.
 402  402                   */
 403  403                  if (db->db_dirtycnt == 0) {
 404  404                          uint64_t *buf = db->db.db_data;
 405  405                          int i;
 406  406  
 407  407                          for (i = 0; i < db->db.db_size >> 3; i++) {
 408  408                                  ASSERT(buf[i] == 0);
 409  409                          }
 410  410                  }
 411  411          }
 412  412          DB_DNODE_EXIT(db);
 413  413  }
 414  414  #endif
 415  415  
 416  416  static void
 417  417  dbuf_update_data(dmu_buf_impl_t *db)
 418  418  {
 419  419          ASSERT(MUTEX_HELD(&db->db_mtx));
 420  420          if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
 421  421                  ASSERT(!refcount_is_zero(&db->db_holds));
 422  422                  *db->db_user_data_ptr_ptr = db->db.db_data;
 423  423          }
 424  424  }
 425  425  
 426  426  static void
 427  427  dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 428  428  {
 429  429          ASSERT(MUTEX_HELD(&db->db_mtx));
 430  430          ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
 431  431          db->db_buf = buf;
 432  432          if (buf != NULL) {
 433  433                  ASSERT(buf->b_data != NULL);
 434  434                  db->db.db_data = buf->b_data;
 435  435                  if (!arc_released(buf))
 436  436                          arc_set_callback(buf, dbuf_do_evict, db);
 437  437                  dbuf_update_data(db);
 438  438          } else {
 439  439                  dbuf_evict_user(db);
 440  440                  db->db.db_data = NULL;
 441  441                  if (db->db_state != DB_NOFILL)
 442  442                          db->db_state = DB_UNCACHED;
 443  443          }
 444  444  }
 445  445  
 446  446  /*
 447  447   * Loan out an arc_buf for read.  Return the loaned arc_buf.
 448  448   */
 449  449  arc_buf_t *
 450  450  dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 451  451  {
 452  452          arc_buf_t *abuf;
 453  453  
 454  454          mutex_enter(&db->db_mtx);
 455  455          if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 456  456                  int blksz = db->db.db_size;
 457  457                  spa_t *spa;
 458  458  
 459  459                  mutex_exit(&db->db_mtx);
 460  460                  DB_GET_SPA(&spa, db);
 461  461                  abuf = arc_loan_buf(spa, blksz);
 462  462                  bcopy(db->db.db_data, abuf->b_data, blksz);
 463  463          } else {
 464  464                  abuf = db->db_buf;
 465  465                  arc_loan_inuse_buf(abuf, db);
 466  466                  dbuf_set_data(db, NULL);
 467  467                  mutex_exit(&db->db_mtx);
 468  468          }
 469  469          return (abuf);
 470  470  }
 471  471  
 472  472  uint64_t
 473  473  dbuf_whichblock(dnode_t *dn, uint64_t offset)
 474  474  {
 475  475          if (dn->dn_datablkshift) {
 476  476                  return (offset >> dn->dn_datablkshift);
 477  477          } else {
 478  478                  ASSERT3U(offset, <, dn->dn_datablksz);
 479  479                  return (0);
 480  480          }
 481  481  }
 482  482  
 483  483  static void
 484  484  dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 485  485  {
 486  486          dmu_buf_impl_t *db = vdb;
 487  487  
 488  488          mutex_enter(&db->db_mtx);
 489  489          ASSERT3U(db->db_state, ==, DB_READ);
 490  490          /*
 491  491           * All reads are synchronous, so we must have a hold on the dbuf
 492  492           */
 493  493          ASSERT(refcount_count(&db->db_holds) > 0);
 494  494          ASSERT(db->db_buf == NULL);
 495  495          ASSERT(db->db.db_data == NULL);
 496  496          if (db->db_level == 0 && db->db_freed_in_flight) {
 497  497                  /* we were freed in flight; disregard any error */
 498  498                  arc_release(buf, db);
 499  499                  bzero(buf->b_data, db->db.db_size);
 500  500                  arc_buf_freeze(buf);
 501  501                  db->db_freed_in_flight = FALSE;
 502  502                  dbuf_set_data(db, buf);
 503  503                  db->db_state = DB_CACHED;
 504  504          } else if (zio == NULL || zio->io_error == 0) {
 505  505                  dbuf_set_data(db, buf);
 506  506                  db->db_state = DB_CACHED;
 507  507          } else {
 508  508                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 509  509                  ASSERT3P(db->db_buf, ==, NULL);
 510  510                  VERIFY(arc_buf_remove_ref(buf, db));
 511  511                  db->db_state = DB_UNCACHED;
 512  512          }
 513  513          cv_broadcast(&db->db_changed);
 514  514          dbuf_rele_and_unlock(db, NULL);
 515  515  }
 516  516  
 517  517  static void
 518  518  dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 519  519  {
 520  520          dnode_t *dn;
 521  521          spa_t *spa;
 522  522          zbookmark_t zb;
 523  523          uint32_t aflags = ARC_NOWAIT;
 524  524  
 525  525          DB_DNODE_ENTER(db);
 526  526          dn = DB_DNODE(db);
 527  527          ASSERT(!refcount_is_zero(&db->db_holds));
 528  528          /* We need the struct_rwlock to prevent db_blkptr from changing. */
 529  529          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 530  530          ASSERT(MUTEX_HELD(&db->db_mtx));
 531  531          ASSERT(db->db_state == DB_UNCACHED);
 532  532          ASSERT(db->db_buf == NULL);
 533  533  
 534  534          if (db->db_blkid == DMU_BONUS_BLKID) {
 535  535                  int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 536  536  
 537  537                  ASSERT3U(bonuslen, <=, db->db.db_size);
 538  538                  db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 539  539                  arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 540  540                  if (bonuslen < DN_MAX_BONUSLEN)
 541  541                          bzero(db->db.db_data, DN_MAX_BONUSLEN);
 542  542                  if (bonuslen)
 543  543                          bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 544  544                  DB_DNODE_EXIT(db);
 545  545                  dbuf_update_data(db);
 546  546                  db->db_state = DB_CACHED;
 547  547                  mutex_exit(&db->db_mtx);
 548  548                  return;
 549  549          }
 550  550  
 551  551          /*
 552  552           * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 553  553           * processes the delete record and clears the bp while we are waiting
 554  554           * for the dn_mtx (resulting in a "no" from block_freed).
 555  555           */
 556  556          if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
 557  557              (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
 558  558              BP_IS_HOLE(db->db_blkptr)))) {
 559  559                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 560  560  
 561  561                  dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
 562  562                      db->db.db_size, db, type));
 563  563                  DB_DNODE_EXIT(db);
 564  564                  bzero(db->db.db_data, db->db.db_size);
 565  565                  db->db_state = DB_CACHED;
 566  566                  *flags |= DB_RF_CACHED;
 567  567                  mutex_exit(&db->db_mtx);
 568  568                  return;
 569  569          }
 570  570  
 571  571          spa = dn->dn_objset->os_spa;
 572  572          DB_DNODE_EXIT(db);
 573  573  
 574  574          db->db_state = DB_READ;
 575  575          mutex_exit(&db->db_mtx);
 576  576  
 577  577          if (DBUF_IS_L2CACHEABLE(db))
 578  578                  aflags |= ARC_L2CACHE;
 579  579          if (DBUF_IS_L2COMPRESSIBLE(db))
 580  580                  aflags |= ARC_L2COMPRESS;
 581  581  
 582  582          SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 583  583              db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 584  584              db->db.db_object, db->db_level, db->db_blkid);
 585  585  
 586  586          dbuf_add_ref(db, NULL);
 587  587  
 588  588          (void) arc_read(zio, spa, db->db_blkptr,
 589  589              dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 590  590              (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 591  591              &aflags, &zb);
 592  592          if (aflags & ARC_CACHED)
 593  593                  *flags |= DB_RF_CACHED;
 594  594  }
 595  595  
 596  596  int
 597  597  dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 598  598  {
 599  599          int err = 0;
 600  600          int havepzio = (zio != NULL);
 601  601          int prefetch;
 602  602          dnode_t *dn;
 603  603  
 604  604          /*
 605  605           * We don't have to hold the mutex to check db_state because it
 606  606           * can't be freed while we have a hold on the buffer.
 607  607           */
 608  608          ASSERT(!refcount_is_zero(&db->db_holds));
 609  609  
 610  610          if (db->db_state == DB_NOFILL)
 611  611                  return (SET_ERROR(EIO));
 612  612  
 613  613          DB_DNODE_ENTER(db);
 614  614          dn = DB_DNODE(db);
 615  615          if ((flags & DB_RF_HAVESTRUCT) == 0)
 616  616                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 617  617  
 618  618          prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 619  619              (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 620  620              DBUF_IS_CACHEABLE(db);
 621  621  
 622  622          mutex_enter(&db->db_mtx);
 623  623          if (db->db_state == DB_CACHED) {
 624  624                  mutex_exit(&db->db_mtx);
 625  625                  if (prefetch)
 626  626                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 627  627                              db->db.db_size, TRUE);
 628  628                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 629  629                          rw_exit(&dn->dn_struct_rwlock);
 630  630                  DB_DNODE_EXIT(db);
 631  631          } else if (db->db_state == DB_UNCACHED) {
 632  632                  spa_t *spa = dn->dn_objset->os_spa;
 633  633  
 634  634                  if (zio == NULL)
 635  635                          zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 636  636                  dbuf_read_impl(db, zio, &flags);
 637  637  
 638  638                  /* dbuf_read_impl has dropped db_mtx for us */
 639  639  
 640  640                  if (prefetch)
 641  641                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 642  642                              db->db.db_size, flags & DB_RF_CACHED);
 643  643  
 644  644                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 645  645                          rw_exit(&dn->dn_struct_rwlock);
 646  646                  DB_DNODE_EXIT(db);
 647  647  
 648  648                  if (!havepzio)
 649  649                          err = zio_wait(zio);
 650  650          } else {
 651  651                  /*
 652  652                   * Another reader came in while the dbuf was in flight
 653  653                   * between UNCACHED and CACHED.  Either a writer will finish
 654  654                   * writing the buffer (sending the dbuf to CACHED) or the
 655  655                   * first reader's request will reach the read_done callback
 656  656                   * and send the dbuf to CACHED.  Otherwise, a failure
 657  657                   * occurred and the dbuf went to UNCACHED.
 658  658                   */
 659  659                  mutex_exit(&db->db_mtx);
 660  660                  if (prefetch)
 661  661                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 662  662                              db->db.db_size, TRUE);
 663  663                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 664  664                          rw_exit(&dn->dn_struct_rwlock);
 665  665                  DB_DNODE_EXIT(db);
 666  666  
 667  667                  /* Skip the wait per the caller's request. */
 668  668                  mutex_enter(&db->db_mtx);
 669  669                  if ((flags & DB_RF_NEVERWAIT) == 0) {
 670  670                          while (db->db_state == DB_READ ||
 671  671                              db->db_state == DB_FILL) {
 672  672                                  ASSERT(db->db_state == DB_READ ||
 673  673                                      (flags & DB_RF_HAVESTRUCT) == 0);
 674  674                                  cv_wait(&db->db_changed, &db->db_mtx);
 675  675                          }
 676  676                          if (db->db_state == DB_UNCACHED)
 677  677                                  err = SET_ERROR(EIO);
 678  678                  }
 679  679                  mutex_exit(&db->db_mtx);
 680  680          }
 681  681  
 682  682          ASSERT(err || havepzio || db->db_state == DB_CACHED);
 683  683          return (err);
 684  684  }
 685  685  
 686  686  static void
 687  687  dbuf_noread(dmu_buf_impl_t *db)
 688  688  {
 689  689          ASSERT(!refcount_is_zero(&db->db_holds));
 690  690          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 691  691          mutex_enter(&db->db_mtx);
 692  692          while (db->db_state == DB_READ || db->db_state == DB_FILL)
 693  693                  cv_wait(&db->db_changed, &db->db_mtx);
 694  694          if (db->db_state == DB_UNCACHED) {
 695  695                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 696  696                  spa_t *spa;
 697  697  
 698  698                  ASSERT(db->db_buf == NULL);
 699  699                  ASSERT(db->db.db_data == NULL);
 700  700                  DB_GET_SPA(&spa, db);
 701  701                  dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 702  702                  db->db_state = DB_FILL;
 703  703          } else if (db->db_state == DB_NOFILL) {
 704  704                  dbuf_set_data(db, NULL);
 705  705          } else {
 706  706                  ASSERT3U(db->db_state, ==, DB_CACHED);
 707  707          }
 708  708          mutex_exit(&db->db_mtx);
 709  709  }
 710  710  
 711  711  /*
 712  712   * This is our just-in-time copy function.  It makes a copy of
 713  713   * buffers, that have been modified in a previous transaction
 714  714   * group, before we modify them in the current active group.
 715  715   *
 716  716   * This function is used in two places: when we are dirtying a
 717  717   * buffer for the first time in a txg, and when we are freeing
 718  718   * a range in a dnode that includes this buffer.
 719  719   *
 720  720   * Note that when we are called from dbuf_free_range() we do
 721  721   * not put a hold on the buffer, we just traverse the active
 722  722   * dbuf list for the dnode.
 723  723   */
 724  724  static void
 725  725  dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 726  726  {
 727  727          dbuf_dirty_record_t *dr = db->db_last_dirty;
 728  728  
 729  729          ASSERT(MUTEX_HELD(&db->db_mtx));
 730  730          ASSERT(db->db.db_data != NULL);
 731  731          ASSERT(db->db_level == 0);
 732  732          ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 733  733  
 734  734          if (dr == NULL ||
 735  735              (dr->dt.dl.dr_data !=
 736  736              ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 737  737                  return;
 738  738  
 739  739          /*
 740  740           * If the last dirty record for this dbuf has not yet synced
 741  741           * and its referencing the dbuf data, either:
 742  742           *      reset the reference to point to a new copy,
 743  743           * or (if there a no active holders)
 744  744           *      just null out the current db_data pointer.
 745  745           */
 746  746          ASSERT(dr->dr_txg >= txg - 2);
 747  747          if (db->db_blkid == DMU_BONUS_BLKID) {
 748  748                  /* Note that the data bufs here are zio_bufs */
 749  749                  dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 750  750                  arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 751  751                  bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 752  752          } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 753  753                  int size = db->db.db_size;
 754  754                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 755  755                  spa_t *spa;
 756  756  
 757  757                  DB_GET_SPA(&spa, db);
 758  758                  dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 759  759                  bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 760  760          } else {
 761  761                  dbuf_set_data(db, NULL);
 762  762          }
 763  763  }
 764  764  
 765  765  void
 766  766  dbuf_unoverride(dbuf_dirty_record_t *dr)
 767  767  {
 768  768          dmu_buf_impl_t *db = dr->dr_dbuf;
 769  769          blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 770  770          uint64_t txg = dr->dr_txg;
 771  771  
 772  772          ASSERT(MUTEX_HELD(&db->db_mtx));
 773  773          ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 774  774          ASSERT(db->db_level == 0);
 775  775  
 776  776          if (db->db_blkid == DMU_BONUS_BLKID ||
 777  777              dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 778  778                  return;
 779  779  
 780  780          ASSERT(db->db_data_pending != dr);
 781  781  
 782  782          /* free this block */
 783  783          if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
 784  784                  spa_t *spa;
 785  785  
 786  786                  DB_GET_SPA(&spa, db);
 787  787                  zio_free(spa, txg, bp);
 788  788          }
 789  789          dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 790  790          dr->dt.dl.dr_nopwrite = B_FALSE;
 791  791  
 792  792          /*
 793  793           * Release the already-written buffer, so we leave it in
 794  794           * a consistent dirty state.  Note that all callers are
 795  795           * modifying the buffer, so they will immediately do
 796  796           * another (redundant) arc_release().  Therefore, leave
 797  797           * the buf thawed to save the effort of freezing &
 798  798           * immediately re-thawing it.
 799  799           */
 800  800          arc_release(dr->dt.dl.dr_data, db);
 801  801  }
 802  802  
 803  803  /*
 804  804   * Evict (if its unreferenced) or clear (if its referenced) any level-0
 805  805   * data blocks in the free range, so that any future readers will find
 806  806   * empty blocks.  Also, if we happen across any level-1 dbufs in the
 807  807   * range that have not already been marked dirty, mark them dirty so
 808  808   * they stay in memory.
 809  809   *
 810  810   * This is a no-op if the dataset is in the middle of an incremental
 811  811   * receive; see comment below for details.
 812  812   */
 813  813  void
 814  814  dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 815  815  {
 816  816          dmu_buf_impl_t *db, *db_next;
 817  817          uint64_t txg = tx->tx_txg;
 818  818          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 819  819          uint64_t first_l1 = start >> epbs;
 820  820          uint64_t last_l1 = end >> epbs;
 821  821  
 822  822          if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
 823  823                  end = dn->dn_maxblkid;
 824  824                  last_l1 = end >> epbs;
 825  825          }
 826  826          dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 827  827  
 828  828          mutex_enter(&dn->dn_dbufs_mtx);
 829  829          if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
 830  830                  /* There can't be any dbufs in this range; no need to search. */
 831  831                  mutex_exit(&dn->dn_dbufs_mtx);
 832  832                  return;
 833  833          } else if (dmu_objset_is_receiving(dn->dn_objset)) {

↓ open down ↓

833 lines elided

↑ open up ↑

 834  834                  /*
 835  835                   * If we are receiving, we expect there to be no dbufs in
 836  836                   * the range to be freed, because receive modifies each
 837  837                   * block at most once, and in offset order.  If this is
 838  838                   * not the case, it can lead to performance problems,
 839  839                   * so note that we unexpectedly took the slow path.
 840  840                   */
 841  841                  atomic_inc_64(&zfs_free_range_recv_miss);
 842  842          }
 843  843  
 844      -        for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
      844 +        for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
 845  845                  db_next = list_next(&dn->dn_dbufs, db);
 846  846                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 847  847  
 848  848                  if (db->db_level == 1 &&
 849  849                      db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
 850  850                          mutex_enter(&db->db_mtx);
 851  851                          if (db->db_last_dirty &&
 852  852                              db->db_last_dirty->dr_txg < txg) {
 853  853                                  dbuf_add_ref(db, FTAG);
 854  854                                  mutex_exit(&db->db_mtx);

 855  855                                  dbuf_will_dirty(db, tx);
 856  856                                  dbuf_rele(db, FTAG);
 857  857                          } else {
 858  858                                  mutex_exit(&db->db_mtx);
 859  859                          }
 860  860                  }
 861  861  
 862  862                  if (db->db_level != 0)
 863  863                          continue;
 864  864                  dprintf_dbuf(db, "found buf %s\n", "");
 865  865                  if (db->db_blkid < start || db->db_blkid > end)
 866  866                          continue;
 867  867  
 868  868                  /* found a level 0 buffer in the range */
 869  869                  mutex_enter(&db->db_mtx);
 870  870                  if (dbuf_undirty(db, tx)) {
 871  871                          /* mutex has been dropped and dbuf destroyed */
 872  872                          continue;
 873  873                  }
 874  874  
 875  875                  if (db->db_state == DB_UNCACHED ||
 876  876                      db->db_state == DB_NOFILL ||
 877  877                      db->db_state == DB_EVICTING) {
 878  878                          ASSERT(db->db.db_data == NULL);
 879  879                          mutex_exit(&db->db_mtx);
 880  880                          continue;
 881  881                  }
 882  882                  if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 883  883                          /* will be handled in dbuf_read_done or dbuf_rele */
 884  884                          db->db_freed_in_flight = TRUE;
 885  885                          mutex_exit(&db->db_mtx);
 886  886                          continue;
 887  887                  }
 888  888                  if (refcount_count(&db->db_holds) == 0) {
 889  889                          ASSERT(db->db_buf);
 890  890                          dbuf_clear(db);
 891  891                          continue;
 892  892                  }
 893  893                  /* The dbuf is referenced */
 894  894  
 895  895                  if (db->db_last_dirty != NULL) {
 896  896                          dbuf_dirty_record_t *dr = db->db_last_dirty;
 897  897  
 898  898                          if (dr->dr_txg == txg) {
 899  899                                  /*
 900  900                                   * This buffer is "in-use", re-adjust the file
 901  901                                   * size to reflect that this buffer may
 902  902                                   * contain new data when we sync.
 903  903                                   */
 904  904                                  if (db->db_blkid != DMU_SPILL_BLKID &&
 905  905                                      db->db_blkid > dn->dn_maxblkid)
 906  906                                          dn->dn_maxblkid = db->db_blkid;
 907  907                                  dbuf_unoverride(dr);
 908  908                          } else {
 909  909                                  /*
 910  910                                   * This dbuf is not dirty in the open context.
 911  911                                   * Either uncache it (if its not referenced in
 912  912                                   * the open context) or reset its contents to
 913  913                                   * empty.
 914  914                                   */
 915  915                                  dbuf_fix_old_data(db, txg);
 916  916                          }
 917  917                  }
 918  918                  /* clear the contents if its cached */
 919  919                  if (db->db_state == DB_CACHED) {
 920  920                          ASSERT(db->db.db_data != NULL);
 921  921                          arc_release(db->db_buf, db);
 922  922                          bzero(db->db.db_data, db->db.db_size);
 923  923                          arc_buf_freeze(db->db_buf);
 924  924                  }
 925  925  
 926  926                  mutex_exit(&db->db_mtx);
 927  927          }
 928  928          mutex_exit(&dn->dn_dbufs_mtx);
 929  929  }
 930  930  
 931  931  static int
 932  932  dbuf_block_freeable(dmu_buf_impl_t *db)
 933  933  {
 934  934          dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 935  935          uint64_t birth_txg = 0;
 936  936  
 937  937          /*
 938  938           * We don't need any locking to protect db_blkptr:
 939  939           * If it's syncing, then db_last_dirty will be set
 940  940           * so we'll ignore db_blkptr.
 941  941           */
 942  942          ASSERT(MUTEX_HELD(&db->db_mtx));
 943  943          if (db->db_last_dirty)
 944  944                  birth_txg = db->db_last_dirty->dr_txg;
 945  945          else if (db->db_blkptr)
 946  946                  birth_txg = db->db_blkptr->blk_birth;
 947  947  
 948  948          /*
 949  949           * If we don't exist or are in a snapshot, we can't be freed.
 950  950           * Don't pass the bp to dsl_dataset_block_freeable() since we
 951  951           * are holding the db_mtx lock and might deadlock if we are
 952  952           * prefetching a dedup-ed block.
 953  953           */
 954  954          if (birth_txg)
 955  955                  return (ds == NULL ||
 956  956                      dsl_dataset_block_freeable(ds, NULL, birth_txg));
 957  957          else
 958  958                  return (FALSE);
 959  959  }
 960  960  
 961  961  void
 962  962  dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 963  963  {
 964  964          arc_buf_t *buf, *obuf;
 965  965          int osize = db->db.db_size;
 966  966          arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 967  967          dnode_t *dn;
 968  968  
 969  969          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 970  970  
 971  971          DB_DNODE_ENTER(db);
 972  972          dn = DB_DNODE(db);
 973  973  
 974  974          /* XXX does *this* func really need the lock? */
 975  975          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 976  976  
 977  977          /*
 978  978           * This call to dbuf_will_dirty() with the dn_struct_rwlock held
 979  979           * is OK, because there can be no other references to the db
 980  980           * when we are changing its size, so no concurrent DB_FILL can
 981  981           * be happening.
 982  982           */
 983  983          /*
 984  984           * XXX we should be doing a dbuf_read, checking the return
 985  985           * value and returning that up to our callers
 986  986           */
 987  987          dbuf_will_dirty(db, tx);
 988  988  
 989  989          /* create the data buffer for the new block */
 990  990          buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
 991  991  
 992  992          /* copy old block data to the new block */
 993  993          obuf = db->db_buf;
 994  994          bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
 995  995          /* zero the remainder */
 996  996          if (size > osize)
 997  997                  bzero((uint8_t *)buf->b_data + osize, size - osize);
 998  998  
 999  999          mutex_enter(&db->db_mtx);
1000 1000          dbuf_set_data(db, buf);
1001 1001          VERIFY(arc_buf_remove_ref(obuf, db));
1002 1002          db->db.db_size = size;
1003 1003  
1004 1004          if (db->db_level == 0) {
1005 1005                  ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1006 1006                  db->db_last_dirty->dt.dl.dr_data = buf;
1007 1007          }
1008 1008          mutex_exit(&db->db_mtx);
1009 1009  
1010 1010          dnode_willuse_space(dn, size-osize, tx);
1011 1011          DB_DNODE_EXIT(db);
1012 1012  }
1013 1013  
1014 1014  void
1015 1015  dbuf_release_bp(dmu_buf_impl_t *db)
1016 1016  {
1017 1017          objset_t *os;
1018 1018  
1019 1019          DB_GET_OBJSET(&os, db);
1020 1020          ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1021 1021          ASSERT(arc_released(os->os_phys_buf) ||
1022 1022              list_link_active(&os->os_dsl_dataset->ds_synced_link));
1023 1023          ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1024 1024  
1025 1025          (void) arc_release(db->db_buf, db);
1026 1026  }
1027 1027  
1028 1028  dbuf_dirty_record_t *
1029 1029  dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1030 1030  {
1031 1031          dnode_t *dn;
1032 1032          objset_t *os;
1033 1033          dbuf_dirty_record_t **drp, *dr;
1034 1034          int drop_struct_lock = FALSE;
1035 1035          boolean_t do_free_accounting = B_FALSE;
1036 1036          int txgoff = tx->tx_txg & TXG_MASK;
1037 1037  
1038 1038          ASSERT(tx->tx_txg != 0);
1039 1039          ASSERT(!refcount_is_zero(&db->db_holds));
1040 1040          DMU_TX_DIRTY_BUF(tx, db);
1041 1041  
1042 1042          DB_DNODE_ENTER(db);
1043 1043          dn = DB_DNODE(db);
1044 1044          /*
1045 1045           * Shouldn't dirty a regular buffer in syncing context.  Private
1046 1046           * objects may be dirtied in syncing context, but only if they
1047 1047           * were already pre-dirtied in open context.
1048 1048           */
1049 1049          ASSERT(!dmu_tx_is_syncing(tx) ||
1050 1050              BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1051 1051              DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1052 1052              dn->dn_objset->os_dsl_dataset == NULL);
1053 1053          /*
1054 1054           * We make this assert for private objects as well, but after we
1055 1055           * check if we're already dirty.  They are allowed to re-dirty
1056 1056           * in syncing context.
1057 1057           */
1058 1058          ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1059 1059              dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1060 1060              (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1061 1061  
1062 1062          mutex_enter(&db->db_mtx);
1063 1063          /*
1064 1064           * XXX make this true for indirects too?  The problem is that
1065 1065           * transactions created with dmu_tx_create_assigned() from
1066 1066           * syncing context don't bother holding ahead.
1067 1067           */
1068 1068          ASSERT(db->db_level != 0 ||
1069 1069              db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1070 1070              db->db_state == DB_NOFILL);
1071 1071  
1072 1072          mutex_enter(&dn->dn_mtx);
1073 1073          /*
1074 1074           * Don't set dirtyctx to SYNC if we're just modifying this as we
1075 1075           * initialize the objset.
1076 1076           */
1077 1077          if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1078 1078              !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1079 1079                  dn->dn_dirtyctx =
1080 1080                      (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1081 1081                  ASSERT(dn->dn_dirtyctx_firstset == NULL);
1082 1082                  dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1083 1083          }
1084 1084          mutex_exit(&dn->dn_mtx);
1085 1085  
1086 1086          if (db->db_blkid == DMU_SPILL_BLKID)
1087 1087                  dn->dn_have_spill = B_TRUE;
1088 1088  
1089 1089          /*
1090 1090           * If this buffer is already dirty, we're done.
1091 1091           */
1092 1092          drp = &db->db_last_dirty;
1093 1093          ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1094 1094              db->db.db_object == DMU_META_DNODE_OBJECT);
1095 1095          while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1096 1096                  drp = &dr->dr_next;
1097 1097          if (dr && dr->dr_txg == tx->tx_txg) {
1098 1098                  DB_DNODE_EXIT(db);
1099 1099  
1100 1100                  if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1101 1101                          /*
1102 1102                           * If this buffer has already been written out,
1103 1103                           * we now need to reset its state.
1104 1104                           */
1105 1105                          dbuf_unoverride(dr);
1106 1106                          if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1107 1107                              db->db_state != DB_NOFILL)
1108 1108                                  arc_buf_thaw(db->db_buf);
1109 1109                  }
1110 1110                  mutex_exit(&db->db_mtx);
1111 1111                  return (dr);
1112 1112          }
1113 1113  
1114 1114          /*
1115 1115           * Only valid if not already dirty.
1116 1116           */
1117 1117          ASSERT(dn->dn_object == 0 ||
1118 1118              dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1119 1119              (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1120 1120  
1121 1121          ASSERT3U(dn->dn_nlevels, >, db->db_level);
1122 1122          ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1123 1123              dn->dn_phys->dn_nlevels > db->db_level ||
1124 1124              dn->dn_next_nlevels[txgoff] > db->db_level ||
1125 1125              dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1126 1126              dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1127 1127  
1128 1128          /*
1129 1129           * We should only be dirtying in syncing context if it's the
1130 1130           * mos or we're initializing the os or it's a special object.
1131 1131           * However, we are allowed to dirty in syncing context provided
1132 1132           * we already dirtied it in open context.  Hence we must make
1133 1133           * this assertion only if we're not already dirty.
1134 1134           */
1135 1135          os = dn->dn_objset;
1136 1136          ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1137 1137              os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1138 1138          ASSERT(db->db.db_size != 0);
1139 1139  
1140 1140          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1141 1141  
1142 1142          if (db->db_blkid != DMU_BONUS_BLKID) {
1143 1143                  /*
1144 1144                   * Update the accounting.
1145 1145                   * Note: we delay "free accounting" until after we drop
1146 1146                   * the db_mtx.  This keeps us from grabbing other locks
1147 1147                   * (and possibly deadlocking) in bp_get_dsize() while
1148 1148                   * also holding the db_mtx.
1149 1149                   */
1150 1150                  dnode_willuse_space(dn, db->db.db_size, tx);
1151 1151                  do_free_accounting = dbuf_block_freeable(db);
1152 1152          }
1153 1153  
1154 1154          /*
1155 1155           * If this buffer is dirty in an old transaction group we need
1156 1156           * to make a copy of it so that the changes we make in this
1157 1157           * transaction group won't leak out when we sync the older txg.
1158 1158           */
1159 1159          dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1160 1160          if (db->db_level == 0) {
1161 1161                  void *data_old = db->db_buf;
1162 1162  
1163 1163                  if (db->db_state != DB_NOFILL) {
1164 1164                          if (db->db_blkid == DMU_BONUS_BLKID) {
1165 1165                                  dbuf_fix_old_data(db, tx->tx_txg);
1166 1166                                  data_old = db->db.db_data;
1167 1167                          } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1168 1168                                  /*
1169 1169                                   * Release the data buffer from the cache so
1170 1170                                   * that we can modify it without impacting
1171 1171                                   * possible other users of this cached data
1172 1172                                   * block.  Note that indirect blocks and
1173 1173                                   * private objects are not released until the
1174 1174                                   * syncing state (since they are only modified
1175 1175                                   * then).
1176 1176                                   */
1177 1177                                  arc_release(db->db_buf, db);
1178 1178                                  dbuf_fix_old_data(db, tx->tx_txg);
1179 1179                                  data_old = db->db_buf;

↓ open down ↓

325 lines elided

↑ open up ↑

1180 1180                          }
1181 1181                          ASSERT(data_old != NULL);
1182 1182                  }
1183 1183                  dr->dt.dl.dr_data = data_old;
1184 1184          } else {
1185 1185                  mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1186 1186                  list_create(&dr->dt.di.dr_children,
1187 1187                      sizeof (dbuf_dirty_record_t),
1188 1188                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
1189 1189          }
     1190 +        if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
     1191 +                dr->dr_accounted = db->db.db_size;
1190 1192          dr->dr_dbuf = db;
1191 1193          dr->dr_txg = tx->tx_txg;
1192 1194          dr->dr_next = *drp;
1193 1195          *drp = dr;
1194 1196  
1195 1197          /*
1196 1198           * We could have been freed_in_flight between the dbuf_noread
1197 1199           * and dbuf_dirty.  We win, as though the dbuf_noread() had
1198 1200           * happened after the free.
1199 1201           */

1200 1202          if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1201 1203              db->db_blkid != DMU_SPILL_BLKID) {
1202 1204                  mutex_enter(&dn->dn_mtx);
1203 1205                  dnode_clear_range(dn, db->db_blkid, 1, tx);
1204 1206                  mutex_exit(&dn->dn_mtx);
1205 1207                  db->db_freed_in_flight = FALSE;
1206 1208          }
1207 1209  
1208 1210          /*
1209 1211           * This buffer is now part of this txg
1210 1212           */
1211 1213          dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1212 1214          db->db_dirtycnt += 1;
1213 1215          ASSERT3U(db->db_dirtycnt, <=, 3);
1214 1216  
1215 1217          mutex_exit(&db->db_mtx);
1216 1218  
1217 1219          if (db->db_blkid == DMU_BONUS_BLKID ||
1218 1220              db->db_blkid == DMU_SPILL_BLKID) {
1219 1221                  mutex_enter(&dn->dn_mtx);
1220 1222                  ASSERT(!list_link_active(&dr->dr_dirty_node));
1221 1223                  list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1222 1224                  mutex_exit(&dn->dn_mtx);
1223 1225                  dnode_setdirty(dn, tx);
1224 1226                  DB_DNODE_EXIT(db);
1225 1227                  return (dr);
1226 1228          } else if (do_free_accounting) {
1227 1229                  blkptr_t *bp = db->db_blkptr;
1228 1230                  int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1229 1231                      bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1230 1232                  /*
1231 1233                   * This is only a guess -- if the dbuf is dirty
1232 1234                   * in a previous txg, we don't know how much
1233 1235                   * space it will use on disk yet.  We should
1234 1236                   * really have the struct_rwlock to access
1235 1237                   * db_blkptr, but since this is just a guess,
1236 1238                   * it's OK if we get an odd answer.
1237 1239                   */
1238 1240                  ddt_prefetch(os->os_spa, bp);
1239 1241                  dnode_willuse_space(dn, -willfree, tx);
1240 1242          }
1241 1243  
1242 1244          if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1243 1245                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
1244 1246                  drop_struct_lock = TRUE;
1245 1247          }
1246 1248  
1247 1249          if (db->db_level == 0) {
1248 1250                  dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1249 1251                  ASSERT(dn->dn_maxblkid >= db->db_blkid);
1250 1252          }
1251 1253  
1252 1254          if (db->db_level+1 < dn->dn_nlevels) {
1253 1255                  dmu_buf_impl_t *parent = db->db_parent;
1254 1256                  dbuf_dirty_record_t *di;
1255 1257                  int parent_held = FALSE;
1256 1258  
1257 1259                  if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1258 1260                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1259 1261  
1260 1262                          parent = dbuf_hold_level(dn, db->db_level+1,
1261 1263                              db->db_blkid >> epbs, FTAG);
1262 1264                          ASSERT(parent != NULL);

↓ open down ↓

63 lines elided

↑ open up ↑

1263 1265                          parent_held = TRUE;
1264 1266                  }
1265 1267                  if (drop_struct_lock)
1266 1268                          rw_exit(&dn->dn_struct_rwlock);
1267 1269                  ASSERT3U(db->db_level+1, ==, parent->db_level);
1268 1270                  di = dbuf_dirty(parent, tx);
1269 1271                  if (parent_held)
1270 1272                          dbuf_rele(parent, FTAG);
1271 1273  
1272 1274                  mutex_enter(&db->db_mtx);
1273      -                /*  possible race with dbuf_undirty() */
     1275 +                /*
     1276 +                 * Since we've dropped the mutex, it's possible that
     1277 +                 * dbuf_undirty() might have changed this out from under us.
     1278 +                 */
1274 1279                  if (db->db_last_dirty == dr ||
1275 1280                      dn->dn_object == DMU_META_DNODE_OBJECT) {
1276 1281                          mutex_enter(&di->dt.di.dr_mtx);
1277 1282                          ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1278 1283                          ASSERT(!list_link_active(&dr->dr_dirty_node));
1279 1284                          list_insert_tail(&di->dt.di.dr_children, dr);
1280 1285                          mutex_exit(&di->dt.di.dr_mtx);
1281 1286                          dr->dr_parent = di;
1282 1287                  }
1283 1288                  mutex_exit(&db->db_mtx);

1284 1289          } else {
1285 1290                  ASSERT(db->db_level+1 == dn->dn_nlevels);
1286 1291                  ASSERT(db->db_blkid < dn->dn_nblkptr);
1287 1292                  ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1288 1293                  mutex_enter(&dn->dn_mtx);
1289 1294                  ASSERT(!list_link_active(&dr->dr_dirty_node));
1290 1295                  list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1291 1296                  mutex_exit(&dn->dn_mtx);
1292 1297                  if (drop_struct_lock)
1293 1298                          rw_exit(&dn->dn_struct_rwlock);
1294 1299          }
1295 1300  
1296 1301          dnode_setdirty(dn, tx);
1297 1302          DB_DNODE_EXIT(db);
1298 1303          return (dr);
1299 1304  }
1300 1305  
1301 1306  /*
1302 1307   * Undirty a buffer in the transaction group referenced by the given
1303 1308   * transaction.  Return whether this evicted the dbuf.
1304 1309   */
1305 1310  static boolean_t
1306 1311  dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1307 1312  {
1308 1313          dnode_t *dn;
1309 1314          uint64_t txg = tx->tx_txg;
1310 1315          dbuf_dirty_record_t *dr, **drp;
1311 1316  
1312 1317          ASSERT(txg != 0);
1313 1318          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1314 1319          ASSERT0(db->db_level);
1315 1320          ASSERT(MUTEX_HELD(&db->db_mtx));
1316 1321  
1317 1322          /*
1318 1323           * If this buffer is not dirty, we're done.
1319 1324           */
1320 1325          for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1321 1326                  if (dr->dr_txg <= txg)
1322 1327                          break;
1323 1328          if (dr == NULL || dr->dr_txg < txg)
1324 1329                  return (B_FALSE);
1325 1330          ASSERT(dr->dr_txg == txg);
1326 1331          ASSERT(dr->dr_dbuf == db);
1327 1332  
1328 1333          DB_DNODE_ENTER(db);
1329 1334          dn = DB_DNODE(db);
1330 1335  
1331 1336          /*
1332 1337           * Note:  This code will probably work even if there are concurrent

↓ open down ↓

49 lines elided

↑ open up ↑

1333 1338           * holders, but it is untested in that scenerio, as the ZPL and
1334 1339           * ztest have additional locking (the range locks) that prevents
1335 1340           * that type of concurrent access.
1336 1341           */
1337 1342          ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1338 1343  
1339 1344          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1340 1345  
1341 1346          ASSERT(db->db.db_size != 0);
1342 1347  
1343      -        /* XXX would be nice to fix up dn_towrite_space[] */
     1348 +        /*
     1349 +         * Any space we accounted for in dp_dirty_* will be cleaned up by
     1350 +         * dsl_pool_sync().  This is relatively rare so the discrepancy
     1351 +         * is not a big deal.
     1352 +         */
1344 1353  
1345 1354          *drp = dr->dr_next;
1346 1355  
1347 1356          /*
1348 1357           * Note that there are three places in dbuf_dirty()
1349 1358           * where this dirty record may be put on a list.
1350 1359           * Make sure to do a list_remove corresponding to
1351 1360           * every one of those list_insert calls.
1352 1361           */
1353 1362          if (dr->dr_parent) {

1354 1363                  mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1355 1364                  list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1356 1365                  mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1357 1366          } else if (db->db_blkid == DMU_SPILL_BLKID ||
1358 1367              db->db_level+1 == dn->dn_nlevels) {
1359 1368                  ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1360 1369                  mutex_enter(&dn->dn_mtx);
1361 1370                  list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1362 1371                  mutex_exit(&dn->dn_mtx);
1363 1372          }
1364 1373          DB_DNODE_EXIT(db);
1365 1374  
1366 1375          if (db->db_state != DB_NOFILL) {
1367 1376                  dbuf_unoverride(dr);
1368 1377  
1369 1378                  ASSERT(db->db_buf != NULL);
1370 1379                  ASSERT(dr->dt.dl.dr_data != NULL);
1371 1380                  if (dr->dt.dl.dr_data != db->db_buf)
1372 1381                          VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1373 1382          }
1374 1383          kmem_free(dr, sizeof (dbuf_dirty_record_t));
1375 1384  
1376 1385          ASSERT(db->db_dirtycnt > 0);
1377 1386          db->db_dirtycnt -= 1;
1378 1387  
1379 1388          if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1380 1389                  arc_buf_t *buf = db->db_buf;
1381 1390  
1382 1391                  ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1383 1392                  dbuf_set_data(db, NULL);
1384 1393                  VERIFY(arc_buf_remove_ref(buf, db));
1385 1394                  dbuf_evict(db);
1386 1395                  return (B_TRUE);
1387 1396          }
1388 1397  
1389 1398          return (B_FALSE);
1390 1399  }
1391 1400  
1392 1401  #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1393 1402  void
1394 1403  dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1395 1404  {
1396 1405          int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1397 1406  
1398 1407          ASSERT(tx->tx_txg != 0);
1399 1408          ASSERT(!refcount_is_zero(&db->db_holds));
1400 1409  
1401 1410          DB_DNODE_ENTER(db);
1402 1411          if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1403 1412                  rf |= DB_RF_HAVESTRUCT;
1404 1413          DB_DNODE_EXIT(db);
1405 1414          (void) dbuf_read(db, NULL, rf);
1406 1415          (void) dbuf_dirty(db, tx);
1407 1416  }
1408 1417  
1409 1418  void
1410 1419  dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1411 1420  {
1412 1421          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1413 1422  
1414 1423          db->db_state = DB_NOFILL;
1415 1424  
1416 1425          dmu_buf_will_fill(db_fake, tx);
1417 1426  }
1418 1427  
1419 1428  void
1420 1429  dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1421 1430  {
1422 1431          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1423 1432  
1424 1433          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1425 1434          ASSERT(tx->tx_txg != 0);
1426 1435          ASSERT(db->db_level == 0);
1427 1436          ASSERT(!refcount_is_zero(&db->db_holds));
1428 1437  
1429 1438          ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1430 1439              dmu_tx_private_ok(tx));
1431 1440  
1432 1441          dbuf_noread(db);
1433 1442          (void) dbuf_dirty(db, tx);
1434 1443  }
1435 1444  
1436 1445  #pragma weak dmu_buf_fill_done = dbuf_fill_done
1437 1446  /* ARGSUSED */
1438 1447  void
1439 1448  dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1440 1449  {
1441 1450          mutex_enter(&db->db_mtx);
1442 1451          DBUF_VERIFY(db);
1443 1452  
1444 1453          if (db->db_state == DB_FILL) {
1445 1454                  if (db->db_level == 0 && db->db_freed_in_flight) {
1446 1455                          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1447 1456                          /* we were freed while filling */
1448 1457                          /* XXX dbuf_undirty? */
1449 1458                          bzero(db->db.db_data, db->db.db_size);
1450 1459                          db->db_freed_in_flight = FALSE;
1451 1460                  }
1452 1461                  db->db_state = DB_CACHED;
1453 1462                  cv_broadcast(&db->db_changed);
1454 1463          }
1455 1464          mutex_exit(&db->db_mtx);
1456 1465  }
1457 1466  
1458 1467  /*
1459 1468   * Directly assign a provided arc buf to a given dbuf if it's not referenced
1460 1469   * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1461 1470   */
1462 1471  void
1463 1472  dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1464 1473  {
1465 1474          ASSERT(!refcount_is_zero(&db->db_holds));
1466 1475          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1467 1476          ASSERT(db->db_level == 0);
1468 1477          ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1469 1478          ASSERT(buf != NULL);
1470 1479          ASSERT(arc_buf_size(buf) == db->db.db_size);
1471 1480          ASSERT(tx->tx_txg != 0);
1472 1481  
1473 1482          arc_return_buf(buf, db);
1474 1483          ASSERT(arc_released(buf));
1475 1484  
1476 1485          mutex_enter(&db->db_mtx);
1477 1486  
1478 1487          while (db->db_state == DB_READ || db->db_state == DB_FILL)
1479 1488                  cv_wait(&db->db_changed, &db->db_mtx);
1480 1489  
1481 1490          ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1482 1491  
1483 1492          if (db->db_state == DB_CACHED &&
1484 1493              refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1485 1494                  mutex_exit(&db->db_mtx);
1486 1495                  (void) dbuf_dirty(db, tx);
1487 1496                  bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1488 1497                  VERIFY(arc_buf_remove_ref(buf, db));
1489 1498                  xuio_stat_wbuf_copied();
1490 1499                  return;
1491 1500          }
1492 1501  
1493 1502          xuio_stat_wbuf_nocopy();
1494 1503          if (db->db_state == DB_CACHED) {
1495 1504                  dbuf_dirty_record_t *dr = db->db_last_dirty;
1496 1505  
1497 1506                  ASSERT(db->db_buf != NULL);
1498 1507                  if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1499 1508                          ASSERT(dr->dt.dl.dr_data == db->db_buf);
1500 1509                          if (!arc_released(db->db_buf)) {
1501 1510                                  ASSERT(dr->dt.dl.dr_override_state ==
1502 1511                                      DR_OVERRIDDEN);
1503 1512                                  arc_release(db->db_buf, db);
1504 1513                          }
1505 1514                          dr->dt.dl.dr_data = buf;
1506 1515                          VERIFY(arc_buf_remove_ref(db->db_buf, db));
1507 1516                  } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1508 1517                          arc_release(db->db_buf, db);
1509 1518                          VERIFY(arc_buf_remove_ref(db->db_buf, db));
1510 1519                  }
1511 1520                  db->db_buf = NULL;
1512 1521          }

↓ open down ↓

159 lines elided

↑ open up ↑

1513 1522          ASSERT(db->db_buf == NULL);
1514 1523          dbuf_set_data(db, buf);
1515 1524          db->db_state = DB_FILL;
1516 1525          mutex_exit(&db->db_mtx);
1517 1526          (void) dbuf_dirty(db, tx);
1518 1527          dbuf_fill_done(db, tx);
1519 1528  }
1520 1529  
1521 1530  /*
1522 1531   * "Clear" the contents of this dbuf.  This will mark the dbuf
1523      - * EVICTING and clear *most* of its references.  Unfortunetely,
     1532 + * EVICTING and clear *most* of its references.  Unfortunately,
1524 1533   * when we are not holding the dn_dbufs_mtx, we can't clear the
1525 1534   * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1526 1535   * in this case.  For callers from the DMU we will usually see:
1527 1536   *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1528 1537   * For the arc callback, we will usually see:
1529 1538   *      dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1530 1539   * Sometimes, though, we will get a mix of these two:
1531 1540   *      DMU: dbuf_clear()->arc_buf_evict()
1532 1541   *      ARC: dbuf_do_evict()->dbuf_destroy()
1533 1542   */

1534 1543  void
1535 1544  dbuf_clear(dmu_buf_impl_t *db)
1536 1545  {
1537 1546          dnode_t *dn;
1538 1547          dmu_buf_impl_t *parent = db->db_parent;
1539 1548          dmu_buf_impl_t *dndb;
1540 1549          int dbuf_gone = FALSE;
1541 1550  
1542 1551          ASSERT(MUTEX_HELD(&db->db_mtx));
1543 1552          ASSERT(refcount_is_zero(&db->db_holds));
1544 1553  
1545 1554          dbuf_evict_user(db);
1546 1555  
1547 1556          if (db->db_state == DB_CACHED) {
1548 1557                  ASSERT(db->db.db_data != NULL);
1549 1558                  if (db->db_blkid == DMU_BONUS_BLKID) {
1550 1559                          zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1551 1560                          arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1552 1561                  }
1553 1562                  db->db.db_data = NULL;
1554 1563                  db->db_state = DB_UNCACHED;
1555 1564          }
1556 1565  
1557 1566          ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1558 1567          ASSERT(db->db_data_pending == NULL);
1559 1568  
1560 1569          db->db_state = DB_EVICTING;
1561 1570          db->db_blkptr = NULL;
1562 1571  
1563 1572          DB_DNODE_ENTER(db);
1564 1573          dn = DB_DNODE(db);
1565 1574          dndb = dn->dn_dbuf;
1566 1575          if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1567 1576                  list_remove(&dn->dn_dbufs, db);
1568 1577                  (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1569 1578                  membar_producer();
1570 1579                  DB_DNODE_EXIT(db);
1571 1580                  /*
1572 1581                   * Decrementing the dbuf count means that the hold corresponding
1573 1582                   * to the removed dbuf is no longer discounted in dnode_move(),
1574 1583                   * so the dnode cannot be moved until after we release the hold.
1575 1584                   * The membar_producer() ensures visibility of the decremented
1576 1585                   * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1577 1586                   * release any lock.
1578 1587                   */
1579 1588                  dnode_rele(dn, db);
1580 1589                  db->db_dnode_handle = NULL;
1581 1590          } else {
1582 1591                  DB_DNODE_EXIT(db);
1583 1592          }
1584 1593  
1585 1594          if (db->db_buf)
1586 1595                  dbuf_gone = arc_buf_evict(db->db_buf);
1587 1596  
1588 1597          if (!dbuf_gone)
1589 1598                  mutex_exit(&db->db_mtx);
1590 1599  
1591 1600          /*
1592 1601           * If this dbuf is referenced from an indirect dbuf,
1593 1602           * decrement the ref count on the indirect dbuf.
1594 1603           */
1595 1604          if (parent && parent != dndb)
1596 1605                  dbuf_rele(parent, db);
1597 1606  }
1598 1607  
1599 1608  static int
1600 1609  dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1601 1610      dmu_buf_impl_t **parentp, blkptr_t **bpp)
1602 1611  {
1603 1612          int nlevels, epbs;
1604 1613  
1605 1614          *parentp = NULL;
1606 1615          *bpp = NULL;
1607 1616  
1608 1617          ASSERT(blkid != DMU_BONUS_BLKID);
1609 1618  
1610 1619          if (blkid == DMU_SPILL_BLKID) {
1611 1620                  mutex_enter(&dn->dn_mtx);
1612 1621                  if (dn->dn_have_spill &&
1613 1622                      (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1614 1623                          *bpp = &dn->dn_phys->dn_spill;
1615 1624                  else
1616 1625                          *bpp = NULL;
1617 1626                  dbuf_add_ref(dn->dn_dbuf, NULL);
1618 1627                  *parentp = dn->dn_dbuf;
1619 1628                  mutex_exit(&dn->dn_mtx);
1620 1629                  return (0);
1621 1630          }
1622 1631  
1623 1632          if (dn->dn_phys->dn_nlevels == 0)
1624 1633                  nlevels = 1;
1625 1634          else
1626 1635                  nlevels = dn->dn_phys->dn_nlevels;
1627 1636  
1628 1637          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1629 1638  
1630 1639          ASSERT3U(level * epbs, <, 64);
1631 1640          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1632 1641          if (level >= nlevels ||
1633 1642              (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1634 1643                  /* the buffer has no parent yet */
1635 1644                  return (SET_ERROR(ENOENT));
1636 1645          } else if (level < nlevels-1) {
1637 1646                  /* this block is referenced from an indirect block */
1638 1647                  int err = dbuf_hold_impl(dn, level+1,
1639 1648                      blkid >> epbs, fail_sparse, NULL, parentp);
1640 1649                  if (err)
1641 1650                          return (err);
1642 1651                  err = dbuf_read(*parentp, NULL,
1643 1652                      (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1644 1653                  if (err) {
1645 1654                          dbuf_rele(*parentp, NULL);
1646 1655                          *parentp = NULL;
1647 1656                          return (err);
1648 1657                  }
1649 1658                  *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1650 1659                      (blkid & ((1ULL << epbs) - 1));
1651 1660                  return (0);
1652 1661          } else {
1653 1662                  /* the block is referenced from the dnode */
1654 1663                  ASSERT3U(level, ==, nlevels-1);
1655 1664                  ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1656 1665                      blkid < dn->dn_phys->dn_nblkptr);
1657 1666                  if (dn->dn_dbuf) {
1658 1667                          dbuf_add_ref(dn->dn_dbuf, NULL);
1659 1668                          *parentp = dn->dn_dbuf;
1660 1669                  }
1661 1670                  *bpp = &dn->dn_phys->dn_blkptr[blkid];
1662 1671                  return (0);
1663 1672          }
1664 1673  }
1665 1674  
1666 1675  static dmu_buf_impl_t *
1667 1676  dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1668 1677      dmu_buf_impl_t *parent, blkptr_t *blkptr)
1669 1678  {
1670 1679          objset_t *os = dn->dn_objset;
1671 1680          dmu_buf_impl_t *db, *odb;
1672 1681  
1673 1682          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1674 1683          ASSERT(dn->dn_type != DMU_OT_NONE);
1675 1684  
1676 1685          db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1677 1686  
1678 1687          db->db_objset = os;
1679 1688          db->db.db_object = dn->dn_object;
1680 1689          db->db_level = level;
1681 1690          db->db_blkid = blkid;
1682 1691          db->db_last_dirty = NULL;
1683 1692          db->db_dirtycnt = 0;
1684 1693          db->db_dnode_handle = dn->dn_handle;
1685 1694          db->db_parent = parent;
1686 1695          db->db_blkptr = blkptr;
1687 1696  
1688 1697          db->db_user_ptr = NULL;
1689 1698          db->db_user_data_ptr_ptr = NULL;
1690 1699          db->db_evict_func = NULL;
1691 1700          db->db_immediate_evict = 0;
1692 1701          db->db_freed_in_flight = 0;
1693 1702  
1694 1703          if (blkid == DMU_BONUS_BLKID) {
1695 1704                  ASSERT3P(parent, ==, dn->dn_dbuf);
1696 1705                  db->db.db_size = DN_MAX_BONUSLEN -
1697 1706                      (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1698 1707                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1699 1708                  db->db.db_offset = DMU_BONUS_BLKID;

↓ open down ↓

166 lines elided

↑ open up ↑

1700 1709                  db->db_state = DB_UNCACHED;
1701 1710                  /* the bonus dbuf is not placed in the hash table */
1702 1711                  arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1703 1712                  return (db);
1704 1713          } else if (blkid == DMU_SPILL_BLKID) {
1705 1714                  db->db.db_size = (blkptr != NULL) ?
1706 1715                      BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1707 1716                  db->db.db_offset = 0;
1708 1717          } else {
1709 1718                  int blocksize =
1710      -                    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
     1719 +                    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1711 1720                  db->db.db_size = blocksize;
1712 1721                  db->db.db_offset = db->db_blkid * blocksize;
1713 1722          }
1714 1723  
1715 1724          /*
1716 1725           * Hold the dn_dbufs_mtx while we get the new dbuf
1717 1726           * in the hash table *and* added to the dbufs list.
1718 1727           * This prevents a possible deadlock with someone
1719 1728           * trying to look up this dbuf before its added to the
1720 1729           * dn_dbufs list.

1721 1730           */
1722 1731          mutex_enter(&dn->dn_dbufs_mtx);
1723 1732          db->db_state = DB_EVICTING;
1724 1733          if ((odb = dbuf_hash_insert(db)) != NULL) {
1725 1734                  /* someone else inserted it first */
1726 1735                  kmem_cache_free(dbuf_cache, db);
1727 1736                  mutex_exit(&dn->dn_dbufs_mtx);
1728 1737                  return (odb);
1729 1738          }
1730 1739          list_insert_head(&dn->dn_dbufs, db);
1731 1740          if (db->db_level == 0 && db->db_blkid >=
1732 1741              dn->dn_unlisted_l0_blkid)
1733 1742                  dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1734 1743          db->db_state = DB_UNCACHED;
1735 1744          mutex_exit(&dn->dn_dbufs_mtx);
1736 1745          arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1737 1746  
1738 1747          if (parent && parent != dn->dn_dbuf)
1739 1748                  dbuf_add_ref(parent, db);
1740 1749  
1741 1750          ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1742 1751              refcount_count(&dn->dn_holds) > 0);
1743 1752          (void) refcount_add(&dn->dn_holds, db);
1744 1753          (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1745 1754  
1746 1755          dprintf_dbuf(db, "db=%p\n", db);
1747 1756  
1748 1757          return (db);
1749 1758  }
1750 1759  
1751 1760  static int
1752 1761  dbuf_do_evict(void *private)
1753 1762  {
1754 1763          arc_buf_t *buf = private;
1755 1764          dmu_buf_impl_t *db = buf->b_private;
1756 1765  
1757 1766          if (!MUTEX_HELD(&db->db_mtx))
1758 1767                  mutex_enter(&db->db_mtx);
1759 1768  
1760 1769          ASSERT(refcount_is_zero(&db->db_holds));
1761 1770  
1762 1771          if (db->db_state != DB_EVICTING) {
1763 1772                  ASSERT(db->db_state == DB_CACHED);
1764 1773                  DBUF_VERIFY(db);
1765 1774                  db->db_buf = NULL;
1766 1775                  dbuf_evict(db);
1767 1776          } else {
1768 1777                  mutex_exit(&db->db_mtx);
1769 1778                  dbuf_destroy(db);
1770 1779          }
1771 1780          return (0);
1772 1781  }
1773 1782  
1774 1783  static void
1775 1784  dbuf_destroy(dmu_buf_impl_t *db)
1776 1785  {
1777 1786          ASSERT(refcount_is_zero(&db->db_holds));
1778 1787  
1779 1788          if (db->db_blkid != DMU_BONUS_BLKID) {
1780 1789                  /*
1781 1790                   * If this dbuf is still on the dn_dbufs list,
1782 1791                   * remove it from that list.
1783 1792                   */
1784 1793                  if (db->db_dnode_handle != NULL) {
1785 1794                          dnode_t *dn;
1786 1795  
1787 1796                          DB_DNODE_ENTER(db);
1788 1797                          dn = DB_DNODE(db);
1789 1798                          mutex_enter(&dn->dn_dbufs_mtx);
1790 1799                          list_remove(&dn->dn_dbufs, db);
1791 1800                          (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1792 1801                          mutex_exit(&dn->dn_dbufs_mtx);
1793 1802                          DB_DNODE_EXIT(db);
1794 1803                          /*
1795 1804                           * Decrementing the dbuf count means that the hold
1796 1805                           * corresponding to the removed dbuf is no longer
1797 1806                           * discounted in dnode_move(), so the dnode cannot be
1798 1807                           * moved until after we release the hold.
1799 1808                           */
1800 1809                          dnode_rele(dn, db);
1801 1810                          db->db_dnode_handle = NULL;
1802 1811                  }
1803 1812                  dbuf_hash_remove(db);
1804 1813          }
1805 1814          db->db_parent = NULL;
1806 1815          db->db_buf = NULL;
1807 1816  
1808 1817          ASSERT(!list_link_active(&db->db_link));

↓ open down ↓

88 lines elided

↑ open up ↑

1809 1818          ASSERT(db->db.db_data == NULL);
1810 1819          ASSERT(db->db_hash_next == NULL);
1811 1820          ASSERT(db->db_blkptr == NULL);
1812 1821          ASSERT(db->db_data_pending == NULL);
1813 1822  
1814 1823          kmem_cache_free(dbuf_cache, db);
1815 1824          arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1816 1825  }
1817 1826  
1818 1827  void
1819      -dbuf_prefetch(dnode_t *dn, uint64_t blkid)
     1828 +dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1820 1829  {
1821 1830          dmu_buf_impl_t *db = NULL;
1822 1831          blkptr_t *bp = NULL;
1823 1832  
1824 1833          ASSERT(blkid != DMU_BONUS_BLKID);
1825 1834          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1826 1835  
1827 1836          if (dnode_block_freed(dn, blkid))
1828 1837                  return;
1829 1838

1830 1839          /* dbuf_find() returns with db_mtx held */
1831 1840          if (db = dbuf_find(dn, 0, blkid)) {
1832 1841                  /*

↓ open down ↓

3 lines elided

↑ open up ↑

1833 1842                   * This dbuf is already in the cache.  We assume that
1834 1843                   * it is already CACHED, or else about to be either
1835 1844                   * read or filled.
1836 1845                   */
1837 1846                  mutex_exit(&db->db_mtx);
1838 1847                  return;
1839 1848          }
1840 1849  
1841 1850          if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1842 1851                  if (bp && !BP_IS_HOLE(bp)) {
1843      -                        int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
1844      -                            ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
1845 1852                          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1846 1853                          uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1847 1854                          zbookmark_t zb;
1848 1855  
1849 1856                          SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1850 1857                              dn->dn_object, 0, blkid);
1851 1858  
1852 1859                          (void) arc_read(NULL, dn->dn_objset->os_spa,
1853      -                            bp, NULL, NULL, priority,
     1860 +                            bp, NULL, NULL, prio,
1854 1861                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1855 1862                              &aflags, &zb);
1856 1863                  }
1857 1864                  if (db)
1858 1865                          dbuf_rele(db, NULL);
1859 1866          }
1860 1867  }
1861 1868  
1862 1869  /*
1863 1870   * Returns with db_holds incremented, and db_mtx not held.

1864 1871   * Note: dn_struct_rwlock must be held.
1865 1872   */
1866 1873  int
1867 1874  dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1868 1875      void *tag, dmu_buf_impl_t **dbp)
1869 1876  {
1870 1877          dmu_buf_impl_t *db, *parent = NULL;
1871 1878  
1872 1879          ASSERT(blkid != DMU_BONUS_BLKID);
1873 1880          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1874 1881          ASSERT3U(dn->dn_nlevels, >, level);
1875 1882  
1876 1883          *dbp = NULL;
1877 1884  top:
1878 1885          /* dbuf_find() returns with db_mtx held */
1879 1886          db = dbuf_find(dn, level, blkid);
1880 1887  
1881 1888          if (db == NULL) {
1882 1889                  blkptr_t *bp = NULL;
1883 1890                  int err;
1884 1891  
1885 1892                  ASSERT3P(parent, ==, NULL);
1886 1893                  err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1887 1894                  if (fail_sparse) {
1888 1895                          if (err == 0 && bp && BP_IS_HOLE(bp))
1889 1896                                  err = SET_ERROR(ENOENT);
1890 1897                          if (err) {
1891 1898                                  if (parent)
1892 1899                                          dbuf_rele(parent, NULL);
1893 1900                                  return (err);
1894 1901                          }
1895 1902                  }
1896 1903                  if (err && err != ENOENT)
1897 1904                          return (err);
1898 1905                  db = dbuf_create(dn, level, blkid, parent, bp);
1899 1906          }
1900 1907  
1901 1908          if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1902 1909                  arc_buf_add_ref(db->db_buf, db);
1903 1910                  if (db->db_buf->b_data == NULL) {
1904 1911                          dbuf_clear(db);
1905 1912                          if (parent) {
1906 1913                                  dbuf_rele(parent, NULL);
1907 1914                                  parent = NULL;
1908 1915                          }
1909 1916                          goto top;
1910 1917                  }
1911 1918                  ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1912 1919          }
1913 1920  
1914 1921          ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1915 1922  
1916 1923          /*
1917 1924           * If this buffer is currently syncing out, and we are are
1918 1925           * still referencing it from db_data, we need to make a copy
1919 1926           * of it in case we decide we want to dirty it again in this txg.
1920 1927           */
1921 1928          if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1922 1929              dn->dn_object != DMU_META_DNODE_OBJECT &&
1923 1930              db->db_state == DB_CACHED && db->db_data_pending) {
1924 1931                  dbuf_dirty_record_t *dr = db->db_data_pending;
1925 1932  
1926 1933                  if (dr->dt.dl.dr_data == db->db_buf) {
1927 1934                          arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1928 1935  
1929 1936                          dbuf_set_data(db,
1930 1937                              arc_buf_alloc(dn->dn_objset->os_spa,
1931 1938                              db->db.db_size, db, type));
1932 1939                          bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1933 1940                              db->db.db_size);
1934 1941                  }
1935 1942          }
1936 1943  
1937 1944          (void) refcount_add(&db->db_holds, tag);
1938 1945          dbuf_update_data(db);
1939 1946          DBUF_VERIFY(db);
1940 1947          mutex_exit(&db->db_mtx);
1941 1948  
1942 1949          /* NOTE: we can't rele the parent until after we drop the db_mtx */
1943 1950          if (parent)
1944 1951                  dbuf_rele(parent, NULL);
1945 1952  
1946 1953          ASSERT3P(DB_DNODE(db), ==, dn);
1947 1954          ASSERT3U(db->db_blkid, ==, blkid);
1948 1955          ASSERT3U(db->db_level, ==, level);
1949 1956          *dbp = db;
1950 1957  
1951 1958          return (0);
1952 1959  }
1953 1960  
1954 1961  dmu_buf_impl_t *
1955 1962  dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1956 1963  {
1957 1964          dmu_buf_impl_t *db;
1958 1965          int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1959 1966          return (err ? NULL : db);
1960 1967  }
1961 1968  
1962 1969  dmu_buf_impl_t *
1963 1970  dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1964 1971  {
1965 1972          dmu_buf_impl_t *db;
1966 1973          int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1967 1974          return (err ? NULL : db);
1968 1975  }
1969 1976  
1970 1977  void
1971 1978  dbuf_create_bonus(dnode_t *dn)
1972 1979  {
1973 1980          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1974 1981  
1975 1982          ASSERT(dn->dn_bonus == NULL);
1976 1983          dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1977 1984  }
1978 1985  
1979 1986  int
1980 1987  dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1981 1988  {
1982 1989          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1983 1990          dnode_t *dn;
1984 1991  
1985 1992          if (db->db_blkid != DMU_SPILL_BLKID)
1986 1993                  return (SET_ERROR(ENOTSUP));
1987 1994          if (blksz == 0)
1988 1995                  blksz = SPA_MINBLOCKSIZE;
1989 1996          if (blksz > SPA_MAXBLOCKSIZE)
1990 1997                  blksz = SPA_MAXBLOCKSIZE;
1991 1998          else
1992 1999                  blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
1993 2000  
1994 2001          DB_DNODE_ENTER(db);
1995 2002          dn = DB_DNODE(db);
1996 2003          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1997 2004          dbuf_new_size(db, blksz, tx);
1998 2005          rw_exit(&dn->dn_struct_rwlock);
1999 2006          DB_DNODE_EXIT(db);
2000 2007  
2001 2008          return (0);
2002 2009  }
2003 2010  
2004 2011  void
2005 2012  dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2006 2013  {
2007 2014          dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2008 2015  }
2009 2016  
2010 2017  #pragma weak dmu_buf_add_ref = dbuf_add_ref
2011 2018  void
2012 2019  dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2013 2020  {
2014 2021          int64_t holds = refcount_add(&db->db_holds, tag);
2015 2022          ASSERT(holds > 1);
2016 2023  }
2017 2024  
2018 2025  /*
2019 2026   * If you call dbuf_rele() you had better not be referencing the dnode handle
2020 2027   * unless you have some other direct or indirect hold on the dnode. (An indirect
2021 2028   * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2022 2029   * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2023 2030   * dnode's parent dbuf evicting its dnode handles.
2024 2031   */
2025 2032  #pragma weak dmu_buf_rele = dbuf_rele
2026 2033  void
2027 2034  dbuf_rele(dmu_buf_impl_t *db, void *tag)
2028 2035  {
2029 2036          mutex_enter(&db->db_mtx);
2030 2037          dbuf_rele_and_unlock(db, tag);
2031 2038  }
2032 2039  
2033 2040  /*
2034 2041   * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2035 2042   * db_dirtycnt and db_holds to be updated atomically.
2036 2043   */
2037 2044  void
2038 2045  dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2039 2046  {
2040 2047          int64_t holds;
2041 2048  
2042 2049          ASSERT(MUTEX_HELD(&db->db_mtx));
2043 2050          DBUF_VERIFY(db);
2044 2051  
2045 2052          /*
2046 2053           * Remove the reference to the dbuf before removing its hold on the
2047 2054           * dnode so we can guarantee in dnode_move() that a referenced bonus
2048 2055           * buffer has a corresponding dnode hold.
2049 2056           */
2050 2057          holds = refcount_remove(&db->db_holds, tag);
2051 2058          ASSERT(holds >= 0);
2052 2059  
2053 2060          /*
2054 2061           * We can't freeze indirects if there is a possibility that they
2055 2062           * may be modified in the current syncing context.
2056 2063           */
2057 2064          if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2058 2065                  arc_buf_freeze(db->db_buf);
2059 2066  
2060 2067          if (holds == db->db_dirtycnt &&
2061 2068              db->db_level == 0 && db->db_immediate_evict)
2062 2069                  dbuf_evict_user(db);
2063 2070  
2064 2071          if (holds == 0) {
2065 2072                  if (db->db_blkid == DMU_BONUS_BLKID) {
2066 2073                          mutex_exit(&db->db_mtx);
2067 2074  
2068 2075                          /*
2069 2076                           * If the dnode moves here, we cannot cross this barrier
2070 2077                           * until the move completes.
2071 2078                           */
2072 2079                          DB_DNODE_ENTER(db);
2073 2080                          (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2074 2081                          DB_DNODE_EXIT(db);
2075 2082                          /*
2076 2083                           * The bonus buffer's dnode hold is no longer discounted
2077 2084                           * in dnode_move(). The dnode cannot move until after
2078 2085                           * the dnode_rele().
2079 2086                           */
2080 2087                          dnode_rele(DB_DNODE(db), db);
2081 2088                  } else if (db->db_buf == NULL) {
2082 2089                          /*
2083 2090                           * This is a special case: we never associated this
2084 2091                           * dbuf with any data allocated from the ARC.
2085 2092                           */
2086 2093                          ASSERT(db->db_state == DB_UNCACHED ||
2087 2094                              db->db_state == DB_NOFILL);
2088 2095                          dbuf_evict(db);
2089 2096                  } else if (arc_released(db->db_buf)) {
2090 2097                          arc_buf_t *buf = db->db_buf;
2091 2098                          /*
2092 2099                           * This dbuf has anonymous data associated with it.
2093 2100                           */
2094 2101                          dbuf_set_data(db, NULL);
2095 2102                          VERIFY(arc_buf_remove_ref(buf, db));
2096 2103                          dbuf_evict(db);
2097 2104                  } else {
2098 2105                          VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2099 2106  
2100 2107                          /*
2101 2108                           * A dbuf will be eligible for eviction if either the
2102 2109                           * 'primarycache' property is set or a duplicate
2103 2110                           * copy of this buffer is already cached in the arc.
2104 2111                           *
2105 2112                           * In the case of the 'primarycache' a buffer
2106 2113                           * is considered for eviction if it matches the
2107 2114                           * criteria set in the property.
2108 2115                           *
2109 2116                           * To decide if our buffer is considered a
2110 2117                           * duplicate, we must call into the arc to determine
2111 2118                           * if multiple buffers are referencing the same
2112 2119                           * block on-disk. If so, then we simply evict
2113 2120                           * ourselves.
2114 2121                           */
2115 2122                          if (!DBUF_IS_CACHEABLE(db) ||
2116 2123                              arc_buf_eviction_needed(db->db_buf))
2117 2124                                  dbuf_clear(db);
2118 2125                          else
2119 2126                                  mutex_exit(&db->db_mtx);
2120 2127                  }
2121 2128          } else {
2122 2129                  mutex_exit(&db->db_mtx);
2123 2130          }
2124 2131  }
2125 2132  
2126 2133  #pragma weak dmu_buf_refcount = dbuf_refcount
2127 2134  uint64_t
2128 2135  dbuf_refcount(dmu_buf_impl_t *db)
2129 2136  {
2130 2137          return (refcount_count(&db->db_holds));
2131 2138  }
2132 2139  
2133 2140  void *
2134 2141  dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2135 2142      dmu_buf_evict_func_t *evict_func)
2136 2143  {
2137 2144          return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2138 2145              user_data_ptr_ptr, evict_func));
2139 2146  }
2140 2147  
2141 2148  void *
2142 2149  dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2143 2150      dmu_buf_evict_func_t *evict_func)
2144 2151  {
2145 2152          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2146 2153  
2147 2154          db->db_immediate_evict = TRUE;
2148 2155          return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2149 2156              user_data_ptr_ptr, evict_func));
2150 2157  }
2151 2158  
2152 2159  void *
2153 2160  dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2154 2161      void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2155 2162  {
2156 2163          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2157 2164          ASSERT(db->db_level == 0);
2158 2165  
2159 2166          ASSERT((user_ptr == NULL) == (evict_func == NULL));
2160 2167  
2161 2168          mutex_enter(&db->db_mtx);
2162 2169  
2163 2170          if (db->db_user_ptr == old_user_ptr) {
2164 2171                  db->db_user_ptr = user_ptr;
2165 2172                  db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2166 2173                  db->db_evict_func = evict_func;
2167 2174  
2168 2175                  dbuf_update_data(db);
2169 2176          } else {
2170 2177                  old_user_ptr = db->db_user_ptr;
2171 2178          }
2172 2179  
2173 2180          mutex_exit(&db->db_mtx);
2174 2181          return (old_user_ptr);
2175 2182  }
2176 2183  
2177 2184  void *
2178 2185  dmu_buf_get_user(dmu_buf_t *db_fake)
2179 2186  {
2180 2187          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2181 2188          ASSERT(!refcount_is_zero(&db->db_holds));
2182 2189  
2183 2190          return (db->db_user_ptr);
2184 2191  }
2185 2192  
2186 2193  boolean_t
2187 2194  dmu_buf_freeable(dmu_buf_t *dbuf)
2188 2195  {
2189 2196          boolean_t res = B_FALSE;
2190 2197          dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2191 2198  
2192 2199          if (db->db_blkptr)
2193 2200                  res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2194 2201                      db->db_blkptr, db->db_blkptr->blk_birth);
2195 2202  
2196 2203          return (res);
2197 2204  }
2198 2205  
2199 2206  blkptr_t *
2200 2207  dmu_buf_get_blkptr(dmu_buf_t *db)
2201 2208  {
2202 2209          dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2203 2210          return (dbi->db_blkptr);
2204 2211  }
2205 2212  
2206 2213  static void
2207 2214  dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2208 2215  {
2209 2216          /* ASSERT(dmu_tx_is_syncing(tx) */
2210 2217          ASSERT(MUTEX_HELD(&db->db_mtx));
2211 2218  
2212 2219          if (db->db_blkptr != NULL)
2213 2220                  return;
2214 2221  
2215 2222          if (db->db_blkid == DMU_SPILL_BLKID) {
2216 2223                  db->db_blkptr = &dn->dn_phys->dn_spill;
2217 2224                  BP_ZERO(db->db_blkptr);
2218 2225                  return;
2219 2226          }
2220 2227          if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2221 2228                  /*
2222 2229                   * This buffer was allocated at a time when there was
2223 2230                   * no available blkptrs from the dnode, or it was
2224 2231                   * inappropriate to hook it in (i.e., nlevels mis-match).
2225 2232                   */
2226 2233                  ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2227 2234                  ASSERT(db->db_parent == NULL);
2228 2235                  db->db_parent = dn->dn_dbuf;
2229 2236                  db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2230 2237                  DBUF_VERIFY(db);
2231 2238          } else {
2232 2239                  dmu_buf_impl_t *parent = db->db_parent;
2233 2240                  int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2234 2241  
2235 2242                  ASSERT(dn->dn_phys->dn_nlevels > 1);
2236 2243                  if (parent == NULL) {
2237 2244                          mutex_exit(&db->db_mtx);
2238 2245                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
2239 2246                          (void) dbuf_hold_impl(dn, db->db_level+1,
2240 2247                              db->db_blkid >> epbs, FALSE, db, &parent);
2241 2248                          rw_exit(&dn->dn_struct_rwlock);
2242 2249                          mutex_enter(&db->db_mtx);
2243 2250                          db->db_parent = parent;
2244 2251                  }
2245 2252                  db->db_blkptr = (blkptr_t *)parent->db.db_data +
2246 2253                      (db->db_blkid & ((1ULL << epbs) - 1));
2247 2254                  DBUF_VERIFY(db);
2248 2255          }
2249 2256  }
2250 2257  
2251 2258  static void
2252 2259  dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2253 2260  {
2254 2261          dmu_buf_impl_t *db = dr->dr_dbuf;
2255 2262          dnode_t *dn;
2256 2263          zio_t *zio;
2257 2264  
2258 2265          ASSERT(dmu_tx_is_syncing(tx));
2259 2266  
2260 2267          dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2261 2268  
2262 2269          mutex_enter(&db->db_mtx);
2263 2270  
2264 2271          ASSERT(db->db_level > 0);
2265 2272          DBUF_VERIFY(db);
2266 2273  
2267 2274          /* Read the block if it hasn't been read yet. */
2268 2275          if (db->db_buf == NULL) {
2269 2276                  mutex_exit(&db->db_mtx);
2270 2277                  (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2271 2278                  mutex_enter(&db->db_mtx);
2272 2279          }
2273 2280          ASSERT3U(db->db_state, ==, DB_CACHED);
2274 2281          ASSERT(db->db_buf != NULL);
2275 2282  
2276 2283          DB_DNODE_ENTER(db);
2277 2284          dn = DB_DNODE(db);
2278 2285          /* Indirect block size must match what the dnode thinks it is. */
2279 2286          ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2280 2287          dbuf_check_blkptr(dn, db);
2281 2288          DB_DNODE_EXIT(db);
2282 2289  
2283 2290          /* Provide the pending dirty record to child dbufs */
2284 2291          db->db_data_pending = dr;
2285 2292  
2286 2293          mutex_exit(&db->db_mtx);
2287 2294          dbuf_write(dr, db->db_buf, tx);
2288 2295  
2289 2296          zio = dr->dr_zio;
2290 2297          mutex_enter(&dr->dt.di.dr_mtx);
2291 2298          dbuf_sync_list(&dr->dt.di.dr_children, tx);
2292 2299          ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2293 2300          mutex_exit(&dr->dt.di.dr_mtx);
2294 2301          zio_nowait(zio);
2295 2302  }
2296 2303  
2297 2304  static void
2298 2305  dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2299 2306  {
2300 2307          arc_buf_t **datap = &dr->dt.dl.dr_data;
2301 2308          dmu_buf_impl_t *db = dr->dr_dbuf;
2302 2309          dnode_t *dn;
2303 2310          objset_t *os;
2304 2311          uint64_t txg = tx->tx_txg;
2305 2312  
2306 2313          ASSERT(dmu_tx_is_syncing(tx));
2307 2314  
2308 2315          dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2309 2316  
2310 2317          mutex_enter(&db->db_mtx);
2311 2318          /*
2312 2319           * To be synced, we must be dirtied.  But we
2313 2320           * might have been freed after the dirty.
2314 2321           */
2315 2322          if (db->db_state == DB_UNCACHED) {
2316 2323                  /* This buffer has been freed since it was dirtied */
2317 2324                  ASSERT(db->db.db_data == NULL);
2318 2325          } else if (db->db_state == DB_FILL) {
2319 2326                  /* This buffer was freed and is now being re-filled */
2320 2327                  ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2321 2328          } else {
2322 2329                  ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2323 2330          }
2324 2331          DBUF_VERIFY(db);
2325 2332  
2326 2333          DB_DNODE_ENTER(db);
2327 2334          dn = DB_DNODE(db);
2328 2335  
2329 2336          if (db->db_blkid == DMU_SPILL_BLKID) {
2330 2337                  mutex_enter(&dn->dn_mtx);
2331 2338                  dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2332 2339                  mutex_exit(&dn->dn_mtx);
2333 2340          }
2334 2341  
2335 2342          /*
2336 2343           * If this is a bonus buffer, simply copy the bonus data into the
2337 2344           * dnode.  It will be written out when the dnode is synced (and it
2338 2345           * will be synced, since it must have been dirty for dbuf_sync to
2339 2346           * be called).
2340 2347           */
2341 2348          if (db->db_blkid == DMU_BONUS_BLKID) {
2342 2349                  dbuf_dirty_record_t **drp;
2343 2350  
2344 2351                  ASSERT(*datap != NULL);
2345 2352                  ASSERT0(db->db_level);
2346 2353                  ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2347 2354                  bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2348 2355                  DB_DNODE_EXIT(db);
2349 2356  
2350 2357                  if (*datap != db->db.db_data) {
2351 2358                          zio_buf_free(*datap, DN_MAX_BONUSLEN);
2352 2359                          arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2353 2360                  }
2354 2361                  db->db_data_pending = NULL;
2355 2362                  drp = &db->db_last_dirty;
2356 2363                  while (*drp != dr)
2357 2364                          drp = &(*drp)->dr_next;
2358 2365                  ASSERT(dr->dr_next == NULL);
2359 2366                  ASSERT(dr->dr_dbuf == db);
2360 2367                  *drp = dr->dr_next;
2361 2368                  kmem_free(dr, sizeof (dbuf_dirty_record_t));
2362 2369                  ASSERT(db->db_dirtycnt > 0);
2363 2370                  db->db_dirtycnt -= 1;
2364 2371                  dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2365 2372                  return;
2366 2373          }
2367 2374  
2368 2375          os = dn->dn_objset;
2369 2376  
2370 2377          /*
2371 2378           * This function may have dropped the db_mtx lock allowing a dmu_sync
2372 2379           * operation to sneak in. As a result, we need to ensure that we
2373 2380           * don't check the dr_override_state until we have returned from
2374 2381           * dbuf_check_blkptr.
2375 2382           */
2376 2383          dbuf_check_blkptr(dn, db);
2377 2384  
2378 2385          /*
2379 2386           * If this buffer is in the middle of an immediate write,
2380 2387           * wait for the synchronous IO to complete.
2381 2388           */
2382 2389          while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2383 2390                  ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2384 2391                  cv_wait(&db->db_changed, &db->db_mtx);
2385 2392                  ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2386 2393          }
2387 2394  
2388 2395          if (db->db_state != DB_NOFILL &&
2389 2396              dn->dn_object != DMU_META_DNODE_OBJECT &&
2390 2397              refcount_count(&db->db_holds) > 1 &&
2391 2398              dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2392 2399              *datap == db->db_buf) {
2393 2400                  /*
2394 2401                   * If this buffer is currently "in use" (i.e., there
2395 2402                   * are active holds and db_data still references it),
2396 2403                   * then make a copy before we start the write so that
2397 2404                   * any modifications from the open txg will not leak
2398 2405                   * into this write.
2399 2406                   *
2400 2407                   * NOTE: this copy does not need to be made for
2401 2408                   * objects only modified in the syncing context (e.g.
2402 2409                   * DNONE_DNODE blocks).
2403 2410                   */
2404 2411                  int blksz = arc_buf_size(*datap);
2405 2412                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2406 2413                  *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2407 2414                  bcopy(db->db.db_data, (*datap)->b_data, blksz);
2408 2415          }
2409 2416          db->db_data_pending = dr;
2410 2417  
2411 2418          mutex_exit(&db->db_mtx);
2412 2419  
2413 2420          dbuf_write(dr, *datap, tx);
2414 2421  
2415 2422          ASSERT(!list_link_active(&dr->dr_dirty_node));
2416 2423          if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2417 2424                  list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2418 2425                  DB_DNODE_EXIT(db);
2419 2426          } else {
2420 2427                  /*
2421 2428                   * Although zio_nowait() does not "wait for an IO", it does
2422 2429                   * initiate the IO. If this is an empty write it seems plausible
2423 2430                   * that the IO could actually be completed before the nowait
2424 2431                   * returns. We need to DB_DNODE_EXIT() first in case
2425 2432                   * zio_nowait() invalidates the dbuf.
2426 2433                   */
2427 2434                  DB_DNODE_EXIT(db);
2428 2435                  zio_nowait(dr->dr_zio);
2429 2436          }
2430 2437  }
2431 2438  
2432 2439  void
2433 2440  dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2434 2441  {
2435 2442          dbuf_dirty_record_t *dr;
2436 2443  
2437 2444          while (dr = list_head(list)) {
2438 2445                  if (dr->dr_zio != NULL) {
2439 2446                          /*
2440 2447                           * If we find an already initialized zio then we
2441 2448                           * are processing the meta-dnode, and we have finished.
2442 2449                           * The dbufs for all dnodes are put back on the list
2443 2450                           * during processing, so that we can zio_wait()
2444 2451                           * these IOs after initiating all child IOs.
2445 2452                           */
2446 2453                          ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2447 2454                              DMU_META_DNODE_OBJECT);
2448 2455                          break;
2449 2456                  }
2450 2457                  list_remove(list, dr);
2451 2458                  if (dr->dr_dbuf->db_level > 0)
2452 2459                          dbuf_sync_indirect(dr, tx);
2453 2460                  else
2454 2461                          dbuf_sync_leaf(dr, tx);
2455 2462          }
2456 2463  }
2457 2464  
2458 2465  /* ARGSUSED */
2459 2466  static void
2460 2467  dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2461 2468  {
2462 2469          dmu_buf_impl_t *db = vdb;
2463 2470          dnode_t *dn;
2464 2471          blkptr_t *bp = zio->io_bp;
2465 2472          blkptr_t *bp_orig = &zio->io_bp_orig;
2466 2473          spa_t *spa = zio->io_spa;
2467 2474          int64_t delta;
2468 2475          uint64_t fill = 0;
2469 2476          int i;
2470 2477  
2471 2478          ASSERT(db->db_blkptr == bp);
2472 2479  
2473 2480          DB_DNODE_ENTER(db);
2474 2481          dn = DB_DNODE(db);
2475 2482          delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2476 2483          dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2477 2484          zio->io_prev_space_delta = delta;
2478 2485  
2479 2486          if (BP_IS_HOLE(bp)) {
2480 2487                  ASSERT(bp->blk_fill == 0);
2481 2488                  DB_DNODE_EXIT(db);
2482 2489                  return;
2483 2490          }
2484 2491  
2485 2492          ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2486 2493              BP_GET_TYPE(bp) == dn->dn_type) ||
2487 2494              (db->db_blkid == DMU_SPILL_BLKID &&
2488 2495              BP_GET_TYPE(bp) == dn->dn_bonustype));
2489 2496          ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2490 2497  
2491 2498          mutex_enter(&db->db_mtx);
2492 2499  
2493 2500  #ifdef ZFS_DEBUG
2494 2501          if (db->db_blkid == DMU_SPILL_BLKID) {
2495 2502                  ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2496 2503                  ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2497 2504                      db->db_blkptr == &dn->dn_phys->dn_spill);
2498 2505          }
2499 2506  #endif
2500 2507  
2501 2508          if (db->db_level == 0) {
2502 2509                  mutex_enter(&dn->dn_mtx);
2503 2510                  if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2504 2511                      db->db_blkid != DMU_SPILL_BLKID)
2505 2512                          dn->dn_phys->dn_maxblkid = db->db_blkid;
2506 2513                  mutex_exit(&dn->dn_mtx);
2507 2514  
2508 2515                  if (dn->dn_type == DMU_OT_DNODE) {
2509 2516                          dnode_phys_t *dnp = db->db.db_data;
2510 2517                          for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2511 2518                              i--, dnp++) {
2512 2519                                  if (dnp->dn_type != DMU_OT_NONE)
2513 2520                                          fill++;
2514 2521                          }
2515 2522                  } else {
2516 2523                          fill = 1;
2517 2524                  }
2518 2525          } else {
2519 2526                  blkptr_t *ibp = db->db.db_data;
2520 2527                  ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2521 2528                  for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2522 2529                          if (BP_IS_HOLE(ibp))
2523 2530                                  continue;

↓ open down ↓

660 lines elided

↑ open up ↑

2524 2531                          fill += ibp->blk_fill;
2525 2532                  }
2526 2533          }
2527 2534          DB_DNODE_EXIT(db);
2528 2535  
2529 2536          bp->blk_fill = fill;
2530 2537  
2531 2538          mutex_exit(&db->db_mtx);
2532 2539  }
2533 2540  
     2541 +/*
     2542 + * The SPA will call this callback several times for each zio - once
     2543 + * for every physical child i/o (zio->io_phys_children times).  This
     2544 + * allows the DMU to monitor the progress of each logical i/o.  For example,
     2545 + * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
     2546 + * block.  There may be a long delay before all copies/fragments are completed,
     2547 + * so this callback allows us to retire dirty space gradually, as the physical
     2548 + * i/os complete.
     2549 + */
2534 2550  /* ARGSUSED */
2535 2551  static void
     2552 +dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
     2553 +{
     2554 +        dmu_buf_impl_t *db = arg;
     2555 +        objset_t *os = db->db_objset;
     2556 +        dsl_pool_t *dp = dmu_objset_pool(os);
     2557 +        dbuf_dirty_record_t *dr;
     2558 +        int delta = 0;
     2559 +
     2560 +        dr = db->db_data_pending;
     2561 +        ASSERT3U(dr->dr_txg, ==, zio->io_txg);
     2562 +
     2563 +        /*
     2564 +         * The callback will be called io_phys_children times.  Retire one
     2565 +         * portion of our dirty space each time we are called.  Any rounding
     2566 +         * error will be cleaned up by dsl_pool_sync()'s call to
     2567 +         * dsl_pool_undirty_space().
     2568 +         */
     2569 +        delta = dr->dr_accounted / zio->io_phys_children;
     2570 +        dsl_pool_undirty_space(dp, delta, zio->io_txg);
     2571 +}
     2572 +
     2573 +/* ARGSUSED */
     2574 +static void
2536 2575  dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2537 2576  {
2538 2577          dmu_buf_impl_t *db = vdb;
2539 2578          blkptr_t *bp = zio->io_bp;
2540 2579          blkptr_t *bp_orig = &zio->io_bp_orig;
2541 2580          uint64_t txg = zio->io_txg;
2542 2581          dbuf_dirty_record_t **drp, *dr;
2543 2582  
2544 2583          ASSERT0(zio->io_error);
2545 2584          ASSERT(db->db_blkptr == bp);

2546 2585  
2547 2586          /*
2548 2587           * For nopwrites and rewrites we ensure that the bp matches our
2549 2588           * original and bypass all the accounting.
2550 2589           */
2551 2590          if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2552 2591                  ASSERT(BP_EQUAL(bp, bp_orig));
2553 2592          } else {
2554 2593                  objset_t *os;
2555 2594                  dsl_dataset_t *ds;
2556 2595                  dmu_tx_t *tx;
2557 2596  
2558 2597                  DB_GET_OBJSET(&os, db);
2559 2598                  ds = os->os_dsl_dataset;
2560 2599                  tx = os->os_synctx;
2561 2600  
2562 2601                  (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2563 2602                  dsl_dataset_block_born(ds, bp, tx);
2564 2603          }
2565 2604  
2566 2605          mutex_enter(&db->db_mtx);
2567 2606  
2568 2607          DBUF_VERIFY(db);
2569 2608  
2570 2609          drp = &db->db_last_dirty;
2571 2610          while ((dr = *drp) != db->db_data_pending)
2572 2611                  drp = &dr->dr_next;
2573 2612          ASSERT(!list_link_active(&dr->dr_dirty_node));
2574 2613          ASSERT(dr->dr_txg == txg);
2575 2614          ASSERT(dr->dr_dbuf == db);
2576 2615          ASSERT(dr->dr_next == NULL);
2577 2616          *drp = dr->dr_next;
2578 2617  
2579 2618  #ifdef ZFS_DEBUG
2580 2619          if (db->db_blkid == DMU_SPILL_BLKID) {
2581 2620                  dnode_t *dn;
2582 2621  
2583 2622                  DB_DNODE_ENTER(db);
2584 2623                  dn = DB_DNODE(db);
2585 2624                  ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2586 2625                  ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2587 2626                      db->db_blkptr == &dn->dn_phys->dn_spill);
2588 2627                  DB_DNODE_EXIT(db);
2589 2628          }
2590 2629  #endif
2591 2630  
2592 2631          if (db->db_level == 0) {
2593 2632                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2594 2633                  ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2595 2634                  if (db->db_state != DB_NOFILL) {
2596 2635                          if (dr->dt.dl.dr_data != db->db_buf)
2597 2636                                  VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2598 2637                                      db));
2599 2638                          else if (!arc_released(db->db_buf))
2600 2639                                  arc_set_callback(db->db_buf, dbuf_do_evict, db);
2601 2640                  }
2602 2641          } else {
2603 2642                  dnode_t *dn;
2604 2643  
2605 2644                  DB_DNODE_ENTER(db);
2606 2645                  dn = DB_DNODE(db);
2607 2646                  ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2608 2647                  ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2609 2648                  if (!BP_IS_HOLE(db->db_blkptr)) {
2610 2649                          int epbs =
2611 2650                              dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2612 2651                          ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2613 2652                              db->db.db_size);
2614 2653                          ASSERT3U(dn->dn_phys->dn_maxblkid
2615 2654                              >> (db->db_level * epbs), >=, db->db_blkid);
2616 2655                          arc_set_callback(db->db_buf, dbuf_do_evict, db);
2617 2656                  }

↓ open down ↓

72 lines elided

↑ open up ↑

2618 2657                  DB_DNODE_EXIT(db);
2619 2658                  mutex_destroy(&dr->dt.di.dr_mtx);
2620 2659                  list_destroy(&dr->dt.di.dr_children);
2621 2660          }
2622 2661          kmem_free(dr, sizeof (dbuf_dirty_record_t));
2623 2662  
2624 2663          cv_broadcast(&db->db_changed);
2625 2664          ASSERT(db->db_dirtycnt > 0);
2626 2665          db->db_dirtycnt -= 1;
2627 2666          db->db_data_pending = NULL;
     2667 +
2628 2668          dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2629 2669  }
2630 2670  
2631 2671  static void
2632 2672  dbuf_write_nofill_ready(zio_t *zio)
2633 2673  {
2634 2674          dbuf_write_ready(zio, NULL, zio->io_private);
2635 2675  }
2636 2676  
2637 2677  static void

2638 2678  dbuf_write_nofill_done(zio_t *zio)
2639 2679  {
2640 2680          dbuf_write_done(zio, NULL, zio->io_private);
2641 2681  }
2642 2682  
2643 2683  static void
2644 2684  dbuf_write_override_ready(zio_t *zio)
2645 2685  {
2646 2686          dbuf_dirty_record_t *dr = zio->io_private;
2647 2687          dmu_buf_impl_t *db = dr->dr_dbuf;
2648 2688  
2649 2689          dbuf_write_ready(zio, NULL, db);
2650 2690  }
2651 2691  
2652 2692  static void
2653 2693  dbuf_write_override_done(zio_t *zio)
2654 2694  {
2655 2695          dbuf_dirty_record_t *dr = zio->io_private;
2656 2696          dmu_buf_impl_t *db = dr->dr_dbuf;
2657 2697          blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2658 2698  
2659 2699          mutex_enter(&db->db_mtx);
2660 2700          if (!BP_EQUAL(zio->io_bp, obp)) {
2661 2701                  if (!BP_IS_HOLE(obp))
2662 2702                          dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2663 2703                  arc_release(dr->dt.dl.dr_data, db);
2664 2704          }
2665 2705          mutex_exit(&db->db_mtx);
2666 2706  
2667 2707          dbuf_write_done(zio, NULL, db);
2668 2708  }
2669 2709  
2670 2710  /* Issue I/O to commit a dirty buffer to disk. */
2671 2711  static void
2672 2712  dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2673 2713  {
2674 2714          dmu_buf_impl_t *db = dr->dr_dbuf;
2675 2715          dnode_t *dn;
2676 2716          objset_t *os;
2677 2717          dmu_buf_impl_t *parent = db->db_parent;
2678 2718          uint64_t txg = tx->tx_txg;
2679 2719          zbookmark_t zb;
2680 2720          zio_prop_t zp;
2681 2721          zio_t *zio;
2682 2722          int wp_flag = 0;
2683 2723  
2684 2724          DB_DNODE_ENTER(db);
2685 2725          dn = DB_DNODE(db);
2686 2726          os = dn->dn_objset;
2687 2727  
2688 2728          if (db->db_state != DB_NOFILL) {
2689 2729                  if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2690 2730                          /*
2691 2731                           * Private object buffers are released here rather
2692 2732                           * than in dbuf_dirty() since they are only modified
2693 2733                           * in the syncing context and we don't want the
2694 2734                           * overhead of making multiple copies of the data.
2695 2735                           */
2696 2736                          if (BP_IS_HOLE(db->db_blkptr)) {
2697 2737                                  arc_buf_thaw(data);
2698 2738                          } else {
2699 2739                                  dbuf_release_bp(db);
2700 2740                          }
2701 2741                  }
2702 2742          }
2703 2743  
2704 2744          if (parent != dn->dn_dbuf) {
2705 2745                  /* Our parent is an indirect block. */
2706 2746                  /* We have a dirty parent that has been scheduled for write. */
2707 2747                  ASSERT(parent && parent->db_data_pending);
2708 2748                  /* Our parent's buffer is one level closer to the dnode. */
2709 2749                  ASSERT(db->db_level == parent->db_level-1);
2710 2750                  /*
2711 2751                   * We're about to modify our parent's db_data by modifying
2712 2752                   * our block pointer, so the parent must be released.
2713 2753                   */
2714 2754                  ASSERT(arc_released(parent->db_buf));
2715 2755                  zio = parent->db_data_pending->dr_zio;
2716 2756          } else {
2717 2757                  /* Our parent is the dnode itself. */
2718 2758                  ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2719 2759                      db->db_blkid != DMU_SPILL_BLKID) ||
2720 2760                      (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2721 2761                  if (db->db_blkid != DMU_SPILL_BLKID)
2722 2762                          ASSERT3P(db->db_blkptr, ==,
2723 2763                              &dn->dn_phys->dn_blkptr[db->db_blkid]);
2724 2764                  zio = dn->dn_zio;
2725 2765          }
2726 2766  
2727 2767          ASSERT(db->db_level == 0 || data == db->db_buf);
2728 2768          ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2729 2769          ASSERT(zio);
2730 2770  
2731 2771          SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2732 2772              os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2733 2773              db->db.db_object, db->db_level, db->db_blkid);
2734 2774  
2735 2775          if (db->db_blkid == DMU_SPILL_BLKID)

↓ open down ↓

98 lines elided

↑ open up ↑

2736 2776                  wp_flag = WP_SPILL;
2737 2777          wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2738 2778  
2739 2779          dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2740 2780          DB_DNODE_EXIT(db);
2741 2781  
2742 2782          if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2743 2783                  ASSERT(db->db_state != DB_NOFILL);
2744 2784                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2745 2785                      db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2746      -                    dbuf_write_override_ready, dbuf_write_override_done, dr,
2747      -                    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
     2786 +                    dbuf_write_override_ready, NULL, dbuf_write_override_done,
     2787 +                    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2748 2788                  mutex_enter(&db->db_mtx);
2749 2789                  dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2750 2790                  zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2751 2791                      dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2752 2792                  mutex_exit(&db->db_mtx);
2753 2793          } else if (db->db_state == DB_NOFILL) {
2754 2794                  ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2755 2795                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2756 2796                      db->db_blkptr, NULL, db->db.db_size, &zp,
2757      -                    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
     2797 +                    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2758 2798                      ZIO_PRIORITY_ASYNC_WRITE,
2759 2799                      ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2760 2800          } else {
2761 2801                  ASSERT(arc_released(data));
2762 2802                  dr->dr_zio = arc_write(zio, os->os_spa, txg,
2763 2803                      db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2764 2804                      DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2765      -                    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
2766      -                    ZIO_FLAG_MUSTSUCCEED, &zb);
     2805 +                    dbuf_write_physdone, dbuf_write_done, db,
     2806 +                    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2767 2807          }
2768 2808  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX