Print this page
    
3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dbuf.c
          +++ new/usr/src/uts/common/fs/zfs/dbuf.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2012 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/dmu.h>
  29   29  #include <sys/dmu_impl.h>
  30   30  #include <sys/dbuf.h>
  31   31  #include <sys/dmu_objset.h>
  32   32  #include <sys/dsl_dataset.h>
  33   33  #include <sys/dsl_dir.h>
  34   34  #include <sys/dmu_tx.h>
  35   35  #include <sys/spa.h>
  36   36  #include <sys/zio.h>
  37   37  #include <sys/dmu_zfetch.h>
  38   38  #include <sys/sa.h>
  39   39  #include <sys/sa_impl.h>
  40   40  
  41   41  static void dbuf_destroy(dmu_buf_impl_t *db);
  42   42  static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
  43   43  static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
  44   44  
  45   45  /*
  46   46   * Global data structures and functions for the dbuf cache.
  47   47   */
  48   48  static kmem_cache_t *dbuf_cache;
  49   49  
  50   50  /* ARGSUSED */
  51   51  static int
  52   52  dbuf_cons(void *vdb, void *unused, int kmflag)
  53   53  {
  54   54          dmu_buf_impl_t *db = vdb;
  55   55          bzero(db, sizeof (dmu_buf_impl_t));
  56   56  
  57   57          mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
  58   58          cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
  59   59          refcount_create(&db->db_holds);
  60   60          return (0);
  61   61  }
  62   62  
  63   63  /* ARGSUSED */
  64   64  static void
  65   65  dbuf_dest(void *vdb, void *unused)
  66   66  {
  67   67          dmu_buf_impl_t *db = vdb;
  68   68          mutex_destroy(&db->db_mtx);
  69   69          cv_destroy(&db->db_changed);
  70   70          refcount_destroy(&db->db_holds);
  71   71  }
  72   72  
  73   73  /*
  74   74   * dbuf hash table routines
  75   75   */
  76   76  static dbuf_hash_table_t dbuf_hash_table;
  77   77  
  78   78  static uint64_t dbuf_hash_count;
  79   79  
  80   80  static uint64_t
  81   81  dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
  82   82  {
  83   83          uintptr_t osv = (uintptr_t)os;
  84   84          uint64_t crc = -1ULL;
  85   85  
  86   86          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
  87   87          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
  88   88          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
  89   89          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
  90   90          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
  91   91          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
  92   92          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
  93   93  
  94   94          crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
  95   95  
  96   96          return (crc);
  97   97  }
  98   98  
  99   99  #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
 100  100  
 101  101  #define DBUF_EQUAL(dbuf, os, obj, level, blkid)         \
 102  102          ((dbuf)->db.db_object == (obj) &&               \
 103  103          (dbuf)->db_objset == (os) &&                    \
 104  104          (dbuf)->db_level == (level) &&                  \
 105  105          (dbuf)->db_blkid == (blkid))
 106  106  
 107  107  dmu_buf_impl_t *
 108  108  dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 109  109  {
 110  110          dbuf_hash_table_t *h = &dbuf_hash_table;
 111  111          objset_t *os = dn->dn_objset;
 112  112          uint64_t obj = dn->dn_object;
 113  113          uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 114  114          uint64_t idx = hv & h->hash_table_mask;
 115  115          dmu_buf_impl_t *db;
 116  116  
 117  117          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 118  118          for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 119  119                  if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 120  120                          mutex_enter(&db->db_mtx);
 121  121                          if (db->db_state != DB_EVICTING) {
 122  122                                  mutex_exit(DBUF_HASH_MUTEX(h, idx));
 123  123                                  return (db);
 124  124                          }
 125  125                          mutex_exit(&db->db_mtx);
 126  126                  }
 127  127          }
 128  128          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 129  129          return (NULL);
 130  130  }
 131  131  
 132  132  /*
 133  133   * Insert an entry into the hash table.  If there is already an element
 134  134   * equal to elem in the hash table, then the already existing element
 135  135   * will be returned and the new element will not be inserted.
 136  136   * Otherwise returns NULL.
 137  137   */
 138  138  static dmu_buf_impl_t *
 139  139  dbuf_hash_insert(dmu_buf_impl_t *db)
 140  140  {
 141  141          dbuf_hash_table_t *h = &dbuf_hash_table;
 142  142          objset_t *os = db->db_objset;
 143  143          uint64_t obj = db->db.db_object;
 144  144          int level = db->db_level;
 145  145          uint64_t blkid = db->db_blkid;
 146  146          uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 147  147          uint64_t idx = hv & h->hash_table_mask;
 148  148          dmu_buf_impl_t *dbf;
 149  149  
 150  150          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 151  151          for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 152  152                  if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 153  153                          mutex_enter(&dbf->db_mtx);
 154  154                          if (dbf->db_state != DB_EVICTING) {
 155  155                                  mutex_exit(DBUF_HASH_MUTEX(h, idx));
 156  156                                  return (dbf);
 157  157                          }
 158  158                          mutex_exit(&dbf->db_mtx);
 159  159                  }
 160  160          }
 161  161  
 162  162          mutex_enter(&db->db_mtx);
 163  163          db->db_hash_next = h->hash_table[idx];
 164  164          h->hash_table[idx] = db;
 165  165          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 166  166          atomic_add_64(&dbuf_hash_count, 1);
 167  167  
 168  168          return (NULL);
 169  169  }
 170  170  
 171  171  /*
 172  172   * Remove an entry from the hash table.  This operation will
 173  173   * fail if there are any existing holds on the db.
 174  174   */
 175  175  static void
 176  176  dbuf_hash_remove(dmu_buf_impl_t *db)
 177  177  {
 178  178          dbuf_hash_table_t *h = &dbuf_hash_table;
 179  179          uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
 180  180              db->db_level, db->db_blkid);
 181  181          uint64_t idx = hv & h->hash_table_mask;
 182  182          dmu_buf_impl_t *dbf, **dbp;
 183  183  
 184  184          /*
 185  185           * We musn't hold db_mtx to maintin lock ordering:
 186  186           * DBUF_HASH_MUTEX > db_mtx.
 187  187           */
 188  188          ASSERT(refcount_is_zero(&db->db_holds));
 189  189          ASSERT(db->db_state == DB_EVICTING);
 190  190          ASSERT(!MUTEX_HELD(&db->db_mtx));
 191  191  
 192  192          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 193  193          dbp = &h->hash_table[idx];
 194  194          while ((dbf = *dbp) != db) {
 195  195                  dbp = &dbf->db_hash_next;
 196  196                  ASSERT(dbf != NULL);
 197  197          }
 198  198          *dbp = db->db_hash_next;
 199  199          db->db_hash_next = NULL;
 200  200          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 201  201          atomic_add_64(&dbuf_hash_count, -1);
 202  202  }
 203  203  
 204  204  static arc_evict_func_t dbuf_do_evict;
 205  205  
 206  206  static void
 207  207  dbuf_evict_user(dmu_buf_impl_t *db)
 208  208  {
 209  209          ASSERT(MUTEX_HELD(&db->db_mtx));
 210  210  
 211  211          if (db->db_level != 0 || db->db_evict_func == NULL)
 212  212                  return;
 213  213  
 214  214          if (db->db_user_data_ptr_ptr)
 215  215                  *db->db_user_data_ptr_ptr = db->db.db_data;
 216  216          db->db_evict_func(&db->db, db->db_user_ptr);
 217  217          db->db_user_ptr = NULL;
 218  218          db->db_user_data_ptr_ptr = NULL;
 219  219          db->db_evict_func = NULL;
 220  220  }
 221  221  
 222  222  boolean_t
 223  223  dbuf_is_metadata(dmu_buf_impl_t *db)
 224  224  {
 225  225          if (db->db_level > 0) {
 226  226                  return (B_TRUE);
 227  227          } else {
 228  228                  boolean_t is_metadata;
 229  229  
 230  230                  DB_DNODE_ENTER(db);
 231  231                  is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 232  232                  DB_DNODE_EXIT(db);
 233  233  
 234  234                  return (is_metadata);
 235  235          }
 236  236  }
 237  237  
 238  238  void
 239  239  dbuf_evict(dmu_buf_impl_t *db)
 240  240  {
 241  241          ASSERT(MUTEX_HELD(&db->db_mtx));
 242  242          ASSERT(db->db_buf == NULL);
 243  243          ASSERT(db->db_data_pending == NULL);
 244  244  
 245  245          dbuf_clear(db);
 246  246          dbuf_destroy(db);
 247  247  }
 248  248  
 249  249  void
 250  250  dbuf_init(void)
 251  251  {
 252  252          uint64_t hsize = 1ULL << 16;
 253  253          dbuf_hash_table_t *h = &dbuf_hash_table;
 254  254          int i;
 255  255  
 256  256          /*
 257  257           * The hash table is big enough to fill all of physical memory
 258  258           * with an average 4K block size.  The table will take up
 259  259           * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 260  260           */
 261  261          while (hsize * 4096 < physmem * PAGESIZE)
 262  262                  hsize <<= 1;
 263  263  
 264  264  retry:
 265  265          h->hash_table_mask = hsize - 1;
 266  266          h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 267  267          if (h->hash_table == NULL) {
 268  268                  /* XXX - we should really return an error instead of assert */
 269  269                  ASSERT(hsize > (1ULL << 10));
 270  270                  hsize >>= 1;
 271  271                  goto retry;
 272  272          }
 273  273  
 274  274          dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 275  275              sizeof (dmu_buf_impl_t),
 276  276              0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 277  277  
 278  278          for (i = 0; i < DBUF_MUTEXES; i++)
 279  279                  mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 280  280  }
 281  281  
 282  282  void
 283  283  dbuf_fini(void)
 284  284  {
 285  285          dbuf_hash_table_t *h = &dbuf_hash_table;
 286  286          int i;
 287  287  
 288  288          for (i = 0; i < DBUF_MUTEXES; i++)
 289  289                  mutex_destroy(&h->hash_mutexes[i]);
 290  290          kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 291  291          kmem_cache_destroy(dbuf_cache);
 292  292  }
 293  293  
 294  294  /*
 295  295   * Other stuff.
 296  296   */
 297  297  
 298  298  #ifdef ZFS_DEBUG
 299  299  static void
 300  300  dbuf_verify(dmu_buf_impl_t *db)
 301  301  {
 302  302          dnode_t *dn;
 303  303          dbuf_dirty_record_t *dr;
 304  304  
 305  305          ASSERT(MUTEX_HELD(&db->db_mtx));
 306  306  
 307  307          if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 308  308                  return;
 309  309  
 310  310          ASSERT(db->db_objset != NULL);
 311  311          DB_DNODE_ENTER(db);
 312  312          dn = DB_DNODE(db);
 313  313          if (dn == NULL) {
 314  314                  ASSERT(db->db_parent == NULL);
 315  315                  ASSERT(db->db_blkptr == NULL);
 316  316          } else {
 317  317                  ASSERT3U(db->db.db_object, ==, dn->dn_object);
 318  318                  ASSERT3P(db->db_objset, ==, dn->dn_objset);
 319  319                  ASSERT3U(db->db_level, <, dn->dn_nlevels);
 320  320                  ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
  
    | 
      ↓ open down ↓ | 
    320 lines elided | 
    
      ↑ open up ↑ | 
  
 321  321                      db->db_blkid == DMU_SPILL_BLKID ||
 322  322                      !list_is_empty(&dn->dn_dbufs));
 323  323          }
 324  324          if (db->db_blkid == DMU_BONUS_BLKID) {
 325  325                  ASSERT(dn != NULL);
 326  326                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 327  327                  ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 328  328          } else if (db->db_blkid == DMU_SPILL_BLKID) {
 329  329                  ASSERT(dn != NULL);
 330  330                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 331      -                ASSERT3U(db->db.db_offset, ==, 0);
      331 +                ASSERT0(db->db.db_offset);
 332  332          } else {
 333  333                  ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 334  334          }
 335  335  
 336  336          for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
 337  337                  ASSERT(dr->dr_dbuf == db);
 338  338  
 339  339          for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
 340  340                  ASSERT(dr->dr_dbuf == db);
 341  341  
 342  342          /*
 343  343           * We can't assert that db_size matches dn_datablksz because it
 344  344           * can be momentarily different when another thread is doing
 345  345           * dnode_set_blksz().
 346  346           */
 347  347          if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 348  348                  dr = db->db_data_pending;
 349  349                  /*
 350  350                   * It should only be modified in syncing context, so
 351  351                   * make sure we only have one copy of the data.
 352  352                   */
 353  353                  ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 354  354          }
 355  355  
 356  356          /* verify db->db_blkptr */
 357  357          if (db->db_blkptr) {
 358  358                  if (db->db_parent == dn->dn_dbuf) {
 359  359                          /* db is pointed to by the dnode */
 360  360                          /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 361  361                          if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 362  362                                  ASSERT(db->db_parent == NULL);
 363  363                          else
 364  364                                  ASSERT(db->db_parent != NULL);
 365  365                          if (db->db_blkid != DMU_SPILL_BLKID)
 366  366                                  ASSERT3P(db->db_blkptr, ==,
 367  367                                      &dn->dn_phys->dn_blkptr[db->db_blkid]);
 368  368                  } else {
 369  369                          /* db is pointed to by an indirect block */
 370  370                          int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
 371  371                          ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 372  372                          ASSERT3U(db->db_parent->db.db_object, ==,
 373  373                              db->db.db_object);
 374  374                          /*
 375  375                           * dnode_grow_indblksz() can make this fail if we don't
 376  376                           * have the struct_rwlock.  XXX indblksz no longer
 377  377                           * grows.  safe to do this now?
 378  378                           */
 379  379                          if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 380  380                                  ASSERT3P(db->db_blkptr, ==,
 381  381                                      ((blkptr_t *)db->db_parent->db.db_data +
 382  382                                      db->db_blkid % epb));
 383  383                          }
 384  384                  }
 385  385          }
 386  386          if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 387  387              (db->db_buf == NULL || db->db_buf->b_data) &&
 388  388              db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 389  389              db->db_state != DB_FILL && !dn->dn_free_txg) {
 390  390                  /*
 391  391                   * If the blkptr isn't set but they have nonzero data,
 392  392                   * it had better be dirty, otherwise we'll lose that
 393  393                   * data when we evict this buffer.
 394  394                   */
 395  395                  if (db->db_dirtycnt == 0) {
 396  396                          uint64_t *buf = db->db.db_data;
 397  397                          int i;
 398  398  
 399  399                          for (i = 0; i < db->db.db_size >> 3; i++) {
 400  400                                  ASSERT(buf[i] == 0);
 401  401                          }
 402  402                  }
 403  403          }
 404  404          DB_DNODE_EXIT(db);
 405  405  }
 406  406  #endif
 407  407  
 408  408  static void
 409  409  dbuf_update_data(dmu_buf_impl_t *db)
 410  410  {
 411  411          ASSERT(MUTEX_HELD(&db->db_mtx));
 412  412          if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
 413  413                  ASSERT(!refcount_is_zero(&db->db_holds));
 414  414                  *db->db_user_data_ptr_ptr = db->db.db_data;
 415  415          }
 416  416  }
 417  417  
 418  418  static void
 419  419  dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 420  420  {
 421  421          ASSERT(MUTEX_HELD(&db->db_mtx));
 422  422          ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
 423  423          db->db_buf = buf;
 424  424          if (buf != NULL) {
 425  425                  ASSERT(buf->b_data != NULL);
 426  426                  db->db.db_data = buf->b_data;
 427  427                  if (!arc_released(buf))
 428  428                          arc_set_callback(buf, dbuf_do_evict, db);
 429  429                  dbuf_update_data(db);
 430  430          } else {
 431  431                  dbuf_evict_user(db);
 432  432                  db->db.db_data = NULL;
 433  433                  if (db->db_state != DB_NOFILL)
 434  434                          db->db_state = DB_UNCACHED;
 435  435          }
 436  436  }
 437  437  
 438  438  /*
 439  439   * Loan out an arc_buf for read.  Return the loaned arc_buf.
 440  440   */
 441  441  arc_buf_t *
 442  442  dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 443  443  {
 444  444          arc_buf_t *abuf;
 445  445  
 446  446          mutex_enter(&db->db_mtx);
 447  447          if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 448  448                  int blksz = db->db.db_size;
 449  449                  spa_t *spa;
 450  450  
 451  451                  mutex_exit(&db->db_mtx);
 452  452                  DB_GET_SPA(&spa, db);
 453  453                  abuf = arc_loan_buf(spa, blksz);
 454  454                  bcopy(db->db.db_data, abuf->b_data, blksz);
 455  455          } else {
 456  456                  abuf = db->db_buf;
 457  457                  arc_loan_inuse_buf(abuf, db);
 458  458                  dbuf_set_data(db, NULL);
 459  459                  mutex_exit(&db->db_mtx);
 460  460          }
 461  461          return (abuf);
 462  462  }
 463  463  
 464  464  uint64_t
 465  465  dbuf_whichblock(dnode_t *dn, uint64_t offset)
 466  466  {
 467  467          if (dn->dn_datablkshift) {
 468  468                  return (offset >> dn->dn_datablkshift);
 469  469          } else {
 470  470                  ASSERT3U(offset, <, dn->dn_datablksz);
 471  471                  return (0);
 472  472          }
 473  473  }
 474  474  
 475  475  static void
 476  476  dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 477  477  {
 478  478          dmu_buf_impl_t *db = vdb;
 479  479  
 480  480          mutex_enter(&db->db_mtx);
 481  481          ASSERT3U(db->db_state, ==, DB_READ);
 482  482          /*
 483  483           * All reads are synchronous, so we must have a hold on the dbuf
 484  484           */
 485  485          ASSERT(refcount_count(&db->db_holds) > 0);
 486  486          ASSERT(db->db_buf == NULL);
 487  487          ASSERT(db->db.db_data == NULL);
 488  488          if (db->db_level == 0 && db->db_freed_in_flight) {
 489  489                  /* we were freed in flight; disregard any error */
 490  490                  arc_release(buf, db);
 491  491                  bzero(buf->b_data, db->db.db_size);
 492  492                  arc_buf_freeze(buf);
 493  493                  db->db_freed_in_flight = FALSE;
 494  494                  dbuf_set_data(db, buf);
 495  495                  db->db_state = DB_CACHED;
 496  496          } else if (zio == NULL || zio->io_error == 0) {
 497  497                  dbuf_set_data(db, buf);
 498  498                  db->db_state = DB_CACHED;
 499  499          } else {
 500  500                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 501  501                  ASSERT3P(db->db_buf, ==, NULL);
 502  502                  VERIFY(arc_buf_remove_ref(buf, db) == 1);
 503  503                  db->db_state = DB_UNCACHED;
 504  504          }
 505  505          cv_broadcast(&db->db_changed);
 506  506          dbuf_rele_and_unlock(db, NULL);
 507  507  }
 508  508  
 509  509  static void
 510  510  dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 511  511  {
 512  512          dnode_t *dn;
 513  513          spa_t *spa;
 514  514          zbookmark_t zb;
 515  515          uint32_t aflags = ARC_NOWAIT;
 516  516          arc_buf_t *pbuf;
 517  517  
 518  518          DB_DNODE_ENTER(db);
 519  519          dn = DB_DNODE(db);
 520  520          ASSERT(!refcount_is_zero(&db->db_holds));
 521  521          /* We need the struct_rwlock to prevent db_blkptr from changing. */
 522  522          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 523  523          ASSERT(MUTEX_HELD(&db->db_mtx));
 524  524          ASSERT(db->db_state == DB_UNCACHED);
 525  525          ASSERT(db->db_buf == NULL);
 526  526  
 527  527          if (db->db_blkid == DMU_BONUS_BLKID) {
 528  528                  int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 529  529  
 530  530                  ASSERT3U(bonuslen, <=, db->db.db_size);
 531  531                  db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 532  532                  arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 533  533                  if (bonuslen < DN_MAX_BONUSLEN)
 534  534                          bzero(db->db.db_data, DN_MAX_BONUSLEN);
 535  535                  if (bonuslen)
 536  536                          bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 537  537                  DB_DNODE_EXIT(db);
 538  538                  dbuf_update_data(db);
 539  539                  db->db_state = DB_CACHED;
 540  540                  mutex_exit(&db->db_mtx);
 541  541                  return;
 542  542          }
 543  543  
 544  544          /*
 545  545           * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 546  546           * processes the delete record and clears the bp while we are waiting
 547  547           * for the dn_mtx (resulting in a "no" from block_freed).
 548  548           */
 549  549          if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
 550  550              (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
 551  551              BP_IS_HOLE(db->db_blkptr)))) {
 552  552                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 553  553  
 554  554                  dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
 555  555                      db->db.db_size, db, type));
 556  556                  DB_DNODE_EXIT(db);
 557  557                  bzero(db->db.db_data, db->db.db_size);
 558  558                  db->db_state = DB_CACHED;
 559  559                  *flags |= DB_RF_CACHED;
 560  560                  mutex_exit(&db->db_mtx);
 561  561                  return;
 562  562          }
 563  563  
 564  564          spa = dn->dn_objset->os_spa;
 565  565          DB_DNODE_EXIT(db);
 566  566  
 567  567          db->db_state = DB_READ;
 568  568          mutex_exit(&db->db_mtx);
 569  569  
 570  570          if (DBUF_IS_L2CACHEABLE(db))
 571  571                  aflags |= ARC_L2CACHE;
 572  572  
 573  573          SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 574  574              db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 575  575              db->db.db_object, db->db_level, db->db_blkid);
 576  576  
 577  577          dbuf_add_ref(db, NULL);
 578  578          /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
 579  579  
 580  580          if (db->db_parent)
 581  581                  pbuf = db->db_parent->db_buf;
 582  582          else
 583  583                  pbuf = db->db_objset->os_phys_buf;
 584  584  
 585  585          (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
 586  586              dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 587  587              (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 588  588              &aflags, &zb);
 589  589          if (aflags & ARC_CACHED)
 590  590                  *flags |= DB_RF_CACHED;
 591  591  }
 592  592  
 593  593  int
 594  594  dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 595  595  {
 596  596          int err = 0;
 597  597          int havepzio = (zio != NULL);
 598  598          int prefetch;
 599  599          dnode_t *dn;
 600  600  
 601  601          /*
 602  602           * We don't have to hold the mutex to check db_state because it
 603  603           * can't be freed while we have a hold on the buffer.
 604  604           */
 605  605          ASSERT(!refcount_is_zero(&db->db_holds));
 606  606  
 607  607          if (db->db_state == DB_NOFILL)
 608  608                  return (EIO);
 609  609  
 610  610          DB_DNODE_ENTER(db);
 611  611          dn = DB_DNODE(db);
 612  612          if ((flags & DB_RF_HAVESTRUCT) == 0)
 613  613                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 614  614  
 615  615          prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 616  616              (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 617  617              DBUF_IS_CACHEABLE(db);
 618  618  
 619  619          mutex_enter(&db->db_mtx);
 620  620          if (db->db_state == DB_CACHED) {
 621  621                  mutex_exit(&db->db_mtx);
 622  622                  if (prefetch)
 623  623                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 624  624                              db->db.db_size, TRUE);
 625  625                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 626  626                          rw_exit(&dn->dn_struct_rwlock);
 627  627                  DB_DNODE_EXIT(db);
 628  628          } else if (db->db_state == DB_UNCACHED) {
 629  629                  spa_t *spa = dn->dn_objset->os_spa;
 630  630  
 631  631                  if (zio == NULL)
 632  632                          zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 633  633                  dbuf_read_impl(db, zio, &flags);
 634  634  
 635  635                  /* dbuf_read_impl has dropped db_mtx for us */
 636  636  
 637  637                  if (prefetch)
 638  638                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 639  639                              db->db.db_size, flags & DB_RF_CACHED);
 640  640  
 641  641                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 642  642                          rw_exit(&dn->dn_struct_rwlock);
 643  643                  DB_DNODE_EXIT(db);
 644  644  
 645  645                  if (!havepzio)
 646  646                          err = zio_wait(zio);
 647  647          } else {
 648  648                  mutex_exit(&db->db_mtx);
 649  649                  if (prefetch)
 650  650                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 651  651                              db->db.db_size, TRUE);
 652  652                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 653  653                          rw_exit(&dn->dn_struct_rwlock);
 654  654                  DB_DNODE_EXIT(db);
 655  655  
 656  656                  mutex_enter(&db->db_mtx);
 657  657                  if ((flags & DB_RF_NEVERWAIT) == 0) {
 658  658                          while (db->db_state == DB_READ ||
 659  659                              db->db_state == DB_FILL) {
 660  660                                  ASSERT(db->db_state == DB_READ ||
 661  661                                      (flags & DB_RF_HAVESTRUCT) == 0);
 662  662                                  cv_wait(&db->db_changed, &db->db_mtx);
 663  663                          }
 664  664                          if (db->db_state == DB_UNCACHED)
 665  665                                  err = EIO;
 666  666                  }
 667  667                  mutex_exit(&db->db_mtx);
 668  668          }
 669  669  
 670  670          ASSERT(err || havepzio || db->db_state == DB_CACHED);
 671  671          return (err);
 672  672  }
 673  673  
 674  674  static void
 675  675  dbuf_noread(dmu_buf_impl_t *db)
 676  676  {
 677  677          ASSERT(!refcount_is_zero(&db->db_holds));
 678  678          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 679  679          mutex_enter(&db->db_mtx);
 680  680          while (db->db_state == DB_READ || db->db_state == DB_FILL)
 681  681                  cv_wait(&db->db_changed, &db->db_mtx);
 682  682          if (db->db_state == DB_UNCACHED) {
 683  683                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 684  684                  spa_t *spa;
 685  685  
 686  686                  ASSERT(db->db_buf == NULL);
 687  687                  ASSERT(db->db.db_data == NULL);
 688  688                  DB_GET_SPA(&spa, db);
 689  689                  dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 690  690                  db->db_state = DB_FILL;
 691  691          } else if (db->db_state == DB_NOFILL) {
 692  692                  dbuf_set_data(db, NULL);
 693  693          } else {
 694  694                  ASSERT3U(db->db_state, ==, DB_CACHED);
 695  695          }
 696  696          mutex_exit(&db->db_mtx);
 697  697  }
 698  698  
 699  699  /*
 700  700   * This is our just-in-time copy function.  It makes a copy of
 701  701   * buffers, that have been modified in a previous transaction
 702  702   * group, before we modify them in the current active group.
 703  703   *
 704  704   * This function is used in two places: when we are dirtying a
 705  705   * buffer for the first time in a txg, and when we are freeing
 706  706   * a range in a dnode that includes this buffer.
 707  707   *
 708  708   * Note that when we are called from dbuf_free_range() we do
 709  709   * not put a hold on the buffer, we just traverse the active
 710  710   * dbuf list for the dnode.
 711  711   */
 712  712  static void
 713  713  dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 714  714  {
 715  715          dbuf_dirty_record_t *dr = db->db_last_dirty;
 716  716  
 717  717          ASSERT(MUTEX_HELD(&db->db_mtx));
 718  718          ASSERT(db->db.db_data != NULL);
 719  719          ASSERT(db->db_level == 0);
 720  720          ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 721  721  
 722  722          if (dr == NULL ||
 723  723              (dr->dt.dl.dr_data !=
 724  724              ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 725  725                  return;
 726  726  
 727  727          /*
 728  728           * If the last dirty record for this dbuf has not yet synced
 729  729           * and its referencing the dbuf data, either:
 730  730           *      reset the reference to point to a new copy,
 731  731           * or (if there a no active holders)
 732  732           *      just null out the current db_data pointer.
 733  733           */
 734  734          ASSERT(dr->dr_txg >= txg - 2);
 735  735          if (db->db_blkid == DMU_BONUS_BLKID) {
 736  736                  /* Note that the data bufs here are zio_bufs */
 737  737                  dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 738  738                  arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 739  739                  bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 740  740          } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 741  741                  int size = db->db.db_size;
 742  742                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 743  743                  spa_t *spa;
 744  744  
 745  745                  DB_GET_SPA(&spa, db);
 746  746                  dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 747  747                  bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 748  748          } else {
 749  749                  dbuf_set_data(db, NULL);
 750  750          }
 751  751  }
 752  752  
 753  753  void
 754  754  dbuf_unoverride(dbuf_dirty_record_t *dr)
 755  755  {
 756  756          dmu_buf_impl_t *db = dr->dr_dbuf;
 757  757          blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 758  758          uint64_t txg = dr->dr_txg;
 759  759  
 760  760          ASSERT(MUTEX_HELD(&db->db_mtx));
 761  761          ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 762  762          ASSERT(db->db_level == 0);
 763  763  
 764  764          if (db->db_blkid == DMU_BONUS_BLKID ||
 765  765              dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 766  766                  return;
 767  767  
 768  768          ASSERT(db->db_data_pending != dr);
 769  769  
 770  770          /* free this block */
 771  771          if (!BP_IS_HOLE(bp)) {
 772  772                  spa_t *spa;
 773  773  
 774  774                  DB_GET_SPA(&spa, db);
 775  775                  zio_free(spa, txg, bp);
 776  776          }
 777  777          dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 778  778          /*
 779  779           * Release the already-written buffer, so we leave it in
 780  780           * a consistent dirty state.  Note that all callers are
 781  781           * modifying the buffer, so they will immediately do
 782  782           * another (redundant) arc_release().  Therefore, leave
 783  783           * the buf thawed to save the effort of freezing &
 784  784           * immediately re-thawing it.
 785  785           */
 786  786          arc_release(dr->dt.dl.dr_data, db);
 787  787  }
 788  788  
 789  789  /*
 790  790   * Evict (if its unreferenced) or clear (if its referenced) any level-0
 791  791   * data blocks in the free range, so that any future readers will find
 792  792   * empty blocks.  Also, if we happen accross any level-1 dbufs in the
 793  793   * range that have not already been marked dirty, mark them dirty so
 794  794   * they stay in memory.
 795  795   */
 796  796  void
 797  797  dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 798  798  {
 799  799          dmu_buf_impl_t *db, *db_next;
 800  800          uint64_t txg = tx->tx_txg;
 801  801          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 802  802          uint64_t first_l1 = start >> epbs;
 803  803          uint64_t last_l1 = end >> epbs;
 804  804  
 805  805          if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
 806  806                  end = dn->dn_maxblkid;
 807  807                  last_l1 = end >> epbs;
 808  808          }
 809  809          dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 810  810          mutex_enter(&dn->dn_dbufs_mtx);
 811  811          for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 812  812                  db_next = list_next(&dn->dn_dbufs, db);
 813  813                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 814  814  
 815  815                  if (db->db_level == 1 &&
 816  816                      db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
 817  817                          mutex_enter(&db->db_mtx);
 818  818                          if (db->db_last_dirty &&
 819  819                              db->db_last_dirty->dr_txg < txg) {
 820  820                                  dbuf_add_ref(db, FTAG);
 821  821                                  mutex_exit(&db->db_mtx);
 822  822                                  dbuf_will_dirty(db, tx);
 823  823                                  dbuf_rele(db, FTAG);
 824  824                          } else {
 825  825                                  mutex_exit(&db->db_mtx);
 826  826                          }
 827  827                  }
 828  828  
 829  829                  if (db->db_level != 0)
 830  830                          continue;
 831  831                  dprintf_dbuf(db, "found buf %s\n", "");
 832  832                  if (db->db_blkid < start || db->db_blkid > end)
 833  833                          continue;
 834  834  
 835  835                  /* found a level 0 buffer in the range */
 836  836                  if (dbuf_undirty(db, tx))
 837  837                          continue;
 838  838  
 839  839                  mutex_enter(&db->db_mtx);
 840  840                  if (db->db_state == DB_UNCACHED ||
 841  841                      db->db_state == DB_NOFILL ||
 842  842                      db->db_state == DB_EVICTING) {
 843  843                          ASSERT(db->db.db_data == NULL);
 844  844                          mutex_exit(&db->db_mtx);
 845  845                          continue;
 846  846                  }
 847  847                  if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 848  848                          /* will be handled in dbuf_read_done or dbuf_rele */
 849  849                          db->db_freed_in_flight = TRUE;
 850  850                          mutex_exit(&db->db_mtx);
 851  851                          continue;
 852  852                  }
 853  853                  if (refcount_count(&db->db_holds) == 0) {
 854  854                          ASSERT(db->db_buf);
 855  855                          dbuf_clear(db);
 856  856                          continue;
 857  857                  }
 858  858                  /* The dbuf is referenced */
 859  859  
 860  860                  if (db->db_last_dirty != NULL) {
 861  861                          dbuf_dirty_record_t *dr = db->db_last_dirty;
 862  862  
 863  863                          if (dr->dr_txg == txg) {
 864  864                                  /*
 865  865                                   * This buffer is "in-use", re-adjust the file
 866  866                                   * size to reflect that this buffer may
 867  867                                   * contain new data when we sync.
 868  868                                   */
 869  869                                  if (db->db_blkid != DMU_SPILL_BLKID &&
 870  870                                      db->db_blkid > dn->dn_maxblkid)
 871  871                                          dn->dn_maxblkid = db->db_blkid;
 872  872                                  dbuf_unoverride(dr);
 873  873                          } else {
 874  874                                  /*
 875  875                                   * This dbuf is not dirty in the open context.
 876  876                                   * Either uncache it (if its not referenced in
 877  877                                   * the open context) or reset its contents to
 878  878                                   * empty.
 879  879                                   */
 880  880                                  dbuf_fix_old_data(db, txg);
 881  881                          }
 882  882                  }
 883  883                  /* clear the contents if its cached */
 884  884                  if (db->db_state == DB_CACHED) {
 885  885                          ASSERT(db->db.db_data != NULL);
 886  886                          arc_release(db->db_buf, db);
 887  887                          bzero(db->db.db_data, db->db.db_size);
 888  888                          arc_buf_freeze(db->db_buf);
 889  889                  }
 890  890  
 891  891                  mutex_exit(&db->db_mtx);
 892  892          }
 893  893          mutex_exit(&dn->dn_dbufs_mtx);
 894  894  }
 895  895  
 896  896  static int
 897  897  dbuf_block_freeable(dmu_buf_impl_t *db)
 898  898  {
 899  899          dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 900  900          uint64_t birth_txg = 0;
 901  901  
 902  902          /*
 903  903           * We don't need any locking to protect db_blkptr:
 904  904           * If it's syncing, then db_last_dirty will be set
 905  905           * so we'll ignore db_blkptr.
 906  906           */
 907  907          ASSERT(MUTEX_HELD(&db->db_mtx));
 908  908          if (db->db_last_dirty)
 909  909                  birth_txg = db->db_last_dirty->dr_txg;
 910  910          else if (db->db_blkptr)
 911  911                  birth_txg = db->db_blkptr->blk_birth;
 912  912  
 913  913          /*
 914  914           * If we don't exist or are in a snapshot, we can't be freed.
 915  915           * Don't pass the bp to dsl_dataset_block_freeable() since we
 916  916           * are holding the db_mtx lock and might deadlock if we are
 917  917           * prefetching a dedup-ed block.
 918  918           */
 919  919          if (birth_txg)
 920  920                  return (ds == NULL ||
 921  921                      dsl_dataset_block_freeable(ds, NULL, birth_txg));
 922  922          else
 923  923                  return (FALSE);
 924  924  }
 925  925  
 926  926  void
 927  927  dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 928  928  {
 929  929          arc_buf_t *buf, *obuf;
 930  930          int osize = db->db.db_size;
 931  931          arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 932  932          dnode_t *dn;
 933  933  
 934  934          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 935  935  
 936  936          DB_DNODE_ENTER(db);
 937  937          dn = DB_DNODE(db);
 938  938  
 939  939          /* XXX does *this* func really need the lock? */
 940  940          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 941  941  
 942  942          /*
 943  943           * This call to dbuf_will_dirty() with the dn_struct_rwlock held
 944  944           * is OK, because there can be no other references to the db
 945  945           * when we are changing its size, so no concurrent DB_FILL can
 946  946           * be happening.
 947  947           */
 948  948          /*
 949  949           * XXX we should be doing a dbuf_read, checking the return
 950  950           * value and returning that up to our callers
 951  951           */
 952  952          dbuf_will_dirty(db, tx);
 953  953  
 954  954          /* create the data buffer for the new block */
 955  955          buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
 956  956  
 957  957          /* copy old block data to the new block */
 958  958          obuf = db->db_buf;
 959  959          bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
 960  960          /* zero the remainder */
 961  961          if (size > osize)
 962  962                  bzero((uint8_t *)buf->b_data + osize, size - osize);
 963  963  
 964  964          mutex_enter(&db->db_mtx);
 965  965          dbuf_set_data(db, buf);
 966  966          VERIFY(arc_buf_remove_ref(obuf, db) == 1);
 967  967          db->db.db_size = size;
 968  968  
 969  969          if (db->db_level == 0) {
 970  970                  ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 971  971                  db->db_last_dirty->dt.dl.dr_data = buf;
 972  972          }
 973  973          mutex_exit(&db->db_mtx);
 974  974  
 975  975          dnode_willuse_space(dn, size-osize, tx);
 976  976          DB_DNODE_EXIT(db);
 977  977  }
 978  978  
 979  979  void
 980  980  dbuf_release_bp(dmu_buf_impl_t *db)
 981  981  {
 982  982          objset_t *os;
 983  983          zbookmark_t zb;
 984  984  
 985  985          DB_GET_OBJSET(&os, db);
 986  986          ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 987  987          ASSERT(arc_released(os->os_phys_buf) ||
 988  988              list_link_active(&os->os_dsl_dataset->ds_synced_link));
 989  989          ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 990  990  
 991  991          zb.zb_objset = os->os_dsl_dataset ?
 992  992              os->os_dsl_dataset->ds_object : 0;
 993  993          zb.zb_object = db->db.db_object;
 994  994          zb.zb_level = db->db_level;
 995  995          zb.zb_blkid = db->db_blkid;
 996  996          (void) arc_release_bp(db->db_buf, db,
 997  997              db->db_blkptr, os->os_spa, &zb);
 998  998  }
 999  999  
1000 1000  dbuf_dirty_record_t *
1001 1001  dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1002 1002  {
1003 1003          dnode_t *dn;
1004 1004          objset_t *os;
1005 1005          dbuf_dirty_record_t **drp, *dr;
1006 1006          int drop_struct_lock = FALSE;
1007 1007          boolean_t do_free_accounting = B_FALSE;
1008 1008          int txgoff = tx->tx_txg & TXG_MASK;
1009 1009  
1010 1010          ASSERT(tx->tx_txg != 0);
1011 1011          ASSERT(!refcount_is_zero(&db->db_holds));
1012 1012          DMU_TX_DIRTY_BUF(tx, db);
1013 1013  
1014 1014          DB_DNODE_ENTER(db);
1015 1015          dn = DB_DNODE(db);
1016 1016          /*
1017 1017           * Shouldn't dirty a regular buffer in syncing context.  Private
1018 1018           * objects may be dirtied in syncing context, but only if they
1019 1019           * were already pre-dirtied in open context.
1020 1020           */
1021 1021          ASSERT(!dmu_tx_is_syncing(tx) ||
1022 1022              BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1023 1023              DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1024 1024              dn->dn_objset->os_dsl_dataset == NULL);
1025 1025          /*
1026 1026           * We make this assert for private objects as well, but after we
1027 1027           * check if we're already dirty.  They are allowed to re-dirty
1028 1028           * in syncing context.
1029 1029           */
1030 1030          ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1031 1031              dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1032 1032              (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1033 1033  
1034 1034          mutex_enter(&db->db_mtx);
1035 1035          /*
1036 1036           * XXX make this true for indirects too?  The problem is that
1037 1037           * transactions created with dmu_tx_create_assigned() from
1038 1038           * syncing context don't bother holding ahead.
1039 1039           */
1040 1040          ASSERT(db->db_level != 0 ||
1041 1041              db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1042 1042              db->db_state == DB_NOFILL);
1043 1043  
1044 1044          mutex_enter(&dn->dn_mtx);
1045 1045          /*
1046 1046           * Don't set dirtyctx to SYNC if we're just modifying this as we
1047 1047           * initialize the objset.
1048 1048           */
1049 1049          if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1050 1050              !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1051 1051                  dn->dn_dirtyctx =
1052 1052                      (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1053 1053                  ASSERT(dn->dn_dirtyctx_firstset == NULL);
1054 1054                  dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1055 1055          }
1056 1056          mutex_exit(&dn->dn_mtx);
1057 1057  
1058 1058          if (db->db_blkid == DMU_SPILL_BLKID)
1059 1059                  dn->dn_have_spill = B_TRUE;
1060 1060  
1061 1061          /*
1062 1062           * If this buffer is already dirty, we're done.
1063 1063           */
1064 1064          drp = &db->db_last_dirty;
1065 1065          ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1066 1066              db->db.db_object == DMU_META_DNODE_OBJECT);
1067 1067          while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1068 1068                  drp = &dr->dr_next;
1069 1069          if (dr && dr->dr_txg == tx->tx_txg) {
1070 1070                  DB_DNODE_EXIT(db);
1071 1071  
1072 1072                  if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1073 1073                          /*
1074 1074                           * If this buffer has already been written out,
1075 1075                           * we now need to reset its state.
1076 1076                           */
1077 1077                          dbuf_unoverride(dr);
1078 1078                          if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1079 1079                              db->db_state != DB_NOFILL)
1080 1080                                  arc_buf_thaw(db->db_buf);
1081 1081                  }
1082 1082                  mutex_exit(&db->db_mtx);
1083 1083                  return (dr);
1084 1084          }
1085 1085  
1086 1086          /*
1087 1087           * Only valid if not already dirty.
1088 1088           */
1089 1089          ASSERT(dn->dn_object == 0 ||
1090 1090              dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1091 1091              (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1092 1092  
1093 1093          ASSERT3U(dn->dn_nlevels, >, db->db_level);
1094 1094          ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1095 1095              dn->dn_phys->dn_nlevels > db->db_level ||
1096 1096              dn->dn_next_nlevels[txgoff] > db->db_level ||
1097 1097              dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1098 1098              dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1099 1099  
1100 1100          /*
1101 1101           * We should only be dirtying in syncing context if it's the
1102 1102           * mos or we're initializing the os or it's a special object.
1103 1103           * However, we are allowed to dirty in syncing context provided
1104 1104           * we already dirtied it in open context.  Hence we must make
1105 1105           * this assertion only if we're not already dirty.
1106 1106           */
1107 1107          os = dn->dn_objset;
1108 1108          ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1109 1109              os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1110 1110          ASSERT(db->db.db_size != 0);
1111 1111  
1112 1112          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1113 1113  
1114 1114          if (db->db_blkid != DMU_BONUS_BLKID) {
1115 1115                  /*
1116 1116                   * Update the accounting.
1117 1117                   * Note: we delay "free accounting" until after we drop
1118 1118                   * the db_mtx.  This keeps us from grabbing other locks
1119 1119                   * (and possibly deadlocking) in bp_get_dsize() while
1120 1120                   * also holding the db_mtx.
1121 1121                   */
1122 1122                  dnode_willuse_space(dn, db->db.db_size, tx);
1123 1123                  do_free_accounting = dbuf_block_freeable(db);
1124 1124          }
1125 1125  
1126 1126          /*
1127 1127           * If this buffer is dirty in an old transaction group we need
1128 1128           * to make a copy of it so that the changes we make in this
1129 1129           * transaction group won't leak out when we sync the older txg.
1130 1130           */
1131 1131          dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1132 1132          if (db->db_level == 0) {
1133 1133                  void *data_old = db->db_buf;
1134 1134  
1135 1135                  if (db->db_state != DB_NOFILL) {
1136 1136                          if (db->db_blkid == DMU_BONUS_BLKID) {
1137 1137                                  dbuf_fix_old_data(db, tx->tx_txg);
1138 1138                                  data_old = db->db.db_data;
1139 1139                          } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1140 1140                                  /*
1141 1141                                   * Release the data buffer from the cache so
1142 1142                                   * that we can modify it without impacting
1143 1143                                   * possible other users of this cached data
1144 1144                                   * block.  Note that indirect blocks and
1145 1145                                   * private objects are not released until the
1146 1146                                   * syncing state (since they are only modified
1147 1147                                   * then).
1148 1148                                   */
1149 1149                                  arc_release(db->db_buf, db);
1150 1150                                  dbuf_fix_old_data(db, tx->tx_txg);
1151 1151                                  data_old = db->db_buf;
1152 1152                          }
1153 1153                          ASSERT(data_old != NULL);
1154 1154                  }
1155 1155                  dr->dt.dl.dr_data = data_old;
1156 1156          } else {
1157 1157                  mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1158 1158                  list_create(&dr->dt.di.dr_children,
1159 1159                      sizeof (dbuf_dirty_record_t),
1160 1160                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
1161 1161          }
1162 1162          dr->dr_dbuf = db;
1163 1163          dr->dr_txg = tx->tx_txg;
1164 1164          dr->dr_next = *drp;
1165 1165          *drp = dr;
1166 1166  
1167 1167          /*
1168 1168           * We could have been freed_in_flight between the dbuf_noread
1169 1169           * and dbuf_dirty.  We win, as though the dbuf_noread() had
1170 1170           * happened after the free.
1171 1171           */
1172 1172          if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1173 1173              db->db_blkid != DMU_SPILL_BLKID) {
1174 1174                  mutex_enter(&dn->dn_mtx);
1175 1175                  dnode_clear_range(dn, db->db_blkid, 1, tx);
1176 1176                  mutex_exit(&dn->dn_mtx);
1177 1177                  db->db_freed_in_flight = FALSE;
1178 1178          }
1179 1179  
1180 1180          /*
1181 1181           * This buffer is now part of this txg
1182 1182           */
1183 1183          dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1184 1184          db->db_dirtycnt += 1;
1185 1185          ASSERT3U(db->db_dirtycnt, <=, 3);
1186 1186  
1187 1187          mutex_exit(&db->db_mtx);
1188 1188  
1189 1189          if (db->db_blkid == DMU_BONUS_BLKID ||
1190 1190              db->db_blkid == DMU_SPILL_BLKID) {
1191 1191                  mutex_enter(&dn->dn_mtx);
1192 1192                  ASSERT(!list_link_active(&dr->dr_dirty_node));
1193 1193                  list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1194 1194                  mutex_exit(&dn->dn_mtx);
1195 1195                  dnode_setdirty(dn, tx);
1196 1196                  DB_DNODE_EXIT(db);
1197 1197                  return (dr);
1198 1198          } else if (do_free_accounting) {
1199 1199                  blkptr_t *bp = db->db_blkptr;
1200 1200                  int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1201 1201                      bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1202 1202                  /*
1203 1203                   * This is only a guess -- if the dbuf is dirty
1204 1204                   * in a previous txg, we don't know how much
1205 1205                   * space it will use on disk yet.  We should
1206 1206                   * really have the struct_rwlock to access
1207 1207                   * db_blkptr, but since this is just a guess,
1208 1208                   * it's OK if we get an odd answer.
1209 1209                   */
1210 1210                  ddt_prefetch(os->os_spa, bp);
1211 1211                  dnode_willuse_space(dn, -willfree, tx);
1212 1212          }
1213 1213  
1214 1214          if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1215 1215                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
1216 1216                  drop_struct_lock = TRUE;
1217 1217          }
1218 1218  
1219 1219          if (db->db_level == 0) {
1220 1220                  dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1221 1221                  ASSERT(dn->dn_maxblkid >= db->db_blkid);
1222 1222          }
1223 1223  
1224 1224          if (db->db_level+1 < dn->dn_nlevels) {
1225 1225                  dmu_buf_impl_t *parent = db->db_parent;
1226 1226                  dbuf_dirty_record_t *di;
1227 1227                  int parent_held = FALSE;
1228 1228  
1229 1229                  if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1230 1230                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1231 1231  
1232 1232                          parent = dbuf_hold_level(dn, db->db_level+1,
1233 1233                              db->db_blkid >> epbs, FTAG);
1234 1234                          ASSERT(parent != NULL);
1235 1235                          parent_held = TRUE;
1236 1236                  }
1237 1237                  if (drop_struct_lock)
1238 1238                          rw_exit(&dn->dn_struct_rwlock);
1239 1239                  ASSERT3U(db->db_level+1, ==, parent->db_level);
1240 1240                  di = dbuf_dirty(parent, tx);
1241 1241                  if (parent_held)
1242 1242                          dbuf_rele(parent, FTAG);
1243 1243  
1244 1244                  mutex_enter(&db->db_mtx);
1245 1245                  /*  possible race with dbuf_undirty() */
1246 1246                  if (db->db_last_dirty == dr ||
1247 1247                      dn->dn_object == DMU_META_DNODE_OBJECT) {
1248 1248                          mutex_enter(&di->dt.di.dr_mtx);
1249 1249                          ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1250 1250                          ASSERT(!list_link_active(&dr->dr_dirty_node));
1251 1251                          list_insert_tail(&di->dt.di.dr_children, dr);
1252 1252                          mutex_exit(&di->dt.di.dr_mtx);
1253 1253                          dr->dr_parent = di;
1254 1254                  }
1255 1255                  mutex_exit(&db->db_mtx);
1256 1256          } else {
1257 1257                  ASSERT(db->db_level+1 == dn->dn_nlevels);
1258 1258                  ASSERT(db->db_blkid < dn->dn_nblkptr);
1259 1259                  ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1260 1260                  mutex_enter(&dn->dn_mtx);
1261 1261                  ASSERT(!list_link_active(&dr->dr_dirty_node));
1262 1262                  list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1263 1263                  mutex_exit(&dn->dn_mtx);
1264 1264                  if (drop_struct_lock)
1265 1265                          rw_exit(&dn->dn_struct_rwlock);
1266 1266          }
1267 1267  
1268 1268          dnode_setdirty(dn, tx);
1269 1269          DB_DNODE_EXIT(db);
1270 1270          return (dr);
1271 1271  }
1272 1272  
1273 1273  static int
1274 1274  dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1275 1275  {
1276 1276          dnode_t *dn;
1277 1277          uint64_t txg = tx->tx_txg;
1278 1278          dbuf_dirty_record_t *dr, **drp;
1279 1279  
1280 1280          ASSERT(txg != 0);
1281 1281          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1282 1282  
1283 1283          mutex_enter(&db->db_mtx);
1284 1284          /*
1285 1285           * If this buffer is not dirty, we're done.
1286 1286           */
1287 1287          for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1288 1288                  if (dr->dr_txg <= txg)
1289 1289                          break;
1290 1290          if (dr == NULL || dr->dr_txg < txg) {
1291 1291                  mutex_exit(&db->db_mtx);
1292 1292                  return (0);
1293 1293          }
1294 1294          ASSERT(dr->dr_txg == txg);
1295 1295          ASSERT(dr->dr_dbuf == db);
1296 1296  
1297 1297          DB_DNODE_ENTER(db);
1298 1298          dn = DB_DNODE(db);
1299 1299  
1300 1300          /*
1301 1301           * If this buffer is currently held, we cannot undirty
1302 1302           * it, since one of the current holders may be in the
1303 1303           * middle of an update.  Note that users of dbuf_undirty()
1304 1304           * should not place a hold on the dbuf before the call.
1305 1305           * Also note: we can get here with a spill block, so
1306 1306           * test for that similar to how dbuf_dirty does.
1307 1307           */
1308 1308          if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1309 1309                  mutex_exit(&db->db_mtx);
1310 1310                  /* Make sure we don't toss this buffer at sync phase */
1311 1311                  if (db->db_blkid != DMU_SPILL_BLKID) {
1312 1312                          mutex_enter(&dn->dn_mtx);
1313 1313                          dnode_clear_range(dn, db->db_blkid, 1, tx);
1314 1314                          mutex_exit(&dn->dn_mtx);
1315 1315                  }
1316 1316                  DB_DNODE_EXIT(db);
1317 1317                  return (0);
1318 1318          }
1319 1319  
1320 1320          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1321 1321  
1322 1322          ASSERT(db->db.db_size != 0);
1323 1323  
1324 1324          /* XXX would be nice to fix up dn_towrite_space[] */
1325 1325  
1326 1326          *drp = dr->dr_next;
1327 1327  
1328 1328          /*
1329 1329           * Note that there are three places in dbuf_dirty()
1330 1330           * where this dirty record may be put on a list.
1331 1331           * Make sure to do a list_remove corresponding to
1332 1332           * every one of those list_insert calls.
1333 1333           */
1334 1334          if (dr->dr_parent) {
1335 1335                  mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1336 1336                  list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1337 1337                  mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1338 1338          } else if (db->db_blkid == DMU_SPILL_BLKID ||
1339 1339              db->db_level+1 == dn->dn_nlevels) {
1340 1340                  ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1341 1341                  mutex_enter(&dn->dn_mtx);
1342 1342                  list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1343 1343                  mutex_exit(&dn->dn_mtx);
1344 1344          }
1345 1345          DB_DNODE_EXIT(db);
1346 1346  
1347 1347          if (db->db_level == 0) {
1348 1348                  if (db->db_state != DB_NOFILL) {
1349 1349                          dbuf_unoverride(dr);
1350 1350  
1351 1351                          ASSERT(db->db_buf != NULL);
1352 1352                          ASSERT(dr->dt.dl.dr_data != NULL);
1353 1353                          if (dr->dt.dl.dr_data != db->db_buf)
1354 1354                                  VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
1355 1355                                      db) == 1);
1356 1356                  }
1357 1357          } else {
1358 1358                  ASSERT(db->db_buf != NULL);
1359 1359                  ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1360 1360                  mutex_destroy(&dr->dt.di.dr_mtx);
1361 1361                  list_destroy(&dr->dt.di.dr_children);
1362 1362          }
1363 1363          kmem_free(dr, sizeof (dbuf_dirty_record_t));
1364 1364  
1365 1365          ASSERT(db->db_dirtycnt > 0);
1366 1366          db->db_dirtycnt -= 1;
1367 1367  
1368 1368          if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1369 1369                  arc_buf_t *buf = db->db_buf;
1370 1370  
1371 1371                  ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1372 1372                  dbuf_set_data(db, NULL);
1373 1373                  VERIFY(arc_buf_remove_ref(buf, db) == 1);
1374 1374                  dbuf_evict(db);
1375 1375                  return (1);
1376 1376          }
1377 1377  
1378 1378          mutex_exit(&db->db_mtx);
1379 1379          return (0);
1380 1380  }
1381 1381  
1382 1382  #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1383 1383  void
1384 1384  dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1385 1385  {
1386 1386          int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1387 1387  
1388 1388          ASSERT(tx->tx_txg != 0);
1389 1389          ASSERT(!refcount_is_zero(&db->db_holds));
1390 1390  
1391 1391          DB_DNODE_ENTER(db);
1392 1392          if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1393 1393                  rf |= DB_RF_HAVESTRUCT;
1394 1394          DB_DNODE_EXIT(db);
1395 1395          (void) dbuf_read(db, NULL, rf);
1396 1396          (void) dbuf_dirty(db, tx);
1397 1397  }
1398 1398  
1399 1399  void
1400 1400  dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1401 1401  {
1402 1402          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1403 1403  
1404 1404          db->db_state = DB_NOFILL;
1405 1405  
1406 1406          dmu_buf_will_fill(db_fake, tx);
1407 1407  }
1408 1408  
1409 1409  void
1410 1410  dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1411 1411  {
1412 1412          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1413 1413  
1414 1414          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1415 1415          ASSERT(tx->tx_txg != 0);
1416 1416          ASSERT(db->db_level == 0);
1417 1417          ASSERT(!refcount_is_zero(&db->db_holds));
1418 1418  
1419 1419          ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1420 1420              dmu_tx_private_ok(tx));
1421 1421  
1422 1422          dbuf_noread(db);
1423 1423          (void) dbuf_dirty(db, tx);
1424 1424  }
1425 1425  
1426 1426  #pragma weak dmu_buf_fill_done = dbuf_fill_done
1427 1427  /* ARGSUSED */
1428 1428  void
1429 1429  dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1430 1430  {
1431 1431          mutex_enter(&db->db_mtx);
1432 1432          DBUF_VERIFY(db);
1433 1433  
1434 1434          if (db->db_state == DB_FILL) {
1435 1435                  if (db->db_level == 0 && db->db_freed_in_flight) {
1436 1436                          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1437 1437                          /* we were freed while filling */
1438 1438                          /* XXX dbuf_undirty? */
1439 1439                          bzero(db->db.db_data, db->db.db_size);
1440 1440                          db->db_freed_in_flight = FALSE;
1441 1441                  }
1442 1442                  db->db_state = DB_CACHED;
1443 1443                  cv_broadcast(&db->db_changed);
1444 1444          }
1445 1445          mutex_exit(&db->db_mtx);
1446 1446  }
1447 1447  
1448 1448  /*
1449 1449   * Directly assign a provided arc buf to a given dbuf if it's not referenced
1450 1450   * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1451 1451   */
1452 1452  void
1453 1453  dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1454 1454  {
1455 1455          ASSERT(!refcount_is_zero(&db->db_holds));
1456 1456          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1457 1457          ASSERT(db->db_level == 0);
1458 1458          ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1459 1459          ASSERT(buf != NULL);
1460 1460          ASSERT(arc_buf_size(buf) == db->db.db_size);
1461 1461          ASSERT(tx->tx_txg != 0);
1462 1462  
1463 1463          arc_return_buf(buf, db);
1464 1464          ASSERT(arc_released(buf));
1465 1465  
1466 1466          mutex_enter(&db->db_mtx);
1467 1467  
1468 1468          while (db->db_state == DB_READ || db->db_state == DB_FILL)
1469 1469                  cv_wait(&db->db_changed, &db->db_mtx);
1470 1470  
1471 1471          ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1472 1472  
1473 1473          if (db->db_state == DB_CACHED &&
1474 1474              refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1475 1475                  mutex_exit(&db->db_mtx);
1476 1476                  (void) dbuf_dirty(db, tx);
1477 1477                  bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1478 1478                  VERIFY(arc_buf_remove_ref(buf, db) == 1);
1479 1479                  xuio_stat_wbuf_copied();
1480 1480                  return;
1481 1481          }
1482 1482  
1483 1483          xuio_stat_wbuf_nocopy();
1484 1484          if (db->db_state == DB_CACHED) {
1485 1485                  dbuf_dirty_record_t *dr = db->db_last_dirty;
1486 1486  
1487 1487                  ASSERT(db->db_buf != NULL);
1488 1488                  if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1489 1489                          ASSERT(dr->dt.dl.dr_data == db->db_buf);
1490 1490                          if (!arc_released(db->db_buf)) {
1491 1491                                  ASSERT(dr->dt.dl.dr_override_state ==
1492 1492                                      DR_OVERRIDDEN);
1493 1493                                  arc_release(db->db_buf, db);
1494 1494                          }
1495 1495                          dr->dt.dl.dr_data = buf;
1496 1496                          VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
1497 1497                  } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1498 1498                          arc_release(db->db_buf, db);
1499 1499                          VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
1500 1500                  }
1501 1501                  db->db_buf = NULL;
1502 1502          }
1503 1503          ASSERT(db->db_buf == NULL);
1504 1504          dbuf_set_data(db, buf);
1505 1505          db->db_state = DB_FILL;
1506 1506          mutex_exit(&db->db_mtx);
1507 1507          (void) dbuf_dirty(db, tx);
1508 1508          dbuf_fill_done(db, tx);
1509 1509  }
1510 1510  
1511 1511  /*
1512 1512   * "Clear" the contents of this dbuf.  This will mark the dbuf
1513 1513   * EVICTING and clear *most* of its references.  Unfortunetely,
1514 1514   * when we are not holding the dn_dbufs_mtx, we can't clear the
1515 1515   * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1516 1516   * in this case.  For callers from the DMU we will usually see:
1517 1517   *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1518 1518   * For the arc callback, we will usually see:
1519 1519   *      dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1520 1520   * Sometimes, though, we will get a mix of these two:
1521 1521   *      DMU: dbuf_clear()->arc_buf_evict()
1522 1522   *      ARC: dbuf_do_evict()->dbuf_destroy()
1523 1523   */
1524 1524  void
1525 1525  dbuf_clear(dmu_buf_impl_t *db)
1526 1526  {
1527 1527          dnode_t *dn;
1528 1528          dmu_buf_impl_t *parent = db->db_parent;
1529 1529          dmu_buf_impl_t *dndb;
1530 1530          int dbuf_gone = FALSE;
1531 1531  
1532 1532          ASSERT(MUTEX_HELD(&db->db_mtx));
1533 1533          ASSERT(refcount_is_zero(&db->db_holds));
1534 1534  
1535 1535          dbuf_evict_user(db);
1536 1536  
1537 1537          if (db->db_state == DB_CACHED) {
1538 1538                  ASSERT(db->db.db_data != NULL);
1539 1539                  if (db->db_blkid == DMU_BONUS_BLKID) {
1540 1540                          zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1541 1541                          arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1542 1542                  }
1543 1543                  db->db.db_data = NULL;
1544 1544                  db->db_state = DB_UNCACHED;
1545 1545          }
1546 1546  
1547 1547          ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1548 1548          ASSERT(db->db_data_pending == NULL);
1549 1549  
1550 1550          db->db_state = DB_EVICTING;
1551 1551          db->db_blkptr = NULL;
1552 1552  
1553 1553          DB_DNODE_ENTER(db);
1554 1554          dn = DB_DNODE(db);
1555 1555          dndb = dn->dn_dbuf;
1556 1556          if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1557 1557                  list_remove(&dn->dn_dbufs, db);
1558 1558                  (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1559 1559                  membar_producer();
1560 1560                  DB_DNODE_EXIT(db);
1561 1561                  /*
1562 1562                   * Decrementing the dbuf count means that the hold corresponding
1563 1563                   * to the removed dbuf is no longer discounted in dnode_move(),
1564 1564                   * so the dnode cannot be moved until after we release the hold.
1565 1565                   * The membar_producer() ensures visibility of the decremented
1566 1566                   * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1567 1567                   * release any lock.
1568 1568                   */
1569 1569                  dnode_rele(dn, db);
1570 1570                  db->db_dnode_handle = NULL;
1571 1571          } else {
1572 1572                  DB_DNODE_EXIT(db);
1573 1573          }
1574 1574  
1575 1575          if (db->db_buf)
1576 1576                  dbuf_gone = arc_buf_evict(db->db_buf);
1577 1577  
1578 1578          if (!dbuf_gone)
1579 1579                  mutex_exit(&db->db_mtx);
1580 1580  
1581 1581          /*
1582 1582           * If this dbuf is referenced from an indirect dbuf,
1583 1583           * decrement the ref count on the indirect dbuf.
1584 1584           */
1585 1585          if (parent && parent != dndb)
1586 1586                  dbuf_rele(parent, db);
1587 1587  }
1588 1588  
1589 1589  static int
1590 1590  dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1591 1591      dmu_buf_impl_t **parentp, blkptr_t **bpp)
1592 1592  {
1593 1593          int nlevels, epbs;
1594 1594  
1595 1595          *parentp = NULL;
1596 1596          *bpp = NULL;
1597 1597  
1598 1598          ASSERT(blkid != DMU_BONUS_BLKID);
1599 1599  
1600 1600          if (blkid == DMU_SPILL_BLKID) {
1601 1601                  mutex_enter(&dn->dn_mtx);
1602 1602                  if (dn->dn_have_spill &&
1603 1603                      (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1604 1604                          *bpp = &dn->dn_phys->dn_spill;
1605 1605                  else
1606 1606                          *bpp = NULL;
1607 1607                  dbuf_add_ref(dn->dn_dbuf, NULL);
1608 1608                  *parentp = dn->dn_dbuf;
1609 1609                  mutex_exit(&dn->dn_mtx);
1610 1610                  return (0);
1611 1611          }
1612 1612  
1613 1613          if (dn->dn_phys->dn_nlevels == 0)
1614 1614                  nlevels = 1;
1615 1615          else
1616 1616                  nlevels = dn->dn_phys->dn_nlevels;
1617 1617  
1618 1618          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1619 1619  
1620 1620          ASSERT3U(level * epbs, <, 64);
1621 1621          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1622 1622          if (level >= nlevels ||
1623 1623              (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1624 1624                  /* the buffer has no parent yet */
1625 1625                  return (ENOENT);
1626 1626          } else if (level < nlevels-1) {
1627 1627                  /* this block is referenced from an indirect block */
1628 1628                  int err = dbuf_hold_impl(dn, level+1,
1629 1629                      blkid >> epbs, fail_sparse, NULL, parentp);
1630 1630                  if (err)
1631 1631                          return (err);
1632 1632                  err = dbuf_read(*parentp, NULL,
1633 1633                      (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1634 1634                  if (err) {
1635 1635                          dbuf_rele(*parentp, NULL);
1636 1636                          *parentp = NULL;
1637 1637                          return (err);
1638 1638                  }
1639 1639                  *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1640 1640                      (blkid & ((1ULL << epbs) - 1));
1641 1641                  return (0);
1642 1642          } else {
1643 1643                  /* the block is referenced from the dnode */
1644 1644                  ASSERT3U(level, ==, nlevels-1);
1645 1645                  ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1646 1646                      blkid < dn->dn_phys->dn_nblkptr);
1647 1647                  if (dn->dn_dbuf) {
1648 1648                          dbuf_add_ref(dn->dn_dbuf, NULL);
1649 1649                          *parentp = dn->dn_dbuf;
1650 1650                  }
1651 1651                  *bpp = &dn->dn_phys->dn_blkptr[blkid];
1652 1652                  return (0);
1653 1653          }
1654 1654  }
1655 1655  
1656 1656  static dmu_buf_impl_t *
1657 1657  dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1658 1658      dmu_buf_impl_t *parent, blkptr_t *blkptr)
1659 1659  {
1660 1660          objset_t *os = dn->dn_objset;
1661 1661          dmu_buf_impl_t *db, *odb;
1662 1662  
1663 1663          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1664 1664          ASSERT(dn->dn_type != DMU_OT_NONE);
1665 1665  
1666 1666          db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1667 1667  
1668 1668          db->db_objset = os;
1669 1669          db->db.db_object = dn->dn_object;
1670 1670          db->db_level = level;
1671 1671          db->db_blkid = blkid;
1672 1672          db->db_last_dirty = NULL;
1673 1673          db->db_dirtycnt = 0;
1674 1674          db->db_dnode_handle = dn->dn_handle;
1675 1675          db->db_parent = parent;
1676 1676          db->db_blkptr = blkptr;
1677 1677  
1678 1678          db->db_user_ptr = NULL;
1679 1679          db->db_user_data_ptr_ptr = NULL;
1680 1680          db->db_evict_func = NULL;
1681 1681          db->db_immediate_evict = 0;
1682 1682          db->db_freed_in_flight = 0;
1683 1683  
1684 1684          if (blkid == DMU_BONUS_BLKID) {
1685 1685                  ASSERT3P(parent, ==, dn->dn_dbuf);
1686 1686                  db->db.db_size = DN_MAX_BONUSLEN -
1687 1687                      (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1688 1688                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1689 1689                  db->db.db_offset = DMU_BONUS_BLKID;
1690 1690                  db->db_state = DB_UNCACHED;
1691 1691                  /* the bonus dbuf is not placed in the hash table */
1692 1692                  arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1693 1693                  return (db);
1694 1694          } else if (blkid == DMU_SPILL_BLKID) {
1695 1695                  db->db.db_size = (blkptr != NULL) ?
1696 1696                      BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1697 1697                  db->db.db_offset = 0;
1698 1698          } else {
1699 1699                  int blocksize =
1700 1700                      db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
1701 1701                  db->db.db_size = blocksize;
1702 1702                  db->db.db_offset = db->db_blkid * blocksize;
1703 1703          }
1704 1704  
1705 1705          /*
1706 1706           * Hold the dn_dbufs_mtx while we get the new dbuf
1707 1707           * in the hash table *and* added to the dbufs list.
1708 1708           * This prevents a possible deadlock with someone
1709 1709           * trying to look up this dbuf before its added to the
1710 1710           * dn_dbufs list.
1711 1711           */
1712 1712          mutex_enter(&dn->dn_dbufs_mtx);
1713 1713          db->db_state = DB_EVICTING;
1714 1714          if ((odb = dbuf_hash_insert(db)) != NULL) {
1715 1715                  /* someone else inserted it first */
1716 1716                  kmem_cache_free(dbuf_cache, db);
1717 1717                  mutex_exit(&dn->dn_dbufs_mtx);
1718 1718                  return (odb);
1719 1719          }
1720 1720          list_insert_head(&dn->dn_dbufs, db);
1721 1721          db->db_state = DB_UNCACHED;
1722 1722          mutex_exit(&dn->dn_dbufs_mtx);
1723 1723          arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1724 1724  
1725 1725          if (parent && parent != dn->dn_dbuf)
1726 1726                  dbuf_add_ref(parent, db);
1727 1727  
1728 1728          ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1729 1729              refcount_count(&dn->dn_holds) > 0);
1730 1730          (void) refcount_add(&dn->dn_holds, db);
1731 1731          (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1732 1732  
1733 1733          dprintf_dbuf(db, "db=%p\n", db);
1734 1734  
1735 1735          return (db);
1736 1736  }
1737 1737  
1738 1738  static int
1739 1739  dbuf_do_evict(void *private)
1740 1740  {
1741 1741          arc_buf_t *buf = private;
1742 1742          dmu_buf_impl_t *db = buf->b_private;
1743 1743  
1744 1744          if (!MUTEX_HELD(&db->db_mtx))
1745 1745                  mutex_enter(&db->db_mtx);
1746 1746  
1747 1747          ASSERT(refcount_is_zero(&db->db_holds));
1748 1748  
1749 1749          if (db->db_state != DB_EVICTING) {
1750 1750                  ASSERT(db->db_state == DB_CACHED);
1751 1751                  DBUF_VERIFY(db);
1752 1752                  db->db_buf = NULL;
1753 1753                  dbuf_evict(db);
1754 1754          } else {
1755 1755                  mutex_exit(&db->db_mtx);
1756 1756                  dbuf_destroy(db);
1757 1757          }
1758 1758          return (0);
1759 1759  }
1760 1760  
1761 1761  static void
1762 1762  dbuf_destroy(dmu_buf_impl_t *db)
1763 1763  {
1764 1764          ASSERT(refcount_is_zero(&db->db_holds));
1765 1765  
1766 1766          if (db->db_blkid != DMU_BONUS_BLKID) {
1767 1767                  /*
1768 1768                   * If this dbuf is still on the dn_dbufs list,
1769 1769                   * remove it from that list.
1770 1770                   */
1771 1771                  if (db->db_dnode_handle != NULL) {
1772 1772                          dnode_t *dn;
1773 1773  
1774 1774                          DB_DNODE_ENTER(db);
1775 1775                          dn = DB_DNODE(db);
1776 1776                          mutex_enter(&dn->dn_dbufs_mtx);
1777 1777                          list_remove(&dn->dn_dbufs, db);
1778 1778                          (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1779 1779                          mutex_exit(&dn->dn_dbufs_mtx);
1780 1780                          DB_DNODE_EXIT(db);
1781 1781                          /*
1782 1782                           * Decrementing the dbuf count means that the hold
1783 1783                           * corresponding to the removed dbuf is no longer
1784 1784                           * discounted in dnode_move(), so the dnode cannot be
1785 1785                           * moved until after we release the hold.
1786 1786                           */
1787 1787                          dnode_rele(dn, db);
1788 1788                          db->db_dnode_handle = NULL;
1789 1789                  }
1790 1790                  dbuf_hash_remove(db);
1791 1791          }
1792 1792          db->db_parent = NULL;
1793 1793          db->db_buf = NULL;
1794 1794  
1795 1795          ASSERT(!list_link_active(&db->db_link));
1796 1796          ASSERT(db->db.db_data == NULL);
1797 1797          ASSERT(db->db_hash_next == NULL);
1798 1798          ASSERT(db->db_blkptr == NULL);
1799 1799          ASSERT(db->db_data_pending == NULL);
1800 1800  
1801 1801          kmem_cache_free(dbuf_cache, db);
1802 1802          arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1803 1803  }
1804 1804  
1805 1805  void
1806 1806  dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1807 1807  {
1808 1808          dmu_buf_impl_t *db = NULL;
1809 1809          blkptr_t *bp = NULL;
1810 1810  
1811 1811          ASSERT(blkid != DMU_BONUS_BLKID);
1812 1812          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1813 1813  
1814 1814          if (dnode_block_freed(dn, blkid))
1815 1815                  return;
1816 1816  
1817 1817          /* dbuf_find() returns with db_mtx held */
1818 1818          if (db = dbuf_find(dn, 0, blkid)) {
1819 1819                  /*
1820 1820                   * This dbuf is already in the cache.  We assume that
1821 1821                   * it is already CACHED, or else about to be either
1822 1822                   * read or filled.
1823 1823                   */
1824 1824                  mutex_exit(&db->db_mtx);
1825 1825                  return;
1826 1826          }
1827 1827  
1828 1828          if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1829 1829                  if (bp && !BP_IS_HOLE(bp)) {
1830 1830                          int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
1831 1831                              ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
1832 1832                          arc_buf_t *pbuf;
1833 1833                          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1834 1834                          uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1835 1835                          zbookmark_t zb;
1836 1836  
1837 1837                          SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1838 1838                              dn->dn_object, 0, blkid);
1839 1839  
1840 1840                          if (db)
1841 1841                                  pbuf = db->db_buf;
1842 1842                          else
1843 1843                                  pbuf = dn->dn_objset->os_phys_buf;
1844 1844  
1845 1845                          (void) dsl_read(NULL, dn->dn_objset->os_spa,
1846 1846                              bp, pbuf, NULL, NULL, priority,
1847 1847                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1848 1848                              &aflags, &zb);
1849 1849                  }
1850 1850                  if (db)
1851 1851                          dbuf_rele(db, NULL);
1852 1852          }
1853 1853  }
1854 1854  
1855 1855  /*
1856 1856   * Returns with db_holds incremented, and db_mtx not held.
1857 1857   * Note: dn_struct_rwlock must be held.
1858 1858   */
1859 1859  int
1860 1860  dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1861 1861      void *tag, dmu_buf_impl_t **dbp)
1862 1862  {
1863 1863          dmu_buf_impl_t *db, *parent = NULL;
1864 1864  
1865 1865          ASSERT(blkid != DMU_BONUS_BLKID);
1866 1866          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1867 1867          ASSERT3U(dn->dn_nlevels, >, level);
1868 1868  
1869 1869          *dbp = NULL;
1870 1870  top:
1871 1871          /* dbuf_find() returns with db_mtx held */
1872 1872          db = dbuf_find(dn, level, blkid);
1873 1873  
1874 1874          if (db == NULL) {
1875 1875                  blkptr_t *bp = NULL;
1876 1876                  int err;
1877 1877  
1878 1878                  ASSERT3P(parent, ==, NULL);
1879 1879                  err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1880 1880                  if (fail_sparse) {
1881 1881                          if (err == 0 && bp && BP_IS_HOLE(bp))
1882 1882                                  err = ENOENT;
1883 1883                          if (err) {
1884 1884                                  if (parent)
1885 1885                                          dbuf_rele(parent, NULL);
1886 1886                                  return (err);
1887 1887                          }
1888 1888                  }
1889 1889                  if (err && err != ENOENT)
1890 1890                          return (err);
1891 1891                  db = dbuf_create(dn, level, blkid, parent, bp);
1892 1892          }
1893 1893  
1894 1894          if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1895 1895                  arc_buf_add_ref(db->db_buf, db);
1896 1896                  if (db->db_buf->b_data == NULL) {
1897 1897                          dbuf_clear(db);
1898 1898                          if (parent) {
1899 1899                                  dbuf_rele(parent, NULL);
1900 1900                                  parent = NULL;
1901 1901                          }
1902 1902                          goto top;
1903 1903                  }
1904 1904                  ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1905 1905          }
1906 1906  
1907 1907          ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1908 1908  
1909 1909          /*
1910 1910           * If this buffer is currently syncing out, and we are are
1911 1911           * still referencing it from db_data, we need to make a copy
1912 1912           * of it in case we decide we want to dirty it again in this txg.
1913 1913           */
1914 1914          if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1915 1915              dn->dn_object != DMU_META_DNODE_OBJECT &&
1916 1916              db->db_state == DB_CACHED && db->db_data_pending) {
1917 1917                  dbuf_dirty_record_t *dr = db->db_data_pending;
1918 1918  
1919 1919                  if (dr->dt.dl.dr_data == db->db_buf) {
1920 1920                          arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1921 1921  
1922 1922                          dbuf_set_data(db,
1923 1923                              arc_buf_alloc(dn->dn_objset->os_spa,
1924 1924                              db->db.db_size, db, type));
1925 1925                          bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1926 1926                              db->db.db_size);
1927 1927                  }
1928 1928          }
1929 1929  
1930 1930          (void) refcount_add(&db->db_holds, tag);
1931 1931          dbuf_update_data(db);
1932 1932          DBUF_VERIFY(db);
1933 1933          mutex_exit(&db->db_mtx);
1934 1934  
1935 1935          /* NOTE: we can't rele the parent until after we drop the db_mtx */
1936 1936          if (parent)
1937 1937                  dbuf_rele(parent, NULL);
1938 1938  
1939 1939          ASSERT3P(DB_DNODE(db), ==, dn);
1940 1940          ASSERT3U(db->db_blkid, ==, blkid);
1941 1941          ASSERT3U(db->db_level, ==, level);
1942 1942          *dbp = db;
1943 1943  
1944 1944          return (0);
1945 1945  }
1946 1946  
1947 1947  dmu_buf_impl_t *
1948 1948  dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1949 1949  {
1950 1950          dmu_buf_impl_t *db;
1951 1951          int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1952 1952          return (err ? NULL : db);
1953 1953  }
1954 1954  
1955 1955  dmu_buf_impl_t *
1956 1956  dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1957 1957  {
1958 1958          dmu_buf_impl_t *db;
1959 1959          int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1960 1960          return (err ? NULL : db);
1961 1961  }
1962 1962  
1963 1963  void
1964 1964  dbuf_create_bonus(dnode_t *dn)
1965 1965  {
1966 1966          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1967 1967  
1968 1968          ASSERT(dn->dn_bonus == NULL);
1969 1969          dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1970 1970  }
1971 1971  
1972 1972  int
1973 1973  dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1974 1974  {
1975 1975          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1976 1976          dnode_t *dn;
1977 1977  
1978 1978          if (db->db_blkid != DMU_SPILL_BLKID)
1979 1979                  return (ENOTSUP);
1980 1980          if (blksz == 0)
1981 1981                  blksz = SPA_MINBLOCKSIZE;
1982 1982          if (blksz > SPA_MAXBLOCKSIZE)
1983 1983                  blksz = SPA_MAXBLOCKSIZE;
1984 1984          else
1985 1985                  blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
1986 1986  
1987 1987          DB_DNODE_ENTER(db);
1988 1988          dn = DB_DNODE(db);
1989 1989          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1990 1990          dbuf_new_size(db, blksz, tx);
1991 1991          rw_exit(&dn->dn_struct_rwlock);
1992 1992          DB_DNODE_EXIT(db);
1993 1993  
1994 1994          return (0);
1995 1995  }
1996 1996  
1997 1997  void
1998 1998  dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
1999 1999  {
2000 2000          dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2001 2001  }
2002 2002  
2003 2003  #pragma weak dmu_buf_add_ref = dbuf_add_ref
2004 2004  void
2005 2005  dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2006 2006  {
2007 2007          int64_t holds = refcount_add(&db->db_holds, tag);
2008 2008          ASSERT(holds > 1);
2009 2009  }
2010 2010  
2011 2011  /*
2012 2012   * If you call dbuf_rele() you had better not be referencing the dnode handle
2013 2013   * unless you have some other direct or indirect hold on the dnode. (An indirect
2014 2014   * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2015 2015   * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2016 2016   * dnode's parent dbuf evicting its dnode handles.
2017 2017   */
2018 2018  #pragma weak dmu_buf_rele = dbuf_rele
2019 2019  void
2020 2020  dbuf_rele(dmu_buf_impl_t *db, void *tag)
2021 2021  {
2022 2022          mutex_enter(&db->db_mtx);
2023 2023          dbuf_rele_and_unlock(db, tag);
2024 2024  }
2025 2025  
2026 2026  /*
2027 2027   * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2028 2028   * db_dirtycnt and db_holds to be updated atomically.
2029 2029   */
2030 2030  void
2031 2031  dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2032 2032  {
2033 2033          int64_t holds;
2034 2034  
2035 2035          ASSERT(MUTEX_HELD(&db->db_mtx));
2036 2036          DBUF_VERIFY(db);
2037 2037  
2038 2038          /*
2039 2039           * Remove the reference to the dbuf before removing its hold on the
2040 2040           * dnode so we can guarantee in dnode_move() that a referenced bonus
2041 2041           * buffer has a corresponding dnode hold.
2042 2042           */
2043 2043          holds = refcount_remove(&db->db_holds, tag);
2044 2044          ASSERT(holds >= 0);
2045 2045  
2046 2046          /*
2047 2047           * We can't freeze indirects if there is a possibility that they
2048 2048           * may be modified in the current syncing context.
2049 2049           */
2050 2050          if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2051 2051                  arc_buf_freeze(db->db_buf);
2052 2052  
2053 2053          if (holds == db->db_dirtycnt &&
2054 2054              db->db_level == 0 && db->db_immediate_evict)
2055 2055                  dbuf_evict_user(db);
2056 2056  
2057 2057          if (holds == 0) {
2058 2058                  if (db->db_blkid == DMU_BONUS_BLKID) {
2059 2059                          mutex_exit(&db->db_mtx);
2060 2060  
2061 2061                          /*
2062 2062                           * If the dnode moves here, we cannot cross this barrier
2063 2063                           * until the move completes.
2064 2064                           */
2065 2065                          DB_DNODE_ENTER(db);
2066 2066                          (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2067 2067                          DB_DNODE_EXIT(db);
2068 2068                          /*
2069 2069                           * The bonus buffer's dnode hold is no longer discounted
2070 2070                           * in dnode_move(). The dnode cannot move until after
2071 2071                           * the dnode_rele().
2072 2072                           */
2073 2073                          dnode_rele(DB_DNODE(db), db);
2074 2074                  } else if (db->db_buf == NULL) {
2075 2075                          /*
2076 2076                           * This is a special case: we never associated this
2077 2077                           * dbuf with any data allocated from the ARC.
2078 2078                           */
2079 2079                          ASSERT(db->db_state == DB_UNCACHED ||
2080 2080                              db->db_state == DB_NOFILL);
2081 2081                          dbuf_evict(db);
2082 2082                  } else if (arc_released(db->db_buf)) {
2083 2083                          arc_buf_t *buf = db->db_buf;
2084 2084                          /*
2085 2085                           * This dbuf has anonymous data associated with it.
2086 2086                           */
2087 2087                          dbuf_set_data(db, NULL);
2088 2088                          VERIFY(arc_buf_remove_ref(buf, db) == 1);
2089 2089                          dbuf_evict(db);
2090 2090                  } else {
2091 2091                          VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
2092 2092                          if (!DBUF_IS_CACHEABLE(db))
2093 2093                                  dbuf_clear(db);
2094 2094                          else
2095 2095                                  mutex_exit(&db->db_mtx);
2096 2096                  }
2097 2097          } else {
2098 2098                  mutex_exit(&db->db_mtx);
2099 2099          }
2100 2100  }
2101 2101  
2102 2102  #pragma weak dmu_buf_refcount = dbuf_refcount
2103 2103  uint64_t
2104 2104  dbuf_refcount(dmu_buf_impl_t *db)
2105 2105  {
2106 2106          return (refcount_count(&db->db_holds));
2107 2107  }
2108 2108  
2109 2109  void *
2110 2110  dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2111 2111      dmu_buf_evict_func_t *evict_func)
2112 2112  {
2113 2113          return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2114 2114              user_data_ptr_ptr, evict_func));
2115 2115  }
2116 2116  
2117 2117  void *
2118 2118  dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2119 2119      dmu_buf_evict_func_t *evict_func)
2120 2120  {
2121 2121          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2122 2122  
2123 2123          db->db_immediate_evict = TRUE;
2124 2124          return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2125 2125              user_data_ptr_ptr, evict_func));
2126 2126  }
2127 2127  
2128 2128  void *
2129 2129  dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2130 2130      void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2131 2131  {
2132 2132          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2133 2133          ASSERT(db->db_level == 0);
2134 2134  
2135 2135          ASSERT((user_ptr == NULL) == (evict_func == NULL));
2136 2136  
2137 2137          mutex_enter(&db->db_mtx);
2138 2138  
2139 2139          if (db->db_user_ptr == old_user_ptr) {
2140 2140                  db->db_user_ptr = user_ptr;
2141 2141                  db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2142 2142                  db->db_evict_func = evict_func;
2143 2143  
2144 2144                  dbuf_update_data(db);
2145 2145          } else {
2146 2146                  old_user_ptr = db->db_user_ptr;
2147 2147          }
2148 2148  
2149 2149          mutex_exit(&db->db_mtx);
2150 2150          return (old_user_ptr);
2151 2151  }
2152 2152  
2153 2153  void *
2154 2154  dmu_buf_get_user(dmu_buf_t *db_fake)
2155 2155  {
2156 2156          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2157 2157          ASSERT(!refcount_is_zero(&db->db_holds));
2158 2158  
2159 2159          return (db->db_user_ptr);
2160 2160  }
2161 2161  
2162 2162  boolean_t
2163 2163  dmu_buf_freeable(dmu_buf_t *dbuf)
2164 2164  {
2165 2165          boolean_t res = B_FALSE;
2166 2166          dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2167 2167  
2168 2168          if (db->db_blkptr)
2169 2169                  res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2170 2170                      db->db_blkptr, db->db_blkptr->blk_birth);
2171 2171  
2172 2172          return (res);
2173 2173  }
2174 2174  
2175 2175  static void
2176 2176  dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2177 2177  {
2178 2178          /* ASSERT(dmu_tx_is_syncing(tx) */
2179 2179          ASSERT(MUTEX_HELD(&db->db_mtx));
2180 2180  
2181 2181          if (db->db_blkptr != NULL)
2182 2182                  return;
2183 2183  
2184 2184          if (db->db_blkid == DMU_SPILL_BLKID) {
2185 2185                  db->db_blkptr = &dn->dn_phys->dn_spill;
2186 2186                  BP_ZERO(db->db_blkptr);
2187 2187                  return;
2188 2188          }
2189 2189          if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2190 2190                  /*
2191 2191                   * This buffer was allocated at a time when there was
2192 2192                   * no available blkptrs from the dnode, or it was
2193 2193                   * inappropriate to hook it in (i.e., nlevels mis-match).
2194 2194                   */
2195 2195                  ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2196 2196                  ASSERT(db->db_parent == NULL);
2197 2197                  db->db_parent = dn->dn_dbuf;
2198 2198                  db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2199 2199                  DBUF_VERIFY(db);
2200 2200          } else {
2201 2201                  dmu_buf_impl_t *parent = db->db_parent;
2202 2202                  int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2203 2203  
2204 2204                  ASSERT(dn->dn_phys->dn_nlevels > 1);
2205 2205                  if (parent == NULL) {
2206 2206                          mutex_exit(&db->db_mtx);
2207 2207                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
2208 2208                          (void) dbuf_hold_impl(dn, db->db_level+1,
2209 2209                              db->db_blkid >> epbs, FALSE, db, &parent);
2210 2210                          rw_exit(&dn->dn_struct_rwlock);
2211 2211                          mutex_enter(&db->db_mtx);
2212 2212                          db->db_parent = parent;
2213 2213                  }
2214 2214                  db->db_blkptr = (blkptr_t *)parent->db.db_data +
2215 2215                      (db->db_blkid & ((1ULL << epbs) - 1));
2216 2216                  DBUF_VERIFY(db);
2217 2217          }
2218 2218  }
2219 2219  
2220 2220  static void
2221 2221  dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2222 2222  {
2223 2223          dmu_buf_impl_t *db = dr->dr_dbuf;
2224 2224          dnode_t *dn;
2225 2225          zio_t *zio;
2226 2226  
2227 2227          ASSERT(dmu_tx_is_syncing(tx));
2228 2228  
2229 2229          dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2230 2230  
2231 2231          mutex_enter(&db->db_mtx);
2232 2232  
2233 2233          ASSERT(db->db_level > 0);
2234 2234          DBUF_VERIFY(db);
2235 2235  
2236 2236          if (db->db_buf == NULL) {
2237 2237                  mutex_exit(&db->db_mtx);
2238 2238                  (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2239 2239                  mutex_enter(&db->db_mtx);
2240 2240          }
2241 2241          ASSERT3U(db->db_state, ==, DB_CACHED);
2242 2242          ASSERT(db->db_buf != NULL);
2243 2243  
2244 2244          DB_DNODE_ENTER(db);
2245 2245          dn = DB_DNODE(db);
2246 2246          ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2247 2247          dbuf_check_blkptr(dn, db);
2248 2248          DB_DNODE_EXIT(db);
2249 2249  
2250 2250          db->db_data_pending = dr;
2251 2251  
2252 2252          mutex_exit(&db->db_mtx);
2253 2253          dbuf_write(dr, db->db_buf, tx);
2254 2254  
2255 2255          zio = dr->dr_zio;
2256 2256          mutex_enter(&dr->dt.di.dr_mtx);
2257 2257          dbuf_sync_list(&dr->dt.di.dr_children, tx);
2258 2258          ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2259 2259          mutex_exit(&dr->dt.di.dr_mtx);
2260 2260          zio_nowait(zio);
2261 2261  }
2262 2262  
2263 2263  static void
2264 2264  dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2265 2265  {
2266 2266          arc_buf_t **datap = &dr->dt.dl.dr_data;
2267 2267          dmu_buf_impl_t *db = dr->dr_dbuf;
2268 2268          dnode_t *dn;
2269 2269          objset_t *os;
2270 2270          uint64_t txg = tx->tx_txg;
2271 2271  
2272 2272          ASSERT(dmu_tx_is_syncing(tx));
2273 2273  
2274 2274          dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2275 2275  
2276 2276          mutex_enter(&db->db_mtx);
2277 2277          /*
2278 2278           * To be synced, we must be dirtied.  But we
2279 2279           * might have been freed after the dirty.
2280 2280           */
2281 2281          if (db->db_state == DB_UNCACHED) {
2282 2282                  /* This buffer has been freed since it was dirtied */
2283 2283                  ASSERT(db->db.db_data == NULL);
2284 2284          } else if (db->db_state == DB_FILL) {
2285 2285                  /* This buffer was freed and is now being re-filled */
2286 2286                  ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2287 2287          } else {
2288 2288                  ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2289 2289          }
2290 2290          DBUF_VERIFY(db);
2291 2291  
2292 2292          DB_DNODE_ENTER(db);
2293 2293          dn = DB_DNODE(db);
2294 2294  
2295 2295          if (db->db_blkid == DMU_SPILL_BLKID) {
2296 2296                  mutex_enter(&dn->dn_mtx);
2297 2297                  dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2298 2298                  mutex_exit(&dn->dn_mtx);
2299 2299          }
2300 2300  
  
    | 
      ↓ open down ↓ | 
    1959 lines elided | 
    
      ↑ open up ↑ | 
  
2301 2301          /*
2302 2302           * If this is a bonus buffer, simply copy the bonus data into the
2303 2303           * dnode.  It will be written out when the dnode is synced (and it
2304 2304           * will be synced, since it must have been dirty for dbuf_sync to
2305 2305           * be called).
2306 2306           */
2307 2307          if (db->db_blkid == DMU_BONUS_BLKID) {
2308 2308                  dbuf_dirty_record_t **drp;
2309 2309  
2310 2310                  ASSERT(*datap != NULL);
2311      -                ASSERT3U(db->db_level, ==, 0);
     2311 +                ASSERT0(db->db_level);
2312 2312                  ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2313 2313                  bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2314 2314                  DB_DNODE_EXIT(db);
2315 2315  
2316 2316                  if (*datap != db->db.db_data) {
2317 2317                          zio_buf_free(*datap, DN_MAX_BONUSLEN);
2318 2318                          arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2319 2319                  }
2320 2320                  db->db_data_pending = NULL;
2321 2321                  drp = &db->db_last_dirty;
2322 2322                  while (*drp != dr)
2323 2323                          drp = &(*drp)->dr_next;
2324 2324                  ASSERT(dr->dr_next == NULL);
2325 2325                  ASSERT(dr->dr_dbuf == db);
2326 2326                  *drp = dr->dr_next;
2327 2327                  kmem_free(dr, sizeof (dbuf_dirty_record_t));
2328 2328                  ASSERT(db->db_dirtycnt > 0);
2329 2329                  db->db_dirtycnt -= 1;
2330 2330                  dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2331 2331                  return;
2332 2332          }
2333 2333  
2334 2334          os = dn->dn_objset;
2335 2335  
2336 2336          /*
2337 2337           * This function may have dropped the db_mtx lock allowing a dmu_sync
2338 2338           * operation to sneak in. As a result, we need to ensure that we
2339 2339           * don't check the dr_override_state until we have returned from
2340 2340           * dbuf_check_blkptr.
2341 2341           */
2342 2342          dbuf_check_blkptr(dn, db);
2343 2343  
2344 2344          /*
2345 2345           * If this buffer is in the middle of an immediate write,
2346 2346           * wait for the synchronous IO to complete.
2347 2347           */
2348 2348          while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2349 2349                  ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2350 2350                  cv_wait(&db->db_changed, &db->db_mtx);
2351 2351                  ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2352 2352          }
2353 2353  
2354 2354          if (db->db_state != DB_NOFILL &&
2355 2355              dn->dn_object != DMU_META_DNODE_OBJECT &&
2356 2356              refcount_count(&db->db_holds) > 1 &&
2357 2357              dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2358 2358              *datap == db->db_buf) {
2359 2359                  /*
2360 2360                   * If this buffer is currently "in use" (i.e., there
2361 2361                   * are active holds and db_data still references it),
2362 2362                   * then make a copy before we start the write so that
2363 2363                   * any modifications from the open txg will not leak
2364 2364                   * into this write.
2365 2365                   *
2366 2366                   * NOTE: this copy does not need to be made for
2367 2367                   * objects only modified in the syncing context (e.g.
2368 2368                   * DNONE_DNODE blocks).
2369 2369                   */
2370 2370                  int blksz = arc_buf_size(*datap);
2371 2371                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2372 2372                  *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2373 2373                  bcopy(db->db.db_data, (*datap)->b_data, blksz);
2374 2374          }
2375 2375          db->db_data_pending = dr;
2376 2376  
2377 2377          mutex_exit(&db->db_mtx);
2378 2378  
2379 2379          dbuf_write(dr, *datap, tx);
2380 2380  
2381 2381          ASSERT(!list_link_active(&dr->dr_dirty_node));
2382 2382          if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2383 2383                  list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2384 2384                  DB_DNODE_EXIT(db);
2385 2385          } else {
2386 2386                  /*
2387 2387                   * Although zio_nowait() does not "wait for an IO", it does
2388 2388                   * initiate the IO. If this is an empty write it seems plausible
2389 2389                   * that the IO could actually be completed before the nowait
2390 2390                   * returns. We need to DB_DNODE_EXIT() first in case
2391 2391                   * zio_nowait() invalidates the dbuf.
2392 2392                   */
2393 2393                  DB_DNODE_EXIT(db);
2394 2394                  zio_nowait(dr->dr_zio);
2395 2395          }
2396 2396  }
2397 2397  
2398 2398  void
2399 2399  dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2400 2400  {
2401 2401          dbuf_dirty_record_t *dr;
2402 2402  
2403 2403          while (dr = list_head(list)) {
2404 2404                  if (dr->dr_zio != NULL) {
2405 2405                          /*
2406 2406                           * If we find an already initialized zio then we
2407 2407                           * are processing the meta-dnode, and we have finished.
2408 2408                           * The dbufs for all dnodes are put back on the list
2409 2409                           * during processing, so that we can zio_wait()
2410 2410                           * these IOs after initiating all child IOs.
2411 2411                           */
2412 2412                          ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2413 2413                              DMU_META_DNODE_OBJECT);
2414 2414                          break;
2415 2415                  }
2416 2416                  list_remove(list, dr);
2417 2417                  if (dr->dr_dbuf->db_level > 0)
2418 2418                          dbuf_sync_indirect(dr, tx);
2419 2419                  else
2420 2420                          dbuf_sync_leaf(dr, tx);
2421 2421          }
2422 2422  }
2423 2423  
2424 2424  /* ARGSUSED */
2425 2425  static void
2426 2426  dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2427 2427  {
2428 2428          dmu_buf_impl_t *db = vdb;
2429 2429          dnode_t *dn;
2430 2430          blkptr_t *bp = zio->io_bp;
2431 2431          blkptr_t *bp_orig = &zio->io_bp_orig;
2432 2432          spa_t *spa = zio->io_spa;
2433 2433          int64_t delta;
2434 2434          uint64_t fill = 0;
2435 2435          int i;
2436 2436  
2437 2437          ASSERT(db->db_blkptr == bp);
2438 2438  
2439 2439          DB_DNODE_ENTER(db);
2440 2440          dn = DB_DNODE(db);
2441 2441          delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2442 2442          dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2443 2443          zio->io_prev_space_delta = delta;
2444 2444  
2445 2445          if (BP_IS_HOLE(bp)) {
2446 2446                  ASSERT(bp->blk_fill == 0);
2447 2447                  DB_DNODE_EXIT(db);
2448 2448                  return;
2449 2449          }
2450 2450  
2451 2451          ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2452 2452              BP_GET_TYPE(bp) == dn->dn_type) ||
2453 2453              (db->db_blkid == DMU_SPILL_BLKID &&
2454 2454              BP_GET_TYPE(bp) == dn->dn_bonustype));
2455 2455          ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2456 2456  
2457 2457          mutex_enter(&db->db_mtx);
2458 2458  
2459 2459  #ifdef ZFS_DEBUG
2460 2460          if (db->db_blkid == DMU_SPILL_BLKID) {
2461 2461                  ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2462 2462                  ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2463 2463                      db->db_blkptr == &dn->dn_phys->dn_spill);
2464 2464          }
2465 2465  #endif
2466 2466  
2467 2467          if (db->db_level == 0) {
2468 2468                  mutex_enter(&dn->dn_mtx);
2469 2469                  if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2470 2470                      db->db_blkid != DMU_SPILL_BLKID)
2471 2471                          dn->dn_phys->dn_maxblkid = db->db_blkid;
2472 2472                  mutex_exit(&dn->dn_mtx);
2473 2473  
2474 2474                  if (dn->dn_type == DMU_OT_DNODE) {
2475 2475                          dnode_phys_t *dnp = db->db.db_data;
2476 2476                          for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2477 2477                              i--, dnp++) {
2478 2478                                  if (dnp->dn_type != DMU_OT_NONE)
2479 2479                                          fill++;
2480 2480                          }
2481 2481                  } else {
2482 2482                          fill = 1;
2483 2483                  }
2484 2484          } else {
2485 2485                  blkptr_t *ibp = db->db.db_data;
2486 2486                  ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2487 2487                  for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2488 2488                          if (BP_IS_HOLE(ibp))
2489 2489                                  continue;
2490 2490                          fill += ibp->blk_fill;
2491 2491                  }
2492 2492          }
2493 2493          DB_DNODE_EXIT(db);
2494 2494  
2495 2495          bp->blk_fill = fill;
2496 2496  
2497 2497          mutex_exit(&db->db_mtx);
2498 2498  }
2499 2499  
  
    | 
      ↓ open down ↓ | 
    178 lines elided | 
    
      ↑ open up ↑ | 
  
2500 2500  /* ARGSUSED */
2501 2501  static void
2502 2502  dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2503 2503  {
2504 2504          dmu_buf_impl_t *db = vdb;
2505 2505          blkptr_t *bp = zio->io_bp;
2506 2506          blkptr_t *bp_orig = &zio->io_bp_orig;
2507 2507          uint64_t txg = zio->io_txg;
2508 2508          dbuf_dirty_record_t **drp, *dr;
2509 2509  
2510      -        ASSERT3U(zio->io_error, ==, 0);
     2510 +        ASSERT0(zio->io_error);
2511 2511          ASSERT(db->db_blkptr == bp);
2512 2512  
2513 2513          if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
2514 2514                  ASSERT(BP_EQUAL(bp, bp_orig));
2515 2515          } else {
2516 2516                  objset_t *os;
2517 2517                  dsl_dataset_t *ds;
2518 2518                  dmu_tx_t *tx;
2519 2519  
2520 2520                  DB_GET_OBJSET(&os, db);
2521 2521                  ds = os->os_dsl_dataset;
2522 2522                  tx = os->os_synctx;
2523 2523  
2524 2524                  (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2525 2525                  dsl_dataset_block_born(ds, bp, tx);
2526 2526          }
2527 2527  
2528 2528          mutex_enter(&db->db_mtx);
2529 2529  
2530 2530          DBUF_VERIFY(db);
2531 2531  
2532 2532          drp = &db->db_last_dirty;
2533 2533          while ((dr = *drp) != db->db_data_pending)
2534 2534                  drp = &dr->dr_next;
2535 2535          ASSERT(!list_link_active(&dr->dr_dirty_node));
2536 2536          ASSERT(dr->dr_txg == txg);
2537 2537          ASSERT(dr->dr_dbuf == db);
2538 2538          ASSERT(dr->dr_next == NULL);
2539 2539          *drp = dr->dr_next;
2540 2540  
2541 2541  #ifdef ZFS_DEBUG
2542 2542          if (db->db_blkid == DMU_SPILL_BLKID) {
2543 2543                  dnode_t *dn;
2544 2544  
2545 2545                  DB_DNODE_ENTER(db);
2546 2546                  dn = DB_DNODE(db);
2547 2547                  ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2548 2548                  ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2549 2549                      db->db_blkptr == &dn->dn_phys->dn_spill);
2550 2550                  DB_DNODE_EXIT(db);
2551 2551          }
2552 2552  #endif
2553 2553  
2554 2554          if (db->db_level == 0) {
2555 2555                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2556 2556                  ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2557 2557                  if (db->db_state != DB_NOFILL) {
2558 2558                          if (dr->dt.dl.dr_data != db->db_buf)
2559 2559                                  VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2560 2560                                      db) == 1);
2561 2561                          else if (!arc_released(db->db_buf))
2562 2562                                  arc_set_callback(db->db_buf, dbuf_do_evict, db);
2563 2563                  }
2564 2564          } else {
2565 2565                  dnode_t *dn;
2566 2566  
2567 2567                  DB_DNODE_ENTER(db);
2568 2568                  dn = DB_DNODE(db);
2569 2569                  ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2570 2570                  ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2571 2571                  if (!BP_IS_HOLE(db->db_blkptr)) {
2572 2572                          int epbs =
2573 2573                              dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2574 2574                          ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2575 2575                              db->db.db_size);
2576 2576                          ASSERT3U(dn->dn_phys->dn_maxblkid
2577 2577                              >> (db->db_level * epbs), >=, db->db_blkid);
2578 2578                          arc_set_callback(db->db_buf, dbuf_do_evict, db);
2579 2579                  }
2580 2580                  DB_DNODE_EXIT(db);
2581 2581                  mutex_destroy(&dr->dt.di.dr_mtx);
2582 2582                  list_destroy(&dr->dt.di.dr_children);
2583 2583          }
2584 2584          kmem_free(dr, sizeof (dbuf_dirty_record_t));
2585 2585  
2586 2586          cv_broadcast(&db->db_changed);
2587 2587          ASSERT(db->db_dirtycnt > 0);
2588 2588          db->db_dirtycnt -= 1;
2589 2589          db->db_data_pending = NULL;
2590 2590          dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2591 2591  }
2592 2592  
2593 2593  static void
2594 2594  dbuf_write_nofill_ready(zio_t *zio)
2595 2595  {
2596 2596          dbuf_write_ready(zio, NULL, zio->io_private);
2597 2597  }
2598 2598  
2599 2599  static void
2600 2600  dbuf_write_nofill_done(zio_t *zio)
2601 2601  {
2602 2602          dbuf_write_done(zio, NULL, zio->io_private);
2603 2603  }
2604 2604  
2605 2605  static void
2606 2606  dbuf_write_override_ready(zio_t *zio)
2607 2607  {
2608 2608          dbuf_dirty_record_t *dr = zio->io_private;
2609 2609          dmu_buf_impl_t *db = dr->dr_dbuf;
2610 2610  
2611 2611          dbuf_write_ready(zio, NULL, db);
2612 2612  }
2613 2613  
2614 2614  static void
2615 2615  dbuf_write_override_done(zio_t *zio)
2616 2616  {
2617 2617          dbuf_dirty_record_t *dr = zio->io_private;
2618 2618          dmu_buf_impl_t *db = dr->dr_dbuf;
2619 2619          blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2620 2620  
2621 2621          mutex_enter(&db->db_mtx);
2622 2622          if (!BP_EQUAL(zio->io_bp, obp)) {
2623 2623                  if (!BP_IS_HOLE(obp))
2624 2624                          dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2625 2625                  arc_release(dr->dt.dl.dr_data, db);
2626 2626          }
2627 2627          mutex_exit(&db->db_mtx);
2628 2628  
2629 2629          dbuf_write_done(zio, NULL, db);
2630 2630  }
2631 2631  
2632 2632  static void
2633 2633  dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2634 2634  {
2635 2635          dmu_buf_impl_t *db = dr->dr_dbuf;
2636 2636          dnode_t *dn;
2637 2637          objset_t *os;
2638 2638          dmu_buf_impl_t *parent = db->db_parent;
2639 2639          uint64_t txg = tx->tx_txg;
2640 2640          zbookmark_t zb;
2641 2641          zio_prop_t zp;
2642 2642          zio_t *zio;
2643 2643          int wp_flag = 0;
2644 2644  
2645 2645          DB_DNODE_ENTER(db);
2646 2646          dn = DB_DNODE(db);
2647 2647          os = dn->dn_objset;
2648 2648  
2649 2649          if (db->db_state != DB_NOFILL) {
2650 2650                  if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2651 2651                          /*
2652 2652                           * Private object buffers are released here rather
2653 2653                           * than in dbuf_dirty() since they are only modified
2654 2654                           * in the syncing context and we don't want the
2655 2655                           * overhead of making multiple copies of the data.
2656 2656                           */
2657 2657                          if (BP_IS_HOLE(db->db_blkptr)) {
2658 2658                                  arc_buf_thaw(data);
2659 2659                          } else {
2660 2660                                  dbuf_release_bp(db);
2661 2661                          }
2662 2662                  }
2663 2663          }
2664 2664  
2665 2665          if (parent != dn->dn_dbuf) {
2666 2666                  ASSERT(parent && parent->db_data_pending);
2667 2667                  ASSERT(db->db_level == parent->db_level-1);
2668 2668                  ASSERT(arc_released(parent->db_buf));
2669 2669                  zio = parent->db_data_pending->dr_zio;
2670 2670          } else {
2671 2671                  ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2672 2672                      db->db_blkid != DMU_SPILL_BLKID) ||
2673 2673                      (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2674 2674                  if (db->db_blkid != DMU_SPILL_BLKID)
2675 2675                          ASSERT3P(db->db_blkptr, ==,
2676 2676                              &dn->dn_phys->dn_blkptr[db->db_blkid]);
2677 2677                  zio = dn->dn_zio;
2678 2678          }
2679 2679  
2680 2680          ASSERT(db->db_level == 0 || data == db->db_buf);
2681 2681          ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2682 2682          ASSERT(zio);
2683 2683  
2684 2684          SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2685 2685              os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2686 2686              db->db.db_object, db->db_level, db->db_blkid);
2687 2687  
2688 2688          if (db->db_blkid == DMU_SPILL_BLKID)
2689 2689                  wp_flag = WP_SPILL;
2690 2690          wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2691 2691  
2692 2692          dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2693 2693          DB_DNODE_EXIT(db);
2694 2694  
2695 2695          if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2696 2696                  ASSERT(db->db_state != DB_NOFILL);
2697 2697                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2698 2698                      db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2699 2699                      dbuf_write_override_ready, dbuf_write_override_done, dr,
2700 2700                      ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2701 2701                  mutex_enter(&db->db_mtx);
2702 2702                  dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2703 2703                  zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2704 2704                      dr->dt.dl.dr_copies);
2705 2705                  mutex_exit(&db->db_mtx);
2706 2706          } else if (db->db_state == DB_NOFILL) {
2707 2707                  ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2708 2708                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2709 2709                      db->db_blkptr, NULL, db->db.db_size, &zp,
2710 2710                      dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
2711 2711                      ZIO_PRIORITY_ASYNC_WRITE,
2712 2712                      ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2713 2713          } else {
2714 2714                  ASSERT(arc_released(data));
2715 2715                  dr->dr_zio = arc_write(zio, os->os_spa, txg,
2716 2716                      db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
2717 2717                      dbuf_write_ready, dbuf_write_done, db,
2718 2718                      ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2719 2719          }
2720 2720  }
  
    | 
      ↓ open down ↓ | 
    200 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX