Print this page
3741 zfs needs better comments
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>


 621         } else if (db->db_state == DB_UNCACHED) {
 622                 spa_t *spa = dn->dn_objset->os_spa;
 623 
 624                 if (zio == NULL)
 625                         zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 626                 dbuf_read_impl(db, zio, &flags);
 627 
 628                 /* dbuf_read_impl has dropped db_mtx for us */
 629 
 630                 if (prefetch)
 631                         dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 632                             db->db.db_size, flags & DB_RF_CACHED);
 633 
 634                 if ((flags & DB_RF_HAVESTRUCT) == 0)
 635                         rw_exit(&dn->dn_struct_rwlock);
 636                 DB_DNODE_EXIT(db);
 637 
 638                 if (!havepzio)
 639                         err = zio_wait(zio);
 640         } else {








 641                 mutex_exit(&db->db_mtx);
 642                 if (prefetch)
 643                         dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 644                             db->db.db_size, TRUE);
 645                 if ((flags & DB_RF_HAVESTRUCT) == 0)
 646                         rw_exit(&dn->dn_struct_rwlock);
 647                 DB_DNODE_EXIT(db);
 648 

 649                 mutex_enter(&db->db_mtx);
 650                 if ((flags & DB_RF_NEVERWAIT) == 0) {
 651                         while (db->db_state == DB_READ ||
 652                             db->db_state == DB_FILL) {
 653                                 ASSERT(db->db_state == DB_READ ||
 654                                     (flags & DB_RF_HAVESTRUCT) == 0);
 655                                 cv_wait(&db->db_changed, &db->db_mtx);
 656                         }
 657                         if (db->db_state == DB_UNCACHED)
 658                                 err = SET_ERROR(EIO);
 659                 }
 660                 mutex_exit(&db->db_mtx);
 661         }
 662 
 663         ASSERT(err || havepzio || db->db_state == DB_CACHED);
 664         return (err);
 665 }
 666 
 667 static void
 668 dbuf_noread(dmu_buf_impl_t *db)


1244                 }
1245                 mutex_exit(&db->db_mtx);
1246         } else {
1247                 ASSERT(db->db_level+1 == dn->dn_nlevels);
1248                 ASSERT(db->db_blkid < dn->dn_nblkptr);
1249                 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1250                 mutex_enter(&dn->dn_mtx);
1251                 ASSERT(!list_link_active(&dr->dr_dirty_node));
1252                 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1253                 mutex_exit(&dn->dn_mtx);
1254                 if (drop_struct_lock)
1255                         rw_exit(&dn->dn_struct_rwlock);
1256         }
1257 
1258         dnode_setdirty(dn, tx);
1259         DB_DNODE_EXIT(db);
1260         return (dr);
1261 }
1262 
1263 /*
1264  * Return TRUE if this evicted the dbuf.

1265  */
1266 static boolean_t
1267 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1268 {
1269         dnode_t *dn;
1270         uint64_t txg = tx->tx_txg;
1271         dbuf_dirty_record_t *dr, **drp;
1272 
1273         ASSERT(txg != 0);
1274         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1275         ASSERT0(db->db_level);
1276         ASSERT(MUTEX_HELD(&db->db_mtx));
1277 
1278         /*
1279          * If this buffer is not dirty, we're done.
1280          */
1281         for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1282                 if (dr->dr_txg <= txg)
1283                         break;
1284         if (dr == NULL || dr->dr_txg < txg)


2205                 DBUF_VERIFY(db);
2206         }
2207 }
2208 
2209 static void
2210 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2211 {
2212         dmu_buf_impl_t *db = dr->dr_dbuf;
2213         dnode_t *dn;
2214         zio_t *zio;
2215 
2216         ASSERT(dmu_tx_is_syncing(tx));
2217 
2218         dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2219 
2220         mutex_enter(&db->db_mtx);
2221 
2222         ASSERT(db->db_level > 0);
2223         DBUF_VERIFY(db);
2224 

2225         if (db->db_buf == NULL) {
2226                 mutex_exit(&db->db_mtx);
2227                 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2228                 mutex_enter(&db->db_mtx);
2229         }
2230         ASSERT3U(db->db_state, ==, DB_CACHED);
2231         ASSERT(db->db_buf != NULL);
2232 
2233         DB_DNODE_ENTER(db);
2234         dn = DB_DNODE(db);

2235         ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2236         dbuf_check_blkptr(dn, db);
2237         DB_DNODE_EXIT(db);
2238 

2239         db->db_data_pending = dr;
2240 
2241         mutex_exit(&db->db_mtx);
2242         dbuf_write(dr, db->db_buf, tx);
2243 
2244         zio = dr->dr_zio;
2245         mutex_enter(&dr->dt.di.dr_mtx);
2246         dbuf_sync_list(&dr->dt.di.dr_children, tx);
2247         ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2248         mutex_exit(&dr->dt.di.dr_mtx);
2249         zio_nowait(zio);
2250 }
2251 
2252 static void
2253 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2254 {
2255         arc_buf_t **datap = &dr->dt.dl.dr_data;
2256         dmu_buf_impl_t *db = dr->dr_dbuf;
2257         dnode_t *dn;
2258         objset_t *os;


2605 }
2606 
2607 static void
2608 dbuf_write_override_done(zio_t *zio)
2609 {
2610         dbuf_dirty_record_t *dr = zio->io_private;
2611         dmu_buf_impl_t *db = dr->dr_dbuf;
2612         blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2613 
2614         mutex_enter(&db->db_mtx);
2615         if (!BP_EQUAL(zio->io_bp, obp)) {
2616                 if (!BP_IS_HOLE(obp))
2617                         dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2618                 arc_release(dr->dt.dl.dr_data, db);
2619         }
2620         mutex_exit(&db->db_mtx);
2621 
2622         dbuf_write_done(zio, NULL, db);
2623 }
2624 

2625 static void
2626 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2627 {
2628         dmu_buf_impl_t *db = dr->dr_dbuf;
2629         dnode_t *dn;
2630         objset_t *os;
2631         dmu_buf_impl_t *parent = db->db_parent;
2632         uint64_t txg = tx->tx_txg;
2633         zbookmark_t zb;
2634         zio_prop_t zp;
2635         zio_t *zio;
2636         int wp_flag = 0;
2637 
2638         DB_DNODE_ENTER(db);
2639         dn = DB_DNODE(db);
2640         os = dn->dn_objset;
2641 
2642         if (db->db_state != DB_NOFILL) {
2643                 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2644                         /*
2645                          * Private object buffers are released here rather
2646                          * than in dbuf_dirty() since they are only modified
2647                          * in the syncing context and we don't want the
2648                          * overhead of making multiple copies of the data.
2649                          */
2650                         if (BP_IS_HOLE(db->db_blkptr)) {
2651                                 arc_buf_thaw(data);
2652                         } else {
2653                                 dbuf_release_bp(db);
2654                         }
2655                 }
2656         }
2657 
2658         if (parent != dn->dn_dbuf) {


2659                 ASSERT(parent && parent->db_data_pending);

2660                 ASSERT(db->db_level == parent->db_level-1);




2661                 ASSERT(arc_released(parent->db_buf));
2662                 zio = parent->db_data_pending->dr_zio;
2663         } else {

2664                 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2665                     db->db_blkid != DMU_SPILL_BLKID) ||
2666                     (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2667                 if (db->db_blkid != DMU_SPILL_BLKID)
2668                         ASSERT3P(db->db_blkptr, ==,
2669                             &dn->dn_phys->dn_blkptr[db->db_blkid]);
2670                 zio = dn->dn_zio;
2671         }
2672 
2673         ASSERT(db->db_level == 0 || data == db->db_buf);
2674         ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2675         ASSERT(zio);
2676 
2677         SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2678             os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2679             db->db.db_object, db->db_level, db->db_blkid);
2680 
2681         if (db->db_blkid == DMU_SPILL_BLKID)
2682                 wp_flag = WP_SPILL;
2683         wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;




 621         } else if (db->db_state == DB_UNCACHED) {
 622                 spa_t *spa = dn->dn_objset->os_spa;
 623 
 624                 if (zio == NULL)
 625                         zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 626                 dbuf_read_impl(db, zio, &flags);
 627 
 628                 /* dbuf_read_impl has dropped db_mtx for us */
 629 
 630                 if (prefetch)
 631                         dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 632                             db->db.db_size, flags & DB_RF_CACHED);
 633 
 634                 if ((flags & DB_RF_HAVESTRUCT) == 0)
 635                         rw_exit(&dn->dn_struct_rwlock);
 636                 DB_DNODE_EXIT(db);
 637 
 638                 if (!havepzio)
 639                         err = zio_wait(zio);
 640         } else {
 641                 /*
 642                  * Another reader came in while the dbuf was in flight
 643                  * between UNCACHED and CACHED.  Either a writer will finish
 644                  * writing the buffer (sending the dbuf to CACHED) or the
 645                  * first reader's request will reach the read_done callback
 646                  * and send the dbuf to CACHED.  Otherwise, a failure
 647                  * occurred and the dbuf went to UNCACHED.
 648                  */
 649                 mutex_exit(&db->db_mtx);
 650                 if (prefetch)
 651                         dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 652                             db->db.db_size, TRUE);
 653                 if ((flags & DB_RF_HAVESTRUCT) == 0)
 654                         rw_exit(&dn->dn_struct_rwlock);
 655                 DB_DNODE_EXIT(db);
 656 
 657                 /* Skip the wait per the caller's request. */
 658                 mutex_enter(&db->db_mtx);
 659                 if ((flags & DB_RF_NEVERWAIT) == 0) {
 660                         while (db->db_state == DB_READ ||
 661                             db->db_state == DB_FILL) {
 662                                 ASSERT(db->db_state == DB_READ ||
 663                                     (flags & DB_RF_HAVESTRUCT) == 0);
 664                                 cv_wait(&db->db_changed, &db->db_mtx);
 665                         }
 666                         if (db->db_state == DB_UNCACHED)
 667                                 err = SET_ERROR(EIO);
 668                 }
 669                 mutex_exit(&db->db_mtx);
 670         }
 671 
 672         ASSERT(err || havepzio || db->db_state == DB_CACHED);
 673         return (err);
 674 }
 675 
 676 static void
 677 dbuf_noread(dmu_buf_impl_t *db)


1253                 }
1254                 mutex_exit(&db->db_mtx);
1255         } else {
1256                 ASSERT(db->db_level+1 == dn->dn_nlevels);
1257                 ASSERT(db->db_blkid < dn->dn_nblkptr);
1258                 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1259                 mutex_enter(&dn->dn_mtx);
1260                 ASSERT(!list_link_active(&dr->dr_dirty_node));
1261                 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1262                 mutex_exit(&dn->dn_mtx);
1263                 if (drop_struct_lock)
1264                         rw_exit(&dn->dn_struct_rwlock);
1265         }
1266 
1267         dnode_setdirty(dn, tx);
1268         DB_DNODE_EXIT(db);
1269         return (dr);
1270 }
1271 
1272 /*
1273  * Undirty a buffer in the transaction group referenced by the given
1274  * transaction.  Return whether this evicted the dbuf.
1275  */
1276 static boolean_t
1277 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1278 {
1279         dnode_t *dn;
1280         uint64_t txg = tx->tx_txg;
1281         dbuf_dirty_record_t *dr, **drp;
1282 
1283         ASSERT(txg != 0);
1284         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1285         ASSERT0(db->db_level);
1286         ASSERT(MUTEX_HELD(&db->db_mtx));
1287 
1288         /*
1289          * If this buffer is not dirty, we're done.
1290          */
1291         for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1292                 if (dr->dr_txg <= txg)
1293                         break;
1294         if (dr == NULL || dr->dr_txg < txg)


2215                 DBUF_VERIFY(db);
2216         }
2217 }
2218 
2219 static void
2220 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2221 {
2222         dmu_buf_impl_t *db = dr->dr_dbuf;
2223         dnode_t *dn;
2224         zio_t *zio;
2225 
2226         ASSERT(dmu_tx_is_syncing(tx));
2227 
2228         dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2229 
2230         mutex_enter(&db->db_mtx);
2231 
2232         ASSERT(db->db_level > 0);
2233         DBUF_VERIFY(db);
2234 
2235         /* Read the block if it hasn't been read yet. */
2236         if (db->db_buf == NULL) {
2237                 mutex_exit(&db->db_mtx);
2238                 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2239                 mutex_enter(&db->db_mtx);
2240         }
2241         ASSERT3U(db->db_state, ==, DB_CACHED);
2242         ASSERT(db->db_buf != NULL);
2243 
2244         DB_DNODE_ENTER(db);
2245         dn = DB_DNODE(db);
2246         /* Indirect block size must match what the dnode thinks it is. */
2247         ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2248         dbuf_check_blkptr(dn, db);
2249         DB_DNODE_EXIT(db);
2250 
2251         /* Provide the pending dirty record to child dbufs */
2252         db->db_data_pending = dr;
2253 
2254         mutex_exit(&db->db_mtx);
2255         dbuf_write(dr, db->db_buf, tx);
2256 
2257         zio = dr->dr_zio;
2258         mutex_enter(&dr->dt.di.dr_mtx);
2259         dbuf_sync_list(&dr->dt.di.dr_children, tx);
2260         ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2261         mutex_exit(&dr->dt.di.dr_mtx);
2262         zio_nowait(zio);
2263 }
2264 
2265 static void
2266 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2267 {
2268         arc_buf_t **datap = &dr->dt.dl.dr_data;
2269         dmu_buf_impl_t *db = dr->dr_dbuf;
2270         dnode_t *dn;
2271         objset_t *os;


2618 }
2619 
2620 static void
2621 dbuf_write_override_done(zio_t *zio)
2622 {
2623         dbuf_dirty_record_t *dr = zio->io_private;
2624         dmu_buf_impl_t *db = dr->dr_dbuf;
2625         blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2626 
2627         mutex_enter(&db->db_mtx);
2628         if (!BP_EQUAL(zio->io_bp, obp)) {
2629                 if (!BP_IS_HOLE(obp))
2630                         dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2631                 arc_release(dr->dt.dl.dr_data, db);
2632         }
2633         mutex_exit(&db->db_mtx);
2634 
2635         dbuf_write_done(zio, NULL, db);
2636 }
2637 
2638 /* Issue I/O to commit a dirty buffer to disk. */
2639 static void
2640 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2641 {
2642         dmu_buf_impl_t *db = dr->dr_dbuf;
2643         dnode_t *dn;
2644         objset_t *os;
2645         dmu_buf_impl_t *parent = db->db_parent;
2646         uint64_t txg = tx->tx_txg;
2647         zbookmark_t zb;
2648         zio_prop_t zp;
2649         zio_t *zio;
2650         int wp_flag = 0;
2651 
2652         DB_DNODE_ENTER(db);
2653         dn = DB_DNODE(db);
2654         os = dn->dn_objset;
2655 
2656         if (db->db_state != DB_NOFILL) {
2657                 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2658                         /*
2659                          * Private object buffers are released here rather
2660                          * than in dbuf_dirty() since they are only modified
2661                          * in the syncing context and we don't want the
2662                          * overhead of making multiple copies of the data.
2663                          */
2664                         if (BP_IS_HOLE(db->db_blkptr)) {
2665                                 arc_buf_thaw(data);
2666                         } else {
2667                                 dbuf_release_bp(db);
2668                         }
2669                 }
2670         }
2671 
2672         if (parent != dn->dn_dbuf) {
2673                 /* Our parent is an indirect block. */
2674                 /* We have a dirty parent that has been scheduled for write. */
2675                 ASSERT(parent && parent->db_data_pending);
2676                 /* Our parent's buffer is one level closer to the dnode. */
2677                 ASSERT(db->db_level == parent->db_level-1);
2678                 /*
2679                  * We're about to modify our parent's db_data by modifying
2680                  * our block pointer, so the parent must be released.
2681                  */
2682                 ASSERT(arc_released(parent->db_buf));
2683                 zio = parent->db_data_pending->dr_zio;
2684         } else {
2685                 /* Our parent is the dnode itself. */
2686                 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2687                     db->db_blkid != DMU_SPILL_BLKID) ||
2688                     (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2689                 if (db->db_blkid != DMU_SPILL_BLKID)
2690                         ASSERT3P(db->db_blkptr, ==,
2691                             &dn->dn_phys->dn_blkptr[db->db_blkid]);
2692                 zio = dn->dn_zio;
2693         }
2694 
2695         ASSERT(db->db_level == 0 || data == db->db_buf);
2696         ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2697         ASSERT(zio);
2698 
2699         SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2700             os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2701             db->db.db_object, db->db_level, db->db_blkid);
2702 
2703         if (db->db_blkid == DMU_SPILL_BLKID)
2704                 wp_flag = WP_SPILL;
2705         wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;