621 } else if (db->db_state == DB_UNCACHED) {
622 spa_t *spa = dn->dn_objset->os_spa;
623
624 if (zio == NULL)
625 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
626 dbuf_read_impl(db, zio, &flags);
627
628 /* dbuf_read_impl has dropped db_mtx for us */
629
630 if (prefetch)
631 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
632 db->db.db_size, flags & DB_RF_CACHED);
633
634 if ((flags & DB_RF_HAVESTRUCT) == 0)
635 rw_exit(&dn->dn_struct_rwlock);
636 DB_DNODE_EXIT(db);
637
638 if (!havepzio)
639 err = zio_wait(zio);
640 } else {
641 mutex_exit(&db->db_mtx);
642 if (prefetch)
643 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
644 db->db.db_size, TRUE);
645 if ((flags & DB_RF_HAVESTRUCT) == 0)
646 rw_exit(&dn->dn_struct_rwlock);
647 DB_DNODE_EXIT(db);
648
649 mutex_enter(&db->db_mtx);
650 if ((flags & DB_RF_NEVERWAIT) == 0) {
651 while (db->db_state == DB_READ ||
652 db->db_state == DB_FILL) {
653 ASSERT(db->db_state == DB_READ ||
654 (flags & DB_RF_HAVESTRUCT) == 0);
655 cv_wait(&db->db_changed, &db->db_mtx);
656 }
657 if (db->db_state == DB_UNCACHED)
658 err = SET_ERROR(EIO);
659 }
660 mutex_exit(&db->db_mtx);
661 }
662
663 ASSERT(err || havepzio || db->db_state == DB_CACHED);
664 return (err);
665 }
666
667 static void
668 dbuf_noread(dmu_buf_impl_t *db)
1244 }
1245 mutex_exit(&db->db_mtx);
1246 } else {
1247 ASSERT(db->db_level+1 == dn->dn_nlevels);
1248 ASSERT(db->db_blkid < dn->dn_nblkptr);
1249 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1250 mutex_enter(&dn->dn_mtx);
1251 ASSERT(!list_link_active(&dr->dr_dirty_node));
1252 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1253 mutex_exit(&dn->dn_mtx);
1254 if (drop_struct_lock)
1255 rw_exit(&dn->dn_struct_rwlock);
1256 }
1257
1258 dnode_setdirty(dn, tx);
1259 DB_DNODE_EXIT(db);
1260 return (dr);
1261 }
1262
1263 /*
1264 * Return TRUE if this evicted the dbuf.
1265 */
1266 static boolean_t
1267 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1268 {
1269 dnode_t *dn;
1270 uint64_t txg = tx->tx_txg;
1271 dbuf_dirty_record_t *dr, **drp;
1272
1273 ASSERT(txg != 0);
1274 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1275 ASSERT0(db->db_level);
1276 ASSERT(MUTEX_HELD(&db->db_mtx));
1277
1278 /*
1279 * If this buffer is not dirty, we're done.
1280 */
1281 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1282 if (dr->dr_txg <= txg)
1283 break;
1284 if (dr == NULL || dr->dr_txg < txg)
2205 DBUF_VERIFY(db);
2206 }
2207 }
2208
2209 static void
2210 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2211 {
2212 dmu_buf_impl_t *db = dr->dr_dbuf;
2213 dnode_t *dn;
2214 zio_t *zio;
2215
2216 ASSERT(dmu_tx_is_syncing(tx));
2217
2218 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2219
2220 mutex_enter(&db->db_mtx);
2221
2222 ASSERT(db->db_level > 0);
2223 DBUF_VERIFY(db);
2224
2225 if (db->db_buf == NULL) {
2226 mutex_exit(&db->db_mtx);
2227 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2228 mutex_enter(&db->db_mtx);
2229 }
2230 ASSERT3U(db->db_state, ==, DB_CACHED);
2231 ASSERT(db->db_buf != NULL);
2232
2233 DB_DNODE_ENTER(db);
2234 dn = DB_DNODE(db);
2235 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2236 dbuf_check_blkptr(dn, db);
2237 DB_DNODE_EXIT(db);
2238
2239 db->db_data_pending = dr;
2240
2241 mutex_exit(&db->db_mtx);
2242 dbuf_write(dr, db->db_buf, tx);
2243
2244 zio = dr->dr_zio;
2245 mutex_enter(&dr->dt.di.dr_mtx);
2246 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2247 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2248 mutex_exit(&dr->dt.di.dr_mtx);
2249 zio_nowait(zio);
2250 }
2251
2252 static void
2253 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2254 {
2255 arc_buf_t **datap = &dr->dt.dl.dr_data;
2256 dmu_buf_impl_t *db = dr->dr_dbuf;
2257 dnode_t *dn;
2258 objset_t *os;
2605 }
2606
2607 static void
2608 dbuf_write_override_done(zio_t *zio)
2609 {
2610 dbuf_dirty_record_t *dr = zio->io_private;
2611 dmu_buf_impl_t *db = dr->dr_dbuf;
2612 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2613
2614 mutex_enter(&db->db_mtx);
2615 if (!BP_EQUAL(zio->io_bp, obp)) {
2616 if (!BP_IS_HOLE(obp))
2617 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2618 arc_release(dr->dt.dl.dr_data, db);
2619 }
2620 mutex_exit(&db->db_mtx);
2621
2622 dbuf_write_done(zio, NULL, db);
2623 }
2624
2625 static void
2626 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2627 {
2628 dmu_buf_impl_t *db = dr->dr_dbuf;
2629 dnode_t *dn;
2630 objset_t *os;
2631 dmu_buf_impl_t *parent = db->db_parent;
2632 uint64_t txg = tx->tx_txg;
2633 zbookmark_t zb;
2634 zio_prop_t zp;
2635 zio_t *zio;
2636 int wp_flag = 0;
2637
2638 DB_DNODE_ENTER(db);
2639 dn = DB_DNODE(db);
2640 os = dn->dn_objset;
2641
2642 if (db->db_state != DB_NOFILL) {
2643 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2644 /*
2645 * Private object buffers are released here rather
2646 * than in dbuf_dirty() since they are only modified
2647 * in the syncing context and we don't want the
2648 * overhead of making multiple copies of the data.
2649 */
2650 if (BP_IS_HOLE(db->db_blkptr)) {
2651 arc_buf_thaw(data);
2652 } else {
2653 dbuf_release_bp(db);
2654 }
2655 }
2656 }
2657
2658 if (parent != dn->dn_dbuf) {
2659 ASSERT(parent && parent->db_data_pending);
2660 ASSERT(db->db_level == parent->db_level-1);
2661 ASSERT(arc_released(parent->db_buf));
2662 zio = parent->db_data_pending->dr_zio;
2663 } else {
2664 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2665 db->db_blkid != DMU_SPILL_BLKID) ||
2666 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2667 if (db->db_blkid != DMU_SPILL_BLKID)
2668 ASSERT3P(db->db_blkptr, ==,
2669 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2670 zio = dn->dn_zio;
2671 }
2672
2673 ASSERT(db->db_level == 0 || data == db->db_buf);
2674 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2675 ASSERT(zio);
2676
2677 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2678 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2679 db->db.db_object, db->db_level, db->db_blkid);
2680
2681 if (db->db_blkid == DMU_SPILL_BLKID)
2682 wp_flag = WP_SPILL;
2683 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
|
621 } else if (db->db_state == DB_UNCACHED) {
622 spa_t *spa = dn->dn_objset->os_spa;
623
624 if (zio == NULL)
625 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
626 dbuf_read_impl(db, zio, &flags);
627
628 /* dbuf_read_impl has dropped db_mtx for us */
629
630 if (prefetch)
631 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
632 db->db.db_size, flags & DB_RF_CACHED);
633
634 if ((flags & DB_RF_HAVESTRUCT) == 0)
635 rw_exit(&dn->dn_struct_rwlock);
636 DB_DNODE_EXIT(db);
637
638 if (!havepzio)
639 err = zio_wait(zio);
640 } else {
641 /*
642 * Another reader came in while the dbuf was in flight
643 * between UNCACHED and CACHED. Either a writer will finish
644 * writing the buffer (sending the dbuf to CACHED) or the
645 * first reader's request will reach the read_done callback
646 * and send the dbuf to CACHED. Otherwise, a failure
647 * occurred and the dbuf went to UNCACHED.
648 */
649 mutex_exit(&db->db_mtx);
650 if (prefetch)
651 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
652 db->db.db_size, TRUE);
653 if ((flags & DB_RF_HAVESTRUCT) == 0)
654 rw_exit(&dn->dn_struct_rwlock);
655 DB_DNODE_EXIT(db);
656
657 /* Skip the wait per the caller's request. */
658 mutex_enter(&db->db_mtx);
659 if ((flags & DB_RF_NEVERWAIT) == 0) {
660 while (db->db_state == DB_READ ||
661 db->db_state == DB_FILL) {
662 ASSERT(db->db_state == DB_READ ||
663 (flags & DB_RF_HAVESTRUCT) == 0);
664 cv_wait(&db->db_changed, &db->db_mtx);
665 }
666 if (db->db_state == DB_UNCACHED)
667 err = SET_ERROR(EIO);
668 }
669 mutex_exit(&db->db_mtx);
670 }
671
672 ASSERT(err || havepzio || db->db_state == DB_CACHED);
673 return (err);
674 }
675
676 static void
677 dbuf_noread(dmu_buf_impl_t *db)
1253 }
1254 mutex_exit(&db->db_mtx);
1255 } else {
1256 ASSERT(db->db_level+1 == dn->dn_nlevels);
1257 ASSERT(db->db_blkid < dn->dn_nblkptr);
1258 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1259 mutex_enter(&dn->dn_mtx);
1260 ASSERT(!list_link_active(&dr->dr_dirty_node));
1261 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1262 mutex_exit(&dn->dn_mtx);
1263 if (drop_struct_lock)
1264 rw_exit(&dn->dn_struct_rwlock);
1265 }
1266
1267 dnode_setdirty(dn, tx);
1268 DB_DNODE_EXIT(db);
1269 return (dr);
1270 }
1271
1272 /*
1273 * Undirty a buffer in the transaction group referenced by the given
1274 * transaction. Return whether this evicted the dbuf.
1275 */
1276 static boolean_t
1277 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1278 {
1279 dnode_t *dn;
1280 uint64_t txg = tx->tx_txg;
1281 dbuf_dirty_record_t *dr, **drp;
1282
1283 ASSERT(txg != 0);
1284 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1285 ASSERT0(db->db_level);
1286 ASSERT(MUTEX_HELD(&db->db_mtx));
1287
1288 /*
1289 * If this buffer is not dirty, we're done.
1290 */
1291 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1292 if (dr->dr_txg <= txg)
1293 break;
1294 if (dr == NULL || dr->dr_txg < txg)
2215 DBUF_VERIFY(db);
2216 }
2217 }
2218
2219 static void
2220 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2221 {
2222 dmu_buf_impl_t *db = dr->dr_dbuf;
2223 dnode_t *dn;
2224 zio_t *zio;
2225
2226 ASSERT(dmu_tx_is_syncing(tx));
2227
2228 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2229
2230 mutex_enter(&db->db_mtx);
2231
2232 ASSERT(db->db_level > 0);
2233 DBUF_VERIFY(db);
2234
2235 /* Read the block if it hasn't been read yet. */
2236 if (db->db_buf == NULL) {
2237 mutex_exit(&db->db_mtx);
2238 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2239 mutex_enter(&db->db_mtx);
2240 }
2241 ASSERT3U(db->db_state, ==, DB_CACHED);
2242 ASSERT(db->db_buf != NULL);
2243
2244 DB_DNODE_ENTER(db);
2245 dn = DB_DNODE(db);
2246 /* Indirect block size must match what the dnode thinks it is. */
2247 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2248 dbuf_check_blkptr(dn, db);
2249 DB_DNODE_EXIT(db);
2250
2251 /* Provide the pending dirty record to child dbufs */
2252 db->db_data_pending = dr;
2253
2254 mutex_exit(&db->db_mtx);
2255 dbuf_write(dr, db->db_buf, tx);
2256
2257 zio = dr->dr_zio;
2258 mutex_enter(&dr->dt.di.dr_mtx);
2259 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2260 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2261 mutex_exit(&dr->dt.di.dr_mtx);
2262 zio_nowait(zio);
2263 }
2264
2265 static void
2266 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2267 {
2268 arc_buf_t **datap = &dr->dt.dl.dr_data;
2269 dmu_buf_impl_t *db = dr->dr_dbuf;
2270 dnode_t *dn;
2271 objset_t *os;
2618 }
2619
2620 static void
2621 dbuf_write_override_done(zio_t *zio)
2622 {
2623 dbuf_dirty_record_t *dr = zio->io_private;
2624 dmu_buf_impl_t *db = dr->dr_dbuf;
2625 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2626
2627 mutex_enter(&db->db_mtx);
2628 if (!BP_EQUAL(zio->io_bp, obp)) {
2629 if (!BP_IS_HOLE(obp))
2630 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2631 arc_release(dr->dt.dl.dr_data, db);
2632 }
2633 mutex_exit(&db->db_mtx);
2634
2635 dbuf_write_done(zio, NULL, db);
2636 }
2637
2638 /* Issue I/O to commit a dirty buffer to disk. */
2639 static void
2640 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2641 {
2642 dmu_buf_impl_t *db = dr->dr_dbuf;
2643 dnode_t *dn;
2644 objset_t *os;
2645 dmu_buf_impl_t *parent = db->db_parent;
2646 uint64_t txg = tx->tx_txg;
2647 zbookmark_t zb;
2648 zio_prop_t zp;
2649 zio_t *zio;
2650 int wp_flag = 0;
2651
2652 DB_DNODE_ENTER(db);
2653 dn = DB_DNODE(db);
2654 os = dn->dn_objset;
2655
2656 if (db->db_state != DB_NOFILL) {
2657 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2658 /*
2659 * Private object buffers are released here rather
2660 * than in dbuf_dirty() since they are only modified
2661 * in the syncing context and we don't want the
2662 * overhead of making multiple copies of the data.
2663 */
2664 if (BP_IS_HOLE(db->db_blkptr)) {
2665 arc_buf_thaw(data);
2666 } else {
2667 dbuf_release_bp(db);
2668 }
2669 }
2670 }
2671
2672 if (parent != dn->dn_dbuf) {
2673 /* Our parent is an indirect block. */
2674 /* We have a dirty parent that has been scheduled for write. */
2675 ASSERT(parent && parent->db_data_pending);
2676 /* Our parent's buffer is one level closer to the dnode. */
2677 ASSERT(db->db_level == parent->db_level-1);
2678 /*
2679 * We're about to modify our parent's db_data by modifying
2680 * our block pointer, so the parent must be released.
2681 */
2682 ASSERT(arc_released(parent->db_buf));
2683 zio = parent->db_data_pending->dr_zio;
2684 } else {
2685 /* Our parent is the dnode itself. */
2686 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2687 db->db_blkid != DMU_SPILL_BLKID) ||
2688 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2689 if (db->db_blkid != DMU_SPILL_BLKID)
2690 ASSERT3P(db->db_blkptr, ==,
2691 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2692 zio = dn->dn_zio;
2693 }
2694
2695 ASSERT(db->db_level == 0 || data == db->db_buf);
2696 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2697 ASSERT(zio);
2698
2699 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2700 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2701 db->db.db_object, db->db_level, db->db_blkid);
2702
2703 if (db->db_blkid == DMU_SPILL_BLKID)
2704 wp_flag = WP_SPILL;
2705 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
|