187 */
188 ASSERT(refcount_is_zero(&db->db_holds));
189 ASSERT(db->db_state == DB_EVICTING);
190 ASSERT(!MUTEX_HELD(&db->db_mtx));
191
192 mutex_enter(DBUF_HASH_MUTEX(h, idx));
193 dbp = &h->hash_table[idx];
194 while ((dbf = *dbp) != db) {
195 dbp = &dbf->db_hash_next;
196 ASSERT(dbf != NULL);
197 }
198 *dbp = db->db_hash_next;
199 db->db_hash_next = NULL;
200 mutex_exit(DBUF_HASH_MUTEX(h, idx));
201 atomic_add_64(&dbuf_hash_count, -1);
202 }
203
204 static arc_evict_func_t dbuf_do_evict;
205
206 static void
207 dbuf_evict_user(dmu_buf_impl_t *db)
208 {
209 ASSERT(MUTEX_HELD(&db->db_mtx));
210
211 if (db->db_level != 0 || db->db_evict_func == NULL)
212 return;
213
214 if (db->db_user_data_ptr_ptr)
215 *db->db_user_data_ptr_ptr = db->db.db_data;
216 db->db_evict_func(&db->db, db->db_user_ptr);
217 db->db_user_ptr = NULL;
218 db->db_user_data_ptr_ptr = NULL;
219 db->db_evict_func = NULL;
220 }
221
222 boolean_t
223 dbuf_is_metadata(dmu_buf_impl_t *db)
224 {
225 if (db->db_level > 0) {
226 return (B_TRUE);
227 } else {
228 boolean_t is_metadata;
229
230 DB_DNODE_ENTER(db);
231 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
232 DB_DNODE_EXIT(db);
233
234 return (is_metadata);
235 }
236 }
237
238 void
239 dbuf_evict(dmu_buf_impl_t *db)
240 {
241 ASSERT(MUTEX_HELD(&db->db_mtx));
242 ASSERT(db->db_buf == NULL);
243 ASSERT(db->db_data_pending == NULL);
244
245 dbuf_clear(db);
246 dbuf_destroy(db);
247 }
248
249 void
250 dbuf_init(void)
251 {
252 uint64_t hsize = 1ULL << 16;
253 dbuf_hash_table_t *h = &dbuf_hash_table;
254 int i;
255
256 /*
257 * The hash table is big enough to fill all of physical memory
258 * with an average 4K block size. The table will take up
259 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
260 */
261 while (hsize * 4096 < physmem * PAGESIZE)
262 hsize <<= 1;
263
264 retry:
265 h->hash_table_mask = hsize - 1;
388 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
389 db->db_state != DB_FILL && !dn->dn_free_txg) {
390 /*
391 * If the blkptr isn't set but they have nonzero data,
392 * it had better be dirty, otherwise we'll lose that
393 * data when we evict this buffer.
394 */
395 if (db->db_dirtycnt == 0) {
396 uint64_t *buf = db->db.db_data;
397 int i;
398
399 for (i = 0; i < db->db.db_size >> 3; i++) {
400 ASSERT(buf[i] == 0);
401 }
402 }
403 }
404 DB_DNODE_EXIT(db);
405 }
406 #endif
407
408 static void
409 dbuf_update_data(dmu_buf_impl_t *db)
410 {
411 ASSERT(MUTEX_HELD(&db->db_mtx));
412 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
413 ASSERT(!refcount_is_zero(&db->db_holds));
414 *db->db_user_data_ptr_ptr = db->db.db_data;
415 }
416 }
417
418 static void
419 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
420 {
421 ASSERT(MUTEX_HELD(&db->db_mtx));
422 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
423 db->db_buf = buf;
424 if (buf != NULL) {
425 ASSERT(buf->b_data != NULL);
426 db->db.db_data = buf->b_data;
427 if (!arc_released(buf))
428 arc_set_callback(buf, dbuf_do_evict, db);
429 dbuf_update_data(db);
430 } else {
431 dbuf_evict_user(db);
432 db->db.db_data = NULL;
433 if (db->db_state != DB_NOFILL)
434 db->db_state = DB_UNCACHED;
435 }
436 }
437
438 /*
439 * Loan out an arc_buf for read. Return the loaned arc_buf.
440 */
441 arc_buf_t *
442 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
443 {
444 arc_buf_t *abuf;
445
446 mutex_enter(&db->db_mtx);
447 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
448 int blksz = db->db.db_size;
449 spa_t *spa;
450
451 mutex_exit(&db->db_mtx);
452 DB_GET_SPA(&spa, db);
453 abuf = arc_loan_buf(spa, blksz);
454 bcopy(db->db.db_data, abuf->b_data, blksz);
455 } else {
456 abuf = db->db_buf;
457 arc_loan_inuse_buf(abuf, db);
458 dbuf_set_data(db, NULL);
459 mutex_exit(&db->db_mtx);
460 }
461 return (abuf);
462 }
463
464 uint64_t
465 dbuf_whichblock(dnode_t *dn, uint64_t offset)
466 {
467 if (dn->dn_datablkshift) {
468 return (offset >> dn->dn_datablkshift);
469 } else {
470 ASSERT3U(offset, <, dn->dn_datablksz);
471 return (0);
472 }
473 }
474
475 static void
476 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
477 {
478 dmu_buf_impl_t *db = vdb;
479
480 mutex_enter(&db->db_mtx);
517 DB_DNODE_ENTER(db);
518 dn = DB_DNODE(db);
519 ASSERT(!refcount_is_zero(&db->db_holds));
520 /* We need the struct_rwlock to prevent db_blkptr from changing. */
521 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
522 ASSERT(MUTEX_HELD(&db->db_mtx));
523 ASSERT(db->db_state == DB_UNCACHED);
524 ASSERT(db->db_buf == NULL);
525
526 if (db->db_blkid == DMU_BONUS_BLKID) {
527 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
528
529 ASSERT3U(bonuslen, <=, db->db.db_size);
530 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
531 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
532 if (bonuslen < DN_MAX_BONUSLEN)
533 bzero(db->db.db_data, DN_MAX_BONUSLEN);
534 if (bonuslen)
535 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
536 DB_DNODE_EXIT(db);
537 dbuf_update_data(db);
538 db->db_state = DB_CACHED;
539 mutex_exit(&db->db_mtx);
540 return;
541 }
542
543 /*
544 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
545 * processes the delete record and clears the bp while we are waiting
546 * for the dn_mtx (resulting in a "no" from block_freed).
547 */
548 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
549 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
550 BP_IS_HOLE(db->db_blkptr)))) {
551 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
552
553 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
554 db->db.db_size, db, type));
555 DB_DNODE_EXIT(db);
556 bzero(db->db.db_data, db->db.db_size);
557 db->db_state = DB_CACHED;
650 if ((flags & DB_RF_NEVERWAIT) == 0) {
651 while (db->db_state == DB_READ ||
652 db->db_state == DB_FILL) {
653 ASSERT(db->db_state == DB_READ ||
654 (flags & DB_RF_HAVESTRUCT) == 0);
655 cv_wait(&db->db_changed, &db->db_mtx);
656 }
657 if (db->db_state == DB_UNCACHED)
658 err = SET_ERROR(EIO);
659 }
660 mutex_exit(&db->db_mtx);
661 }
662
663 ASSERT(err || havepzio || db->db_state == DB_CACHED);
664 return (err);
665 }
666
667 static void
668 dbuf_noread(dmu_buf_impl_t *db)
669 {
670 ASSERT(!refcount_is_zero(&db->db_holds));
671 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
672 mutex_enter(&db->db_mtx);
673 while (db->db_state == DB_READ || db->db_state == DB_FILL)
674 cv_wait(&db->db_changed, &db->db_mtx);
675 if (db->db_state == DB_UNCACHED) {
676 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
677 spa_t *spa;
678
679 ASSERT(db->db_buf == NULL);
680 ASSERT(db->db.db_data == NULL);
681 DB_GET_SPA(&spa, db);
682 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
683 db->db_state = DB_FILL;
684 } else if (db->db_state == DB_NOFILL) {
685 dbuf_set_data(db, NULL);
686 } else {
687 ASSERT3U(db->db_state, ==, DB_CACHED);
688 }
689 mutex_exit(&db->db_mtx);
690 }
691
692 /*
693 * This is our just-in-time copy function. It makes a copy of
694 * buffers, that have been modified in a previous transaction
695 * group, before we modify them in the current active group.
696 *
697 * This function is used in two places: when we are dirtying a
698 * buffer for the first time in a txg, and when we are freeing
699 * a range in a dnode that includes this buffer.
700 *
701 * Note that when we are called from dbuf_free_range() we do
702 * not put a hold on the buffer, we just traverse the active
703 * dbuf list for the dnode.
704 */
705 static void
706 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
707 {
708 dbuf_dirty_record_t *dr = db->db_last_dirty;
709
710 ASSERT(MUTEX_HELD(&db->db_mtx));
711 ASSERT(db->db.db_data != NULL);
712 ASSERT(db->db_level == 0);
713 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
714
715 if (dr == NULL ||
716 (dr->dt.dl.dr_data !=
717 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
718 return;
719
720 /*
721 * If the last dirty record for this dbuf has not yet synced
722 * and its referencing the dbuf data, either:
723 * reset the reference to point to a new copy,
724 * or (if there a no active holders)
725 * just null out the current db_data pointer.
726 */
727 ASSERT(dr->dr_txg >= txg - 2);
728 if (db->db_blkid == DMU_BONUS_BLKID) {
729 /* Note that the data bufs here are zio_bufs */
730 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
731 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
732 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
733 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
734 int size = db->db.db_size;
735 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
736 spa_t *spa;
737
738 DB_GET_SPA(&spa, db);
739 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
740 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
741 } else {
742 dbuf_set_data(db, NULL);
743 }
744 }
745
746 void
747 dbuf_unoverride(dbuf_dirty_record_t *dr)
748 {
749 dmu_buf_impl_t *db = dr->dr_dbuf;
750 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
751 uint64_t txg = dr->dr_txg;
752
753 ASSERT(MUTEX_HELD(&db->db_mtx));
754 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
755 ASSERT(db->db_level == 0);
756
757 if (db->db_blkid == DMU_BONUS_BLKID ||
758 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
759 return;
760
761 ASSERT(db->db_data_pending != dr);
762
779 * immediately re-thawing it.
780 */
781 arc_release(dr->dt.dl.dr_data, db);
782 }
783
784 /*
785 * Evict (if its unreferenced) or clear (if its referenced) any level-0
786 * data blocks in the free range, so that any future readers will find
787 * empty blocks. Also, if we happen accross any level-1 dbufs in the
788 * range that have not already been marked dirty, mark them dirty so
789 * they stay in memory.
790 */
791 void
792 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
793 {
794 dmu_buf_impl_t *db, *db_next;
795 uint64_t txg = tx->tx_txg;
796 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
797 uint64_t first_l1 = start >> epbs;
798 uint64_t last_l1 = end >> epbs;
799
800 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
801 end = dn->dn_maxblkid;
802 last_l1 = end >> epbs;
803 }
804 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
805 mutex_enter(&dn->dn_dbufs_mtx);
806 for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
807 db_next = list_next(&dn->dn_dbufs, db);
808 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
809
810 if (db->db_level == 1 &&
811 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
812 mutex_enter(&db->db_mtx);
813 if (db->db_last_dirty &&
814 db->db_last_dirty->dr_txg < txg) {
815 dbuf_add_ref(db, FTAG);
816 mutex_exit(&db->db_mtx);
817 dbuf_will_dirty(db, tx);
818 dbuf_rele(db, FTAG);
832 if (dbuf_undirty(db, tx)) {
833 /* mutex has been dropped and dbuf destroyed */
834 continue;
835 }
836
837 if (db->db_state == DB_UNCACHED ||
838 db->db_state == DB_NOFILL ||
839 db->db_state == DB_EVICTING) {
840 ASSERT(db->db.db_data == NULL);
841 mutex_exit(&db->db_mtx);
842 continue;
843 }
844 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
845 /* will be handled in dbuf_read_done or dbuf_rele */
846 db->db_freed_in_flight = TRUE;
847 mutex_exit(&db->db_mtx);
848 continue;
849 }
850 if (refcount_count(&db->db_holds) == 0) {
851 ASSERT(db->db_buf);
852 dbuf_clear(db);
853 continue;
854 }
855 /* The dbuf is referenced */
856
857 if (db->db_last_dirty != NULL) {
858 dbuf_dirty_record_t *dr = db->db_last_dirty;
859
860 if (dr->dr_txg == txg) {
861 /*
862 * This buffer is "in-use", re-adjust the file
863 * size to reflect that this buffer may
864 * contain new data when we sync.
865 */
866 if (db->db_blkid != DMU_SPILL_BLKID &&
867 db->db_blkid > dn->dn_maxblkid)
868 dn->dn_maxblkid = db->db_blkid;
869 dbuf_unoverride(dr);
870 } else {
871 /*
872 * This dbuf is not dirty in the open context.
873 * Either uncache it (if its not referenced in
874 * the open context) or reset its contents to
875 * empty.
876 */
877 dbuf_fix_old_data(db, txg);
878 }
879 }
880 /* clear the contents if its cached */
881 if (db->db_state == DB_CACHED) {
882 ASSERT(db->db.db_data != NULL);
883 arc_release(db->db_buf, db);
884 bzero(db->db.db_data, db->db.db_size);
885 arc_buf_freeze(db->db_buf);
886 }
887
888 mutex_exit(&db->db_mtx);
889 }
890 mutex_exit(&dn->dn_dbufs_mtx);
891 }
892
893 static int
894 dbuf_block_freeable(dmu_buf_impl_t *db)
895 {
896 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
897 uint64_t birth_txg = 0;
898
899 /*
900 * We don't need any locking to protect db_blkptr:
901 * If it's syncing, then db_last_dirty will be set
902 * so we'll ignore db_blkptr.
903 */
904 ASSERT(MUTEX_HELD(&db->db_mtx));
905 if (db->db_last_dirty)
906 birth_txg = db->db_last_dirty->dr_txg;
907 else if (db->db_blkptr)
908 birth_txg = db->db_blkptr->blk_birth;
909
910 /*
979 objset_t *os;
980
981 DB_GET_OBJSET(&os, db);
982 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
983 ASSERT(arc_released(os->os_phys_buf) ||
984 list_link_active(&os->os_dsl_dataset->ds_synced_link));
985 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
986
987 (void) arc_release(db->db_buf, db);
988 }
989
990 dbuf_dirty_record_t *
991 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
992 {
993 dnode_t *dn;
994 objset_t *os;
995 dbuf_dirty_record_t **drp, *dr;
996 int drop_struct_lock = FALSE;
997 boolean_t do_free_accounting = B_FALSE;
998 int txgoff = tx->tx_txg & TXG_MASK;
999
1000 ASSERT(tx->tx_txg != 0);
1001 ASSERT(!refcount_is_zero(&db->db_holds));
1002 DMU_TX_DIRTY_BUF(tx, db);
1003
1004 DB_DNODE_ENTER(db);
1005 dn = DB_DNODE(db);
1006 /*
1007 * Shouldn't dirty a regular buffer in syncing context. Private
1008 * objects may be dirtied in syncing context, but only if they
1009 * were already pre-dirtied in open context.
1010 */
1011 ASSERT(!dmu_tx_is_syncing(tx) ||
1012 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1013 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1014 dn->dn_objset->os_dsl_dataset == NULL);
1015 /*
1016 * We make this assert for private objects as well, but after we
1017 * check if we're already dirty. They are allowed to re-dirty
1018 * in syncing context.
1053 */
1054 drp = &db->db_last_dirty;
1055 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1056 db->db.db_object == DMU_META_DNODE_OBJECT);
1057 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1058 drp = &dr->dr_next;
1059 if (dr && dr->dr_txg == tx->tx_txg) {
1060 DB_DNODE_EXIT(db);
1061
1062 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1063 /*
1064 * If this buffer has already been written out,
1065 * we now need to reset its state.
1066 */
1067 dbuf_unoverride(dr);
1068 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1069 db->db_state != DB_NOFILL)
1070 arc_buf_thaw(db->db_buf);
1071 }
1072 mutex_exit(&db->db_mtx);
1073 return (dr);
1074 }
1075
1076 /*
1077 * Only valid if not already dirty.
1078 */
1079 ASSERT(dn->dn_object == 0 ||
1080 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1081 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1082
1083 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1084 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1085 dn->dn_phys->dn_nlevels > db->db_level ||
1086 dn->dn_next_nlevels[txgoff] > db->db_level ||
1087 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1088 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1089
1090 /*
1091 * We should only be dirtying in syncing context if it's the
1092 * mos or we're initializing the os or it's a special object.
1107 * Note: we delay "free accounting" until after we drop
1108 * the db_mtx. This keeps us from grabbing other locks
1109 * (and possibly deadlocking) in bp_get_dsize() while
1110 * also holding the db_mtx.
1111 */
1112 dnode_willuse_space(dn, db->db.db_size, tx);
1113 do_free_accounting = dbuf_block_freeable(db);
1114 }
1115
1116 /*
1117 * If this buffer is dirty in an old transaction group we need
1118 * to make a copy of it so that the changes we make in this
1119 * transaction group won't leak out when we sync the older txg.
1120 */
1121 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1122 if (db->db_level == 0) {
1123 void *data_old = db->db_buf;
1124
1125 if (db->db_state != DB_NOFILL) {
1126 if (db->db_blkid == DMU_BONUS_BLKID) {
1127 dbuf_fix_old_data(db, tx->tx_txg);
1128 data_old = db->db.db_data;
1129 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1130 /*
1131 * Release the data buffer from the cache so
1132 * that we can modify it without impacting
1133 * possible other users of this cached data
1134 * block. Note that indirect blocks and
1135 * private objects are not released until the
1136 * syncing state (since they are only modified
1137 * then).
1138 */
1139 arc_release(db->db_buf, db);
1140 dbuf_fix_old_data(db, tx->tx_txg);
1141 data_old = db->db_buf;
1142 }
1143 ASSERT(data_old != NULL);
1144 }
1145 dr->dt.dl.dr_data = data_old;
1146 } else {
1147 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1148 list_create(&dr->dt.di.dr_children,
1149 sizeof (dbuf_dirty_record_t),
1150 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1151 }
1152 dr->dr_dbuf = db;
1153 dr->dr_txg = tx->tx_txg;
1154 dr->dr_next = *drp;
1155 *drp = dr;
1156
1157 /*
1158 * We could have been freed_in_flight between the dbuf_noread
1159 * and dbuf_dirty. We win, as though the dbuf_noread() had
1160 * happened after the free.
1161 */
1162 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1163 db->db_blkid != DMU_SPILL_BLKID) {
1164 mutex_enter(&dn->dn_mtx);
1165 dnode_clear_range(dn, db->db_blkid, 1, tx);
1166 mutex_exit(&dn->dn_mtx);
1167 db->db_freed_in_flight = FALSE;
1168 }
1169
1170 /*
1171 * This buffer is now part of this txg
1172 */
1173 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1174 db->db_dirtycnt += 1;
1175 ASSERT3U(db->db_dirtycnt, <=, 3);
1176
1177 mutex_exit(&db->db_mtx);
1178
1179 if (db->db_blkid == DMU_BONUS_BLKID ||
1180 db->db_blkid == DMU_SPILL_BLKID) {
1181 mutex_enter(&dn->dn_mtx);
1182 ASSERT(!list_link_active(&dr->dr_dirty_node));
1183 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1184 mutex_exit(&dn->dn_mtx);
1185 dnode_setdirty(dn, tx);
1186 DB_DNODE_EXIT(db);
1187 return (dr);
1188 } else if (do_free_accounting) {
1189 blkptr_t *bp = db->db_blkptr;
1190 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1191 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1192 /*
1193 * This is only a guess -- if the dbuf is dirty
1194 * in a previous txg, we don't know how much
1195 * space it will use on disk yet. We should
1196 * really have the struct_rwlock to access
1197 * db_blkptr, but since this is just a guess,
1252 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1253 mutex_exit(&dn->dn_mtx);
1254 if (drop_struct_lock)
1255 rw_exit(&dn->dn_struct_rwlock);
1256 }
1257
1258 dnode_setdirty(dn, tx);
1259 DB_DNODE_EXIT(db);
1260 return (dr);
1261 }
1262
1263 /*
1264 * Return TRUE if this evicted the dbuf.
1265 */
1266 static boolean_t
1267 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1268 {
1269 dnode_t *dn;
1270 uint64_t txg = tx->tx_txg;
1271 dbuf_dirty_record_t *dr, **drp;
1272
1273 ASSERT(txg != 0);
1274 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1275 ASSERT0(db->db_level);
1276 ASSERT(MUTEX_HELD(&db->db_mtx));
1277
1278 /*
1279 * If this buffer is not dirty, we're done.
1280 */
1281 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1282 if (dr->dr_txg <= txg)
1283 break;
1284 if (dr == NULL || dr->dr_txg < txg)
1285 return (B_FALSE);
1286 ASSERT(dr->dr_txg == txg);
1287 ASSERT(dr->dr_dbuf == db);
1288
1289 DB_DNODE_ENTER(db);
1290 dn = DB_DNODE(db);
1291
1292 /*
1293 * Note: This code will probably work even if there are concurrent
1294 * holders, but it is untested in that scenerio, as the ZPL and
1295 * ztest have additional locking (the range locks) that prevents
1296 * that type of concurrent access.
1297 */
1298 ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1299
1300 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1301
1302 ASSERT(db->db.db_size != 0);
1303
1304 /* XXX would be nice to fix up dn_towrite_space[] */
1305
1306 *drp = dr->dr_next;
1307
1308 /*
1324 }
1325 DB_DNODE_EXIT(db);
1326
1327 if (db->db_state != DB_NOFILL) {
1328 dbuf_unoverride(dr);
1329
1330 ASSERT(db->db_buf != NULL);
1331 ASSERT(dr->dt.dl.dr_data != NULL);
1332 if (dr->dt.dl.dr_data != db->db_buf)
1333 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1334 }
1335 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1336
1337 ASSERT(db->db_dirtycnt > 0);
1338 db->db_dirtycnt -= 1;
1339
1340 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1341 arc_buf_t *buf = db->db_buf;
1342
1343 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1344 dbuf_set_data(db, NULL);
1345 VERIFY(arc_buf_remove_ref(buf, db));
1346 dbuf_evict(db);
1347 return (B_TRUE);
1348 }
1349
1350 return (B_FALSE);
1351 }
1352
1353 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1354 void
1355 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1356 {
1357 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1358
1359 ASSERT(tx->tx_txg != 0);
1360 ASSERT(!refcount_is_zero(&db->db_holds));
1361
1362 DB_DNODE_ENTER(db);
1363 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1364 rf |= DB_RF_HAVESTRUCT;
1365 DB_DNODE_EXIT(db);
1366 (void) dbuf_read(db, NULL, rf);
1367 (void) dbuf_dirty(db, tx);
1368 }
1369
1476 db->db_state = DB_FILL;
1477 mutex_exit(&db->db_mtx);
1478 (void) dbuf_dirty(db, tx);
1479 dbuf_fill_done(db, tx);
1480 }
1481
1482 /*
1483 * "Clear" the contents of this dbuf. This will mark the dbuf
1484 * EVICTING and clear *most* of its references. Unfortunetely,
1485 * when we are not holding the dn_dbufs_mtx, we can't clear the
1486 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1487 * in this case. For callers from the DMU we will usually see:
1488 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1489 * For the arc callback, we will usually see:
1490 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1491 * Sometimes, though, we will get a mix of these two:
1492 * DMU: dbuf_clear()->arc_buf_evict()
1493 * ARC: dbuf_do_evict()->dbuf_destroy()
1494 */
1495 void
1496 dbuf_clear(dmu_buf_impl_t *db)
1497 {
1498 dnode_t *dn;
1499 dmu_buf_impl_t *parent = db->db_parent;
1500 dmu_buf_impl_t *dndb;
1501 int dbuf_gone = FALSE;
1502
1503 ASSERT(MUTEX_HELD(&db->db_mtx));
1504 ASSERT(refcount_is_zero(&db->db_holds));
1505
1506 dbuf_evict_user(db);
1507
1508 if (db->db_state == DB_CACHED) {
1509 ASSERT(db->db.db_data != NULL);
1510 if (db->db_blkid == DMU_BONUS_BLKID) {
1511 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1512 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1513 }
1514 db->db.db_data = NULL;
1515 db->db_state = DB_UNCACHED;
1516 }
1517
1518 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1519 ASSERT(db->db_data_pending == NULL);
1520
1521 db->db_state = DB_EVICTING;
1522 db->db_blkptr = NULL;
1523
1524 DB_DNODE_ENTER(db);
1525 dn = DB_DNODE(db);
1526 dndb = dn->dn_dbuf;
1629 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1630 {
1631 objset_t *os = dn->dn_objset;
1632 dmu_buf_impl_t *db, *odb;
1633
1634 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1635 ASSERT(dn->dn_type != DMU_OT_NONE);
1636
1637 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1638
1639 db->db_objset = os;
1640 db->db.db_object = dn->dn_object;
1641 db->db_level = level;
1642 db->db_blkid = blkid;
1643 db->db_last_dirty = NULL;
1644 db->db_dirtycnt = 0;
1645 db->db_dnode_handle = dn->dn_handle;
1646 db->db_parent = parent;
1647 db->db_blkptr = blkptr;
1648
1649 db->db_user_ptr = NULL;
1650 db->db_user_data_ptr_ptr = NULL;
1651 db->db_evict_func = NULL;
1652 db->db_immediate_evict = 0;
1653 db->db_freed_in_flight = 0;
1654
1655 if (blkid == DMU_BONUS_BLKID) {
1656 ASSERT3P(parent, ==, dn->dn_dbuf);
1657 db->db.db_size = DN_MAX_BONUSLEN -
1658 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1659 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1660 db->db.db_offset = DMU_BONUS_BLKID;
1661 db->db_state = DB_UNCACHED;
1662 /* the bonus dbuf is not placed in the hash table */
1663 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1664 return (db);
1665 } else if (blkid == DMU_SPILL_BLKID) {
1666 db->db.db_size = (blkptr != NULL) ?
1667 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1668 db->db.db_offset = 0;
1669 } else {
1670 int blocksize =
1671 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
1694 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1695
1696 if (parent && parent != dn->dn_dbuf)
1697 dbuf_add_ref(parent, db);
1698
1699 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1700 refcount_count(&dn->dn_holds) > 0);
1701 (void) refcount_add(&dn->dn_holds, db);
1702 (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1703
1704 dprintf_dbuf(db, "db=%p\n", db);
1705
1706 return (db);
1707 }
1708
1709 static int
1710 dbuf_do_evict(void *private)
1711 {
1712 arc_buf_t *buf = private;
1713 dmu_buf_impl_t *db = buf->b_private;
1714
1715 if (!MUTEX_HELD(&db->db_mtx))
1716 mutex_enter(&db->db_mtx);
1717
1718 ASSERT(refcount_is_zero(&db->db_holds));
1719
1720 if (db->db_state != DB_EVICTING) {
1721 ASSERT(db->db_state == DB_CACHED);
1722 DBUF_VERIFY(db);
1723 db->db_buf = NULL;
1724 dbuf_evict(db);
1725 } else {
1726 mutex_exit(&db->db_mtx);
1727 dbuf_destroy(db);
1728 }
1729 return (0);
1730 }
1731
1732 static void
1733 dbuf_destroy(dmu_buf_impl_t *db)
1734 {
1735 ASSERT(refcount_is_zero(&db->db_holds));
1736
1737 if (db->db_blkid != DMU_BONUS_BLKID) {
1738 /*
1739 * If this dbuf is still on the dn_dbufs list,
1740 * remove it from that list.
1741 */
1742 if (db->db_dnode_handle != NULL) {
1743 dnode_t *dn;
1744
1745 DB_DNODE_ENTER(db);
1746 dn = DB_DNODE(db);
1747 mutex_enter(&dn->dn_dbufs_mtx);
1748 list_remove(&dn->dn_dbufs, db);
1809
1810 (void) arc_read(NULL, dn->dn_objset->os_spa,
1811 bp, NULL, NULL, priority,
1812 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1813 &aflags, &zb);
1814 }
1815 if (db)
1816 dbuf_rele(db, NULL);
1817 }
1818 }
1819
1820 /*
1821 * Returns with db_holds incremented, and db_mtx not held.
1822 * Note: dn_struct_rwlock must be held.
1823 */
1824 int
1825 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1826 void *tag, dmu_buf_impl_t **dbp)
1827 {
1828 dmu_buf_impl_t *db, *parent = NULL;
1829
1830 ASSERT(blkid != DMU_BONUS_BLKID);
1831 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1832 ASSERT3U(dn->dn_nlevels, >, level);
1833
1834 *dbp = NULL;
1835 top:
1836 /* dbuf_find() returns with db_mtx held */
1837 db = dbuf_find(dn, level, blkid);
1838
1839 if (db == NULL) {
1840 blkptr_t *bp = NULL;
1841 int err;
1842
1843 ASSERT3P(parent, ==, NULL);
1844 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1845 if (fail_sparse) {
1846 if (err == 0 && bp && BP_IS_HOLE(bp))
1847 err = SET_ERROR(ENOENT);
1848 if (err) {
1849 if (parent)
1850 dbuf_rele(parent, NULL);
1851 return (err);
1852 }
1853 }
1854 if (err && err != ENOENT)
1855 return (err);
1856 db = dbuf_create(dn, level, blkid, parent, bp);
1857 }
1858
1859 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1860 arc_buf_add_ref(db->db_buf, db);
1861 if (db->db_buf->b_data == NULL) {
1862 dbuf_clear(db);
1863 if (parent) {
1864 dbuf_rele(parent, NULL);
1865 parent = NULL;
1866 }
1867 goto top;
1868 }
1869 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1870 }
1871
1872 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1873
1874 /*
1875 * If this buffer is currently syncing out, and we are are
1876 * still referencing it from db_data, we need to make a copy
1877 * of it in case we decide we want to dirty it again in this txg.
1878 */
1879 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1880 dn->dn_object != DMU_META_DNODE_OBJECT &&
1881 db->db_state == DB_CACHED && db->db_data_pending) {
1882 dbuf_dirty_record_t *dr = db->db_data_pending;
1883
1884 if (dr->dt.dl.dr_data == db->db_buf) {
1885 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1886
1887 dbuf_set_data(db,
1888 arc_buf_alloc(dn->dn_objset->os_spa,
1889 db->db.db_size, db, type));
1890 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1891 db->db.db_size);
1892 }
1893 }
1894
1895 (void) refcount_add(&db->db_holds, tag);
1896 dbuf_update_data(db);
1897 DBUF_VERIFY(db);
1898 mutex_exit(&db->db_mtx);
1899
1900 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1901 if (parent)
1902 dbuf_rele(parent, NULL);
1903
1904 ASSERT3P(DB_DNODE(db), ==, dn);
1905 ASSERT3U(db->db_blkid, ==, blkid);
1906 ASSERT3U(db->db_level, ==, level);
1907 *dbp = db;
1908
1909 return (0);
1910 }
1911
1912 dmu_buf_impl_t *
1913 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1914 {
1915 dmu_buf_impl_t *db;
1916 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1917 return (err ? NULL : db);
1918 }
1919
1979 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
1980 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
1981 * dnode's parent dbuf evicting its dnode handles.
1982 */
1983 #pragma weak dmu_buf_rele = dbuf_rele
1984 void
1985 dbuf_rele(dmu_buf_impl_t *db, void *tag)
1986 {
1987 mutex_enter(&db->db_mtx);
1988 dbuf_rele_and_unlock(db, tag);
1989 }
1990
1991 /*
1992 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
1993 * db_dirtycnt and db_holds to be updated atomically.
1994 */
1995 void
1996 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
1997 {
1998 int64_t holds;
1999
2000 ASSERT(MUTEX_HELD(&db->db_mtx));
2001 DBUF_VERIFY(db);
2002
2003 /*
2004 * Remove the reference to the dbuf before removing its hold on the
2005 * dnode so we can guarantee in dnode_move() that a referenced bonus
2006 * buffer has a corresponding dnode hold.
2007 */
2008 holds = refcount_remove(&db->db_holds, tag);
2009 ASSERT(holds >= 0);
2010
2011 /*
2012 * We can't freeze indirects if there is a possibility that they
2013 * may be modified in the current syncing context.
2014 */
2015 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2016 arc_buf_freeze(db->db_buf);
2017
2018 if (holds == db->db_dirtycnt &&
2019 db->db_level == 0 && db->db_immediate_evict)
2020 dbuf_evict_user(db);
2021
2022 if (holds == 0) {
2023 if (db->db_blkid == DMU_BONUS_BLKID) {
2024 mutex_exit(&db->db_mtx);
2025
2026 /*
2027 * If the dnode moves here, we cannot cross this barrier
2028 * until the move completes.
2029 */
2030 DB_DNODE_ENTER(db);
2031 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2032 DB_DNODE_EXIT(db);
2033 /*
2034 * The bonus buffer's dnode hold is no longer discounted
2035 * in dnode_move(). The dnode cannot move until after
2036 * the dnode_rele().
2037 */
2038 dnode_rele(DB_DNODE(db), db);
2039 } else if (db->db_buf == NULL) {
2040 /*
2041 * This is a special case: we never associated this
2042 * dbuf with any data allocated from the ARC.
2043 */
2044 ASSERT(db->db_state == DB_UNCACHED ||
2045 db->db_state == DB_NOFILL);
2046 dbuf_evict(db);
2047 } else if (arc_released(db->db_buf)) {
2048 arc_buf_t *buf = db->db_buf;
2049 /*
2050 * This dbuf has anonymous data associated with it.
2051 */
2052 dbuf_set_data(db, NULL);
2053 VERIFY(arc_buf_remove_ref(buf, db));
2054 dbuf_evict(db);
2055 } else {
2056 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2057
2058 /*
2059 * A dbuf will be eligible for eviction if either the
2060 * 'primarycache' property is set or a duplicate
2061 * copy of this buffer is already cached in the arc.
2062 *
2063 * In the case of the 'primarycache' a buffer
2064 * is considered for eviction if it matches the
2065 * criteria set in the property.
2066 *
2067 * To decide if our buffer is considered a
2068 * duplicate, we must call into the arc to determine
2069 * if multiple buffers are referencing the same
2070 * block on-disk. If so, then we simply evict
2071 * ourselves.
2072 */
2073 if (!DBUF_IS_CACHEABLE(db) ||
2074 arc_buf_eviction_needed(db->db_buf))
2075 dbuf_clear(db);
2076 else
2077 mutex_exit(&db->db_mtx);
2078 }
2079 } else {
2080 mutex_exit(&db->db_mtx);
2081 }
2082 }
2083
2084 #pragma weak dmu_buf_refcount = dbuf_refcount
2085 uint64_t
2086 dbuf_refcount(dmu_buf_impl_t *db)
2087 {
2088 return (refcount_count(&db->db_holds));
2089 }
2090
2091 void *
2092 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2093 dmu_buf_evict_func_t *evict_func)
2094 {
2095 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2096 user_data_ptr_ptr, evict_func));
2097 }
2098
2099 void *
2100 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2101 dmu_buf_evict_func_t *evict_func)
2102 {
2103 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2104
2105 db->db_immediate_evict = TRUE;
2106 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2107 user_data_ptr_ptr, evict_func));
2108 }
2109
2110 void *
2111 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2112 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2113 {
2114 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2115 ASSERT(db->db_level == 0);
2116
2117 ASSERT((user_ptr == NULL) == (evict_func == NULL));
2118
2119 mutex_enter(&db->db_mtx);
2120
2121 if (db->db_user_ptr == old_user_ptr) {
2122 db->db_user_ptr = user_ptr;
2123 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2124 db->db_evict_func = evict_func;
2125
2126 dbuf_update_data(db);
2127 } else {
2128 old_user_ptr = db->db_user_ptr;
2129 }
2130
2131 mutex_exit(&db->db_mtx);
2132 return (old_user_ptr);
2133 }
2134
2135 void *
2136 dmu_buf_get_user(dmu_buf_t *db_fake)
2137 {
2138 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2139 ASSERT(!refcount_is_zero(&db->db_holds));
2140
2141 return (db->db_user_ptr);
2142 }
2143
2144 boolean_t
2145 dmu_buf_freeable(dmu_buf_t *dbuf)
2146 {
2147 boolean_t res = B_FALSE;
2148 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2149
2150 if (db->db_blkptr)
2151 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2152 db->db_blkptr, db->db_blkptr->blk_birth);
2153
2154 return (res);
2155 }
2156
2157 blkptr_t *
2158 dmu_buf_get_blkptr(dmu_buf_t *db)
2159 {
2160 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2161 return (dbi->db_blkptr);
2162 }
2163
|
187 */
188 ASSERT(refcount_is_zero(&db->db_holds));
189 ASSERT(db->db_state == DB_EVICTING);
190 ASSERT(!MUTEX_HELD(&db->db_mtx));
191
192 mutex_enter(DBUF_HASH_MUTEX(h, idx));
193 dbp = &h->hash_table[idx];
194 while ((dbf = *dbp) != db) {
195 dbp = &dbf->db_hash_next;
196 ASSERT(dbf != NULL);
197 }
198 *dbp = db->db_hash_next;
199 db->db_hash_next = NULL;
200 mutex_exit(DBUF_HASH_MUTEX(h, idx));
201 atomic_add_64(&dbuf_hash_count, -1);
202 }
203
204 static arc_evict_func_t dbuf_do_evict;
205
206 static void
207 dbuf_verify_user(dmu_buf_impl_t *db, boolean_t evicting)
208 {
209 #ifdef ZFS_DEBUG
210
211 if (db->db_level != 0)
212 ASSERT(db->db_user == NULL);
213
214 if (db->db_user == NULL)
215 return;
216
217 /* Clients must resolve a dbuf before attaching user data. */
218 ASSERT(db->db.db_data != NULL && db->db_state == DB_CACHED);
219 /*
220 * We can't check the hold count here, because they are modified
221 * independently of the dbuf mutex. But it would be nice to ensure
222 * that the user has the appropriate number.
223 */
224 #endif
225 }
226
227 /*
228 * Evict the dbuf's user, either immediately, or use a provided queue.
229 *
230 * Call dmu_buf_process_user_evicts or dmu_buf_destroy_user_evict_list
231 * on the list when finished generating it.
232 *
233 * NOTE: If db->db_immediate_evict is FALSE, evict_list_p must be provided.
234 * NOTE: See dmu_buf_user_t about how this process works.
235 */
236 static void
237 dbuf_evict_user(dmu_buf_impl_t *db, list_t *evict_list_p)
238 {
239 ASSERT(MUTEX_HELD(&db->db_mtx));
240 ASSERT(evict_list_p != NULL);
241 dbuf_verify_user(db, /*evicting*/B_TRUE);
242
243 if (db->db_user == NULL)
244 return;
245
246 ASSERT(!list_link_active(&db->db_user->evict_queue_link));
247 list_insert_head(evict_list_p, db->db_user);
248 db->db_user = NULL;
249 }
250
251 /*
252 * Replace the current user of the dbuf. Requires that the caller knows who
253 * the old user is. Returns the old user, which may not necessarily be
254 * the same old_user provided by the caller.
255 */
256 dmu_buf_user_t *
257 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
258 dmu_buf_user_t *new_user)
259 {
260 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
261
262 mutex_enter(&db->db_mtx);
263 dbuf_verify_user(db, /*evicting*/B_FALSE);
264 if (db->db_user == old_user)
265 db->db_user = new_user;
266 else
267 old_user = db->db_user;
268 dbuf_verify_user(db, /*evicting*/B_FALSE);
269 mutex_exit(&db->db_mtx);
270
271 return (old_user);
272 }
273
274 /*
275 * Set the user eviction data for the DMU beturns NULL on success,
276 * or the existing user if another user currently owns the buffer.
277 */
278 dmu_buf_user_t *
279 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
280 {
281 return (dmu_buf_replace_user(db_fake, NULL, user));
282 }
283
284 dmu_buf_user_t *
285 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
286 {
287 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
288
289 db->db_immediate_evict = TRUE;
290 return (dmu_buf_set_user(db_fake, user));
291 }
292
293 /*
294 * Remove the user eviction data for the DMU buffer.
295 */
296 dmu_buf_user_t *
297 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
298 {
299 return (dmu_buf_replace_user(db_fake, user, NULL));
300 }
301
302 /*
303 * Returns the db_user set with dmu_buf_update_user(), or NULL if not set.
304 */
305 dmu_buf_user_t *
306 dmu_buf_get_user(dmu_buf_t *db_fake)
307 {
308 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
309
310 dbuf_verify_user(db, /*evicting*/B_FALSE);
311 return (db->db_user);
312 }
313
314 static void
315 dbuf_clear_data(dmu_buf_impl_t *db, list_t *evict_list_p)
316 {
317 ASSERT(MUTEX_HELD(&db->db_mtx));
318 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
319 dbuf_evict_user(db, evict_list_p);
320 db->db_buf = NULL;
321 db->db.db_data = NULL;
322 if (db->db_state != DB_NOFILL)
323 db->db_state = DB_UNCACHED;
324 }
325
326 static void
327 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
328 {
329 ASSERT(MUTEX_HELD(&db->db_mtx));
330 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
331 ASSERT(buf != NULL);
332
333 db->db_buf = buf;
334 ASSERT(buf->b_data != NULL);
335 db->db.db_data = buf->b_data;
336 if (!arc_released(buf))
337 arc_set_callback(buf, dbuf_do_evict, db);
338 }
339
340 boolean_t
341 dbuf_is_metadata(dmu_buf_impl_t *db)
342 {
343 if (db->db_level > 0) {
344 return (B_TRUE);
345 } else {
346 boolean_t is_metadata;
347
348 DB_DNODE_ENTER(db);
349 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
350 DB_DNODE_EXIT(db);
351
352 return (is_metadata);
353 }
354 }
355
356 void
357 dbuf_evict(dmu_buf_impl_t *db, list_t *evict_list_p)
358 {
359 ASSERT(MUTEX_HELD(&db->db_mtx));
360 ASSERT(db->db_buf == NULL);
361 ASSERT(db->db_data_pending == NULL);
362
363 dbuf_clear(db, evict_list_p);
364 dbuf_destroy(db);
365 }
366
367 void
368 dbuf_init(void)
369 {
370 uint64_t hsize = 1ULL << 16;
371 dbuf_hash_table_t *h = &dbuf_hash_table;
372 int i;
373
374 /*
375 * The hash table is big enough to fill all of physical memory
376 * with an average 4K block size. The table will take up
377 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
378 */
379 while (hsize * 4096 < physmem * PAGESIZE)
380 hsize <<= 1;
381
382 retry:
383 h->hash_table_mask = hsize - 1;
506 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
507 db->db_state != DB_FILL && !dn->dn_free_txg) {
508 /*
509 * If the blkptr isn't set but they have nonzero data,
510 * it had better be dirty, otherwise we'll lose that
511 * data when we evict this buffer.
512 */
513 if (db->db_dirtycnt == 0) {
514 uint64_t *buf = db->db.db_data;
515 int i;
516
517 for (i = 0; i < db->db.db_size >> 3; i++) {
518 ASSERT(buf[i] == 0);
519 }
520 }
521 }
522 DB_DNODE_EXIT(db);
523 }
524 #endif
525
526 /*
527 * Loan out an arc_buf for read. Return the loaned arc_buf.
528 */
529 arc_buf_t *
530 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
531 {
532 arc_buf_t *abuf;
533 list_t evict_list;
534
535 dmu_buf_create_user_evict_list(&evict_list);
536
537 mutex_enter(&db->db_mtx);
538 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
539 int blksz = db->db.db_size;
540 spa_t *spa;
541
542 mutex_exit(&db->db_mtx);
543 DB_GET_SPA(&spa, db);
544 abuf = arc_loan_buf(spa, blksz);
545 bcopy(db->db.db_data, abuf->b_data, blksz);
546 } else {
547 abuf = db->db_buf;
548 arc_loan_inuse_buf(abuf, db);
549 dbuf_clear_data(db, &evict_list);
550 mutex_exit(&db->db_mtx);
551 }
552 dmu_buf_destroy_user_evict_list(&evict_list);
553 return (abuf);
554 }
555
556 uint64_t
557 dbuf_whichblock(dnode_t *dn, uint64_t offset)
558 {
559 if (dn->dn_datablkshift) {
560 return (offset >> dn->dn_datablkshift);
561 } else {
562 ASSERT3U(offset, <, dn->dn_datablksz);
563 return (0);
564 }
565 }
566
567 static void
568 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
569 {
570 dmu_buf_impl_t *db = vdb;
571
572 mutex_enter(&db->db_mtx);
609 DB_DNODE_ENTER(db);
610 dn = DB_DNODE(db);
611 ASSERT(!refcount_is_zero(&db->db_holds));
612 /* We need the struct_rwlock to prevent db_blkptr from changing. */
613 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
614 ASSERT(MUTEX_HELD(&db->db_mtx));
615 ASSERT(db->db_state == DB_UNCACHED);
616 ASSERT(db->db_buf == NULL);
617
618 if (db->db_blkid == DMU_BONUS_BLKID) {
619 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
620
621 ASSERT3U(bonuslen, <=, db->db.db_size);
622 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
623 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
624 if (bonuslen < DN_MAX_BONUSLEN)
625 bzero(db->db.db_data, DN_MAX_BONUSLEN);
626 if (bonuslen)
627 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
628 DB_DNODE_EXIT(db);
629 db->db_state = DB_CACHED;
630 mutex_exit(&db->db_mtx);
631 return;
632 }
633
634 /*
635 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
636 * processes the delete record and clears the bp while we are waiting
637 * for the dn_mtx (resulting in a "no" from block_freed).
638 */
639 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
640 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
641 BP_IS_HOLE(db->db_blkptr)))) {
642 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
643
644 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
645 db->db.db_size, db, type));
646 DB_DNODE_EXIT(db);
647 bzero(db->db.db_data, db->db.db_size);
648 db->db_state = DB_CACHED;
741 if ((flags & DB_RF_NEVERWAIT) == 0) {
742 while (db->db_state == DB_READ ||
743 db->db_state == DB_FILL) {
744 ASSERT(db->db_state == DB_READ ||
745 (flags & DB_RF_HAVESTRUCT) == 0);
746 cv_wait(&db->db_changed, &db->db_mtx);
747 }
748 if (db->db_state == DB_UNCACHED)
749 err = SET_ERROR(EIO);
750 }
751 mutex_exit(&db->db_mtx);
752 }
753
754 ASSERT(err || havepzio || db->db_state == DB_CACHED);
755 return (err);
756 }
757
758 static void
759 dbuf_noread(dmu_buf_impl_t *db)
760 {
761 list_t evict_list;
762
763 ASSERT(!refcount_is_zero(&db->db_holds));
764 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
765 dmu_buf_create_user_evict_list(&evict_list);
766
767 mutex_enter(&db->db_mtx);
768 while (db->db_state == DB_READ || db->db_state == DB_FILL)
769 cv_wait(&db->db_changed, &db->db_mtx);
770 if (db->db_state == DB_UNCACHED) {
771 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
772 spa_t *spa;
773
774 ASSERT(db->db_buf == NULL);
775 ASSERT(db->db.db_data == NULL);
776 DB_GET_SPA(&spa, db);
777 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
778 db->db_state = DB_FILL;
779 } else if (db->db_state == DB_NOFILL) {
780 dbuf_clear_data(db, &evict_list);
781 } else {
782 ASSERT3U(db->db_state, ==, DB_CACHED);
783 }
784 mutex_exit(&db->db_mtx);
785 dmu_buf_destroy_user_evict_list(&evict_list);
786 }
787
788 /*
789 * This is our just-in-time copy function. It makes a copy of
790 * buffers, that have been modified in a previous transaction
791 * group, before we modify them in the current active group.
792 *
793 * This function is used in two places: when we are dirtying a
794 * buffer for the first time in a txg, and when we are freeing
795 * a range in a dnode that includes this buffer.
796 *
797 * Note that when we are called from dbuf_free_range() we do
798 * not put a hold on the buffer, we just traverse the active
799 * dbuf list for the dnode.
800 */
801 static void
802 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg, list_t *evict_list_p)
803 {
804 dbuf_dirty_record_t *dr = db->db_last_dirty;
805
806 ASSERT(MUTEX_HELD(&db->db_mtx));
807 ASSERT(db->db.db_data != NULL);
808 ASSERT(db->db_level == 0);
809 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
810
811 if (dr == NULL ||
812 (dr->dt.dl.dr_data !=
813 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
814 return;
815
816 /*
817 * If the last dirty record for this dbuf has not yet synced
818 * and its referencing the dbuf data, either:
819 * reset the reference to point to a new copy,
820 * or (if there a no active holders)
821 * just null out the current db_data pointer.
822 */
823 ASSERT(dr->dr_txg >= txg - 2);
824 if (db->db_blkid == DMU_BONUS_BLKID) {
825 /* Note that the data bufs here are zio_bufs */
826 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
827 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
828 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
829 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
830 int size = db->db.db_size;
831 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
832 spa_t *spa;
833
834 DB_GET_SPA(&spa, db);
835 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
836 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
837 } else {
838 dbuf_clear_data(db, evict_list_p);
839 }
840 }
841
842 void
843 dbuf_unoverride(dbuf_dirty_record_t *dr)
844 {
845 dmu_buf_impl_t *db = dr->dr_dbuf;
846 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
847 uint64_t txg = dr->dr_txg;
848
849 ASSERT(MUTEX_HELD(&db->db_mtx));
850 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
851 ASSERT(db->db_level == 0);
852
853 if (db->db_blkid == DMU_BONUS_BLKID ||
854 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
855 return;
856
857 ASSERT(db->db_data_pending != dr);
858
875 * immediately re-thawing it.
876 */
877 arc_release(dr->dt.dl.dr_data, db);
878 }
879
880 /*
881 * Evict (if its unreferenced) or clear (if its referenced) any level-0
882 * data blocks in the free range, so that any future readers will find
883 * empty blocks. Also, if we happen accross any level-1 dbufs in the
884 * range that have not already been marked dirty, mark them dirty so
885 * they stay in memory.
886 */
887 void
888 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
889 {
890 dmu_buf_impl_t *db, *db_next;
891 uint64_t txg = tx->tx_txg;
892 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
893 uint64_t first_l1 = start >> epbs;
894 uint64_t last_l1 = end >> epbs;
895 list_t evict_list;
896
897 dmu_buf_create_user_evict_list(&evict_list);
898
899 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
900 end = dn->dn_maxblkid;
901 last_l1 = end >> epbs;
902 }
903 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
904 mutex_enter(&dn->dn_dbufs_mtx);
905 for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
906 db_next = list_next(&dn->dn_dbufs, db);
907 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
908
909 if (db->db_level == 1 &&
910 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
911 mutex_enter(&db->db_mtx);
912 if (db->db_last_dirty &&
913 db->db_last_dirty->dr_txg < txg) {
914 dbuf_add_ref(db, FTAG);
915 mutex_exit(&db->db_mtx);
916 dbuf_will_dirty(db, tx);
917 dbuf_rele(db, FTAG);
931 if (dbuf_undirty(db, tx)) {
932 /* mutex has been dropped and dbuf destroyed */
933 continue;
934 }
935
936 if (db->db_state == DB_UNCACHED ||
937 db->db_state == DB_NOFILL ||
938 db->db_state == DB_EVICTING) {
939 ASSERT(db->db.db_data == NULL);
940 mutex_exit(&db->db_mtx);
941 continue;
942 }
943 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
944 /* will be handled in dbuf_read_done or dbuf_rele */
945 db->db_freed_in_flight = TRUE;
946 mutex_exit(&db->db_mtx);
947 continue;
948 }
949 if (refcount_count(&db->db_holds) == 0) {
950 ASSERT(db->db_buf);
951 dbuf_clear(db, &evict_list);
952 continue;
953 }
954 /* The dbuf is referenced */
955
956 if (db->db_last_dirty != NULL) {
957 dbuf_dirty_record_t *dr = db->db_last_dirty;
958
959 if (dr->dr_txg == txg) {
960 /*
961 * This buffer is "in-use", re-adjust the file
962 * size to reflect that this buffer may
963 * contain new data when we sync.
964 */
965 if (db->db_blkid != DMU_SPILL_BLKID &&
966 db->db_blkid > dn->dn_maxblkid)
967 dn->dn_maxblkid = db->db_blkid;
968 dbuf_unoverride(dr);
969 } else {
970 /*
971 * This dbuf is not dirty in the open context.
972 * Either uncache it (if its not referenced in
973 * the open context) or reset its contents to
974 * empty.
975 */
976 dbuf_fix_old_data(db, txg, &evict_list);
977 }
978 }
979 /* clear the contents if its cached */
980 if (db->db_state == DB_CACHED) {
981 ASSERT(db->db.db_data != NULL);
982 arc_release(db->db_buf, db);
983 bzero(db->db.db_data, db->db.db_size);
984 arc_buf_freeze(db->db_buf);
985 }
986
987 mutex_exit(&db->db_mtx);
988 dmu_buf_process_user_evicts(&evict_list);
989 }
990 mutex_exit(&dn->dn_dbufs_mtx);
991 dmu_buf_destroy_user_evict_list(&evict_list);
992 }
993
994 static int
995 dbuf_block_freeable(dmu_buf_impl_t *db)
996 {
997 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
998 uint64_t birth_txg = 0;
999
1000 /*
1001 * We don't need any locking to protect db_blkptr:
1002 * If it's syncing, then db_last_dirty will be set
1003 * so we'll ignore db_blkptr.
1004 */
1005 ASSERT(MUTEX_HELD(&db->db_mtx));
1006 if (db->db_last_dirty)
1007 birth_txg = db->db_last_dirty->dr_txg;
1008 else if (db->db_blkptr)
1009 birth_txg = db->db_blkptr->blk_birth;
1010
1011 /*
1080 objset_t *os;
1081
1082 DB_GET_OBJSET(&os, db);
1083 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1084 ASSERT(arc_released(os->os_phys_buf) ||
1085 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1086 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1087
1088 (void) arc_release(db->db_buf, db);
1089 }
1090
1091 dbuf_dirty_record_t *
1092 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1093 {
1094 dnode_t *dn;
1095 objset_t *os;
1096 dbuf_dirty_record_t **drp, *dr;
1097 int drop_struct_lock = FALSE;
1098 boolean_t do_free_accounting = B_FALSE;
1099 int txgoff = tx->tx_txg & TXG_MASK;
1100 list_t evict_list;
1101
1102 dmu_buf_create_user_evict_list(&evict_list);
1103
1104 ASSERT(tx->tx_txg != 0);
1105 ASSERT(!refcount_is_zero(&db->db_holds));
1106 DMU_TX_DIRTY_BUF(tx, db);
1107
1108 DB_DNODE_ENTER(db);
1109 dn = DB_DNODE(db);
1110 /*
1111 * Shouldn't dirty a regular buffer in syncing context. Private
1112 * objects may be dirtied in syncing context, but only if they
1113 * were already pre-dirtied in open context.
1114 */
1115 ASSERT(!dmu_tx_is_syncing(tx) ||
1116 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1117 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1118 dn->dn_objset->os_dsl_dataset == NULL);
1119 /*
1120 * We make this assert for private objects as well, but after we
1121 * check if we're already dirty. They are allowed to re-dirty
1122 * in syncing context.
1157 */
1158 drp = &db->db_last_dirty;
1159 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1160 db->db.db_object == DMU_META_DNODE_OBJECT);
1161 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1162 drp = &dr->dr_next;
1163 if (dr && dr->dr_txg == tx->tx_txg) {
1164 DB_DNODE_EXIT(db);
1165
1166 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1167 /*
1168 * If this buffer has already been written out,
1169 * we now need to reset its state.
1170 */
1171 dbuf_unoverride(dr);
1172 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1173 db->db_state != DB_NOFILL)
1174 arc_buf_thaw(db->db_buf);
1175 }
1176 mutex_exit(&db->db_mtx);
1177 dmu_buf_destroy_user_evict_list(&evict_list);
1178 return (dr);
1179 }
1180
1181 /*
1182 * Only valid if not already dirty.
1183 */
1184 ASSERT(dn->dn_object == 0 ||
1185 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1186 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1187
1188 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1189 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1190 dn->dn_phys->dn_nlevels > db->db_level ||
1191 dn->dn_next_nlevels[txgoff] > db->db_level ||
1192 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1193 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1194
1195 /*
1196 * We should only be dirtying in syncing context if it's the
1197 * mos or we're initializing the os or it's a special object.
1212 * Note: we delay "free accounting" until after we drop
1213 * the db_mtx. This keeps us from grabbing other locks
1214 * (and possibly deadlocking) in bp_get_dsize() while
1215 * also holding the db_mtx.
1216 */
1217 dnode_willuse_space(dn, db->db.db_size, tx);
1218 do_free_accounting = dbuf_block_freeable(db);
1219 }
1220
1221 /*
1222 * If this buffer is dirty in an old transaction group we need
1223 * to make a copy of it so that the changes we make in this
1224 * transaction group won't leak out when we sync the older txg.
1225 */
1226 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1227 if (db->db_level == 0) {
1228 void *data_old = db->db_buf;
1229
1230 if (db->db_state != DB_NOFILL) {
1231 if (db->db_blkid == DMU_BONUS_BLKID) {
1232 dbuf_fix_old_data(db, tx->tx_txg, &evict_list);
1233 data_old = db->db.db_data;
1234 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1235 /*
1236 * Release the data buffer from the cache so
1237 * that we can modify it without impacting
1238 * possible other users of this cached data
1239 * block. Note that indirect blocks and
1240 * private objects are not released until the
1241 * syncing state (since they are only modified
1242 * then).
1243 */
1244 arc_release(db->db_buf, db);
1245 dbuf_fix_old_data(db, tx->tx_txg, &evict_list);
1246 data_old = db->db_buf;
1247 }
1248 ASSERT(data_old != NULL);
1249 }
1250 dr->dt.dl.dr_data = data_old;
1251 } else {
1252 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1253 list_create(&dr->dt.di.dr_children,
1254 sizeof (dbuf_dirty_record_t),
1255 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1256 }
1257 dr->dr_dbuf = db;
1258 dr->dr_txg = tx->tx_txg;
1259 dr->dr_next = *drp;
1260 *drp = dr;
1261
1262 /*
1263 * We could have been freed_in_flight between the dbuf_noread
1264 * and dbuf_dirty. We win, as though the dbuf_noread() had
1265 * happened after the free.
1266 */
1267 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1268 db->db_blkid != DMU_SPILL_BLKID) {
1269 mutex_enter(&dn->dn_mtx);
1270 dnode_clear_range(dn, db->db_blkid, 1, tx);
1271 mutex_exit(&dn->dn_mtx);
1272 db->db_freed_in_flight = FALSE;
1273 }
1274
1275 /*
1276 * This buffer is now part of this txg
1277 */
1278 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1279 db->db_dirtycnt += 1;
1280 ASSERT3U(db->db_dirtycnt, <=, 3);
1281
1282 mutex_exit(&db->db_mtx);
1283 dmu_buf_destroy_user_evict_list(&evict_list);
1284
1285 if (db->db_blkid == DMU_BONUS_BLKID ||
1286 db->db_blkid == DMU_SPILL_BLKID) {
1287 mutex_enter(&dn->dn_mtx);
1288 ASSERT(!list_link_active(&dr->dr_dirty_node));
1289 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1290 mutex_exit(&dn->dn_mtx);
1291 dnode_setdirty(dn, tx);
1292 DB_DNODE_EXIT(db);
1293 return (dr);
1294 } else if (do_free_accounting) {
1295 blkptr_t *bp = db->db_blkptr;
1296 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1297 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1298 /*
1299 * This is only a guess -- if the dbuf is dirty
1300 * in a previous txg, we don't know how much
1301 * space it will use on disk yet. We should
1302 * really have the struct_rwlock to access
1303 * db_blkptr, but since this is just a guess,
1358 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1359 mutex_exit(&dn->dn_mtx);
1360 if (drop_struct_lock)
1361 rw_exit(&dn->dn_struct_rwlock);
1362 }
1363
1364 dnode_setdirty(dn, tx);
1365 DB_DNODE_EXIT(db);
1366 return (dr);
1367 }
1368
1369 /*
1370 * Return TRUE if this evicted the dbuf.
1371 */
1372 static boolean_t
1373 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1374 {
1375 dnode_t *dn;
1376 uint64_t txg = tx->tx_txg;
1377 dbuf_dirty_record_t *dr, **drp;
1378 list_t evict_list;
1379
1380 ASSERT(txg != 0);
1381 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1382 ASSERT0(db->db_level);
1383 ASSERT(MUTEX_HELD(&db->db_mtx));
1384
1385 /*
1386 * If this buffer is not dirty, we're done.
1387 */
1388 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1389 if (dr->dr_txg <= txg)
1390 break;
1391 if (dr == NULL || dr->dr_txg < txg)
1392 return (B_FALSE);
1393 ASSERT(dr->dr_txg == txg);
1394 ASSERT(dr->dr_dbuf == db);
1395
1396 dmu_buf_create_user_evict_list(&evict_list);
1397
1398 DB_DNODE_ENTER(db);
1399 dn = DB_DNODE(db);
1400
1401 /*
1402 * Note: This code will probably work even if there are concurrent
1403 * holders, but it is untested in that scenerio, as the ZPL and
1404 * ztest have additional locking (the range locks) that prevents
1405 * that type of concurrent access.
1406 */
1407 ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1408
1409 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1410
1411 ASSERT(db->db.db_size != 0);
1412
1413 /* XXX would be nice to fix up dn_towrite_space[] */
1414
1415 *drp = dr->dr_next;
1416
1417 /*
1433 }
1434 DB_DNODE_EXIT(db);
1435
1436 if (db->db_state != DB_NOFILL) {
1437 dbuf_unoverride(dr);
1438
1439 ASSERT(db->db_buf != NULL);
1440 ASSERT(dr->dt.dl.dr_data != NULL);
1441 if (dr->dt.dl.dr_data != db->db_buf)
1442 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1443 }
1444 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1445
1446 ASSERT(db->db_dirtycnt > 0);
1447 db->db_dirtycnt -= 1;
1448
1449 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1450 arc_buf_t *buf = db->db_buf;
1451
1452 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1453 dbuf_clear_data(db, &evict_list);
1454 VERIFY(arc_buf_remove_ref(buf, db));
1455 dbuf_evict(db, &evict_list);
1456 dmu_buf_destroy_user_evict_list(&evict_list);
1457 return (B_TRUE);
1458 }
1459
1460 dmu_buf_destroy_user_evict_list(&evict_list);
1461 return (B_FALSE);
1462 }
1463
1464 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1465 void
1466 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1467 {
1468 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1469
1470 ASSERT(tx->tx_txg != 0);
1471 ASSERT(!refcount_is_zero(&db->db_holds));
1472
1473 DB_DNODE_ENTER(db);
1474 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1475 rf |= DB_RF_HAVESTRUCT;
1476 DB_DNODE_EXIT(db);
1477 (void) dbuf_read(db, NULL, rf);
1478 (void) dbuf_dirty(db, tx);
1479 }
1480
1587 db->db_state = DB_FILL;
1588 mutex_exit(&db->db_mtx);
1589 (void) dbuf_dirty(db, tx);
1590 dbuf_fill_done(db, tx);
1591 }
1592
1593 /*
1594 * "Clear" the contents of this dbuf. This will mark the dbuf
1595 * EVICTING and clear *most* of its references. Unfortunetely,
1596 * when we are not holding the dn_dbufs_mtx, we can't clear the
1597 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1598 * in this case. For callers from the DMU we will usually see:
1599 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1600 * For the arc callback, we will usually see:
1601 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1602 * Sometimes, though, we will get a mix of these two:
1603 * DMU: dbuf_clear()->arc_buf_evict()
1604 * ARC: dbuf_do_evict()->dbuf_destroy()
1605 */
1606 void
1607 dbuf_clear(dmu_buf_impl_t *db, list_t *evict_list_p)
1608 {
1609 dnode_t *dn;
1610 dmu_buf_impl_t *parent = db->db_parent;
1611 dmu_buf_impl_t *dndb;
1612 int dbuf_gone = FALSE;
1613
1614 ASSERT(MUTEX_HELD(&db->db_mtx));
1615 ASSERT(refcount_is_zero(&db->db_holds));
1616
1617 dbuf_evict_user(db, evict_list_p);
1618
1619 if (db->db_state == DB_CACHED) {
1620 ASSERT(db->db.db_data != NULL);
1621 if (db->db_blkid == DMU_BONUS_BLKID) {
1622 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1623 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1624 }
1625 db->db.db_data = NULL;
1626 db->db_state = DB_UNCACHED;
1627 }
1628
1629 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1630 ASSERT(db->db_data_pending == NULL);
1631
1632 db->db_state = DB_EVICTING;
1633 db->db_blkptr = NULL;
1634
1635 DB_DNODE_ENTER(db);
1636 dn = DB_DNODE(db);
1637 dndb = dn->dn_dbuf;
1740 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1741 {
1742 objset_t *os = dn->dn_objset;
1743 dmu_buf_impl_t *db, *odb;
1744
1745 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1746 ASSERT(dn->dn_type != DMU_OT_NONE);
1747
1748 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1749
1750 db->db_objset = os;
1751 db->db.db_object = dn->dn_object;
1752 db->db_level = level;
1753 db->db_blkid = blkid;
1754 db->db_last_dirty = NULL;
1755 db->db_dirtycnt = 0;
1756 db->db_dnode_handle = dn->dn_handle;
1757 db->db_parent = parent;
1758 db->db_blkptr = blkptr;
1759
1760 db->db_user = NULL;
1761 db->db_immediate_evict = 0;
1762 db->db_freed_in_flight = 0;
1763
1764 if (blkid == DMU_BONUS_BLKID) {
1765 ASSERT3P(parent, ==, dn->dn_dbuf);
1766 db->db.db_size = DN_MAX_BONUSLEN -
1767 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1768 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1769 db->db.db_offset = DMU_BONUS_BLKID;
1770 db->db_state = DB_UNCACHED;
1771 /* the bonus dbuf is not placed in the hash table */
1772 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1773 return (db);
1774 } else if (blkid == DMU_SPILL_BLKID) {
1775 db->db.db_size = (blkptr != NULL) ?
1776 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1777 db->db.db_offset = 0;
1778 } else {
1779 int blocksize =
1780 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
1803 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1804
1805 if (parent && parent != dn->dn_dbuf)
1806 dbuf_add_ref(parent, db);
1807
1808 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1809 refcount_count(&dn->dn_holds) > 0);
1810 (void) refcount_add(&dn->dn_holds, db);
1811 (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1812
1813 dprintf_dbuf(db, "db=%p\n", db);
1814
1815 return (db);
1816 }
1817
1818 static int
1819 dbuf_do_evict(void *private)
1820 {
1821 arc_buf_t *buf = private;
1822 dmu_buf_impl_t *db = buf->b_private;
1823 list_t evict_list;
1824
1825 dmu_buf_create_user_evict_list(&evict_list);
1826
1827 if (!MUTEX_HELD(&db->db_mtx))
1828 mutex_enter(&db->db_mtx);
1829
1830 ASSERT(refcount_is_zero(&db->db_holds));
1831
1832 if (db->db_state != DB_EVICTING) {
1833 ASSERT(db->db_state == DB_CACHED);
1834 DBUF_VERIFY(db);
1835 db->db_buf = NULL;
1836 dbuf_evict(db, &evict_list);
1837 } else {
1838 mutex_exit(&db->db_mtx);
1839 dbuf_destroy(db);
1840 }
1841 dmu_buf_destroy_user_evict_list(&evict_list);
1842 return (0);
1843 }
1844
1845 static void
1846 dbuf_destroy(dmu_buf_impl_t *db)
1847 {
1848 ASSERT(refcount_is_zero(&db->db_holds));
1849
1850 if (db->db_blkid != DMU_BONUS_BLKID) {
1851 /*
1852 * If this dbuf is still on the dn_dbufs list,
1853 * remove it from that list.
1854 */
1855 if (db->db_dnode_handle != NULL) {
1856 dnode_t *dn;
1857
1858 DB_DNODE_ENTER(db);
1859 dn = DB_DNODE(db);
1860 mutex_enter(&dn->dn_dbufs_mtx);
1861 list_remove(&dn->dn_dbufs, db);
1922
1923 (void) arc_read(NULL, dn->dn_objset->os_spa,
1924 bp, NULL, NULL, priority,
1925 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1926 &aflags, &zb);
1927 }
1928 if (db)
1929 dbuf_rele(db, NULL);
1930 }
1931 }
1932
1933 /*
1934 * Returns with db_holds incremented, and db_mtx not held.
1935 * Note: dn_struct_rwlock must be held.
1936 */
1937 int
1938 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1939 void *tag, dmu_buf_impl_t **dbp)
1940 {
1941 dmu_buf_impl_t *db, *parent = NULL;
1942 list_t evict_list;
1943
1944 ASSERT(blkid != DMU_BONUS_BLKID);
1945 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1946 ASSERT3U(dn->dn_nlevels, >, level);
1947
1948 dmu_buf_create_user_evict_list(&evict_list);
1949
1950 *dbp = NULL;
1951 top:
1952 /* dbuf_find() returns with db_mtx held */
1953 db = dbuf_find(dn, level, blkid);
1954
1955 if (db == NULL) {
1956 blkptr_t *bp = NULL;
1957 int err;
1958
1959 ASSERT3P(parent, ==, NULL);
1960 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1961 if (fail_sparse) {
1962 if (err == 0 && bp && BP_IS_HOLE(bp))
1963 err = SET_ERROR(ENOENT);
1964 if (err) {
1965 if (parent)
1966 dbuf_rele(parent, NULL);
1967 return (err);
1968 }
1969 }
1970 if (err && err != ENOENT)
1971 return (err);
1972 db = dbuf_create(dn, level, blkid, parent, bp);
1973 }
1974
1975 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1976 arc_buf_add_ref(db->db_buf, db);
1977 if (db->db_buf->b_data == NULL) {
1978 dbuf_clear(db, &evict_list);
1979 if (parent) {
1980 dbuf_rele(parent, NULL);
1981 parent = NULL;
1982 }
1983 goto top;
1984 }
1985 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1986 }
1987
1988 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1989
1990 /*
1991 * If this buffer is currently syncing out, and we are are
1992 * still referencing it from db_data, we need to make a copy
1993 * of it in case we decide we want to dirty it again in this txg.
1994 */
1995 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1996 dn->dn_object != DMU_META_DNODE_OBJECT &&
1997 db->db_state == DB_CACHED && db->db_data_pending) {
1998 dbuf_dirty_record_t *dr = db->db_data_pending;
1999
2000 if (dr->dt.dl.dr_data == db->db_buf) {
2001 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2002
2003 dbuf_set_data(db,
2004 arc_buf_alloc(dn->dn_objset->os_spa,
2005 db->db.db_size, db, type));
2006 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2007 db->db.db_size);
2008 }
2009 }
2010
2011 (void) refcount_add(&db->db_holds, tag);
2012 DBUF_VERIFY(db);
2013 mutex_exit(&db->db_mtx);
2014
2015 dmu_buf_destroy_user_evict_list(&evict_list);
2016
2017 /* NOTE: we can't rele the parent until after we drop the db_mtx */
2018 if (parent)
2019 dbuf_rele(parent, NULL);
2020
2021 ASSERT3P(DB_DNODE(db), ==, dn);
2022 ASSERT3U(db->db_blkid, ==, blkid);
2023 ASSERT3U(db->db_level, ==, level);
2024 *dbp = db;
2025
2026 return (0);
2027 }
2028
2029 dmu_buf_impl_t *
2030 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2031 {
2032 dmu_buf_impl_t *db;
2033 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
2034 return (err ? NULL : db);
2035 }
2036
2096 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2097 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2098 * dnode's parent dbuf evicting its dnode handles.
2099 */
2100 #pragma weak dmu_buf_rele = dbuf_rele
2101 void
2102 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2103 {
2104 mutex_enter(&db->db_mtx);
2105 dbuf_rele_and_unlock(db, tag);
2106 }
2107
2108 /*
2109 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2110 * db_dirtycnt and db_holds to be updated atomically.
2111 */
2112 void
2113 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2114 {
2115 int64_t holds;
2116 list_t evict_list;
2117
2118 ASSERT(MUTEX_HELD(&db->db_mtx));
2119 DBUF_VERIFY(db);
2120
2121 dmu_buf_create_user_evict_list(&evict_list);
2122
2123 /*
2124 * Remove the reference to the dbuf before removing its hold on the
2125 * dnode so we can guarantee in dnode_move() that a referenced bonus
2126 * buffer has a corresponding dnode hold.
2127 */
2128 holds = refcount_remove(&db->db_holds, tag);
2129 ASSERT(holds >= 0);
2130
2131 /*
2132 * We can't freeze indirects if there is a possibility that they
2133 * may be modified in the current syncing context.
2134 */
2135 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2136 arc_buf_freeze(db->db_buf);
2137
2138 if (holds == db->db_dirtycnt &&
2139 db->db_level == 0 && db->db_immediate_evict)
2140 dbuf_evict_user(db, &evict_list);
2141
2142 if (holds == 0) {
2143 if (db->db_blkid == DMU_BONUS_BLKID) {
2144 mutex_exit(&db->db_mtx);
2145
2146 /*
2147 * If the dnode moves here, we cannot cross this barrier
2148 * until the move completes.
2149 */
2150 DB_DNODE_ENTER(db);
2151 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2152 DB_DNODE_EXIT(db);
2153 /*
2154 * The bonus buffer's dnode hold is no longer discounted
2155 * in dnode_move(). The dnode cannot move until after
2156 * the dnode_rele().
2157 */
2158 dnode_rele(DB_DNODE(db), db);
2159 } else if (db->db_buf == NULL) {
2160 /*
2161 * This is a special case: we never associated this
2162 * dbuf with any data allocated from the ARC.
2163 */
2164 ASSERT(db->db_state == DB_UNCACHED ||
2165 db->db_state == DB_NOFILL);
2166 dbuf_evict(db, &evict_list);
2167 } else if (arc_released(db->db_buf)) {
2168 arc_buf_t *buf = db->db_buf;
2169 /*
2170 * This dbuf has anonymous data associated with it.
2171 */
2172 dbuf_clear_data(db, &evict_list);
2173 VERIFY(arc_buf_remove_ref(buf, db));
2174 dbuf_evict(db, &evict_list);
2175 } else {
2176 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2177
2178 /*
2179 * A dbuf will be eligible for eviction if either the
2180 * 'primarycache' property is set or a duplicate
2181 * copy of this buffer is already cached in the arc.
2182 *
2183 * In the case of the 'primarycache' a buffer
2184 * is considered for eviction if it matches the
2185 * criteria set in the property.
2186 *
2187 * To decide if our buffer is considered a
2188 * duplicate, we must call into the arc to determine
2189 * if multiple buffers are referencing the same
2190 * block on-disk. If so, then we simply evict
2191 * ourselves.
2192 */
2193 if (!DBUF_IS_CACHEABLE(db) ||
2194 arc_buf_eviction_needed(db->db_buf))
2195 dbuf_clear(db, &evict_list);
2196 else
2197 mutex_exit(&db->db_mtx);
2198 }
2199 } else {
2200 mutex_exit(&db->db_mtx);
2201 }
2202 dmu_buf_destroy_user_evict_list(&evict_list);
2203 }
2204
2205 #pragma weak dmu_buf_refcount = dbuf_refcount
2206 uint64_t
2207 dbuf_refcount(dmu_buf_impl_t *db)
2208 {
2209 return (refcount_count(&db->db_holds));
2210 }
2211
2212 boolean_t
2213 dmu_buf_freeable(dmu_buf_t *dbuf)
2214 {
2215 boolean_t res = B_FALSE;
2216 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2217
2218 if (db->db_blkptr)
2219 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2220 db->db_blkptr, db->db_blkptr->blk_birth);
2221
2222 return (res);
2223 }
2224
2225 blkptr_t *
2226 dmu_buf_get_blkptr(dmu_buf_t *db)
2227 {
2228 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2229 return (dbi->db_blkptr);
2230 }
2231
|