Print this page
3752 want more verifiable dbuf user eviction
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Will Andrews <willa@spectralogic.com>

@@ -48,14 +48,12 @@
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 
 int fzap_default_block_shift = 14; /* 16k blocksize */
 
-static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
 
-
 void
 fzap_byteswap(void *vbuf, size_t size)
 {
         uint64_t block_type;
 

@@ -78,17 +76,16 @@
         zap_phys_t *zp;
 
         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
         zap->zap_ismicro = FALSE;
 
-        (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
-            &zap->zap_f.zap_phys, zap_evict);
+        zap->db_evict.evict_func = zap_evict;
 
         mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
         zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
 
-        zp = zap->zap_f.zap_phys;
+        zp = zap->zap_f_phys;
         /*
          * explicitly zero it since it might be coming from an
          * initialized microzap
          */
         bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);

@@ -115,11 +112,10 @@
             1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
         dmu_buf_will_dirty(db, tx);
 
         l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
         l->l_dbuf = db;
-        l->l_phys = db->db_data;
 
         zap_leaf_init(l, zp->zap_normflags != 0);
 
         kmem_free(l, sizeof (zap_leaf_t));
         dmu_buf_rele(db, FTAG);

@@ -322,26 +318,26 @@
          * The pointer table should never use more hash bits than we
          * have (otherwise we'd be using useless zero bits to index it).
          * If we are within 2 bits of running out, stop growing, since
          * this is already an aberrant condition.
          */
-        if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+        if (zap->zap_f_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
                 return (SET_ERROR(ENOSPC));
 
-        if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+        if (zap->zap_f_phys->zap_ptrtbl.zt_numblks == 0) {
                 /*
                  * We are outgrowing the "embedded" ptrtbl (the one
                  * stored in the header block).  Give it its own entire
                  * block, which will double the size of the ptrtbl.
                  */
                 uint64_t newblk;
                 dmu_buf_t *db_new;
                 int err;
 
-                ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+                ASSERT3U(zap->zap_f_phys->zap_ptrtbl.zt_shift, ==,
                     ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-                ASSERT0(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk);
+                ASSERT0(zap->zap_f_phys->zap_ptrtbl.zt_blk);
 
                 newblk = zap_allocate_blocks(zap, 1);
                 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
                     newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
                     DMU_READ_NO_PREFETCH);

@@ -350,45 +346,54 @@
                 dmu_buf_will_dirty(db_new, tx);
                 zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
                     db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
                 dmu_buf_rele(db_new, FTAG);
 
-                zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
-                zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
-                zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+                zap->zap_f_phys->zap_ptrtbl.zt_blk = newblk;
+                zap->zap_f_phys->zap_ptrtbl.zt_numblks = 1;
+                zap->zap_f_phys->zap_ptrtbl.zt_shift++;
 
-                ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
-                    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+                ASSERT3U(1ULL << zap->zap_f_phys->zap_ptrtbl.zt_shift, ==,
+                    zap->zap_f_phys->zap_ptrtbl.zt_numblks <<
                     (FZAP_BLOCK_SHIFT(zap)-3));
 
                 return (0);
         } else {
-                return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+                return (zap_table_grow(zap, &zap->zap_f_phys->zap_ptrtbl,
                     zap_ptrtbl_transfer, tx));
         }
 }
 
 static void
 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 {
         dmu_buf_will_dirty(zap->zap_dbuf, tx);
         mutex_enter(&zap->zap_f.zap_num_entries_mtx);
-        ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
-        zap->zap_f.zap_phys->zap_num_entries += delta;
+        ASSERT(delta > 0 || zap->zap_f_phys->zap_num_entries >= -delta);
+        zap->zap_f_phys->zap_num_entries += delta;
         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 }
 
 static uint64_t
 zap_allocate_blocks(zap_t *zap, int nblocks)
 {
         uint64_t newblk;
         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-        newblk = zap->zap_f.zap_phys->zap_freeblk;
-        zap->zap_f.zap_phys->zap_freeblk += nblocks;
+        newblk = zap->zap_f_phys->zap_freeblk;
+        zap->zap_f_phys->zap_freeblk += nblocks;
         return (newblk);
 }
 
+static void
+zap_leaf_pageout(dmu_buf_user_t *dbu)
+{
+        zap_leaf_t *l = (zap_leaf_t *)dbu;
+
+        rw_destroy(&l->l_rwlock);
+        kmem_free(l, sizeof (zap_leaf_t));
+}
+
 static zap_leaf_t *
 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 {
         void *winner;
         zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);

@@ -397,32 +402,32 @@
 
         rw_init(&l->l_rwlock, 0, 0, 0);
         rw_enter(&l->l_rwlock, RW_WRITER);
         l->l_blkid = zap_allocate_blocks(zap, 1);
         l->l_dbuf = NULL;
-        l->l_phys = NULL;
 
         VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
             l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
             DMU_READ_NO_PREFETCH));
-        winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+        dmu_buf_init_user(&l->db_evict, zap_leaf_pageout);
+        winner = (zap_leaf_t *)dmu_buf_set_user(l->l_dbuf, &l->db_evict);
         ASSERT(winner == NULL);
         dmu_buf_will_dirty(l->l_dbuf, tx);
 
         zap_leaf_init(l, zap->zap_normflags != 0);
 
-        zap->zap_f.zap_phys->zap_num_leafs++;
+        zap->zap_f_phys->zap_num_leafs++;
 
         return (l);
 }
 
 int
 fzap_count(zap_t *zap, uint64_t *count)
 {
         ASSERT(!zap->zap_ismicro);
         mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
-        *count = zap->zap_f.zap_phys->zap_num_entries;
+        *count = zap->zap_f_phys->zap_num_entries;
         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
         return (0);
 }
 
 /*

@@ -434,20 +439,10 @@
 {
         rw_exit(&l->l_rwlock);
         dmu_buf_rele(l->l_dbuf, NULL);
 }
 
-_NOTE(ARGSUSED(0))
-static void
-zap_leaf_pageout(dmu_buf_t *db, void *vl)
-{
-        zap_leaf_t *l = vl;
-
-        rw_destroy(&l->l_rwlock);
-        kmem_free(l, sizeof (zap_leaf_t));
-}
-
 static zap_leaf_t *
 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 {
         zap_leaf_t *l, *winner;
 

@@ -457,18 +452,18 @@
         rw_init(&l->l_rwlock, 0, 0, 0);
         rw_enter(&l->l_rwlock, RW_WRITER);
         l->l_blkid = blkid;
         l->l_bs = highbit(db->db_size)-1;
         l->l_dbuf = db;
-        l->l_phys = NULL;
 
-        winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+        dmu_buf_init_user(&l->db_evict, zap_leaf_pageout);
+        winner = (zap_leaf_t *)dmu_buf_set_user(db, &l->db_evict);
 
         rw_exit(&l->l_rwlock);
         if (winner != NULL) {
                 /* someone else set it first */
-                zap_leaf_pageout(NULL, l);
+                zap_leaf_pageout(&l->db_evict);
                 l = winner;
         }
 
         /*
          * lhr_pad was previously used for the next leaf in the leaf

@@ -513,11 +508,11 @@
         ASSERT3U(db->db_object, ==, zap->zap_object);
         ASSERT3U(db->db_offset, ==, blkid << bs);
         ASSERT3U(db->db_size, ==, 1 << bs);
         ASSERT(blkid != 0);
 
-        l = dmu_buf_get_user(db);
+        l = (zap_leaf_t *)dmu_buf_get_user(db);
 
         if (l == NULL)
                 l = zap_open_leaf(blkid, db);
 
         rw_enter(&l->l_rwlock, lt);

@@ -540,32 +535,32 @@
 static int
 zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
 {
         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-        if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+        if (zap->zap_f_phys->zap_ptrtbl.zt_numblks == 0) {
                 ASSERT3U(idx, <,
-                    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+                    (1ULL << zap->zap_f_phys->zap_ptrtbl.zt_shift));
                 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
                 return (0);
         } else {
-                return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+                return (zap_table_load(zap, &zap->zap_f_phys->zap_ptrtbl,
                     idx, valp));
         }
 }
 
 static int
 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 {
         ASSERT(tx != NULL);
         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
-        if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+        if (zap->zap_f_phys->zap_ptrtbl.zt_blk == 0) {
                 ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
                 return (0);
         } else {
-                return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+                return (zap_table_store(zap, &zap->zap_f_phys->zap_ptrtbl,
                     idx, blk, tx));
         }
 }
 
 static int

@@ -573,13 +568,13 @@
 {
         uint64_t idx, blk;
         int err;
 
         ASSERT(zap->zap_dbuf == NULL ||
-            zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
-        ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
-        idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+            zap->zap_f_phys == zap->zap_dbuf->db_data);
+        ASSERT3U(zap->zap_f_phys->zap_magic, ==, ZAP_MAGIC);
+        idx = ZAP_HASH_IDX(h, zap->zap_f_phys->zap_ptrtbl.zt_shift);
         err = zap_idx_to_blk(zap, idx, &blk);
         if (err != 0)
                 return (err);
         err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 

@@ -596,18 +591,18 @@
         zap_leaf_t *nl;
         int prefix_diff, i, err;
         uint64_t sibling;
         int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
 
-        ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+        ASSERT3U(old_prefix_len, <=, zap->zap_f_phys->zap_ptrtbl.zt_shift);
         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
         ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
             l->l_phys->l_hdr.lh_prefix);
 
         if (zap_tryupgradedir(zap, tx) == 0 ||
-            old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+            old_prefix_len == zap->zap_f_phys->zap_ptrtbl.zt_shift) {
                 /* We failed to upgrade, or need to grow the pointer table */
                 objset_t *os = zap->zap_objset;
                 uint64_t object = zap->zap_object;
 
                 zap_put_leaf(l);

@@ -618,11 +613,11 @@
                 if (err)
                         return (err);
                 ASSERT(!zap->zap_ismicro);
 
                 while (old_prefix_len ==
-                    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+                    zap->zap_f_phys->zap_ptrtbl.zt_shift) {
                         err = zap_grow_ptrtbl(zap, tx);
                         if (err)
                                 return (err);
                 }
 

@@ -635,15 +630,15 @@
                         *lp = l;
                         return (0);
                 }
         }
         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-        ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+        ASSERT3U(old_prefix_len, <, zap->zap_f_phys->zap_ptrtbl.zt_shift);
         ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
             l->l_phys->l_hdr.lh_prefix);
 
-        prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+        prefix_diff = zap->zap_f_phys->zap_ptrtbl.zt_shift -
             (old_prefix_len + 1);
         sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 
         /* check for i/o errors before doing zap_leaf_split */
         for (i = 0; i < (1ULL<<prefix_diff); i++) {

@@ -677,17 +672,17 @@
 
 static void
 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 {
         zap_t *zap = zn->zn_zap;
-        int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+        int shift = zap->zap_f_phys->zap_ptrtbl.zt_shift;
         int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
             l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
 
         zap_put_leaf(l);
 
-        if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
+        if (leaffull || zap->zap_f_phys->zap_ptrtbl.zt_nextblk) {
                 int err;
 
                 /*
                  * We are in the middle of growing the pointer table, or
                  * this leaf will soon make us grow it.

@@ -703,11 +698,11 @@
                         if (err)
                                 return;
                 }
 
                 /* could have finished growing while our locks were down */
-                if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
+                if (zap->zap_f_phys->zap_ptrtbl.zt_shift == shift)
                         (void) zap_grow_ptrtbl(zap, tx);
         }
 }
 
 static int

@@ -934,11 +929,11 @@
         uint64_t idx, blk;
         zap_t *zap = zn->zn_zap;
         int bs;
 
         idx = ZAP_HASH_IDX(zn->zn_hash,
-            zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+            zap->zap_f_phys->zap_ptrtbl.zt_shift);
         if (zap_idx_to_blk(zap, idx, &blk) != 0)
                 return;
         bs = FZAP_BLOCK_SHIFT(zap);
         dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
 }

@@ -1274,46 +1269,46 @@
         zs->zs_blocksize = 1ULL << bs;
 
         /*
          * Set zap_phys_t fields
          */
-        zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
-        zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
-        zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
-        zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
-        zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
-        zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
+        zs->zs_num_leafs = zap->zap_f_phys->zap_num_leafs;
+        zs->zs_num_entries = zap->zap_f_phys->zap_num_entries;
+        zs->zs_num_blocks = zap->zap_f_phys->zap_freeblk;
+        zs->zs_block_type = zap->zap_f_phys->zap_block_type;
+        zs->zs_magic = zap->zap_f_phys->zap_magic;
+        zs->zs_salt = zap->zap_f_phys->zap_salt;
 
         /*
          * Set zap_ptrtbl fields
          */
-        zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-        zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
+        zs->zs_ptrtbl_len = 1ULL << zap->zap_f_phys->zap_ptrtbl.zt_shift;
+        zs->zs_ptrtbl_nextblk = zap->zap_f_phys->zap_ptrtbl.zt_nextblk;
         zs->zs_ptrtbl_blks_copied =
-            zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
-        zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
-        zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
-        zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+            zap->zap_f_phys->zap_ptrtbl.zt_blks_copied;
+        zs->zs_ptrtbl_zt_blk = zap->zap_f_phys->zap_ptrtbl.zt_blk;
+        zs->zs_ptrtbl_zt_numblks = zap->zap_f_phys->zap_ptrtbl.zt_numblks;
+        zs->zs_ptrtbl_zt_shift = zap->zap_f_phys->zap_ptrtbl.zt_shift;
 
-        if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+        if (zap->zap_f_phys->zap_ptrtbl.zt_numblks == 0) {
                 /* the ptrtbl is entirely in the header block. */
                 zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
                     1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
         } else {
                 int b;
 
                 dmu_prefetch(zap->zap_objset, zap->zap_object,
-                    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
-                    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
+                    zap->zap_f_phys->zap_ptrtbl.zt_blk << bs,
+                    zap->zap_f_phys->zap_ptrtbl.zt_numblks << bs);
 
-                for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+                for (b = 0; b < zap->zap_f_phys->zap_ptrtbl.zt_numblks;
                     b++) {
                         dmu_buf_t *db;
                         int err;
 
                         err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-                            (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+                            (zap->zap_f_phys->zap_ptrtbl.zt_blk + b) << bs,
                             FTAG, &db, DMU_READ_NO_PREFETCH);
                         if (err == 0) {
                                 zap_stats_ptrtbl(zap, db->db_data,
                                     1<<(bs-3), zs);
                                 dmu_buf_rele(db, FTAG);

@@ -1346,11 +1341,11 @@
          *   external pointer table.
          * - If this already has an external pointer table this operation
          *   could extend the table.
          */
         if (add) {
-                if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
+                if (zap->zap_f_phys->zap_ptrtbl.zt_blk == 0)
                         *towrite += zap->zap_dbuf->db_size;
                 else
                         *towrite += (zap->zap_dbuf->db_size * 3);
         }