Print this page
3752 want more verifiable dbuf user eviction
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Will Andrews <willa@spectralogic.com>


  28 #include <sys/dmu.h>
  29 #include <sys/zfs_context.h>
  30 #include <sys/zap.h>
  31 #include <sys/refcount.h>
  32 #include <sys/zap_impl.h>
  33 #include <sys/zap_leaf.h>
  34 #include <sys/avl.h>
  35 #include <sys/arc.h>
  36 
  37 #ifdef _KERNEL
  38 #include <sys/sunddi.h>
  39 #endif
  40 
  41 static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
  42 
  43 uint64_t
  44 zap_getflags(zap_t *zap)
  45 {
  46         if (zap->zap_ismicro)
  47                 return (0);
  48         return (zap->zap_u.zap_fat.zap_phys->zap_flags);
  49 }
  50 
  51 int
  52 zap_hashbits(zap_t *zap)
  53 {
  54         if (zap_getflags(zap) & ZAP_FLAG_HASH64)
  55                 return (48);
  56         else
  57                 return (28);
  58 }
  59 
  60 uint32_t
  61 zap_maxcd(zap_t *zap)
  62 {
  63         if (zap_getflags(zap) & ZAP_FLAG_HASH64)
  64                 return ((1<<16)-1);
  65         else
  66                 return (-1U);
  67 }
  68 


 367 
 368         zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 369         rw_init(&zap->zap_rwlock, 0, 0, 0);
 370         rw_enter(&zap->zap_rwlock, RW_WRITER);
 371         zap->zap_objset = os;
 372         zap->zap_object = obj;
 373         zap->zap_dbuf = db;
 374 
 375         if (*(uint64_t *)db->db_data != ZBT_MICRO) {
 376                 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
 377                 zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
 378         } else {
 379                 zap->zap_ismicro = TRUE;
 380         }
 381 
 382         /*
 383          * Make sure that zap_ismicro is set before we let others see
 384          * it, because zap_lockdir() checks zap_ismicro without the lock
 385          * held.
 386          */
 387         winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);

 388 
 389         if (winner != NULL) {
 390                 rw_exit(&zap->zap_rwlock);
 391                 rw_destroy(&zap->zap_rwlock);
 392                 if (!zap->zap_ismicro)
 393                         mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 394                 kmem_free(zap, sizeof (zap_t));
 395                 return (winner);
 396         }
 397 
 398         if (zap->zap_ismicro) {
 399                 zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
 400                 zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
 401                 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
 402                 avl_create(&zap->zap_m.zap_avl, mze_compare,
 403                     sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
 404 
 405                 for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 406                         mzap_ent_phys_t *mze =
 407                             &zap->zap_m.zap_phys->mz_chunk[i];
 408                         if (mze->mze_name[0]) {
 409                                 zap_name_t *zn;
 410 
 411                                 zap->zap_m.zap_num_entries++;
 412                                 zn = zap_name_alloc(zap, mze->mze_name,
 413                                     MT_EXACT);
 414                                 mze_insert(zap, i, zn->zn_hash);
 415                                 zap_name_free(zn);
 416                         }
 417                 }
 418         } else {
 419                 zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
 420                 zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
 421 
 422                 ASSERT3U(sizeof (struct zap_leaf_header), ==,
 423                     2*ZAP_LEAF_CHUNKSIZE);
 424 
 425                 /*
 426                  * The embedded pointer table should not overlap the
 427                  * other members.
 428                  */
 429                 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
 430                     &zap->zap_f.zap_phys->zap_salt);
 431 
 432                 /*
 433                  * The embedded pointer table should end at the end of
 434                  * the block
 435                  */
 436                 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
 437                     1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
 438                     (uintptr_t)zap->zap_f.zap_phys, ==,
 439                     zap->zap_dbuf->db_size);
 440         }
 441         rw_exit(&zap->zap_rwlock);
 442         return (zap);
 443 }
 444 
 445 int
 446 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 447     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 448 {
 449         zap_t *zap;
 450         dmu_buf_t *db;
 451         krw_t lt;
 452         int err;
 453 
 454         *zapp = NULL;
 455 
 456         err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
 457         if (err)
 458                 return (err);
 459 
 460 #ifdef ZFS_DEBUG
 461         {
 462                 dmu_object_info_t doi;
 463                 dmu_object_info_from_db(db, &doi);
 464                 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 465         }
 466 #endif
 467 
 468         zap = dmu_buf_get_user(db);
 469         if (zap == NULL)
 470                 zap = mzap_open(os, obj, db);
 471 
 472         /*
 473          * We're checking zap_ismicro without the lock held, in order to
 474          * tell what type of lock we want.  Once we have some sort of
 475          * lock, see if it really is the right type.  In practice this
 476          * can only be different if it was upgraded from micro to fat,
 477          * and micro wanted WRITER but fat only needs READER.
 478          */
 479         lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
 480         rw_enter(&zap->zap_rwlock, lt);
 481         if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
 482                 /* it was upgraded, now we only need reader */
 483                 ASSERT(lt == RW_WRITER);
 484                 ASSERT(RW_READER ==
 485                     (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
 486                 rw_downgrade(&zap->zap_rwlock);
 487                 lt = RW_READER;
 488         }


 659 
 660         VERIFY(dmu_object_set_blocksize(os, obj,
 661             1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
 662 
 663         mzap_create_impl(os, obj, normflags, flags, tx);
 664         return (obj);
 665 }
 666 
 667 int
 668 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
 669 {
 670         /*
 671          * dmu_object_free will free the object number and free the
 672          * data.  Freeing the data will cause our pageout function to be
 673          * called, which will destroy our data (zap_leaf_t's and zap_t).
 674          */
 675 
 676         return (dmu_object_free(os, zapobj, tx));
 677 }
 678 
 679 _NOTE(ARGSUSED(0))
 680 void
 681 zap_evict(dmu_buf_t *db, void *vzap)
 682 {
 683         zap_t *zap = vzap;
 684 
 685         rw_destroy(&zap->zap_rwlock);
 686 
 687         if (zap->zap_ismicro)
 688                 mze_destroy(zap);
 689         else
 690                 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 691 
 692         kmem_free(zap, sizeof (zap_t));
 693 }
 694 
 695 int
 696 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 697 {
 698         zap_t *zap;
 699         int err;
 700 
 701         err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 702         if (err)
 703                 return (err);


 921                 return (SET_ERROR(ENOTSUP));
 922         }
 923         err = fzap_length(zn, integer_size, num_integers);
 924         zap_name_free(zn);
 925         zap_unlockdir(zap);
 926         return (err);
 927 }
 928 
 929 static void
 930 mzap_addent(zap_name_t *zn, uint64_t value)
 931 {
 932         int i;
 933         zap_t *zap = zn->zn_zap;
 934         int start = zap->zap_m.zap_alloc_next;
 935         uint32_t cd;
 936 
 937         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 938 
 939 #ifdef ZFS_DEBUG
 940         for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 941                 mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
 942                 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 943         }
 944 #endif
 945 
 946         cd = mze_find_unused_cd(zap, zn->zn_hash);
 947         /* given the limited size of the microzap, this can't happen */
 948         ASSERT(cd < zap_maxcd(zap));
 949 
 950 again:
 951         for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
 952                 mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
 953                 if (mze->mze_name[0] == 0) {
 954                         mze->mze_value = value;
 955                         mze->mze_cd = cd;
 956                         (void) strcpy(mze->mze_name, zn->zn_key_orig);
 957                         zap->zap_m.zap_num_entries++;
 958                         zap->zap_m.zap_alloc_next = i+1;
 959                         if (zap->zap_m.zap_alloc_next ==
 960                             zap->zap_m.zap_num_chunks)
 961                                 zap->zap_m.zap_alloc_next = 0;
 962                         mze_insert(zap, i, zn->zn_hash);
 963                         return;
 964                 }
 965         }
 966         if (start != 0) {
 967                 start = 0;
 968                 goto again;
 969         }
 970         ASSERT(!"out of entries!");
 971 }
 972 


1133         int err;
1134         mzap_ent_t *mze;
1135         zap_name_t *zn;
1136 
1137         err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
1138         if (err)
1139                 return (err);
1140         zn = zap_name_alloc(zap, name, mt);
1141         if (zn == NULL) {
1142                 zap_unlockdir(zap);
1143                 return (SET_ERROR(ENOTSUP));
1144         }
1145         if (!zap->zap_ismicro) {
1146                 err = fzap_remove(zn, tx);
1147         } else {
1148                 mze = mze_find(zn);
1149                 if (mze == NULL) {
1150                         err = SET_ERROR(ENOENT);
1151                 } else {
1152                         zap->zap_m.zap_num_entries--;
1153                         bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
1154                             sizeof (mzap_ent_phys_t));
1155                         mze_remove(zap, mze);
1156                 }
1157         }
1158         zap_name_free(zn);
1159         zap_unlockdir(zap);
1160         return (err);
1161 }
1162 
1163 int
1164 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1165     int key_numints, dmu_tx_t *tx)
1166 {
1167         zap_t *zap;
1168         int err;
1169         zap_name_t *zn;
1170 
1171         err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
1172         if (err)
1173                 return (err);




  28 #include <sys/dmu.h>
  29 #include <sys/zfs_context.h>
  30 #include <sys/zap.h>
  31 #include <sys/refcount.h>
  32 #include <sys/zap_impl.h>
  33 #include <sys/zap_leaf.h>
  34 #include <sys/avl.h>
  35 #include <sys/arc.h>
  36 
  37 #ifdef _KERNEL
  38 #include <sys/sunddi.h>
  39 #endif
  40 
  41 static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
  42 
  43 uint64_t
  44 zap_getflags(zap_t *zap)
  45 {
  46         if (zap->zap_ismicro)
  47                 return (0);
  48         return (zap->zap_f_phys->zap_flags);
  49 }
  50 
  51 int
  52 zap_hashbits(zap_t *zap)
  53 {
  54         if (zap_getflags(zap) & ZAP_FLAG_HASH64)
  55                 return (48);
  56         else
  57                 return (28);
  58 }
  59 
  60 uint32_t
  61 zap_maxcd(zap_t *zap)
  62 {
  63         if (zap_getflags(zap) & ZAP_FLAG_HASH64)
  64                 return ((1<<16)-1);
  65         else
  66                 return (-1U);
  67 }
  68 


 367 
 368         zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 369         rw_init(&zap->zap_rwlock, 0, 0, 0);
 370         rw_enter(&zap->zap_rwlock, RW_WRITER);
 371         zap->zap_objset = os;
 372         zap->zap_object = obj;
 373         zap->zap_dbuf = db;
 374 
 375         if (*(uint64_t *)db->db_data != ZBT_MICRO) {
 376                 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
 377                 zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
 378         } else {
 379                 zap->zap_ismicro = TRUE;
 380         }
 381 
 382         /*
 383          * Make sure that zap_ismicro is set before we let others see
 384          * it, because zap_lockdir() checks zap_ismicro without the lock
 385          * held.
 386          */
 387         dmu_buf_init_user(&zap->db_evict, zap_evict);
 388         winner = (zap_t *)dmu_buf_set_user(db, &zap->db_evict);
 389 
 390         if (winner != NULL) {
 391                 rw_exit(&zap->zap_rwlock);
 392                 rw_destroy(&zap->zap_rwlock);
 393                 if (!zap->zap_ismicro)
 394                         mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 395                 kmem_free(zap, sizeof (zap_t));
 396                 return (winner);
 397         }
 398 
 399         if (zap->zap_ismicro) {
 400                 zap->zap_salt = zap->zap_m_phys->mz_salt;
 401                 zap->zap_normflags = zap->zap_m_phys->mz_normflags;
 402                 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
 403                 avl_create(&zap->zap_m.zap_avl, mze_compare,
 404                     sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
 405 
 406                 for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 407                         mzap_ent_phys_t *mze =
 408                             &zap->zap_m_phys->mz_chunk[i];
 409                         if (mze->mze_name[0]) {
 410                                 zap_name_t *zn;
 411 
 412                                 zap->zap_m.zap_num_entries++;
 413                                 zn = zap_name_alloc(zap, mze->mze_name,
 414                                     MT_EXACT);
 415                                 mze_insert(zap, i, zn->zn_hash);
 416                                 zap_name_free(zn);
 417                         }
 418                 }
 419         } else {
 420                 zap->zap_salt = zap->zap_f_phys->zap_salt;
 421                 zap->zap_normflags = zap->zap_f_phys->zap_normflags;
 422 
 423                 ASSERT3U(sizeof (struct zap_leaf_header), ==,
 424                     2*ZAP_LEAF_CHUNKSIZE);
 425 
 426                 /*
 427                  * The embedded pointer table should not overlap the
 428                  * other members.
 429                  */
 430                 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
 431                     &zap->zap_f_phys->zap_salt);
 432 
 433                 /*
 434                  * The embedded pointer table should end at the end of
 435                  * the block
 436                  */
 437                 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
 438                     1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
 439                     (uintptr_t)zap->zap_f_phys, ==,
 440                     zap->zap_dbuf->db_size);
 441         }
 442         rw_exit(&zap->zap_rwlock);
 443         return (zap);
 444 }
 445 
 446 int
 447 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 448     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 449 {
 450         zap_t *zap;
 451         dmu_buf_t *db;
 452         krw_t lt;
 453         int err;
 454 
 455         *zapp = NULL;
 456 
 457         err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
 458         if (err)
 459                 return (err);
 460 
 461 #ifdef ZFS_DEBUG
 462         {
 463                 dmu_object_info_t doi;
 464                 dmu_object_info_from_db(db, &doi);
 465                 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 466         }
 467 #endif
 468 
 469         zap = (zap_t *)dmu_buf_get_user(db);
 470         if (zap == NULL)
 471                 zap = mzap_open(os, obj, db);
 472 
 473         /*
 474          * We're checking zap_ismicro without the lock held, in order to
 475          * tell what type of lock we want.  Once we have some sort of
 476          * lock, see if it really is the right type.  In practice this
 477          * can only be different if it was upgraded from micro to fat,
 478          * and micro wanted WRITER but fat only needs READER.
 479          */
 480         lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
 481         rw_enter(&zap->zap_rwlock, lt);
 482         if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
 483                 /* it was upgraded, now we only need reader */
 484                 ASSERT(lt == RW_WRITER);
 485                 ASSERT(RW_READER ==
 486                     (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
 487                 rw_downgrade(&zap->zap_rwlock);
 488                 lt = RW_READER;
 489         }


 660 
 661         VERIFY(dmu_object_set_blocksize(os, obj,
 662             1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
 663 
 664         mzap_create_impl(os, obj, normflags, flags, tx);
 665         return (obj);
 666 }
 667 
 668 int
 669 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
 670 {
 671         /*
 672          * dmu_object_free will free the object number and free the
 673          * data.  Freeing the data will cause our pageout function to be
 674          * called, which will destroy our data (zap_leaf_t's and zap_t).
 675          */
 676 
 677         return (dmu_object_free(os, zapobj, tx));
 678 }
 679 

 680 void
 681 zap_evict(dmu_buf_user_t *dbu)
 682 {
 683         zap_t *zap = (zap_t *)dbu;
 684 
 685         rw_destroy(&zap->zap_rwlock);
 686 
 687         if (zap->zap_ismicro)
 688                 mze_destroy(zap);
 689         else
 690                 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 691 
 692         kmem_free(zap, sizeof (zap_t));
 693 }
 694 
 695 int
 696 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 697 {
 698         zap_t *zap;
 699         int err;
 700 
 701         err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 702         if (err)
 703                 return (err);


 921                 return (SET_ERROR(ENOTSUP));
 922         }
 923         err = fzap_length(zn, integer_size, num_integers);
 924         zap_name_free(zn);
 925         zap_unlockdir(zap);
 926         return (err);
 927 }
 928 
 929 static void
 930 mzap_addent(zap_name_t *zn, uint64_t value)
 931 {
 932         int i;
 933         zap_t *zap = zn->zn_zap;
 934         int start = zap->zap_m.zap_alloc_next;
 935         uint32_t cd;
 936 
 937         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 938 
 939 #ifdef ZFS_DEBUG
 940         for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 941                 mzap_ent_phys_t *mze = &zap->zap_m_phys->mz_chunk[i];
 942                 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 943         }
 944 #endif
 945 
 946         cd = mze_find_unused_cd(zap, zn->zn_hash);
 947         /* given the limited size of the microzap, this can't happen */
 948         ASSERT(cd < zap_maxcd(zap));
 949 
 950 again:
 951         for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
 952                 mzap_ent_phys_t *mze = &zap->zap_m_phys->mz_chunk[i];
 953                 if (mze->mze_name[0] == 0) {
 954                         mze->mze_value = value;
 955                         mze->mze_cd = cd;
 956                         (void) strcpy(mze->mze_name, zn->zn_key_orig);
 957                         zap->zap_m.zap_num_entries++;
 958                         zap->zap_m.zap_alloc_next = i+1;
 959                         if (zap->zap_m.zap_alloc_next ==
 960                             zap->zap_m.zap_num_chunks)
 961                                 zap->zap_m.zap_alloc_next = 0;
 962                         mze_insert(zap, i, zn->zn_hash);
 963                         return;
 964                 }
 965         }
 966         if (start != 0) {
 967                 start = 0;
 968                 goto again;
 969         }
 970         ASSERT(!"out of entries!");
 971 }
 972 


1133         int err;
1134         mzap_ent_t *mze;
1135         zap_name_t *zn;
1136 
1137         err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
1138         if (err)
1139                 return (err);
1140         zn = zap_name_alloc(zap, name, mt);
1141         if (zn == NULL) {
1142                 zap_unlockdir(zap);
1143                 return (SET_ERROR(ENOTSUP));
1144         }
1145         if (!zap->zap_ismicro) {
1146                 err = fzap_remove(zn, tx);
1147         } else {
1148                 mze = mze_find(zn);
1149                 if (mze == NULL) {
1150                         err = SET_ERROR(ENOENT);
1151                 } else {
1152                         zap->zap_m.zap_num_entries--;
1153                         bzero(&zap->zap_m_phys->mz_chunk[mze->mze_chunkid],
1154                             sizeof (mzap_ent_phys_t));
1155                         mze_remove(zap, mze);
1156                 }
1157         }
1158         zap_name_free(zn);
1159         zap_unlockdir(zap);
1160         return (err);
1161 }
1162 
1163 int
1164 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1165     int key_numints, dmu_tx_t *tx)
1166 {
1167         zap_t *zap;
1168         int err;
1169         zap_name_t *zn;
1170 
1171         err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
1172         if (err)
1173                 return (err);