Print this page
1862 incremental zfs receive fails for sparse file > 8PB
dmu_tx_count_free is doing a horrible over-estimation of used memory. It
assumes that the file is fully non-sparse and calculates a worst-case estimate
of how much memory is needed to hold all metadata for the file. If a large
hole needs to be freed, the estimation goes into the TB-range, which obviously
fails later on.
This patch tries to calculate a more realistic estimate by counting the l1
blocks (the loop for this is already present) and assumes a worst-case
distribution of those blocks over the full length given.
Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com>
Reviewed by: Simon Klinkert <klinkert@webgods.de>


 412         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 413 
 414         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 415             object, THT_WRITE, off, len);
 416         if (txh == NULL)
 417                 return;
 418 
 419         dmu_tx_count_write(txh, off, len);
 420         dmu_tx_count_dnode(txh);
 421 }
 422 
 423 static void
 424 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 425 {
 426         uint64_t blkid, nblks, lastblk;
 427         uint64_t space = 0, unref = 0, skipped = 0;
 428         dnode_t *dn = txh->txh_dnode;
 429         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 430         spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 431         int epbs;

 432 
 433         if (dn->dn_nlevels == 0)
 434                 return;
 435 
 436         /*
 437          * The struct_rwlock protects us against dn_nlevels
 438          * changing, in case (against all odds) we manage to dirty &
 439          * sync out the changes after we check for being dirty.
 440          * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 441          */
 442         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 443         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 444         if (dn->dn_maxblkid == 0) {
 445                 if (off == 0 && len >= dn->dn_datablksz) {
 446                         blkid = 0;
 447                         nblks = 1;
 448                 } else {
 449                         rw_exit(&dn->dn_struct_rwlock);
 450                         return;
 451                 }
 452         } else {
 453                 blkid = off >> dn->dn_datablkshift;
 454                 nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 455 
 456                 if (blkid >= dn->dn_maxblkid) {
 457                         rw_exit(&dn->dn_struct_rwlock);
 458                         return;
 459                 }
 460                 if (blkid + nblks > dn->dn_maxblkid)
 461                         nblks = dn->dn_maxblkid - blkid;
 462 
 463         }

 464         if (dn->dn_nlevels == 1) {
 465                 int i;
 466                 for (i = 0; i < nblks; i++) {
 467                         blkptr_t *bp = dn->dn_phys->dn_blkptr;
 468                         ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 469                         bp += blkid + i;
 470                         if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 471                                 dprintf_bp(bp, "can free old%s", "");
 472                                 space += bp_get_dsize(spa, bp);
 473                         }
 474                         unref += BP_GET_ASIZE(bp);
 475                 }

 476                 nblks = 0;
 477         }
 478 
 479         /*
 480          * Add in memory requirements of higher-level indirects.
 481          * This assumes a worst-possible scenario for dn_nlevels.
 482          */
 483         {
 484                 uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
 485                 int level = (dn->dn_nlevels > 1) ? 2 : 1;
 486 
 487                 while (level++ < DN_MAX_LEVELS) {
 488                         txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
 489                         blkcnt = 1 + (blkcnt >> epbs);
 490                 }
 491                 ASSERT(blkcnt <= dn->dn_nblkptr);
 492         }
 493 
 494         lastblk = blkid + nblks - 1;
 495         while (nblks) {
 496                 dmu_buf_impl_t *dbuf;
 497                 uint64_t ibyte, new_blkid;
 498                 int epb = 1 << epbs;
 499                 int err, i, blkoff, tochk;
 500                 blkptr_t *bp;
 501 
 502                 ibyte = blkid << dn->dn_datablkshift;
 503                 err = dnode_next_offset(dn,
 504                     DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 505                 new_blkid = ibyte >> dn->dn_datablkshift;
 506                 if (err == ESRCH) {
 507                         skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 508                         break;
 509                 }
 510                 if (err) {
 511                         txh->txh_tx->tx_err = err;
 512                         break;
 513                 }


 544                 err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 545                 if (err != 0) {
 546                         txh->txh_tx->tx_err = err;
 547                         dbuf_rele(dbuf, FTAG);
 548                         break;
 549                 }
 550 
 551                 bp = dbuf->db.db_data;
 552                 bp += blkoff;
 553 
 554                 for (i = 0; i < tochk; i++) {
 555                         if (dsl_dataset_block_freeable(ds, &bp[i],
 556                             bp[i].blk_birth)) {
 557                                 dprintf_bp(&bp[i], "can free old%s", "");
 558                                 space += bp_get_dsize(spa, &bp[i]);
 559                         }
 560                         unref += BP_GET_ASIZE(bp);
 561                 }
 562                 dbuf_rele(dbuf, FTAG);
 563 

 564                 blkid += tochk;
 565                 nblks -= tochk;
 566         }
 567         rw_exit(&dn->dn_struct_rwlock);
 568 























 569         /* account for new level 1 indirect blocks that might show up */
 570         if (skipped > 0) {
 571                 txh->txh_fudge += skipped << dn->dn_indblkshift;
 572                 skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 573                 txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 574         }
 575         txh->txh_space_tofree += space;
 576         txh->txh_space_tounref += unref;
 577 }
 578 
 579 void
 580 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 581 {
 582         dmu_tx_hold_t *txh;
 583         dnode_t *dn;
 584         uint64_t start, end, i;
 585         int err, shift;
 586         zio_t *zio;
 587 
 588         ASSERT(tx->tx_txg == 0);




 412         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 413 
 414         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 415             object, THT_WRITE, off, len);
 416         if (txh == NULL)
 417                 return;
 418 
 419         dmu_tx_count_write(txh, off, len);
 420         dmu_tx_count_dnode(txh);
 421 }
 422 
 423 static void
 424 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 425 {
 426         uint64_t blkid, nblks, lastblk;
 427         uint64_t space = 0, unref = 0, skipped = 0;
 428         dnode_t *dn = txh->txh_dnode;
 429         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 430         spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 431         int epbs;
 432         uint64_t l0span = 0, nl1blks = 0;
 433 
 434         if (dn->dn_nlevels == 0)
 435                 return;
 436 
 437         /*
 438          * The struct_rwlock protects us against dn_nlevels
 439          * changing, in case (against all odds) we manage to dirty &
 440          * sync out the changes after we check for being dirty.
 441          * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 442          */
 443         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 444         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 445         if (dn->dn_maxblkid == 0) {
 446                 if (off == 0 && len >= dn->dn_datablksz) {
 447                         blkid = 0;
 448                         nblks = 1;
 449                 } else {
 450                         rw_exit(&dn->dn_struct_rwlock);
 451                         return;
 452                 }
 453         } else {
 454                 blkid = off >> dn->dn_datablkshift;
 455                 nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 456 
 457                 if (blkid >= dn->dn_maxblkid) {
 458                         rw_exit(&dn->dn_struct_rwlock);
 459                         return;
 460                 }
 461                 if (blkid + nblks > dn->dn_maxblkid)
 462                         nblks = dn->dn_maxblkid - blkid;
 463 
 464         }
 465         l0span = nblks;    /* save for later use to calc level > 1 overhead */
 466         if (dn->dn_nlevels == 1) {
 467                 int i;
 468                 for (i = 0; i < nblks; i++) {
 469                         blkptr_t *bp = dn->dn_phys->dn_blkptr;
 470                         ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 471                         bp += blkid + i;
 472                         if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 473                                 dprintf_bp(bp, "can free old%s", "");
 474                                 space += bp_get_dsize(spa, bp);
 475                         }
 476                         unref += BP_GET_ASIZE(bp);
 477                 }
 478                 nl1blks = 1;
 479                 nblks = 0;
 480         }
 481 















 482         lastblk = blkid + nblks - 1;
 483         while (nblks) {
 484                 dmu_buf_impl_t *dbuf;
 485                 uint64_t ibyte, new_blkid;
 486                 int epb = 1 << epbs;
 487                 int err, i, blkoff, tochk;
 488                 blkptr_t *bp;
 489 
 490                 ibyte = blkid << dn->dn_datablkshift;
 491                 err = dnode_next_offset(dn,
 492                     DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 493                 new_blkid = ibyte >> dn->dn_datablkshift;
 494                 if (err == ESRCH) {
 495                         skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 496                         break;
 497                 }
 498                 if (err) {
 499                         txh->txh_tx->tx_err = err;
 500                         break;
 501                 }


 532                 err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 533                 if (err != 0) {
 534                         txh->txh_tx->tx_err = err;
 535                         dbuf_rele(dbuf, FTAG);
 536                         break;
 537                 }
 538 
 539                 bp = dbuf->db.db_data;
 540                 bp += blkoff;
 541 
 542                 for (i = 0; i < tochk; i++) {
 543                         if (dsl_dataset_block_freeable(ds, &bp[i],
 544                             bp[i].blk_birth)) {
 545                                 dprintf_bp(&bp[i], "can free old%s", "");
 546                                 space += bp_get_dsize(spa, &bp[i]);
 547                         }
 548                         unref += BP_GET_ASIZE(bp);
 549                 }
 550                 dbuf_rele(dbuf, FTAG);
 551 
 552                 ++nl1blks;
 553                 blkid += tochk;
 554                 nblks -= tochk;
 555         }
 556         rw_exit(&dn->dn_struct_rwlock);
 557 
 558         /*
 559          * Add in memory requirements of higher-level indirects.
 560          * This assumes a worst-possible scenario for dn_nlevels and a
 561          * worst-possible distribution of l1-blocks over the region to free.
 562          */
 563         {
 564                 uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 565                 int level = 2;
 566                 /*
 567                  * Here we don't use DN_MAX_LEVEL, but calculate it with the
 568                  * given datablkshift and indblkshift. This makes the
 569                  * difference between 19 and 8 on large files.
 570                  */
 571                 int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 572                     (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 573 
 574                 while (level++ < maxlevel) {
 575                         txh->txh_memory_tohold += MIN(blkcnt, (nl1blks >> epbs))
 576                             << dn->dn_indblkshift;
 577                         blkcnt = 1 + (blkcnt >> epbs);
 578                 }
 579         }
 580 
 581         /* account for new level 1 indirect blocks that might show up */
 582         if (skipped > 0) {
 583                 txh->txh_fudge += skipped << dn->dn_indblkshift;
 584                 skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 585                 txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 586         }
 587         txh->txh_space_tofree += space;
 588         txh->txh_space_tounref += unref;
 589 }
 590 
 591 void
 592 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 593 {
 594         dmu_tx_hold_t *txh;
 595         dnode_t *dn;
 596         uint64_t start, end, i;
 597         int err, shift;
 598         zio_t *zio;
 599 
 600         ASSERT(tx->tx_txg == 0);