Print this page
1862 incremental zfs receive fails for sparse file > 8PB
dmu_tx_count_free is doing a horrible over-estimation of used memory. It
assumes that the file is fully non-sparse and calculates a worst-case estimate
of how much memory is needed to hold all metadata for the file. If a large
hole needs to be freed, the estimation goes into the TB-range, which obviously
fails later on.
This patch tries to calculate a more realistic estimate by counting the l1
blocks (the loop for this is already present) and assumes a worst-case
distribution of those blocks over the full length given.
Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com>
Reviewed by: Simon Klinkert <klinkert@webgods.de>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c
↓ open down ↓ 421 lines elided ↑ open up ↑
 422  422  
 423  423  static void
 424  424  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 425  425  {
 426  426          uint64_t blkid, nblks, lastblk;
 427  427          uint64_t space = 0, unref = 0, skipped = 0;
 428  428          dnode_t *dn = txh->txh_dnode;
 429  429          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 430  430          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 431  431          int epbs;
      432 +        uint64_t l0span = 0, nl1blks = 0;
 432  433  
 433  434          if (dn->dn_nlevels == 0)
 434  435                  return;
 435  436  
 436  437          /*
 437  438           * The struct_rwlock protects us against dn_nlevels
 438  439           * changing, in case (against all odds) we manage to dirty &
 439  440           * sync out the changes after we check for being dirty.
 440  441           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 441  442           */
↓ open down ↓ 12 lines elided ↑ open up ↑
 454  455                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 455  456  
 456  457                  if (blkid >= dn->dn_maxblkid) {
 457  458                          rw_exit(&dn->dn_struct_rwlock);
 458  459                          return;
 459  460                  }
 460  461                  if (blkid + nblks > dn->dn_maxblkid)
 461  462                          nblks = dn->dn_maxblkid - blkid;
 462  463  
 463  464          }
      465 +        l0span = nblks;    /* save for later use to calc level > 1 overhead */
 464  466          if (dn->dn_nlevels == 1) {
 465  467                  int i;
 466  468                  for (i = 0; i < nblks; i++) {
 467  469                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 468  470                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 469  471                          bp += blkid + i;
 470  472                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 471  473                                  dprintf_bp(bp, "can free old%s", "");
 472  474                                  space += bp_get_dsize(spa, bp);
 473  475                          }
 474  476                          unref += BP_GET_ASIZE(bp);
 475  477                  }
      478 +                nl1blks = 1;
 476  479                  nblks = 0;
 477  480          }
 478  481  
 479      -        /*
 480      -         * Add in memory requirements of higher-level indirects.
 481      -         * This assumes a worst-possible scenario for dn_nlevels.
 482      -         */
 483      -        {
 484      -                uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
 485      -                int level = (dn->dn_nlevels > 1) ? 2 : 1;
 486      -
 487      -                while (level++ < DN_MAX_LEVELS) {
 488      -                        txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
 489      -                        blkcnt = 1 + (blkcnt >> epbs);
 490      -                }
 491      -                ASSERT(blkcnt <= dn->dn_nblkptr);
 492      -        }
 493      -
 494  482          lastblk = blkid + nblks - 1;
 495  483          while (nblks) {
 496  484                  dmu_buf_impl_t *dbuf;
 497  485                  uint64_t ibyte, new_blkid;
 498  486                  int epb = 1 << epbs;
 499  487                  int err, i, blkoff, tochk;
 500  488                  blkptr_t *bp;
 501  489  
 502  490                  ibyte = blkid << dn->dn_datablkshift;
 503  491                  err = dnode_next_offset(dn,
↓ open down ↓ 50 lines elided ↑ open up ↑
 554  542                  for (i = 0; i < tochk; i++) {
 555  543                          if (dsl_dataset_block_freeable(ds, &bp[i],
 556  544                              bp[i].blk_birth)) {
 557  545                                  dprintf_bp(&bp[i], "can free old%s", "");
 558  546                                  space += bp_get_dsize(spa, &bp[i]);
 559  547                          }
 560  548                          unref += BP_GET_ASIZE(bp);
 561  549                  }
 562  550                  dbuf_rele(dbuf, FTAG);
 563  551  
      552 +                ++nl1blks;
 564  553                  blkid += tochk;
 565  554                  nblks -= tochk;
 566  555          }
 567  556          rw_exit(&dn->dn_struct_rwlock);
 568  557  
      558 +        /*
      559 +         * Add in memory requirements of higher-level indirects.
      560 +         * This assumes a worst-possible scenario for dn_nlevels and a
      561 +         * worst-possible distribution of l1-blocks over the region to free.
      562 +         */
      563 +        {
      564 +                uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
      565 +                int level = 2;
      566 +                /*
      567 +                 * Here we don't use DN_MAX_LEVEL, but calculate it with the
      568 +                 * given datablkshift and indblkshift. This makes the
      569 +                 * difference between 19 and 8 on large files.
      570 +                 */
      571 +                int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
      572 +                    (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
      573 +
      574 +                while (level++ < maxlevel) {
      575 +                        txh->txh_memory_tohold += MIN(blkcnt, (nl1blks >> epbs))
      576 +                            << dn->dn_indblkshift;
      577 +                        blkcnt = 1 + (blkcnt >> epbs);
      578 +                }
      579 +        }
      580 +
 569  581          /* account for new level 1 indirect blocks that might show up */
 570  582          if (skipped > 0) {
 571  583                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 572  584                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 573  585                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 574  586          }
 575  587          txh->txh_space_tofree += space;
 576  588          txh->txh_space_tounref += unref;
 577  589  }
 578  590  
↓ open down ↓ 807 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX