Print this page
1862 incremental zfs receive fails for sparse file > 8PB
dmu_tx_count_free is doing a horrible over-estimation of used memory. It
assumes that the file is fully non-sparse and calculates a worst-case estimate
of how much memory is needed to hold all metadata for the file. If a large
hole needs to be freed, the estimation goes into the TB-range, which obviously
fails later on.
This patch tries to calculate a more realistic estimate by counting the l1
blocks (the loop for this is already present) and assumes a worst-case
distribution of those blocks over the full length given.
Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com>
Reviewed by: Simon Klinkert <klinkert@webgods.de>

@@ -427,10 +427,11 @@
         uint64_t space = 0, unref = 0, skipped = 0;
         dnode_t *dn = txh->txh_dnode;
         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
         spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
         int epbs;
+        uint64_t l0span = 0, nl1blks = 0;
 
         if (dn->dn_nlevels == 0)
                 return;
 
         /*

@@ -459,10 +460,11 @@
                 }
                 if (blkid + nblks > dn->dn_maxblkid)
                         nblks = dn->dn_maxblkid - blkid;
 
         }
+        l0span = nblks;    /* save for later use to calc level > 1 overhead */
         if (dn->dn_nlevels == 1) {
                 int i;
                 for (i = 0; i < nblks; i++) {
                         blkptr_t *bp = dn->dn_phys->dn_blkptr;
                         ASSERT3U(blkid + i, <, dn->dn_nblkptr);

@@ -471,28 +473,14 @@
                                 dprintf_bp(bp, "can free old%s", "");
                                 space += bp_get_dsize(spa, bp);
                         }
                         unref += BP_GET_ASIZE(bp);
                 }
+                nl1blks = 1;
                 nblks = 0;
         }
 
-        /*
-         * Add in memory requirements of higher-level indirects.
-         * This assumes a worst-possible scenario for dn_nlevels.
-         */
-        {
-                uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
-                int level = (dn->dn_nlevels > 1) ? 2 : 1;
-
-                while (level++ < DN_MAX_LEVELS) {
-                        txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
-                        blkcnt = 1 + (blkcnt >> epbs);
-                }
-                ASSERT(blkcnt <= dn->dn_nblkptr);
-        }
-
         lastblk = blkid + nblks - 1;
         while (nblks) {
                 dmu_buf_impl_t *dbuf;
                 uint64_t ibyte, new_blkid;
                 int epb = 1 << epbs;

@@ -559,15 +547,39 @@
                         }
                         unref += BP_GET_ASIZE(bp);
                 }
                 dbuf_rele(dbuf, FTAG);
 
+                ++nl1blks;
                 blkid += tochk;
                 nblks -= tochk;
         }
         rw_exit(&dn->dn_struct_rwlock);
 
+        /*
+         * Add in memory requirements of higher-level indirects.
+         * This assumes a worst-possible scenario for dn_nlevels and a
+         * worst-possible distribution of l1-blocks over the region to free.
+         */
+        {
+                uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
+                int level = 2;
+                /*
+                 * Here we don't use DN_MAX_LEVEL, but calculate it with the
+                 * given datablkshift and indblkshift. This makes the
+                 * difference between 19 and 8 on large files.
+                 */
+                int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
+                    (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
+
+                while (level++ < maxlevel) {
+                        txh->txh_memory_tohold += MIN(blkcnt, (nl1blks >> epbs))
+                            << dn->dn_indblkshift;
+                        blkcnt = 1 + (blkcnt >> epbs);
+                }
+        }
+
         /* account for new level 1 indirect blocks that might show up */
         if (skipped > 0) {
                 txh->txh_fudge += skipped << dn->dn_indblkshift;
                 skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
                 txh->txh_memory_tohold += skipped << dn->dn_indblkshift;