Print this page
3740 Poor ZFS send / receive performance due to snapshot hold / release processing
Submitted by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

@@ -21,10 +21,11 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>

@@ -791,10 +792,11 @@
         boolean_t seenfrom, seento, replicate, doall, fromorigin;
         boolean_t verbose, dryrun, parsable, progress;
         int outfd;
         boolean_t err;
         nvlist_t *fss;
+        nvlist_t *snapholds;
         avl_tree_t *fsavl;
         snapfilter_cb_t *filter_cb;
         void *filter_cb_arg;
         nvlist_t *debugnv;
         char holdtag[ZFS_MAXNAMELEN];

@@ -940,45 +942,23 @@
         nvlist_free(thisdbg);
 
         return (0);
 }
 
-static int
-hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd)
+static void
+gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd)
 {
-        zfs_handle_t *pzhp;
-        int error = 0;
-        char *thissnap;
-
         assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
-        if (sdd->dryrun)
-                return (0);
-
         /*
-         * zfs_send() only opens a cleanup_fd for sends that need it,
+         * zfs_send() only sets snapholds for sends that need them,
          * e.g. replication and doall.
          */
-        if (sdd->cleanup_fd == -1)
-                return (0);
-
-        thissnap = strchr(zhp->zfs_name, '@') + 1;
-        *(thissnap - 1) = '\0';
-        pzhp = zfs_open(zhp->zfs_hdl, zhp->zfs_name, ZFS_TYPE_DATASET);
-        *(thissnap - 1) = '@';
-
-        /*
-         * It's OK if the parent no longer exists.  The send code will
-         * handle that error.
-         */
-        if (pzhp) {
-                error = zfs_hold(pzhp, thissnap, sdd->holdtag,
-                    B_FALSE, B_TRUE, sdd->cleanup_fd);
-                zfs_close(pzhp);
-        }
+        if (sdd->snapholds == NULL)
+                return;
 
-        return (error);
+        fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag);
 }
 
 static void *
 send_progress_thread(void *arg)
 {

@@ -1030,32 +1010,27 @@
 dump_snapshot(zfs_handle_t *zhp, void *arg)
 {
         send_dump_data_t *sdd = arg;
         progress_arg_t pa = { 0 };
         pthread_t tid;
-
         char *thissnap;
         int err;
         boolean_t isfromsnap, istosnap, fromorigin;
         boolean_t exclude = B_FALSE;
 
+        err = 0;
         thissnap = strchr(zhp->zfs_name, '@') + 1;
         isfromsnap = (sdd->fromsnap != NULL &&
             strcmp(sdd->fromsnap, thissnap) == 0);
 
         if (!sdd->seenfrom && isfromsnap) {
-                err = hold_for_send(zhp, sdd);
-                if (err == 0) {
+                gather_holds(zhp, sdd);
                         sdd->seenfrom = B_TRUE;
                         (void) strcpy(sdd->prevsnap, thissnap);
-                        sdd->prevsnap_obj = zfs_prop_get_int(zhp,
-                            ZFS_PROP_OBJSETID);
-                } else if (err == ENOENT) {
-                        err = 0;
-                }
+                sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
                 zfs_close(zhp);
-                return (err);
+                return (0);
         }
 
         if (sdd->seento || !sdd->seenfrom) {
                 zfs_close(zhp);
                 return (0);

@@ -1102,18 +1077,11 @@
                  */
                 zfs_close(zhp);
                 return (0);
         }
 
-        err = hold_for_send(zhp, sdd);
-        if (err) {
-                if (err == ENOENT)
-                        err = 0;
-                zfs_close(zhp);
-                return (err);
-        }
-
+        gather_holds(zhp, sdd);
         fromorigin = sdd->prevsnap[0] == '\0' &&
             (sdd->fromorigin || sdd->replicate);
 
         if (sdd->verbose) {
                 uint64_t size;

@@ -1377,11 +1345,11 @@
         int err = 0;
         nvlist_t *fss = NULL;
         avl_tree_t *fsavl = NULL;
         static uint64_t holdseq;
         int spa_version;
-        pthread_t tid;
+        pthread_t tid = 0;
         int pipefd[2];
         dedup_arg_t dda = { 0 };
         int featureflags = 0;
 
         (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,

@@ -1450,16 +1418,13 @@
                             NV_ENCODE_XDR, 0);
                         if (debugnvp)
                                 *debugnvp = hdrnv;
                         else
                                 nvlist_free(hdrnv);
-                        if (err) {
-                                fsavl_destroy(fsavl);
-                                nvlist_free(fss);
+                        if (err)
                                 goto stderr_out;
                         }
-                }
 
                 if (!flags->dryrun) {
                         /* write first begin record */
                         drr.drr_type = DRR_BEGIN;
                         drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;

@@ -1478,24 +1443,20 @@
                                 err = cksum_and_write(packbuf, buflen, &zc,
                                     outfd);
                         }
                         free(packbuf);
                         if (err == -1) {
-                                fsavl_destroy(fsavl);
-                                nvlist_free(fss);
                                 err = errno;
                                 goto stderr_out;
                         }
 
                         /* write end record */
                         bzero(&drr, sizeof (drr));
                         drr.drr_type = DRR_END;
                         drr.drr_u.drr_end.drr_checksum = zc;
                         err = write(outfd, &drr, sizeof (drr));
                         if (err == -1) {
-                                fsavl_destroy(fsavl);
-                                nvlist_free(fss);
                                 err = errno;
                                 goto stderr_out;
                         }
 
                         err = 0;

@@ -1503,11 +1464,11 @@
         }
 
         /* dump each stream */
         sdd.fromsnap = fromsnap;
         sdd.tosnap = tosnap;
-        if (flags->dedup)
+        if (tid != 0)
                 sdd.outfd = pipefd[0];
         else
                 sdd.outfd = outfd;
         sdd.replicate = flags->replicate;
         sdd.doall = flags->doall;

@@ -1540,40 +1501,75 @@
                 sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
                 if (sdd.cleanup_fd < 0) {
                         err = errno;
                         goto stderr_out;
                 }
+                sdd.snapholds = fnvlist_alloc();
         } else {
                 sdd.cleanup_fd = -1;
+                sdd.snapholds = NULL;
         }
-        if (flags->verbose) {
+        if (flags->verbose || sdd.snapholds != NULL) {
                 /*
                  * Do a verbose no-op dry run to get all the verbose output
-                 * before generating any data.  Then do a non-verbose real
-                 * run to generate the streams.
+                 * or to gather snapshot hold's before generating any data,
+                 * then do a non-verbose real run to generate the streams.
                  */
                 sdd.dryrun = B_TRUE;
                 err = dump_filesystems(zhp, &sdd);
-                sdd.dryrun = flags->dryrun;
-                sdd.verbose = B_FALSE;
+
+                if (err != 0)
+                        goto stderr_out;
+
+                if (flags->verbose) {
                 if (flags->parsable) {
                         (void) fprintf(stderr, "size\t%llu\n",
                             (longlong_t)sdd.size);
                 } else {
                         char buf[16];
                         zfs_nicenum(sdd.size, buf, sizeof (buf));
                         (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
                             "total estimated size is %s\n"), buf);
                 }
         }
+
+                /* Ensure no snaps found is treated as an error. */
+                if (!sdd.seento) {
+                        err = ENOENT;
+                        goto err_out;
+                }
+
+                /* Skip the second run if dryrun was requested. */
+                if (flags->dryrun)
+                        goto err_out;
+
+                if (sdd.snapholds != NULL) {
+                        err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds);
+                        if (err != 0)
+                                goto stderr_out;
+
+                        fnvlist_free(sdd.snapholds);
+                        sdd.snapholds = NULL;
+                }
+
+                sdd.dryrun = B_FALSE;
+                sdd.verbose = B_FALSE;
+        }
+
         err = dump_filesystems(zhp, &sdd);
         fsavl_destroy(fsavl);
         nvlist_free(fss);
 
-        if (flags->dedup) {
-                (void) close(pipefd[0]);
+        /* Ensure no snaps found is treated as an error. */
+        if (err == 0 && !sdd.seento)
+                err = ENOENT;
+
+        if (tid != 0) {
+                if (err != 0)
+                        (void) pthread_cancel(tid);
                 (void) pthread_join(tid, NULL);
+                (void) close(pipefd[0]);
         }
 
         if (sdd.cleanup_fd != -1) {
                 VERIFY(0 == close(sdd.cleanup_fd));
                 sdd.cleanup_fd = -1;

@@ -1597,13 +1593,17 @@
         return (err || sdd.err);
 
 stderr_out:
         err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
 err_out:
+        fsavl_destroy(fsavl);
+        nvlist_free(fss);
+        fnvlist_free(sdd.snapholds);
+
         if (sdd.cleanup_fd != -1)
                 VERIFY(0 == close(sdd.cleanup_fd));
-        if (flags->dedup) {
+        if (tid != 0) {
                 (void) pthread_cancel(tid);
                 (void) pthread_join(tid, NULL);
                 (void) close(pipefd[0]);
         }
         return (err);