Print this page
3740 Poor ZFS send / receive performance due to snapshot hold / release processing
Submitted by: Steven Hartland <steven.hartland@multiplay.co.uk>


 776         }
 777 
 778         *nvlp = sd.fss;
 779         return (0);
 780 }
 781 
 782 /*
 783  * Routines specific to "zfs send"
 784  */
 785 typedef struct send_dump_data {
 786         /* these are all just the short snapname (the part after the @) */
 787         const char *fromsnap;
 788         const char *tosnap;
 789         char prevsnap[ZFS_MAXNAMELEN];
 790         uint64_t prevsnap_obj;
 791         boolean_t seenfrom, seento, replicate, doall, fromorigin;
 792         boolean_t verbose, dryrun, parsable, progress;
 793         int outfd;
 794         boolean_t err;
 795         nvlist_t *fss;

 796         avl_tree_t *fsavl;
 797         snapfilter_cb_t *filter_cb;
 798         void *filter_cb_arg;
 799         nvlist_t *debugnv;
 800         char holdtag[ZFS_MAXNAMELEN];
 801         int cleanup_fd;
 802         uint64_t size;
 803 } send_dump_data_t;
 804 
 805 static int
 806 estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
 807     boolean_t fromorigin, uint64_t *sizep)
 808 {
 809         zfs_cmd_t zc = { 0 };
 810         libzfs_handle_t *hdl = zhp->zfs_hdl;
 811 
 812         assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 813         assert(fromsnap_obj == 0 || !fromorigin);
 814 
 815         (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));


 925                 case ENXIO:
 926                 case EPIPE:
 927                 case ERANGE:
 928                 case EFAULT:
 929                 case EROFS:
 930                         zfs_error_aux(hdl, strerror(errno));
 931                         return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 932 
 933                 default:
 934                         return (zfs_standard_error(hdl, errno, errbuf));
 935                 }
 936         }
 937 
 938         if (debugnv)
 939                 VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
 940         nvlist_free(thisdbg);
 941 
 942         return (0);
 943 }
 944 
 945 static int
 946 hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd)
 947 {
 948         zfs_handle_t *pzhp;
 949         int error = 0;
 950         char *thissnap;
 951 
 952         assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 953 
 954         if (sdd->dryrun)
 955                 return (0);
 956 
 957         /*
 958          * zfs_send() only opens a cleanup_fd for sends that need it,
 959          * e.g. replication and doall.
 960          */
 961         if (sdd->cleanup_fd == -1)
 962                 return (0);
 963 
 964         thissnap = strchr(zhp->zfs_name, '@') + 1;
 965         *(thissnap - 1) = '\0';
 966         pzhp = zfs_open(zhp->zfs_hdl, zhp->zfs_name, ZFS_TYPE_DATASET);
 967         *(thissnap - 1) = '@';
 968 
 969         /*
 970          * It's OK if the parent no longer exists.  The send code will
 971          * handle that error.
 972          */
 973         if (pzhp) {
 974                 error = zfs_hold(pzhp, thissnap, sdd->holdtag,
 975                     B_FALSE, B_TRUE, sdd->cleanup_fd);
 976                 zfs_close(pzhp);
 977         }
 978 
 979         return (error);
 980 }
 981 
 982 static void *
 983 send_progress_thread(void *arg)
 984 {
 985         progress_arg_t *pa = arg;
 986 
 987         zfs_cmd_t zc = { 0 };
 988         zfs_handle_t *zhp = pa->pa_zhp;
 989         libzfs_handle_t *hdl = zhp->zfs_hdl;
 990         unsigned long long bytes;
 991         char buf[16];
 992 
 993         time_t t;
 994         struct tm *tm;
 995 
 996         assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 997         (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 998 
 999         if (!pa->pa_parsable)


1026         }
1027 }
1028 
1029 static int
1030 dump_snapshot(zfs_handle_t *zhp, void *arg)
1031 {
1032         send_dump_data_t *sdd = arg;
1033         progress_arg_t pa = { 0 };
1034         pthread_t tid;
1035 
1036         char *thissnap;
1037         int err;
1038         boolean_t isfromsnap, istosnap, fromorigin;
1039         boolean_t exclude = B_FALSE;
1040 
1041         thissnap = strchr(zhp->zfs_name, '@') + 1;
1042         isfromsnap = (sdd->fromsnap != NULL &&
1043             strcmp(sdd->fromsnap, thissnap) == 0);
1044 
1045         if (!sdd->seenfrom && isfromsnap) {
1046                 err = hold_for_send(zhp, sdd);
1047                 if (err == 0) {
1048                         sdd->seenfrom = B_TRUE;
1049                         (void) strcpy(sdd->prevsnap, thissnap);
1050                         sdd->prevsnap_obj = zfs_prop_get_int(zhp,
1051                             ZFS_PROP_OBJSETID);
1052                 } else if (err == ENOENT) {
1053                         err = 0;
1054                 }
1055                 zfs_close(zhp);
1056                 return (err);
1057         }
1058 
1059         if (sdd->seento || !sdd->seenfrom) {
1060                 zfs_close(zhp);
1061                 return (0);
1062         }
1063 
1064         istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
1065         if (istosnap)
1066                 sdd->seento = B_TRUE;
1067 
1068         if (!sdd->doall && !isfromsnap && !istosnap) {
1069                 if (sdd->replicate) {
1070                         char *snapname;
1071                         nvlist_t *snapprops;
1072                         /*
1073                          * Filter out all intermediate snapshots except origin
1074                          * snapshots needed to replicate clones.
1075                          */
1076                         nvlist_t *nvfs = fsavl_find(sdd->fsavl,


1087         }
1088 
1089         /*
1090          * If a filter function exists, call it to determine whether
1091          * this snapshot will be sent.
1092          */
1093         if (exclude || (sdd->filter_cb != NULL &&
1094             sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
1095                 /*
1096                  * This snapshot is filtered out.  Don't send it, and don't
1097                  * set prevsnap_obj, so it will be as if this snapshot didn't
1098                  * exist, and the next accepted snapshot will be sent as
1099                  * an incremental from the last accepted one, or as the
1100                  * first (and full) snapshot in the case of a replication,
1101                  * non-incremental send.
1102                  */
1103                 zfs_close(zhp);
1104                 return (0);
1105         }
1106 
1107         err = hold_for_send(zhp, sdd);
1108         if (err) {
1109                 if (err == ENOENT)
1110                         err = 0;
1111                 zfs_close(zhp);
1112                 return (err);
1113         }
1114 
1115         fromorigin = sdd->prevsnap[0] == '\0' &&
1116             (sdd->fromorigin || sdd->replicate);
1117 
1118         if (sdd->verbose) {
1119                 uint64_t size;
1120                 err = estimate_ioctl(zhp, sdd->prevsnap_obj,
1121                     fromorigin, &size);
1122 
1123                 if (sdd->parsable) {
1124                         if (sdd->prevsnap[0] != '\0') {
1125                                 (void) fprintf(stderr, "incremental\t%s\t%s",
1126                                     sdd->prevsnap, zhp->zfs_name);
1127                         } else {
1128                                 (void) fprintf(stderr, "full\t%s",
1129                                     zhp->zfs_name);
1130                         }
1131                 } else {
1132                         (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
1133                             "send from @%s to %s"),
1134                             sdd->prevsnap, zhp->zfs_name);


1525 
1526         /*
1527          * Some flags require that we place user holds on the datasets that are
1528          * being sent so they don't get destroyed during the send. We can skip
1529          * this step if the pool is imported read-only since the datasets cannot
1530          * be destroyed.
1531          */
1532         if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp),
1533             ZPOOL_PROP_READONLY, NULL) &&
1534             zfs_spa_version(zhp, &spa_version) == 0 &&
1535             spa_version >= SPA_VERSION_USERREFS &&
1536             (flags->doall || flags->replicate)) {
1537                 ++holdseq;
1538                 (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
1539                     ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
1540                 sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
1541                 if (sdd.cleanup_fd < 0) {
1542                         err = errno;
1543                         goto stderr_out;
1544                 }

1545         } else {
1546                 sdd.cleanup_fd = -1;

1547         }
1548         if (flags->verbose) {
1549                 /*
1550                  * Do a verbose no-op dry run to get all the verbose output
1551                  * before generating any data.  Then do a non-verbose real
1552                  * run to generate the streams.
1553                  */
1554                 sdd.dryrun = B_TRUE;
1555                 err = dump_filesystems(zhp, &sdd);
1556                 sdd.dryrun = flags->dryrun;
1557                 sdd.verbose = B_FALSE;
1558                 if (flags->parsable) {
1559                         (void) fprintf(stderr, "size\t%llu\n",
1560                             (longlong_t)sdd.size);
1561                 } else {
1562                         char buf[16];
1563                         zfs_nicenum(sdd.size, buf, sizeof (buf));
1564                         (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
1565                             "total estimated size is %s\n"), buf);
1566                 }
1567         }
























1568         err = dump_filesystems(zhp, &sdd);
1569         fsavl_destroy(fsavl);
1570         nvlist_free(fss);
1571 
1572         if (flags->dedup) {
1573                 (void) close(pipefd[0]);
1574                 (void) pthread_join(tid, NULL);
1575         }
1576 
1577         if (sdd.cleanup_fd != -1) {
1578                 VERIFY(0 == close(sdd.cleanup_fd));
1579                 sdd.cleanup_fd = -1;
1580         }
1581 
1582         if (!flags->dryrun && (flags->replicate || flags->doall ||
1583             flags->props)) {
1584                 /*
1585                  * write final end record.  NB: want to do this even if
1586                  * there was some error, because it might not be totally
1587                  * failed.




 776         }
 777 
 778         *nvlp = sd.fss;
 779         return (0);
 780 }
 781 
 782 /*
 783  * Routines specific to "zfs send"
 784  */
 785 typedef struct send_dump_data {
 786         /* these are all just the short snapname (the part after the @) */
 787         const char *fromsnap;
 788         const char *tosnap;
 789         char prevsnap[ZFS_MAXNAMELEN];
 790         uint64_t prevsnap_obj;
 791         boolean_t seenfrom, seento, replicate, doall, fromorigin;
 792         boolean_t verbose, dryrun, parsable, progress;
 793         int outfd;
 794         boolean_t err;
 795         nvlist_t *fss;
 796         nvlist_t *snapholds;
 797         avl_tree_t *fsavl;
 798         snapfilter_cb_t *filter_cb;
 799         void *filter_cb_arg;
 800         nvlist_t *debugnv;
 801         char holdtag[ZFS_MAXNAMELEN];
 802         int cleanup_fd;
 803         uint64_t size;
 804 } send_dump_data_t;
 805 
 806 static int
 807 estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
 808     boolean_t fromorigin, uint64_t *sizep)
 809 {
 810         zfs_cmd_t zc = { 0 };
 811         libzfs_handle_t *hdl = zhp->zfs_hdl;
 812 
 813         assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 814         assert(fromsnap_obj == 0 || !fromorigin);
 815 
 816         (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));


 926                 case ENXIO:
 927                 case EPIPE:
 928                 case ERANGE:
 929                 case EFAULT:
 930                 case EROFS:
 931                         zfs_error_aux(hdl, strerror(errno));
 932                         return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 933 
 934                 default:
 935                         return (zfs_standard_error(hdl, errno, errbuf));
 936                 }
 937         }
 938 
 939         if (debugnv)
 940                 VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
 941         nvlist_free(thisdbg);
 942 
 943         return (0);
 944 }
 945 
 946 static void
 947 gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd)
 948 {




 949         assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 950 



 951         /*
 952          * zfs_send() only sets snapholds for sends that need them,
 953          * e.g. replication and doall.
 954          */
 955         if (sdd->snapholds == NULL)
 956                 return;















 957 
 958         fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag);
 959 }
 960 
 961 static void *
 962 send_progress_thread(void *arg)
 963 {
 964         progress_arg_t *pa = arg;
 965 
 966         zfs_cmd_t zc = { 0 };
 967         zfs_handle_t *zhp = pa->pa_zhp;
 968         libzfs_handle_t *hdl = zhp->zfs_hdl;
 969         unsigned long long bytes;
 970         char buf[16];
 971 
 972         time_t t;
 973         struct tm *tm;
 974 
 975         assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 976         (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 977 
 978         if (!pa->pa_parsable)


1005         }
1006 }
1007 
1008 static int
1009 dump_snapshot(zfs_handle_t *zhp, void *arg)
1010 {
1011         send_dump_data_t *sdd = arg;
1012         progress_arg_t pa = { 0 };
1013         pthread_t tid;
1014 
1015         char *thissnap;
1016         int err;
1017         boolean_t isfromsnap, istosnap, fromorigin;
1018         boolean_t exclude = B_FALSE;
1019 
1020         thissnap = strchr(zhp->zfs_name, '@') + 1;
1021         isfromsnap = (sdd->fromsnap != NULL &&
1022             strcmp(sdd->fromsnap, thissnap) == 0);
1023 
1024         if (!sdd->seenfrom && isfromsnap) {
1025                 gather_holds(zhp, sdd);

1026                 sdd->seenfrom = B_TRUE;
1027                 (void) strcpy(sdd->prevsnap, thissnap);
1028                 sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);




1029                 zfs_close(zhp);
1030                 return (0);
1031         }
1032 
1033         if (sdd->seento || !sdd->seenfrom) {
1034                 zfs_close(zhp);
1035                 return (0);
1036         }
1037 
1038         istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
1039         if (istosnap)
1040                 sdd->seento = B_TRUE;
1041 
1042         if (!sdd->doall && !isfromsnap && !istosnap) {
1043                 if (sdd->replicate) {
1044                         char *snapname;
1045                         nvlist_t *snapprops;
1046                         /*
1047                          * Filter out all intermediate snapshots except origin
1048                          * snapshots needed to replicate clones.
1049                          */
1050                         nvlist_t *nvfs = fsavl_find(sdd->fsavl,


1061         }
1062 
1063         /*
1064          * If a filter function exists, call it to determine whether
1065          * this snapshot will be sent.
1066          */
1067         if (exclude || (sdd->filter_cb != NULL &&
1068             sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
1069                 /*
1070                  * This snapshot is filtered out.  Don't send it, and don't
1071                  * set prevsnap_obj, so it will be as if this snapshot didn't
1072                  * exist, and the next accepted snapshot will be sent as
1073                  * an incremental from the last accepted one, or as the
1074                  * first (and full) snapshot in the case of a replication,
1075                  * non-incremental send.
1076                  */
1077                 zfs_close(zhp);
1078                 return (0);
1079         }
1080 
1081         gather_holds(zhp, sdd);







1082         fromorigin = sdd->prevsnap[0] == '\0' &&
1083             (sdd->fromorigin || sdd->replicate);
1084 
1085         if (sdd->verbose) {
1086                 uint64_t size;
1087                 err = estimate_ioctl(zhp, sdd->prevsnap_obj,
1088                     fromorigin, &size);
1089 
1090                 if (sdd->parsable) {
1091                         if (sdd->prevsnap[0] != '\0') {
1092                                 (void) fprintf(stderr, "incremental\t%s\t%s",
1093                                     sdd->prevsnap, zhp->zfs_name);
1094                         } else {
1095                                 (void) fprintf(stderr, "full\t%s",
1096                                     zhp->zfs_name);
1097                         }
1098                 } else {
1099                         (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
1100                             "send from @%s to %s"),
1101                             sdd->prevsnap, zhp->zfs_name);


1492 
1493         /*
1494          * Some flags require that we place user holds on the datasets that are
1495          * being sent so they don't get destroyed during the send. We can skip
1496          * this step if the pool is imported read-only since the datasets cannot
1497          * be destroyed.
1498          */
1499         if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp),
1500             ZPOOL_PROP_READONLY, NULL) &&
1501             zfs_spa_version(zhp, &spa_version) == 0 &&
1502             spa_version >= SPA_VERSION_USERREFS &&
1503             (flags->doall || flags->replicate)) {
1504                 ++holdseq;
1505                 (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
1506                     ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
1507                 sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
1508                 if (sdd.cleanup_fd < 0) {
1509                         err = errno;
1510                         goto stderr_out;
1511                 }
1512                 sdd.snapholds = fnvlist_alloc();
1513         } else {
1514                 sdd.cleanup_fd = -1;
1515                 sdd.snapholds = NULL;
1516         }
1517         if (flags->verbose) {
1518                 /*
1519                  * Do a verbose no-op dry run to get all the verbose output
1520                  * before generating any data.  Then do a non-verbose real
1521                  * run to generate the streams.
1522                  */
1523                 sdd.dryrun = B_TRUE;
1524                 err = dump_filesystems(zhp, &sdd);
1525                 sdd.dryrun = flags->dryrun;
1526                 sdd.verbose = B_FALSE;
1527                 if (flags->parsable) {
1528                         (void) fprintf(stderr, "size\t%llu\n",
1529                             (longlong_t)sdd.size);
1530                 } else {
1531                         char buf[16];
1532                         zfs_nicenum(sdd.size, buf, sizeof (buf));
1533                         (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
1534                             "total estimated size is %s\n"), buf);
1535                 }
1536         }
1537 
1538         if (sdd.snapholds != NULL) {
1539                 /* Holds are required. */
1540                 if (!flags->verbose) {
1541                         /*
1542                          * A verbose dry run wasn't done so do a non-verbose
1543                          * dry run to gather snapshot hold's.
1544                          */
1545                         sdd.dryrun = B_TRUE;
1546                         err = dump_filesystems(zhp, &sdd);
1547                         sdd.dryrun = flags->dryrun;
1548                 }
1549 
1550                 if (err != 0) {
1551                         fnvlist_free(sdd.snapholds);
1552                         goto stderr_out;
1553                 }
1554 
1555                 err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds);
1556                 fnvlist_free(sdd.snapholds);
1557                 if (err != 0)
1558                         goto stderr_out;
1559         }
1560         
1561         err = dump_filesystems(zhp, &sdd);
1562         fsavl_destroy(fsavl);
1563         nvlist_free(fss);
1564 
1565         if (flags->dedup) {
1566                 (void) close(pipefd[0]);
1567                 (void) pthread_join(tid, NULL);
1568         }
1569 
1570         if (sdd.cleanup_fd != -1) {
1571                 VERIFY(0 == close(sdd.cleanup_fd));
1572                 sdd.cleanup_fd = -1;
1573         }
1574 
1575         if (!flags->dryrun && (flags->replicate || flags->doall ||
1576             flags->props)) {
1577                 /*
1578                  * write final end record.  NB: want to do this even if
1579                  * there was some error, because it might not be totally
1580                  * failed.