Print this page
3740 Poor ZFS send / receive performance due to snapshot hold / release processing
Submitted by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/lib/libzfs/common/libzfs_sendrecv.c
          +++ new/usr/src/lib/libzfs/common/libzfs_sendrecv.c
↓ open down ↓ 15 lines elided ↑ open up ↑
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2012 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
       26 + * Copyright (c) 2013 Steven Hartland. All rights reserved.
  26   27   */
  27   28  
  28   29  #include <assert.h>
  29   30  #include <ctype.h>
  30   31  #include <errno.h>
  31   32  #include <libintl.h>
  32   33  #include <stdio.h>
  33   34  #include <stdlib.h>
  34   35  #include <strings.h>
  35   36  #include <unistd.h>
↓ open down ↓ 750 lines elided ↑ open up ↑
 786  787          /* these are all just the short snapname (the part after the @) */
 787  788          const char *fromsnap;
 788  789          const char *tosnap;
 789  790          char prevsnap[ZFS_MAXNAMELEN];
 790  791          uint64_t prevsnap_obj;
 791  792          boolean_t seenfrom, seento, replicate, doall, fromorigin;
 792  793          boolean_t verbose, dryrun, parsable, progress;
 793  794          int outfd;
 794  795          boolean_t err;
 795  796          nvlist_t *fss;
      797 +        nvlist_t *snapholds;
 796  798          avl_tree_t *fsavl;
 797  799          snapfilter_cb_t *filter_cb;
 798  800          void *filter_cb_arg;
 799  801          nvlist_t *debugnv;
 800  802          char holdtag[ZFS_MAXNAMELEN];
 801  803          int cleanup_fd;
 802  804          uint64_t size;
 803  805  } send_dump_data_t;
 804  806  
 805  807  static int
↓ open down ↓ 129 lines elided ↑ open up ↑
 935  937                  }
 936  938          }
 937  939  
 938  940          if (debugnv)
 939  941                  VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
 940  942          nvlist_free(thisdbg);
 941  943  
 942  944          return (0);
 943  945  }
 944  946  
 945      -static int
 946      -hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd)
      947 +static void
      948 +gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd)
 947  949  {
 948      -        zfs_handle_t *pzhp;
 949      -        int error = 0;
 950      -        char *thissnap;
 951      -
 952  950          assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 953  951  
 954      -        if (sdd->dryrun)
 955      -                return (0);
 956      -
 957  952          /*
 958      -         * zfs_send() only opens a cleanup_fd for sends that need it,
      953 +         * zfs_send() only sets snapholds for sends that need them,
 959  954           * e.g. replication and doall.
 960  955           */
 961      -        if (sdd->cleanup_fd == -1)
 962      -                return (0);
 963      -
 964      -        thissnap = strchr(zhp->zfs_name, '@') + 1;
 965      -        *(thissnap - 1) = '\0';
 966      -        pzhp = zfs_open(zhp->zfs_hdl, zhp->zfs_name, ZFS_TYPE_DATASET);
 967      -        *(thissnap - 1) = '@';
 968      -
 969      -        /*
 970      -         * It's OK if the parent no longer exists.  The send code will
 971      -         * handle that error.
 972      -         */
 973      -        if (pzhp) {
 974      -                error = zfs_hold(pzhp, thissnap, sdd->holdtag,
 975      -                    B_FALSE, B_TRUE, sdd->cleanup_fd);
 976      -                zfs_close(pzhp);
 977      -        }
      956 +        if (sdd->snapholds == NULL)
      957 +                return;
 978  958  
 979      -        return (error);
      959 +        fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag);
 980  960  }
 981  961  
 982  962  static void *
 983  963  send_progress_thread(void *arg)
 984  964  {
 985  965          progress_arg_t *pa = arg;
 986  966  
 987  967          zfs_cmd_t zc = { 0 };
 988  968          zfs_handle_t *zhp = pa->pa_zhp;
 989  969          libzfs_handle_t *hdl = zhp->zfs_hdl;
↓ open down ↓ 35 lines elided ↑ open up ↑
1025 1005                  }
1026 1006          }
1027 1007  }
1028 1008  
1029 1009  static int
1030 1010  dump_snapshot(zfs_handle_t *zhp, void *arg)
1031 1011  {
1032 1012          send_dump_data_t *sdd = arg;
1033 1013          progress_arg_t pa = { 0 };
1034 1014          pthread_t tid;
1035      -
1036 1015          char *thissnap;
1037 1016          int err;
1038 1017          boolean_t isfromsnap, istosnap, fromorigin;
1039 1018          boolean_t exclude = B_FALSE;
1040 1019  
     1020 +        err = 0;
1041 1021          thissnap = strchr(zhp->zfs_name, '@') + 1;
1042 1022          isfromsnap = (sdd->fromsnap != NULL &&
1043 1023              strcmp(sdd->fromsnap, thissnap) == 0);
1044 1024  
1045 1025          if (!sdd->seenfrom && isfromsnap) {
1046      -                err = hold_for_send(zhp, sdd);
1047      -                if (err == 0) {
1048      -                        sdd->seenfrom = B_TRUE;
1049      -                        (void) strcpy(sdd->prevsnap, thissnap);
1050      -                        sdd->prevsnap_obj = zfs_prop_get_int(zhp,
1051      -                            ZFS_PROP_OBJSETID);
1052      -                } else if (err == ENOENT) {
1053      -                        err = 0;
1054      -                }
     1026 +                gather_holds(zhp, sdd);
     1027 +                sdd->seenfrom = B_TRUE;
     1028 +                (void) strcpy(sdd->prevsnap, thissnap);
     1029 +                sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
1055 1030                  zfs_close(zhp);
1056      -                return (err);
     1031 +                return (0);
1057 1032          }
1058 1033  
1059 1034          if (sdd->seento || !sdd->seenfrom) {
1060 1035                  zfs_close(zhp);
1061 1036                  return (0);
1062 1037          }
1063 1038  
1064 1039          istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
1065 1040          if (istosnap)
1066 1041                  sdd->seento = B_TRUE;
↓ open down ↓ 30 lines elided ↑ open up ↑
1097 1072                   * set prevsnap_obj, so it will be as if this snapshot didn't
1098 1073                   * exist, and the next accepted snapshot will be sent as
1099 1074                   * an incremental from the last accepted one, or as the
1100 1075                   * first (and full) snapshot in the case of a replication,
1101 1076                   * non-incremental send.
1102 1077                   */
1103 1078                  zfs_close(zhp);
1104 1079                  return (0);
1105 1080          }
1106 1081  
1107      -        err = hold_for_send(zhp, sdd);
1108      -        if (err) {
1109      -                if (err == ENOENT)
1110      -                        err = 0;
1111      -                zfs_close(zhp);
1112      -                return (err);
1113      -        }
1114      -
     1082 +        gather_holds(zhp, sdd);
1115 1083          fromorigin = sdd->prevsnap[0] == '\0' &&
1116 1084              (sdd->fromorigin || sdd->replicate);
1117 1085  
1118 1086          if (sdd->verbose) {
1119 1087                  uint64_t size;
1120 1088                  err = estimate_ioctl(zhp, sdd->prevsnap_obj,
1121 1089                      fromorigin, &size);
1122 1090  
1123 1091                  if (sdd->parsable) {
1124 1092                          if (sdd->prevsnap[0] != '\0') {
↓ open down ↓ 247 lines elided ↑ open up ↑
1372 1340      sendflags_t *flags, int outfd, snapfilter_cb_t filter_func,
1373 1341      void *cb_arg, nvlist_t **debugnvp)
1374 1342  {
1375 1343          char errbuf[1024];
1376 1344          send_dump_data_t sdd = { 0 };
1377 1345          int err = 0;
1378 1346          nvlist_t *fss = NULL;
1379 1347          avl_tree_t *fsavl = NULL;
1380 1348          static uint64_t holdseq;
1381 1349          int spa_version;
1382      -        pthread_t tid;
     1350 +        pthread_t tid = 0;
1383 1351          int pipefd[2];
1384 1352          dedup_arg_t dda = { 0 };
1385 1353          int featureflags = 0;
1386 1354  
1387 1355          (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
1388 1356              "cannot send '%s'"), zhp->zfs_name);
1389 1357  
1390 1358          if (fromsnap && fromsnap[0] == '\0') {
1391 1359                  zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
1392 1360                      "zero-length incremental source"));
↓ open down ↓ 52 lines elided ↑ open up ↑
1445 1413                              fromsnap, tosnap, flags->replicate, &fss, &fsavl);
1446 1414                          if (err)
1447 1415                                  goto err_out;
1448 1416                          VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
1449 1417                          err = nvlist_pack(hdrnv, &packbuf, &buflen,
1450 1418                              NV_ENCODE_XDR, 0);
1451 1419                          if (debugnvp)
1452 1420                                  *debugnvp = hdrnv;
1453 1421                          else
1454 1422                                  nvlist_free(hdrnv);
1455      -                        if (err) {
1456      -                                fsavl_destroy(fsavl);
1457      -                                nvlist_free(fss);
     1423 +                        if (err)
1458 1424                                  goto stderr_out;
1459      -                        }
1460 1425                  }
1461 1426  
1462 1427                  if (!flags->dryrun) {
1463 1428                          /* write first begin record */
1464 1429                          drr.drr_type = DRR_BEGIN;
1465 1430                          drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
1466 1431                          DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.
1467 1432                              drr_versioninfo, DMU_COMPOUNDSTREAM);
1468 1433                          DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.
1469 1434                              drr_versioninfo, featureflags);
↓ open down ↓ 3 lines elided ↑ open up ↑
1473 1438                          drr.drr_payloadlen = buflen;
1474 1439                          err = cksum_and_write(&drr, sizeof (drr), &zc, outfd);
1475 1440  
1476 1441                          /* write header nvlist */
1477 1442                          if (err != -1 && packbuf != NULL) {
1478 1443                                  err = cksum_and_write(packbuf, buflen, &zc,
1479 1444                                      outfd);
1480 1445                          }
1481 1446                          free(packbuf);
1482 1447                          if (err == -1) {
1483      -                                fsavl_destroy(fsavl);
1484      -                                nvlist_free(fss);
1485 1448                                  err = errno;
1486 1449                                  goto stderr_out;
1487 1450                          }
1488 1451  
1489 1452                          /* write end record */
1490 1453                          bzero(&drr, sizeof (drr));
1491 1454                          drr.drr_type = DRR_END;
1492 1455                          drr.drr_u.drr_end.drr_checksum = zc;
1493 1456                          err = write(outfd, &drr, sizeof (drr));
1494 1457                          if (err == -1) {
1495      -                                fsavl_destroy(fsavl);
1496      -                                nvlist_free(fss);
1497 1458                                  err = errno;
1498 1459                                  goto stderr_out;
1499 1460                          }
1500 1461  
1501 1462                          err = 0;
1502 1463                  }
1503 1464          }
1504 1465  
1505 1466          /* dump each stream */
1506 1467          sdd.fromsnap = fromsnap;
1507 1468          sdd.tosnap = tosnap;
1508      -        if (flags->dedup)
     1469 +        if (tid != 0)
1509 1470                  sdd.outfd = pipefd[0];
1510 1471          else
1511 1472                  sdd.outfd = outfd;
1512 1473          sdd.replicate = flags->replicate;
1513 1474          sdd.doall = flags->doall;
1514 1475          sdd.fromorigin = flags->fromorigin;
1515 1476          sdd.fss = fss;
1516 1477          sdd.fsavl = fsavl;
1517 1478          sdd.verbose = flags->verbose;
1518 1479          sdd.parsable = flags->parsable;
↓ open down ↓ 16 lines elided ↑ open up ↑
1535 1496              spa_version >= SPA_VERSION_USERREFS &&
1536 1497              (flags->doall || flags->replicate)) {
1537 1498                  ++holdseq;
1538 1499                  (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
1539 1500                      ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
1540 1501                  sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
1541 1502                  if (sdd.cleanup_fd < 0) {
1542 1503                          err = errno;
1543 1504                          goto stderr_out;
1544 1505                  }
     1506 +                sdd.snapholds = fnvlist_alloc();
1545 1507          } else {
1546 1508                  sdd.cleanup_fd = -1;
     1509 +                sdd.snapholds = NULL;
1547 1510          }
1548      -        if (flags->verbose) {
     1511 +        if (flags->verbose || sdd.snapholds != NULL) {
1549 1512                  /*
1550 1513                   * Do a verbose no-op dry run to get all the verbose output
1551      -                 * before generating any data.  Then do a non-verbose real
1552      -                 * run to generate the streams.
     1514 +                 * or to gather snapshot hold's before generating any data,
     1515 +                 * then do a non-verbose real run to generate the streams.
1553 1516                   */
1554 1517                  sdd.dryrun = B_TRUE;
1555 1518                  err = dump_filesystems(zhp, &sdd);
1556      -                sdd.dryrun = flags->dryrun;
1557      -                sdd.verbose = B_FALSE;
1558      -                if (flags->parsable) {
1559      -                        (void) fprintf(stderr, "size\t%llu\n",
1560      -                            (longlong_t)sdd.size);
1561      -                } else {
1562      -                        char buf[16];
1563      -                        zfs_nicenum(sdd.size, buf, sizeof (buf));
1564      -                        (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
1565      -                            "total estimated size is %s\n"), buf);
     1519 +
     1520 +                if (err != 0)
     1521 +                        goto stderr_out;
     1522 +
     1523 +                if (flags->verbose) {
     1524 +                        if (flags->parsable) {
     1525 +                                (void) fprintf(stderr, "size\t%llu\n",
     1526 +                                    (longlong_t)sdd.size);
     1527 +                        } else {
     1528 +                                char buf[16];
     1529 +                                zfs_nicenum(sdd.size, buf, sizeof (buf));
     1530 +                                (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
     1531 +                                    "total estimated size is %s\n"), buf);
     1532 +                        }
     1533 +                }
     1534 +
     1535 +                /* Ensure no snaps found is treated as an error. */
     1536 +                if (!sdd.seento) {
     1537 +                        err = ENOENT;
     1538 +                        goto err_out;
1566 1539                  }
     1540 +
     1541 +                /* Skip the second run if dryrun was requested. */
     1542 +                if (flags->dryrun)
     1543 +                        goto err_out;
     1544 +
     1545 +                if (sdd.snapholds != NULL) {
     1546 +                        err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds);
     1547 +                        if (err != 0)
     1548 +                                goto stderr_out;
     1549 +
     1550 +                        fnvlist_free(sdd.snapholds);
     1551 +                        sdd.snapholds = NULL;
     1552 +                }
     1553 +
     1554 +                sdd.dryrun = B_FALSE;
     1555 +                sdd.verbose = B_FALSE;
1567 1556          }
     1557 +
1568 1558          err = dump_filesystems(zhp, &sdd);
1569 1559          fsavl_destroy(fsavl);
1570 1560          nvlist_free(fss);
1571 1561  
1572      -        if (flags->dedup) {
1573      -                (void) close(pipefd[0]);
     1562 +        /* Ensure no snaps found is treated as an error. */
     1563 +        if (err == 0 && !sdd.seento)
     1564 +                err = ENOENT;
     1565 +
     1566 +        if (tid != 0) {
     1567 +                if (err != 0)
     1568 +                        (void) pthread_cancel(tid);
1574 1569                  (void) pthread_join(tid, NULL);
     1570 +                (void) close(pipefd[0]);
1575 1571          }
1576 1572  
1577 1573          if (sdd.cleanup_fd != -1) {
1578 1574                  VERIFY(0 == close(sdd.cleanup_fd));
1579 1575                  sdd.cleanup_fd = -1;
1580 1576          }
1581 1577  
1582 1578          if (!flags->dryrun && (flags->replicate || flags->doall ||
1583 1579              flags->props)) {
1584 1580                  /*
↓ open down ↓ 7 lines elided ↑ open up ↑
1592 1588                          return (zfs_standard_error(zhp->zfs_hdl,
1593 1589                              errno, errbuf));
1594 1590                  }
1595 1591          }
1596 1592  
1597 1593          return (err || sdd.err);
1598 1594  
1599 1595  stderr_out:
1600 1596          err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
1601 1597  err_out:
     1598 +        fsavl_destroy(fsavl);
     1599 +        nvlist_free(fss);
     1600 +        fnvlist_free(sdd.snapholds);
     1601 +
1602 1602          if (sdd.cleanup_fd != -1)
1603 1603                  VERIFY(0 == close(sdd.cleanup_fd));
1604      -        if (flags->dedup) {
     1604 +        if (tid != 0) {
1605 1605                  (void) pthread_cancel(tid);
1606 1606                  (void) pthread_join(tid, NULL);
1607 1607                  (void) close(pipefd[0]);
1608 1608          }
1609 1609          return (err);
1610 1610  }
1611 1611  
1612 1612  /*
1613 1613   * Routines specific to "zfs recv"
1614 1614   */
↓ open down ↓ 1581 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX