Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/fm/fs/zfs.h>
  29 #include <sys/spa.h>
  30 #include <sys/txg.h>
  31 #include <sys/spa_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio_impl.h>
  34 #include <sys/zio_compress.h>
  35 #include <sys/zio_checksum.h>
  36 #include <sys/dmu_objset.h>
  37 #include <sys/arc.h>
  38 #include <sys/ddt.h>
  39 
  40 /*
  41  * ==========================================================================
  42  * I/O priority table
  43  * ==========================================================================
  44  */
  45 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
  46         0,      /* ZIO_PRIORITY_NOW             */
  47         0,      /* ZIO_PRIORITY_SYNC_READ       */
  48         0,      /* ZIO_PRIORITY_SYNC_WRITE      */
  49         0,      /* ZIO_PRIORITY_LOG_WRITE       */
  50         1,      /* ZIO_PRIORITY_CACHE_FILL      */
  51         1,      /* ZIO_PRIORITY_AGG             */
  52         4,      /* ZIO_PRIORITY_FREE            */
  53         4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
  54         6,      /* ZIO_PRIORITY_ASYNC_READ      */
  55         10,     /* ZIO_PRIORITY_RESILVER        */
  56         20,     /* ZIO_PRIORITY_SCRUB           */
  57         2,      /* ZIO_PRIORITY_DDT_PREFETCH    */
  58 };
  59 
  60 /*
  61  * ==========================================================================
  62  * I/O type descriptions
  63  * ==========================================================================
  64  */
  65 char *zio_type_name[ZIO_TYPES] = {
  66         "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
  67         "zio_ioctl"
  68 };
  69 
  70 /*
  71  * ==========================================================================
  72  * I/O kmem caches
  73  * ==========================================================================
  74  */
  75 kmem_cache_t *zio_cache;
  76 kmem_cache_t *zio_link_cache;
  77 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  78 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  79 
  80 #ifdef _KERNEL
  81 extern vmem_t *zio_alloc_arena;
  82 #endif
  83 extern int zfs_mg_alloc_failures;
  84 
  85 /*


 469                 zio->io_stage >>= 1;
 470                 zio->io_stall = countp;
 471                 waiting = B_TRUE;
 472         }
 473         mutex_exit(&zio->io_lock);
 474 
 475         return (waiting);
 476 }
 477 
 478 static void
 479 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 480 {
 481         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 482         int *errorp = &pio->io_child_error[zio->io_child_type];
 483 
 484         mutex_enter(&pio->io_lock);
 485         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 486                 *errorp = zio_worst_error(*errorp, zio->io_error);
 487         pio->io_reexecute |= zio->io_reexecute;
 488         ASSERT3U(*countp, >, 0);
 489         if (--*countp == 0 && pio->io_stall == countp) {



 490                 pio->io_stall = NULL;
 491                 mutex_exit(&pio->io_lock);
 492                 zio_execute(pio);
 493         } else {
 494                 mutex_exit(&pio->io_lock);
 495         }
 496 }
 497 
 498 static void
 499 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 500 {
 501         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 502                 zio->io_error = zio->io_child_error[c];
 503 }
 504 
 505 /*
 506  * ==========================================================================
 507  * Create the various types of I/O (read, write, free, etc)
 508  * ==========================================================================
 509  */
 510 static zio_t *
 511 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 512     void *data, uint64_t size, zio_done_func_t *done, void *private,
 513     zio_type_t type, int priority, enum zio_flag flags,
 514     vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
 515     enum zio_stage stage, enum zio_stage pipeline)
 516 {
 517         zio_t *zio;
 518 
 519         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 520         ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 521         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 522 
 523         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 524         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 525         ASSERT(vd || stage == ZIO_STAGE_OPEN);
 526 
 527         zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 528         bzero(zio, sizeof (zio_t));
 529 
 530         mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 531         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 532 
 533         list_create(&zio->io_parent_list, sizeof (zio_link_t),


 603     void *private, enum zio_flag flags)
 604 {
 605         zio_t *zio;
 606 
 607         zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 608             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 609             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 610 
 611         return (zio);
 612 }
 613 
 614 zio_t *
 615 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 616 {
 617         return (zio_null(NULL, spa, NULL, done, private, flags));
 618 }
 619 
 620 zio_t *
 621 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 622     void *data, uint64_t size, zio_done_func_t *done, void *private,
 623     int priority, enum zio_flag flags, const zbookmark_t *zb)
 624 {
 625         zio_t *zio;
 626 
 627         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 628             data, size, done, private,
 629             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 630             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 631             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 632 
 633         return (zio);
 634 }
 635 
 636 zio_t *
 637 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 638     void *data, uint64_t size, const zio_prop_t *zp,
 639     zio_done_func_t *ready, zio_done_func_t *done, void *private,
 640     int priority, enum zio_flag flags, const zbookmark_t *zb)

 641 {
 642         zio_t *zio;
 643 
 644         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 645             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 646             zp->zp_compress >= ZIO_COMPRESS_OFF &&
 647             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 648             DMU_OT_IS_VALID(zp->zp_type) &&
 649             zp->zp_level < 32 &&
 650             zp->zp_copies > 0 &&
 651             zp->zp_copies <= spa_max_replication(spa));
 652 
 653         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 654             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 655             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 656             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 657 
 658         zio->io_ready = ready;

 659         zio->io_prop = *zp;
 660 
 661         return (zio);
 662 }
 663 
 664 zio_t *
 665 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 666     uint64_t size, zio_done_func_t *done, void *private, int priority,
 667     enum zio_flag flags, zbookmark_t *zb)
 668 {
 669         zio_t *zio;
 670 
 671         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 672             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 673             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 674 
 675         return (zio);
 676 }
 677 
 678 void
 679 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 680 {
 681         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 682         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 683         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 684         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 685 
 686         /*
 687          * We must reset the io_prop to match the values that existed


 723 
 724         dprintf_bp(bp, "freeing in txg %llu, pass %u",
 725             (longlong_t)txg, spa->spa_sync_pass);
 726 
 727         ASSERT(!BP_IS_HOLE(bp));
 728         ASSERT(spa_syncing_txg(spa) == txg);
 729         ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 730 
 731         metaslab_check_free(spa, bp);
 732         arc_freed(spa, bp);
 733 
 734         /*
 735          * GANG and DEDUP blocks can induce a read (for the gang block header,
 736          * or the DDT), so issue them asynchronously so that this thread is
 737          * not tied up.
 738          */
 739         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 740                 stage |= ZIO_STAGE_ISSUE_ASYNC;
 741 
 742         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 743             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
 744             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 745 
 746 
 747         return (zio);
 748 }
 749 
 750 zio_t *
 751 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 752     zio_done_func_t *done, void *private, enum zio_flag flags)
 753 {
 754         zio_t *zio;
 755 
 756         /*
 757          * A claim is an allocation of a specific block.  Claims are needed
 758          * to support immediate writes in the intent log.  The issue is that
 759          * immediate writes contain committed data, but in a txg that was
 760          * *not* committed.  Upon opening the pool after an unclean shutdown,
 761          * the intent log claims all blocks that contain immediate write data
 762          * so that the SPA knows they're in use.
 763          *
 764          * All claims *must* be resolved in the first txg -- before the SPA
 765          * starts allocating blocks -- so that nothing is allocated twice.
 766          * If txg == 0 we just verify that the block is claimable.
 767          */
 768         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 769         ASSERT(txg == spa_first_txg(spa) || txg == 0);
 770         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 771 
 772         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 773             done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 774             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 775 
 776         return (zio);
 777 }
 778 
 779 zio_t *
 780 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 781     zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
 782 {
 783         zio_t *zio;
 784         int c;
 785 
 786         if (vd->vdev_children == 0) {
 787                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 788                     ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
 789                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 790 
 791                 zio->io_cmd = cmd;
 792         } else {
 793                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 794 
 795                 for (c = 0; c < vd->vdev_children; c++)
 796                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 797                             done, private, priority, flags));
 798         }
 799 
 800         return (zio);
 801 }
 802 
 803 zio_t *
 804 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 805     void *data, int checksum, zio_done_func_t *done, void *private,
 806     int priority, enum zio_flag flags, boolean_t labels)
 807 {
 808         zio_t *zio;
 809 
 810         ASSERT(vd->vdev_children == 0);
 811         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 812             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 813         ASSERT3U(offset + size, <=, vd->vdev_psize);
 814 
 815         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 816             ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
 817             ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 818 
 819         zio->io_prop.zp_checksum = checksum;
 820 
 821         return (zio);
 822 }
 823 
 824 zio_t *
 825 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 826     void *data, int checksum, zio_done_func_t *done, void *private,
 827     int priority, enum zio_flag flags, boolean_t labels)
 828 {
 829         zio_t *zio;
 830 
 831         ASSERT(vd->vdev_children == 0);
 832         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 833             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 834         ASSERT3U(offset + size, <=, vd->vdev_psize);
 835 
 836         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 837             ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
 838             ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 839 
 840         zio->io_prop.zp_checksum = checksum;
 841 
 842         if (zio_checksum_table[checksum].ci_eck) {
 843                 /*
 844                  * zec checksums are necessarily destructive -- they modify
 845                  * the end of the write buffer to hold the verifier/checksum.
 846                  * Therefore, we must make a local copy in case the data is
 847                  * being written to multiple places in parallel.
 848                  */
 849                 void *wbuf = zio_buf_alloc(size);
 850                 bcopy(data, wbuf, size);
 851                 zio_push_transform(zio, wbuf, size, size, NULL);
 852         }
 853 
 854         return (zio);
 855 }
 856 
 857 /*
 858  * Create a child I/O to do some work for us.
 859  */
 860 zio_t *
 861 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 862         void *data, uint64_t size, int type, int priority, enum zio_flag flags,
 863         zio_done_func_t *done, void *private)
 864 {
 865         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 866         zio_t *zio;
 867 
 868         ASSERT(vd->vdev_parent ==
 869             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 870 
 871         if (type == ZIO_TYPE_READ && bp != NULL) {
 872                 /*
 873                  * If we have the bp, then the child should perform the
 874                  * checksum and the parent need not.  This pushes error
 875                  * detection as close to the leaves as possible and
 876                  * eliminates redundant checksums in the interior nodes.
 877                  */
 878                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 879                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 880         }
 881 
 882         if (vd->vdev_children == 0)
 883                 offset += VDEV_LABEL_START_SIZE;
 884 
 885         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 886 
 887         /*
 888          * If we've decided to do a repair, the write is not speculative --
 889          * even if the original read was.
 890          */
 891         if (flags & ZIO_FLAG_IO_REPAIR)
 892                 flags &= ~ZIO_FLAG_SPECULATIVE;
 893 
 894         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 895             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 896             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 897 




 898         return (zio);
 899 }
 900 
 901 zio_t *
 902 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 903         int type, int priority, enum zio_flag flags,
 904         zio_done_func_t *done, void *private)
 905 {
 906         zio_t *zio;
 907 
 908         ASSERT(vd->vdev_ops->vdev_op_leaf);
 909 
 910         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 911             data, size, done, private, type, priority,
 912             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
 913             vd, offset, NULL,
 914             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 915 
 916         return (zio);
 917 }
 918 
 919 void
 920 zio_flush(zio_t *zio, vdev_t *vd)
 921 {
 922         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 923             NULL, NULL, ZIO_PRIORITY_NOW,
 924             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 925 }
 926 
 927 void
 928 zio_shrink(zio_t *zio, uint64_t size)
 929 {
 930         ASSERT(zio->io_executor == NULL);
 931         ASSERT(zio->io_orig_size == zio->io_size);
 932         ASSERT(size <= zio->io_size);
 933 
 934         /*
 935          * We don't shrink for raidz because of problems with the
 936          * reconstruction when reading back less than the block size.
 937          * Note, BP_IS_RAIDZ() assumes no compression.
 938          */
 939         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 940         if (!BP_IS_RAIDZ(zio->io_bp))
 941                 zio->io_orig_size = zio->io_size = size;
 942 }
 943 


1804 
1805         /*
1806          * Create and nowait the gang children.
1807          */
1808         for (int g = 0; resid != 0; resid -= lsize, g++) {
1809                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1810                     SPA_MINBLOCKSIZE);
1811                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1812 
1813                 zp.zp_checksum = gio->io_prop.zp_checksum;
1814                 zp.zp_compress = ZIO_COMPRESS_OFF;
1815                 zp.zp_type = DMU_OT_NONE;
1816                 zp.zp_level = 0;
1817                 zp.zp_copies = gio->io_prop.zp_copies;
1818                 zp.zp_dedup = B_FALSE;
1819                 zp.zp_dedup_verify = B_FALSE;
1820                 zp.zp_nopwrite = B_FALSE;
1821 
1822                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1823                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1824                     zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1825                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1826                     &pio->io_bookmark));
1827         }
1828 
1829         /*
1830          * Set pio's pipeline to just wait for zio to finish.
1831          */
1832         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1833 
1834         zio_nowait(zio);
1835 
1836         return (ZIO_PIPELINE_CONTINUE);
1837 }
1838 
1839 /*
1840  * The zio_nop_write stage in the pipeline determines if allocating
1841  * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1842  * such as SHA256, we can compare the checksums of the new data and the old
1843  * to determine if allocating a new block is required.  The nopwrite
1844  * feature can handle writes in either syncing or open context (i.e. zil


2181                 czp.zp_copies = ditto_copies;
2182 
2183                 /*
2184                  * If we arrived here with an override bp, we won't have run
2185                  * the transform stack, so we won't have the data we need to
2186                  * generate a child i/o.  So, toss the override bp and restart.
2187                  * This is safe, because using the override bp is just an
2188                  * optimization; and it's rare, so the cost doesn't matter.
2189                  */
2190                 if (zio->io_bp_override) {
2191                         zio_pop_transforms(zio);
2192                         zio->io_stage = ZIO_STAGE_OPEN;
2193                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
2194                         zio->io_bp_override = NULL;
2195                         BP_ZERO(bp);
2196                         ddt_exit(ddt);
2197                         return (ZIO_PIPELINE_CONTINUE);
2198                 }
2199 
2200                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2201                     zio->io_orig_size, &czp, NULL,
2202                     zio_ddt_ditto_write_done, dde, zio->io_priority,
2203                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2204 
2205                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2206                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2207         }
2208 
2209         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2210                 if (ddp->ddp_phys_birth != 0)
2211                         ddt_bp_fill(ddp, bp, txg);
2212                 if (dde->dde_lead_zio[p] != NULL)
2213                         zio_add_child(zio, dde->dde_lead_zio[p]);
2214                 else
2215                         ddt_phys_addref(ddp);
2216         } else if (zio->io_bp_override) {
2217                 ASSERT(bp->blk_birth == txg);
2218                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2219                 ddt_phys_fill(ddp, bp);
2220                 ddt_phys_addref(ddp);
2221         } else {
2222                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2223                     zio->io_orig_size, zp, zio_ddt_child_write_ready,
2224                     zio_ddt_child_write_done, dde, zio->io_priority,
2225                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2226 
2227                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2228                 dde->dde_lead_zio[p] = cio;
2229         }
2230 
2231         ddt_exit(ddt);
2232 
2233         if (cio)
2234                 zio_nowait(cio);
2235         if (dio)
2236                 zio_nowait(dio);
2237 
2238         return (ZIO_PIPELINE_CONTINUE);
2239 }
2240 
2241 ddt_entry_t *freedde; /* for debugging */
2242 
2243 static int


2620         /*
2621          * If we got an error on a leaf device, convert it to ENXIO
2622          * if the device is not accessible at all.
2623          */
2624         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2625             !vdev_accessible(vd, zio))
2626                 zio->io_error = SET_ERROR(ENXIO);
2627 
2628         /*
2629          * If we can't write to an interior vdev (mirror or RAID-Z),
2630          * set vdev_cant_write so that we stop trying to allocate from it.
2631          */
2632         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2633             vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2634                 vd->vdev_cant_write = B_TRUE;
2635         }
2636 
2637         if (zio->io_error)
2638                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2639 







2640         return (ZIO_PIPELINE_CONTINUE);
2641 }
2642 
2643 void
2644 zio_vdev_io_reissue(zio_t *zio)
2645 {
2646         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2647         ASSERT(zio->io_error == 0);
2648 
2649         zio->io_stage >>= 1;
2650 }
2651 
2652 void
2653 zio_vdev_io_redone(zio_t *zio)
2654 {
2655         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2656 
2657         zio->io_stage >>= 1;
2658 }
2659 




  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/fm/fs/zfs.h>
  29 #include <sys/spa.h>
  30 #include <sys/txg.h>
  31 #include <sys/spa_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio_impl.h>
  34 #include <sys/zio_compress.h>
  35 #include <sys/zio_checksum.h>
  36 #include <sys/dmu_objset.h>
  37 #include <sys/arc.h>
  38 #include <sys/ddt.h>
  39 
  40 /*
  41  * ==========================================================================




















  42  * I/O type descriptions
  43  * ==========================================================================
  44  */
  45 const char *zio_type_name[ZIO_TYPES] = {
  46         "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
  47         "zio_ioctl"
  48 };
  49 
  50 /*
  51  * ==========================================================================
  52  * I/O kmem caches
  53  * ==========================================================================
  54  */
  55 kmem_cache_t *zio_cache;
  56 kmem_cache_t *zio_link_cache;
  57 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  58 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  59 
  60 #ifdef _KERNEL
  61 extern vmem_t *zio_alloc_arena;
  62 #endif
  63 extern int zfs_mg_alloc_failures;
  64 
  65 /*


 449                 zio->io_stage >>= 1;
 450                 zio->io_stall = countp;
 451                 waiting = B_TRUE;
 452         }
 453         mutex_exit(&zio->io_lock);
 454 
 455         return (waiting);
 456 }
 457 
 458 static void
 459 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 460 {
 461         uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 462         int *errorp = &pio->io_child_error[zio->io_child_type];
 463 
 464         mutex_enter(&pio->io_lock);
 465         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 466                 *errorp = zio_worst_error(*errorp, zio->io_error);
 467         pio->io_reexecute |= zio->io_reexecute;
 468         ASSERT3U(*countp, >, 0);
 469 
 470         (*countp)--;
 471 
 472         if (*countp == 0 && pio->io_stall == countp) {
 473                 pio->io_stall = NULL;
 474                 mutex_exit(&pio->io_lock);
 475                 zio_execute(pio);
 476         } else {
 477                 mutex_exit(&pio->io_lock);
 478         }
 479 }
 480 
 481 static void
 482 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 483 {
 484         if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 485                 zio->io_error = zio->io_child_error[c];
 486 }
 487 
 488 /*
 489  * ==========================================================================
 490  * Create the various types of I/O (read, write, free, etc)
 491  * ==========================================================================
 492  */
 493 static zio_t *
 494 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 495     void *data, uint64_t size, zio_done_func_t *done, void *private,
 496     zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 497     vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
 498     enum zio_stage stage, enum zio_stage pipeline)
 499 {
 500         zio_t *zio;
 501 
 502         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 503         ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 504         ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 505 
 506         ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 507         ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 508         ASSERT(vd || stage == ZIO_STAGE_OPEN);
 509 
 510         zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 511         bzero(zio, sizeof (zio_t));
 512 
 513         mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 514         cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 515 
 516         list_create(&zio->io_parent_list, sizeof (zio_link_t),


 586     void *private, enum zio_flag flags)
 587 {
 588         zio_t *zio;
 589 
 590         zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 591             ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 592             ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 593 
 594         return (zio);
 595 }
 596 
 597 zio_t *
 598 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 599 {
 600         return (zio_null(NULL, spa, NULL, done, private, flags));
 601 }
 602 
 603 zio_t *
 604 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 605     void *data, uint64_t size, zio_done_func_t *done, void *private,
 606     zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
 607 {
 608         zio_t *zio;
 609 
 610         zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 611             data, size, done, private,
 612             ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 613             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 614             ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 615 
 616         return (zio);
 617 }
 618 
 619 zio_t *
 620 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 621     void *data, uint64_t size, const zio_prop_t *zp,
 622     zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
 623     void *private,
 624     zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
 625 {
 626         zio_t *zio;
 627 
 628         ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 629             zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 630             zp->zp_compress >= ZIO_COMPRESS_OFF &&
 631             zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 632             DMU_OT_IS_VALID(zp->zp_type) &&
 633             zp->zp_level < 32 &&
 634             zp->zp_copies > 0 &&
 635             zp->zp_copies <= spa_max_replication(spa));
 636 
 637         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 638             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 639             ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 640             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 641 
 642         zio->io_ready = ready;
 643         zio->io_physdone = physdone;
 644         zio->io_prop = *zp;
 645 
 646         return (zio);
 647 }
 648 
 649 zio_t *
 650 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 651     uint64_t size, zio_done_func_t *done, void *private,
 652     zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb)
 653 {
 654         zio_t *zio;
 655 
 656         zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 657             ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 658             ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 659 
 660         return (zio);
 661 }
 662 
 663 void
 664 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 665 {
 666         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 667         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 668         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 669         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 670 
 671         /*
 672          * We must reset the io_prop to match the values that existed


 708 
 709         dprintf_bp(bp, "freeing in txg %llu, pass %u",
 710             (longlong_t)txg, spa->spa_sync_pass);
 711 
 712         ASSERT(!BP_IS_HOLE(bp));
 713         ASSERT(spa_syncing_txg(spa) == txg);
 714         ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 715 
 716         metaslab_check_free(spa, bp);
 717         arc_freed(spa, bp);
 718 
 719         /*
 720          * GANG and DEDUP blocks can induce a read (for the gang block header,
 721          * or the DDT), so issue them asynchronously so that this thread is
 722          * not tied up.
 723          */
 724         if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 725                 stage |= ZIO_STAGE_ISSUE_ASYNC;
 726 
 727         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 728             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 729             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 730 
 731 
 732         return (zio);
 733 }
 734 
 735 zio_t *
 736 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 737     zio_done_func_t *done, void *private, enum zio_flag flags)
 738 {
 739         zio_t *zio;
 740 
 741         /*
 742          * A claim is an allocation of a specific block.  Claims are needed
 743          * to support immediate writes in the intent log.  The issue is that
 744          * immediate writes contain committed data, but in a txg that was
 745          * *not* committed.  Upon opening the pool after an unclean shutdown,
 746          * the intent log claims all blocks that contain immediate write data
 747          * so that the SPA knows they're in use.
 748          *
 749          * All claims *must* be resolved in the first txg -- before the SPA
 750          * starts allocating blocks -- so that nothing is allocated twice.
 751          * If txg == 0 we just verify that the block is claimable.
 752          */
 753         ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 754         ASSERT(txg == spa_first_txg(spa) || txg == 0);
 755         ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 756 
 757         zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 758             done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 759             NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 760 
 761         return (zio);
 762 }
 763 
 764 zio_t *
 765 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 766     zio_done_func_t *done, void *private, enum zio_flag flags)
 767 {
 768         zio_t *zio;
 769         int c;
 770 
 771         if (vd->vdev_children == 0) {
 772                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 773                     ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 774                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 775 
 776                 zio->io_cmd = cmd;
 777         } else {
 778                 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 779 
 780                 for (c = 0; c < vd->vdev_children; c++)
 781                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 782                             done, private, flags));
 783         }
 784 
 785         return (zio);
 786 }
 787 
 788 zio_t *
 789 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 790     void *data, int checksum, zio_done_func_t *done, void *private,
 791     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 792 {
 793         zio_t *zio;
 794 
 795         ASSERT(vd->vdev_children == 0);
 796         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 797             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 798         ASSERT3U(offset + size, <=, vd->vdev_psize);
 799 
 800         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 801             ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
 802             ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 803 
 804         zio->io_prop.zp_checksum = checksum;
 805 
 806         return (zio);
 807 }
 808 
 809 zio_t *
 810 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 811     void *data, int checksum, zio_done_func_t *done, void *private,
 812     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 813 {
 814         zio_t *zio;
 815 
 816         ASSERT(vd->vdev_children == 0);
 817         ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 818             offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 819         ASSERT3U(offset + size, <=, vd->vdev_psize);
 820 
 821         zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 822             ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
 823             ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 824 
 825         zio->io_prop.zp_checksum = checksum;
 826 
 827         if (zio_checksum_table[checksum].ci_eck) {
 828                 /*
 829                  * zec checksums are necessarily destructive -- they modify
 830                  * the end of the write buffer to hold the verifier/checksum.
 831                  * Therefore, we must make a local copy in case the data is
 832                  * being written to multiple places in parallel.
 833                  */
 834                 void *wbuf = zio_buf_alloc(size);
 835                 bcopy(data, wbuf, size);
 836                 zio_push_transform(zio, wbuf, size, size, NULL);
 837         }
 838 
 839         return (zio);
 840 }
 841 
 842 /*
 843  * Create a child I/O to do some work for us.
 844  */
 845 zio_t *
 846 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 847         void *data, uint64_t size, int type, zio_priority_t priority,
 848         enum zio_flag flags, zio_done_func_t *done, void *private)
 849 {
 850         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 851         zio_t *zio;
 852 
 853         ASSERT(vd->vdev_parent ==
 854             (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 855 
 856         if (type == ZIO_TYPE_READ && bp != NULL) {
 857                 /*
 858                  * If we have the bp, then the child should perform the
 859                  * checksum and the parent need not.  This pushes error
 860                  * detection as close to the leaves as possible and
 861                  * eliminates redundant checksums in the interior nodes.
 862                  */
 863                 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 864                 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 865         }
 866 
 867         if (vd->vdev_children == 0)
 868                 offset += VDEV_LABEL_START_SIZE;
 869 
 870         flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 871 
 872         /*
 873          * If we've decided to do a repair, the write is not speculative --
 874          * even if the original read was.
 875          */
 876         if (flags & ZIO_FLAG_IO_REPAIR)
 877                 flags &= ~ZIO_FLAG_SPECULATIVE;
 878 
 879         zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 880             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 881             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 882 
 883         zio->io_physdone = pio->io_physdone;
 884         if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
 885                 zio->io_logical->io_phys_children++;
 886 
 887         return (zio);
 888 }
 889 
 890 zio_t *
 891 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 892         int type, zio_priority_t priority, enum zio_flag flags,
 893         zio_done_func_t *done, void *private)
 894 {
 895         zio_t *zio;
 896 
 897         ASSERT(vd->vdev_ops->vdev_op_leaf);
 898 
 899         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 900             data, size, done, private, type, priority,
 901             flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 902             vd, offset, NULL,
 903             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 904 
 905         return (zio);
 906 }
 907 
 908 void
 909 zio_flush(zio_t *zio, vdev_t *vd)
 910 {
 911         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 912             NULL, NULL,
 913             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 914 }
 915 
 916 void
 917 zio_shrink(zio_t *zio, uint64_t size)
 918 {
 919         ASSERT(zio->io_executor == NULL);
 920         ASSERT(zio->io_orig_size == zio->io_size);
 921         ASSERT(size <= zio->io_size);
 922 
 923         /*
 924          * We don't shrink for raidz because of problems with the
 925          * reconstruction when reading back less than the block size.
 926          * Note, BP_IS_RAIDZ() assumes no compression.
 927          */
 928         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 929         if (!BP_IS_RAIDZ(zio->io_bp))
 930                 zio->io_orig_size = zio->io_size = size;
 931 }
 932 


1793 
1794         /*
1795          * Create and nowait the gang children.
1796          */
1797         for (int g = 0; resid != 0; resid -= lsize, g++) {
1798                 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1799                     SPA_MINBLOCKSIZE);
1800                 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1801 
1802                 zp.zp_checksum = gio->io_prop.zp_checksum;
1803                 zp.zp_compress = ZIO_COMPRESS_OFF;
1804                 zp.zp_type = DMU_OT_NONE;
1805                 zp.zp_level = 0;
1806                 zp.zp_copies = gio->io_prop.zp_copies;
1807                 zp.zp_dedup = B_FALSE;
1808                 zp.zp_dedup_verify = B_FALSE;
1809                 zp.zp_nopwrite = B_FALSE;
1810 
1811                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1812                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1813                     zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1814                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1815                     &pio->io_bookmark));
1816         }
1817 
1818         /*
1819          * Set pio's pipeline to just wait for zio to finish.
1820          */
1821         pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1822 
1823         zio_nowait(zio);
1824 
1825         return (ZIO_PIPELINE_CONTINUE);
1826 }
1827 
1828 /*
1829  * The zio_nop_write stage in the pipeline determines if allocating
1830  * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1831  * such as SHA256, we can compare the checksums of the new data and the old
1832  * to determine if allocating a new block is required.  The nopwrite
1833  * feature can handle writes in either syncing or open context (i.e. zil


2170                 czp.zp_copies = ditto_copies;
2171 
2172                 /*
2173                  * If we arrived here with an override bp, we won't have run
2174                  * the transform stack, so we won't have the data we need to
2175                  * generate a child i/o.  So, toss the override bp and restart.
2176                  * This is safe, because using the override bp is just an
2177                  * optimization; and it's rare, so the cost doesn't matter.
2178                  */
2179                 if (zio->io_bp_override) {
2180                         zio_pop_transforms(zio);
2181                         zio->io_stage = ZIO_STAGE_OPEN;
2182                         zio->io_pipeline = ZIO_WRITE_PIPELINE;
2183                         zio->io_bp_override = NULL;
2184                         BP_ZERO(bp);
2185                         ddt_exit(ddt);
2186                         return (ZIO_PIPELINE_CONTINUE);
2187                 }
2188 
2189                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2190                     zio->io_orig_size, &czp, NULL, NULL,
2191                     zio_ddt_ditto_write_done, dde, zio->io_priority,
2192                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2193 
2194                 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2195                 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2196         }
2197 
2198         if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2199                 if (ddp->ddp_phys_birth != 0)
2200                         ddt_bp_fill(ddp, bp, txg);
2201                 if (dde->dde_lead_zio[p] != NULL)
2202                         zio_add_child(zio, dde->dde_lead_zio[p]);
2203                 else
2204                         ddt_phys_addref(ddp);
2205         } else if (zio->io_bp_override) {
2206                 ASSERT(bp->blk_birth == txg);
2207                 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2208                 ddt_phys_fill(ddp, bp);
2209                 ddt_phys_addref(ddp);
2210         } else {
2211                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2212                     zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2213                     zio_ddt_child_write_done, dde, zio->io_priority,
2214                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2215 
2216                 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2217                 dde->dde_lead_zio[p] = cio;
2218         }
2219 
2220         ddt_exit(ddt);
2221 
2222         if (cio)
2223                 zio_nowait(cio);
2224         if (dio)
2225                 zio_nowait(dio);
2226 
2227         return (ZIO_PIPELINE_CONTINUE);
2228 }
2229 
2230 ddt_entry_t *freedde; /* for debugging */
2231 
2232 static int


2609         /*
2610          * If we got an error on a leaf device, convert it to ENXIO
2611          * if the device is not accessible at all.
2612          */
2613         if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2614             !vdev_accessible(vd, zio))
2615                 zio->io_error = SET_ERROR(ENXIO);
2616 
2617         /*
2618          * If we can't write to an interior vdev (mirror or RAID-Z),
2619          * set vdev_cant_write so that we stop trying to allocate from it.
2620          */
2621         if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2622             vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2623                 vd->vdev_cant_write = B_TRUE;
2624         }
2625 
2626         if (zio->io_error)
2627                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2628 
2629         if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2630             zio->io_physdone != NULL) {
2631                 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2632                 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2633                 zio->io_physdone(zio->io_logical);
2634         }
2635 
2636         return (ZIO_PIPELINE_CONTINUE);
2637 }
2638 
2639 void
2640 zio_vdev_io_reissue(zio_t *zio)
2641 {
2642         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2643         ASSERT(zio->io_error == 0);
2644 
2645         zio->io_stage >>= 1;
2646 }
2647 
2648 void
2649 zio_vdev_io_redone(zio_t *zio)
2650 {
2651         ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2652 
2653         zio->io_stage >>= 1;
2654 }
2655