Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/zio.c
          +++ new/usr/src/uts/common/fs/zfs/zio.c
↓ open down ↓ 31 lines elided ↑ open up ↑
  32   32  #include <sys/vdev_impl.h>
  33   33  #include <sys/zio_impl.h>
  34   34  #include <sys/zio_compress.h>
  35   35  #include <sys/zio_checksum.h>
  36   36  #include <sys/dmu_objset.h>
  37   37  #include <sys/arc.h>
  38   38  #include <sys/ddt.h>
  39   39  
  40   40  /*
  41   41   * ==========================================================================
  42      - * I/O priority table
  43      - * ==========================================================================
  44      - */
  45      -uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
  46      -        0,      /* ZIO_PRIORITY_NOW             */
  47      -        0,      /* ZIO_PRIORITY_SYNC_READ       */
  48      -        0,      /* ZIO_PRIORITY_SYNC_WRITE      */
  49      -        0,      /* ZIO_PRIORITY_LOG_WRITE       */
  50      -        1,      /* ZIO_PRIORITY_CACHE_FILL      */
  51      -        1,      /* ZIO_PRIORITY_AGG             */
  52      -        4,      /* ZIO_PRIORITY_FREE            */
  53      -        4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
  54      -        6,      /* ZIO_PRIORITY_ASYNC_READ      */
  55      -        10,     /* ZIO_PRIORITY_RESILVER        */
  56      -        20,     /* ZIO_PRIORITY_SCRUB           */
  57      -        2,      /* ZIO_PRIORITY_DDT_PREFETCH    */
  58      -};
  59      -
  60      -/*
  61      - * ==========================================================================
  62   42   * I/O type descriptions
  63   43   * ==========================================================================
  64   44   */
  65      -char *zio_type_name[ZIO_TYPES] = {
       45 +const char *zio_type_name[ZIO_TYPES] = {
  66   46          "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
  67   47          "zio_ioctl"
  68   48  };
  69   49  
  70   50  /*
  71   51   * ==========================================================================
  72   52   * I/O kmem caches
  73   53   * ==========================================================================
  74   54   */
  75   55  kmem_cache_t *zio_cache;
↓ open down ↓ 403 lines elided ↑ open up ↑
 479  459  zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 480  460  {
 481  461          uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 482  462          int *errorp = &pio->io_child_error[zio->io_child_type];
 483  463  
 484  464          mutex_enter(&pio->io_lock);
 485  465          if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 486  466                  *errorp = zio_worst_error(*errorp, zio->io_error);
 487  467          pio->io_reexecute |= zio->io_reexecute;
 488  468          ASSERT3U(*countp, >, 0);
 489      -        if (--*countp == 0 && pio->io_stall == countp) {
      469 +
      470 +        (*countp)--;
      471 +
      472 +        if (*countp == 0 && pio->io_stall == countp) {
 490  473                  pio->io_stall = NULL;
 491  474                  mutex_exit(&pio->io_lock);
 492  475                  zio_execute(pio);
 493  476          } else {
 494  477                  mutex_exit(&pio->io_lock);
 495  478          }
 496  479  }
 497  480  
 498  481  static void
 499  482  zio_inherit_child_errors(zio_t *zio, enum zio_child c)
↓ open down ↓ 3 lines elided ↑ open up ↑
 503  486  }
 504  487  
 505  488  /*
 506  489   * ==========================================================================
 507  490   * Create the various types of I/O (read, write, free, etc)
 508  491   * ==========================================================================
 509  492   */
 510  493  static zio_t *
 511  494  zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 512  495      void *data, uint64_t size, zio_done_func_t *done, void *private,
 513      -    zio_type_t type, int priority, enum zio_flag flags,
      496 +    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 514  497      vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
 515  498      enum zio_stage stage, enum zio_stage pipeline)
 516  499  {
 517  500          zio_t *zio;
 518  501  
 519  502          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 520  503          ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 521  504          ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 522  505  
 523  506          ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
↓ open down ↓ 89 lines elided ↑ open up ↑
 613  596  
 614  597  zio_t *
 615  598  zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 616  599  {
 617  600          return (zio_null(NULL, spa, NULL, done, private, flags));
 618  601  }
 619  602  
 620  603  zio_t *
 621  604  zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 622  605      void *data, uint64_t size, zio_done_func_t *done, void *private,
 623      -    int priority, enum zio_flag flags, const zbookmark_t *zb)
      606 +    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
 624  607  {
 625  608          zio_t *zio;
 626  609  
 627  610          zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 628  611              data, size, done, private,
 629  612              ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 630  613              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 631  614              ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 632  615  
 633  616          return (zio);
 634  617  }
 635  618  
 636  619  zio_t *
 637  620  zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 638  621      void *data, uint64_t size, const zio_prop_t *zp,
 639      -    zio_done_func_t *ready, zio_done_func_t *done, void *private,
 640      -    int priority, enum zio_flag flags, const zbookmark_t *zb)
      622 +    zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
      623 +    void *private,
      624 +    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
 641  625  {
 642  626          zio_t *zio;
 643  627  
 644  628          ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 645  629              zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 646  630              zp->zp_compress >= ZIO_COMPRESS_OFF &&
 647  631              zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 648  632              DMU_OT_IS_VALID(zp->zp_type) &&
 649  633              zp->zp_level < 32 &&
 650  634              zp->zp_copies > 0 &&
 651  635              zp->zp_copies <= spa_max_replication(spa));
 652  636  
 653  637          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 654  638              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 655  639              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 656  640              ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 657  641  
 658  642          zio->io_ready = ready;
      643 +        zio->io_physdone = physdone;
 659  644          zio->io_prop = *zp;
 660  645  
 661  646          return (zio);
 662  647  }
 663  648  
 664  649  zio_t *
 665  650  zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 666      -    uint64_t size, zio_done_func_t *done, void *private, int priority,
 667      -    enum zio_flag flags, zbookmark_t *zb)
      651 +    uint64_t size, zio_done_func_t *done, void *private,
      652 +    zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb)
 668  653  {
 669  654          zio_t *zio;
 670  655  
 671  656          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 672  657              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 673  658              ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 674  659  
 675  660          return (zio);
 676  661  }
 677  662  
↓ open down ↓ 55 lines elided ↑ open up ↑
 733  718  
 734  719          /*
 735  720           * GANG and DEDUP blocks can induce a read (for the gang block header,
 736  721           * or the DDT), so issue them asynchronously so that this thread is
 737  722           * not tied up.
 738  723           */
 739  724          if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 740  725                  stage |= ZIO_STAGE_ISSUE_ASYNC;
 741  726  
 742  727          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 743      -            NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
      728 +            NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 744  729              NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 745  730  
 746  731  
 747  732          return (zio);
 748  733  }
 749  734  
 750  735  zio_t *
 751  736  zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 752  737      zio_done_func_t *done, void *private, enum zio_flag flags)
 753  738  {
↓ open down ↓ 17 lines elided ↑ open up ↑
 771  756  
 772  757          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 773  758              done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 774  759              NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 775  760  
 776  761          return (zio);
 777  762  }
 778  763  
 779  764  zio_t *
 780  765  zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 781      -    zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
      766 +    zio_done_func_t *done, void *private, enum zio_flag flags)
 782  767  {
 783  768          zio_t *zio;
 784  769          int c;
 785  770  
 786  771          if (vd->vdev_children == 0) {
 787  772                  zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 788      -                    ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
      773 +                    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 789  774                      ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 790  775  
 791  776                  zio->io_cmd = cmd;
 792  777          } else {
 793  778                  zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 794  779  
 795  780                  for (c = 0; c < vd->vdev_children; c++)
 796  781                          zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 797      -                            done, private, priority, flags));
      782 +                            done, private, flags));
 798  783          }
 799  784  
 800  785          return (zio);
 801  786  }
 802  787  
 803  788  zio_t *
 804  789  zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 805  790      void *data, int checksum, zio_done_func_t *done, void *private,
 806      -    int priority, enum zio_flag flags, boolean_t labels)
      791 +    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 807  792  {
 808  793          zio_t *zio;
 809  794  
 810  795          ASSERT(vd->vdev_children == 0);
 811  796          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 812  797              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 813  798          ASSERT3U(offset + size, <=, vd->vdev_psize);
 814  799  
 815  800          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 816  801              ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
 817  802              ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 818  803  
 819  804          zio->io_prop.zp_checksum = checksum;
 820  805  
 821  806          return (zio);
 822  807  }
 823  808  
 824  809  zio_t *
 825  810  zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 826  811      void *data, int checksum, zio_done_func_t *done, void *private,
 827      -    int priority, enum zio_flag flags, boolean_t labels)
      812 +    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 828  813  {
 829  814          zio_t *zio;
 830  815  
 831  816          ASSERT(vd->vdev_children == 0);
 832  817          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 833  818              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 834  819          ASSERT3U(offset + size, <=, vd->vdev_psize);
 835  820  
 836  821          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 837  822              ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
↓ open down ↓ 14 lines elided ↑ open up ↑
 852  837          }
 853  838  
 854  839          return (zio);
 855  840  }
 856  841  
 857  842  /*
 858  843   * Create a child I/O to do some work for us.
 859  844   */
 860  845  zio_t *
 861  846  zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 862      -        void *data, uint64_t size, int type, int priority, enum zio_flag flags,
 863      -        zio_done_func_t *done, void *private)
      847 +        void *data, uint64_t size, int type, zio_priority_t priority,
      848 +        enum zio_flag flags, zio_done_func_t *done, void *private)
 864  849  {
 865  850          enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 866  851          zio_t *zio;
 867  852  
 868  853          ASSERT(vd->vdev_parent ==
 869  854              (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 870  855  
 871  856          if (type == ZIO_TYPE_READ && bp != NULL) {
 872  857                  /*
 873  858                   * If we have the bp, then the child should perform the
↓ open down ↓ 14 lines elided ↑ open up ↑
 888  873           * If we've decided to do a repair, the write is not speculative --
 889  874           * even if the original read was.
 890  875           */
 891  876          if (flags & ZIO_FLAG_IO_REPAIR)
 892  877                  flags &= ~ZIO_FLAG_SPECULATIVE;
 893  878  
 894  879          zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 895  880              done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 896  881              ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 897  882  
      883 +        zio->io_physdone = pio->io_physdone;
      884 +        if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
      885 +                zio->io_logical->io_phys_children++;
      886 +
 898  887          return (zio);
 899  888  }
 900  889  
 901  890  zio_t *
 902  891  zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 903      -        int type, int priority, enum zio_flag flags,
      892 +        int type, zio_priority_t priority, enum zio_flag flags,
 904  893          zio_done_func_t *done, void *private)
 905  894  {
 906  895          zio_t *zio;
 907  896  
 908  897          ASSERT(vd->vdev_ops->vdev_op_leaf);
 909  898  
 910  899          zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 911  900              data, size, done, private, type, priority,
 912      -            flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
      901 +            flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 913  902              vd, offset, NULL,
 914  903              ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 915  904  
 916  905          return (zio);
 917  906  }
 918  907  
 919  908  void
 920  909  zio_flush(zio_t *zio, vdev_t *vd)
 921  910  {
 922  911          zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 923      -            NULL, NULL, ZIO_PRIORITY_NOW,
      912 +            NULL, NULL,
 924  913              ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 925  914  }
 926  915  
 927  916  void
 928  917  zio_shrink(zio_t *zio, uint64_t size)
 929  918  {
 930  919          ASSERT(zio->io_executor == NULL);
 931  920          ASSERT(zio->io_orig_size == zio->io_size);
 932  921          ASSERT(size <= zio->io_size);
 933  922  
↓ open down ↓ 880 lines elided ↑ open up ↑
1814 1803                  zp.zp_compress = ZIO_COMPRESS_OFF;
1815 1804                  zp.zp_type = DMU_OT_NONE;
1816 1805                  zp.zp_level = 0;
1817 1806                  zp.zp_copies = gio->io_prop.zp_copies;
1818 1807                  zp.zp_dedup = B_FALSE;
1819 1808                  zp.zp_dedup_verify = B_FALSE;
1820 1809                  zp.zp_nopwrite = B_FALSE;
1821 1810  
1822 1811                  zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1823 1812                      (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1824      -                    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
     1813 +                    zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1825 1814                      pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1826 1815                      &pio->io_bookmark));
1827 1816          }
1828 1817  
1829 1818          /*
1830 1819           * Set pio's pipeline to just wait for zio to finish.
1831 1820           */
1832 1821          pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1833 1822  
1834 1823          zio_nowait(zio);
↓ open down ↓ 356 lines elided ↑ open up ↑
2191 2180                          zio_pop_transforms(zio);
2192 2181                          zio->io_stage = ZIO_STAGE_OPEN;
2193 2182                          zio->io_pipeline = ZIO_WRITE_PIPELINE;
2194 2183                          zio->io_bp_override = NULL;
2195 2184                          BP_ZERO(bp);
2196 2185                          ddt_exit(ddt);
2197 2186                          return (ZIO_PIPELINE_CONTINUE);
2198 2187                  }
2199 2188  
2200 2189                  dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2201      -                    zio->io_orig_size, &czp, NULL,
     2190 +                    zio->io_orig_size, &czp, NULL, NULL,
2202 2191                      zio_ddt_ditto_write_done, dde, zio->io_priority,
2203 2192                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2204 2193  
2205 2194                  zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2206 2195                  dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2207 2196          }
2208 2197  
2209 2198          if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2210 2199                  if (ddp->ddp_phys_birth != 0)
2211 2200                          ddt_bp_fill(ddp, bp, txg);
↓ open down ↓ 1 lines elided ↑ open up ↑
2213 2202                          zio_add_child(zio, dde->dde_lead_zio[p]);
2214 2203                  else
2215 2204                          ddt_phys_addref(ddp);
2216 2205          } else if (zio->io_bp_override) {
2217 2206                  ASSERT(bp->blk_birth == txg);
2218 2207                  ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2219 2208                  ddt_phys_fill(ddp, bp);
2220 2209                  ddt_phys_addref(ddp);
2221 2210          } else {
2222 2211                  cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2223      -                    zio->io_orig_size, zp, zio_ddt_child_write_ready,
     2212 +                    zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2224 2213                      zio_ddt_child_write_done, dde, zio->io_priority,
2225 2214                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2226 2215  
2227 2216                  zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2228 2217                  dde->dde_lead_zio[p] = cio;
2229 2218          }
2230 2219  
2231 2220          ddt_exit(ddt);
2232 2221  
2233 2222          if (cio)
↓ open down ↓ 396 lines elided ↑ open up ↑
2630 2619           * set vdev_cant_write so that we stop trying to allocate from it.
2631 2620           */
2632 2621          if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2633 2622              vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2634 2623                  vd->vdev_cant_write = B_TRUE;
2635 2624          }
2636 2625  
2637 2626          if (zio->io_error)
2638 2627                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2639 2628  
     2629 +        if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
     2630 +            zio->io_physdone != NULL) {
     2631 +                ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
     2632 +                ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
     2633 +                zio->io_physdone(zio->io_logical);
     2634 +        }
     2635 +
2640 2636          return (ZIO_PIPELINE_CONTINUE);
2641 2637  }
2642 2638  
2643 2639  void
2644 2640  zio_vdev_io_reissue(zio_t *zio)
2645 2641  {
2646 2642          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2647 2643          ASSERT(zio->io_error == 0);
2648 2644  
2649 2645          zio->io_stage >>= 1;
↓ open down ↓ 531 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX