illumos-3748 Wdiff usr/src/uts/common/fs/zfs/zio.c

Print this page

3748 zfs headers should be C++ compatible
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Will Andrews <willa@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zio.c
          +++ new/usr/src/uts/common/fs/zfs/zio.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/fm/fs/zfs.h>
  29   29  #include <sys/spa.h>
  30   30  #include <sys/txg.h>
  31   31  #include <sys/spa_impl.h>
  32   32  #include <sys/vdev_impl.h>
  33   33  #include <sys/zio_impl.h>
  34   34  #include <sys/zio_compress.h>
  35   35  #include <sys/zio_checksum.h>
  36   36  #include <sys/dmu_objset.h>
  37   37  #include <sys/arc.h>
  38   38  #include <sys/ddt.h>
  39   39  
  40   40  /*
  41   41   * ==========================================================================
  42   42   * I/O priority table
  43   43   * ==========================================================================
  44   44   */
  45   45  uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
  46   46          0,      /* ZIO_PRIORITY_NOW             */
  47   47          0,      /* ZIO_PRIORITY_SYNC_READ       */
  48   48          0,      /* ZIO_PRIORITY_SYNC_WRITE      */
  49   49          0,      /* ZIO_PRIORITY_LOG_WRITE       */
  50   50          1,      /* ZIO_PRIORITY_CACHE_FILL      */
  51   51          1,      /* ZIO_PRIORITY_AGG             */
  52   52          4,      /* ZIO_PRIORITY_FREE            */
  53   53          4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
  54   54          6,      /* ZIO_PRIORITY_ASYNC_READ      */
  55   55          10,     /* ZIO_PRIORITY_RESILVER        */
  56   56          20,     /* ZIO_PRIORITY_SCRUB           */
  57   57          2,      /* ZIO_PRIORITY_DDT_PREFETCH    */
  58   58  };
  59   59  
  60   60  /*
  61   61   * ==========================================================================
  62   62   * I/O type descriptions
  63   63   * ==========================================================================
  64   64   */
  65   65  char *zio_type_name[ZIO_TYPES] = {
  66   66          "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
  67   67          "zio_ioctl"
  68   68  };
  69   69  
  70   70  /*
  71   71   * ==========================================================================
  72   72   * I/O kmem caches
  73   73   * ==========================================================================
  74   74   */
  75   75  kmem_cache_t *zio_cache;
  76   76  kmem_cache_t *zio_link_cache;
  77   77  kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  78   78  kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  79   79  
  80   80  #ifdef _KERNEL
  81   81  extern vmem_t *zio_alloc_arena;
  82   82  #endif
  83   83  extern int zfs_mg_alloc_failures;
  84   84  
  85   85  /*
  86   86   * The following actions directly effect the spa's sync-to-convergence logic.
  87   87   * The values below define the sync pass when we start performing the action.
  88   88   * Care should be taken when changing these values as they directly impact
  89   89   * spa_sync() performance. Tuning these values may introduce subtle performance
  90   90   * pathologies and should only be done in the context of performance analysis.
  91   91   * These tunables will eventually be removed and replaced with #defines once
  92   92   * enough analysis has been done to determine optimal values.
  93   93   *
  94   94   * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  95   95   * regular blocks are not deferred.
  96   96   */
  97   97  int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  98   98  int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  99   99  int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
 100  100  
 101  101  /*
 102  102   * An allocating zio is one that either currently has the DVA allocate
 103  103   * stage set or will have it later in its lifetime.
 104  104   */
 105  105  #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 106  106  
 107  107  boolean_t       zio_requeue_io_start_cut_in_line = B_TRUE;
 108  108  
 109  109  #ifdef ZFS_DEBUG
 110  110  int zio_buf_debug_limit = 16384;
 111  111  #else
 112  112  int zio_buf_debug_limit = 0;
 113  113  #endif
 114  114  
 115  115  void
 116  116  zio_init(void)
 117  117  {
 118  118          size_t c;
 119  119          vmem_t *data_alloc_arena = NULL;
 120  120  
 121  121  #ifdef _KERNEL
 122  122          data_alloc_arena = zio_alloc_arena;
 123  123  #endif
 124  124          zio_cache = kmem_cache_create("zio_cache",
 125  125              sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 126  126          zio_link_cache = kmem_cache_create("zio_link_cache",
 127  127              sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 128  128  
 129  129          /*
 130  130           * For small buffers, we want a cache for each multiple of
 131  131           * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
 132  132           * for each quarter-power of 2.  For large buffers, we want
 133  133           * a cache for each multiple of PAGESIZE.
 134  134           */
 135  135          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 136  136                  size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 137  137                  size_t p2 = size;
 138  138                  size_t align = 0;
 139  139                  size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 140  140  
 141  141                  while (p2 & (p2 - 1))
 142  142                          p2 &= p2 - 1;
 143  143  
 144  144  #ifndef _KERNEL
 145  145                  /*
 146  146                   * If we are using watchpoints, put each buffer on its own page,
 147  147                   * to eliminate the performance overhead of trapping to the
 148  148                   * kernel when modifying a non-watched buffer that shares the
 149  149                   * page with a watched buffer.
 150  150                   */
 151  151                  if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 152  152                          continue;
 153  153  #endif
 154  154                  if (size <= 4 * SPA_MINBLOCKSIZE) {
 155  155                          align = SPA_MINBLOCKSIZE;
 156  156                  } else if (IS_P2ALIGNED(size, PAGESIZE)) {
 157  157                          align = PAGESIZE;
 158  158                  } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 159  159                          align = p2 >> 2;
 160  160                  }
 161  161  
 162  162                  if (align != 0) {
 163  163                          char name[36];
 164  164                          (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 165  165                          zio_buf_cache[c] = kmem_cache_create(name, size,
 166  166                              align, NULL, NULL, NULL, NULL, NULL, cflags);
 167  167  
 168  168                          /*
 169  169                           * Since zio_data bufs do not appear in crash dumps, we
 170  170                           * pass KMC_NOTOUCH so that no allocator metadata is
 171  171                           * stored with the buffers.
 172  172                           */
 173  173                          (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 174  174                          zio_data_buf_cache[c] = kmem_cache_create(name, size,
 175  175                              align, NULL, NULL, NULL, NULL, data_alloc_arena,
 176  176                              cflags | KMC_NOTOUCH);
 177  177                  }
 178  178          }
 179  179  
 180  180          while (--c != 0) {
 181  181                  ASSERT(zio_buf_cache[c] != NULL);
 182  182                  if (zio_buf_cache[c - 1] == NULL)
 183  183                          zio_buf_cache[c - 1] = zio_buf_cache[c];
 184  184  
 185  185                  ASSERT(zio_data_buf_cache[c] != NULL);
 186  186                  if (zio_data_buf_cache[c - 1] == NULL)
 187  187                          zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 188  188          }
 189  189  
 190  190          /*
 191  191           * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
 192  192           * to fail 3 times per txg or 8 failures, whichever is greater.
 193  193           */
 194  194          zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
 195  195  
 196  196          zio_inject_init();
 197  197  }
 198  198  
 199  199  void
 200  200  zio_fini(void)
 201  201  {
 202  202          size_t c;
 203  203          kmem_cache_t *last_cache = NULL;
 204  204          kmem_cache_t *last_data_cache = NULL;
 205  205  
 206  206          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 207  207                  if (zio_buf_cache[c] != last_cache) {
 208  208                          last_cache = zio_buf_cache[c];
 209  209                          kmem_cache_destroy(zio_buf_cache[c]);
 210  210                  }
 211  211                  zio_buf_cache[c] = NULL;
 212  212  
 213  213                  if (zio_data_buf_cache[c] != last_data_cache) {
 214  214                          last_data_cache = zio_data_buf_cache[c];
 215  215                          kmem_cache_destroy(zio_data_buf_cache[c]);
 216  216                  }
 217  217                  zio_data_buf_cache[c] = NULL;
 218  218          }
 219  219  
 220  220          kmem_cache_destroy(zio_link_cache);
 221  221          kmem_cache_destroy(zio_cache);
 222  222  
 223  223          zio_inject_fini();
 224  224  }
 225  225  
 226  226  /*
 227  227   * ==========================================================================
 228  228   * Allocate and free I/O buffers
 229  229   * ==========================================================================
 230  230   */
 231  231  
 232  232  /*
 233  233   * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 234  234   * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 235  235   * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 236  236   * excess / transient data in-core during a crashdump.
 237  237   */
 238  238  void *
 239  239  zio_buf_alloc(size_t size)
 240  240  {
 241  241          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 242  242  
 243  243          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 244  244  
 245  245          return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 246  246  }
 247  247  
 248  248  /*
 249  249   * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 250  250   * crashdump if the kernel panics.  This exists so that we will limit the amount
 251  251   * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 252  252   * of kernel heap dumped to disk when the kernel panics)
 253  253   */
 254  254  void *
 255  255  zio_data_buf_alloc(size_t size)
 256  256  {
 257  257          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 258  258  
 259  259          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 260  260  
 261  261          return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 262  262  }
 263  263  
 264  264  void
 265  265  zio_buf_free(void *buf, size_t size)
 266  266  {
 267  267          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 268  268  
 269  269          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 270  270  
 271  271          kmem_cache_free(zio_buf_cache[c], buf);
 272  272  }
 273  273  
 274  274  void
 275  275  zio_data_buf_free(void *buf, size_t size)
 276  276  {
 277  277          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 278  278  
 279  279          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 280  280  
 281  281          kmem_cache_free(zio_data_buf_cache[c], buf);
 282  282  }
 283  283  
 284  284  /*
 285  285   * ==========================================================================
 286  286   * Push and pop I/O transform buffers
 287  287   * ==========================================================================
 288  288   */
 289  289  static void
 290  290  zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 291  291          zio_transform_func_t *transform)
 292  292  {
 293  293          zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 294  294  
 295  295          zt->zt_orig_data = zio->io_data;
 296  296          zt->zt_orig_size = zio->io_size;
 297  297          zt->zt_bufsize = bufsize;
 298  298          zt->zt_transform = transform;
 299  299  
 300  300          zt->zt_next = zio->io_transform_stack;
 301  301          zio->io_transform_stack = zt;
 302  302  
 303  303          zio->io_data = data;
 304  304          zio->io_size = size;
 305  305  }
 306  306  
 307  307  static void
 308  308  zio_pop_transforms(zio_t *zio)
 309  309  {
 310  310          zio_transform_t *zt;
 311  311  
 312  312          while ((zt = zio->io_transform_stack) != NULL) {
 313  313                  if (zt->zt_transform != NULL)
 314  314                          zt->zt_transform(zio,
 315  315                              zt->zt_orig_data, zt->zt_orig_size);
 316  316  
 317  317                  if (zt->zt_bufsize != 0)
 318  318                          zio_buf_free(zio->io_data, zt->zt_bufsize);
 319  319  
 320  320                  zio->io_data = zt->zt_orig_data;
 321  321                  zio->io_size = zt->zt_orig_size;
 322  322                  zio->io_transform_stack = zt->zt_next;
 323  323  
 324  324                  kmem_free(zt, sizeof (zio_transform_t));
 325  325          }
 326  326  }
 327  327  
 328  328  /*
 329  329   * ==========================================================================
 330  330   * I/O transform callbacks for subblocks and decompression
 331  331   * ==========================================================================
 332  332   */
 333  333  static void
 334  334  zio_subblock(zio_t *zio, void *data, uint64_t size)
 335  335  {
 336  336          ASSERT(zio->io_size > size);
 337  337  
 338  338          if (zio->io_type == ZIO_TYPE_READ)
 339  339                  bcopy(zio->io_data, data, size);
 340  340  }
 341  341  
 342  342  static void
 343  343  zio_decompress(zio_t *zio, void *data, uint64_t size)
 344  344  {
 345  345          if (zio->io_error == 0 &&
 346  346              zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 347  347              zio->io_data, data, zio->io_size, size) != 0)
 348  348                  zio->io_error = SET_ERROR(EIO);
 349  349  }
 350  350  
 351  351  /*
 352  352   * ==========================================================================
 353  353   * I/O parent/child relationships and pipeline interlocks
 354  354   * ==========================================================================
 355  355   */
 356  356  /*
 357  357   * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 358  358   *        continue calling these functions until they return NULL.
 359  359   *        Otherwise, the next caller will pick up the list walk in
 360  360   *        some indeterminate state.  (Otherwise every caller would
 361  361   *        have to pass in a cookie to keep the state represented by
 362  362   *        io_walk_link, which gets annoying.)
 363  363   */
 364  364  zio_t *
 365  365  zio_walk_parents(zio_t *cio)
 366  366  {
 367  367          zio_link_t *zl = cio->io_walk_link;
 368  368          list_t *pl = &cio->io_parent_list;
 369  369  
 370  370          zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 371  371          cio->io_walk_link = zl;
 372  372  
 373  373          if (zl == NULL)
 374  374                  return (NULL);
 375  375  
 376  376          ASSERT(zl->zl_child == cio);
 377  377          return (zl->zl_parent);
 378  378  }
 379  379  
 380  380  zio_t *
 381  381  zio_walk_children(zio_t *pio)
 382  382  {
 383  383          zio_link_t *zl = pio->io_walk_link;
 384  384          list_t *cl = &pio->io_child_list;
 385  385  
 386  386          zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 387  387          pio->io_walk_link = zl;
 388  388  
 389  389          if (zl == NULL)
 390  390                  return (NULL);
 391  391  
 392  392          ASSERT(zl->zl_parent == pio);
 393  393          return (zl->zl_child);
 394  394  }
 395  395  
 396  396  zio_t *
 397  397  zio_unique_parent(zio_t *cio)
 398  398  {
 399  399          zio_t *pio = zio_walk_parents(cio);
 400  400  
 401  401          VERIFY(zio_walk_parents(cio) == NULL);
 402  402          return (pio);
 403  403  }
 404  404  
 405  405  void
 406  406  zio_add_child(zio_t *pio, zio_t *cio)
 407  407  {
 408  408          zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 409  409  
 410  410          /*
 411  411           * Logical I/Os can have logical, gang, or vdev children.
 412  412           * Gang I/Os can have gang or vdev children.
 413  413           * Vdev I/Os can only have vdev children.
 414  414           * The following ASSERT captures all of these constraints.
 415  415           */
 416  416          ASSERT(cio->io_child_type <= pio->io_child_type);
 417  417  
 418  418          zl->zl_parent = pio;
 419  419          zl->zl_child = cio;
 420  420  
 421  421          mutex_enter(&cio->io_lock);
 422  422          mutex_enter(&pio->io_lock);
 423  423  
 424  424          ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 425  425  
 426  426          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 427  427                  pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 428  428  
 429  429          list_insert_head(&pio->io_child_list, zl);
 430  430          list_insert_head(&cio->io_parent_list, zl);
 431  431  
 432  432          pio->io_child_count++;
 433  433          cio->io_parent_count++;
 434  434  
 435  435          mutex_exit(&pio->io_lock);
 436  436          mutex_exit(&cio->io_lock);
 437  437  }
 438  438  
 439  439  static void
 440  440  zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 441  441  {
 442  442          ASSERT(zl->zl_parent == pio);
 443  443          ASSERT(zl->zl_child == cio);
 444  444  
 445  445          mutex_enter(&cio->io_lock);
 446  446          mutex_enter(&pio->io_lock);
 447  447  
 448  448          list_remove(&pio->io_child_list, zl);
 449  449          list_remove(&cio->io_parent_list, zl);
 450  450  
 451  451          pio->io_child_count--;
 452  452          cio->io_parent_count--;
 453  453  
 454  454          mutex_exit(&pio->io_lock);
 455  455          mutex_exit(&cio->io_lock);
 456  456  
 457  457          kmem_cache_free(zio_link_cache, zl);
 458  458  }
 459  459  
 460  460  static boolean_t
 461  461  zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 462  462  {
 463  463          uint64_t *countp = &zio->io_children[child][wait];
 464  464          boolean_t waiting = B_FALSE;
 465  465  
 466  466          mutex_enter(&zio->io_lock);
 467  467          ASSERT(zio->io_stall == NULL);
 468  468          if (*countp != 0) {
 469  469                  zio->io_stage >>= 1;
 470  470                  zio->io_stall = countp;
 471  471                  waiting = B_TRUE;
 472  472          }
 473  473          mutex_exit(&zio->io_lock);
 474  474  
 475  475          return (waiting);
 476  476  }
 477  477  
 478  478  static void
 479  479  zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 480  480  {
 481  481          uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 482  482          int *errorp = &pio->io_child_error[zio->io_child_type];
 483  483  
 484  484          mutex_enter(&pio->io_lock);
 485  485          if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 486  486                  *errorp = zio_worst_error(*errorp, zio->io_error);
 487  487          pio->io_reexecute |= zio->io_reexecute;
 488  488          ASSERT3U(*countp, >, 0);
 489  489          if (--*countp == 0 && pio->io_stall == countp) {
 490  490                  pio->io_stall = NULL;
 491  491                  mutex_exit(&pio->io_lock);
 492  492                  zio_execute(pio);
 493  493          } else {
 494  494                  mutex_exit(&pio->io_lock);
 495  495          }
 496  496  }
 497  497  
 498  498  static void
 499  499  zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 500  500  {
 501  501          if (zio->io_child_error[c] != 0 && zio->io_error == 0)

↓ open down ↓

501 lines elided

↑ open up ↑

 502  502                  zio->io_error = zio->io_child_error[c];
 503  503  }
 504  504  
 505  505  /*
 506  506   * ==========================================================================
 507  507   * Create the various types of I/O (read, write, free, etc)
 508  508   * ==========================================================================
 509  509   */
 510  510  static zio_t *
 511  511  zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 512      -    void *data, uint64_t size, zio_done_func_t *done, void *private,
      512 +    void *data, uint64_t size, zio_done_func_t *done, void *io_private,
 513  513      zio_type_t type, int priority, enum zio_flag flags,
 514  514      vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
 515  515      enum zio_stage stage, enum zio_stage pipeline)
 516  516  {
 517  517          zio_t *zio;
 518  518  
 519  519          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 520  520          ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 521  521          ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 522  522

 523  523          ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 524  524          ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 525  525          ASSERT(vd || stage == ZIO_STAGE_OPEN);
 526  526  
 527  527          zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 528  528          bzero(zio, sizeof (zio_t));
 529  529  
 530  530          mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 531  531          cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 532  532  
 533  533          list_create(&zio->io_parent_list, sizeof (zio_link_t),
 534  534              offsetof(zio_link_t, zl_parent_node));
 535  535          list_create(&zio->io_child_list, sizeof (zio_link_t),
 536  536              offsetof(zio_link_t, zl_child_node));
 537  537  
 538  538          if (vd != NULL)
 539  539                  zio->io_child_type = ZIO_CHILD_VDEV;
 540  540          else if (flags & ZIO_FLAG_GANG_CHILD)
 541  541                  zio->io_child_type = ZIO_CHILD_GANG;
 542  542          else if (flags & ZIO_FLAG_DDT_CHILD)
 543  543                  zio->io_child_type = ZIO_CHILD_DDT;
 544  544          else
 545  545                  zio->io_child_type = ZIO_CHILD_LOGICAL;
 546  546  
 547  547          if (bp != NULL) {
 548  548                  zio->io_bp = (blkptr_t *)bp;
 549  549                  zio->io_bp_copy = *bp;
 550  550                  zio->io_bp_orig = *bp;
 551  551                  if (type != ZIO_TYPE_WRITE ||
 552  552                      zio->io_child_type == ZIO_CHILD_DDT)

↓ open down ↓

30 lines elided

↑ open up ↑

 553  553                          zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 554  554                  if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 555  555                          zio->io_logical = zio;
 556  556                  if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 557  557                          pipeline |= ZIO_GANG_STAGES;
 558  558          }
 559  559  
 560  560          zio->io_spa = spa;
 561  561          zio->io_txg = txg;
 562  562          zio->io_done = done;
 563      -        zio->io_private = private;
      563 +        zio->io_private = io_private;
 564  564          zio->io_type = type;
 565  565          zio->io_priority = priority;
 566  566          zio->io_vd = vd;
 567  567          zio->io_offset = offset;
 568  568          zio->io_orig_data = zio->io_data = data;
 569  569          zio->io_orig_size = zio->io_size = size;
 570  570          zio->io_orig_flags = zio->io_flags = flags;
 571  571          zio->io_orig_stage = zio->io_stage = stage;
 572  572          zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 573  573

 574  574          zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 575  575          zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 576  576  
 577  577          if (zb != NULL)
 578  578                  zio->io_bookmark = *zb;
 579  579  
 580  580          if (pio != NULL) {
 581  581                  if (zio->io_logical == NULL)
 582  582                          zio->io_logical = pio->io_logical;
 583  583                  if (zio->io_child_type == ZIO_CHILD_GANG)
 584  584                          zio->io_gang_leader = pio->io_gang_leader;
 585  585                  zio_add_child(pio, zio);
 586  586          }
 587  587  
 588  588          return (zio);
 589  589  }
 590  590  
 591  591  static void
 592  592  zio_destroy(zio_t *zio)

↓ open down ↓

19 lines elided

↑ open up ↑

 593  593  {
 594  594          list_destroy(&zio->io_parent_list);
 595  595          list_destroy(&zio->io_child_list);
 596  596          mutex_destroy(&zio->io_lock);
 597  597          cv_destroy(&zio->io_cv);
 598  598          kmem_cache_free(zio_cache, zio);
 599  599  }
 600  600  
 601  601  zio_t *
 602  602  zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 603      -    void *private, enum zio_flag flags)
      603 +    void *io_private, enum zio_flag flags)
 604  604  {
 605  605          zio_t *zio;
 606  606  
 607      -        zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
      607 +        zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, io_private,
 608  608              ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 609  609              ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 610  610  
 611  611          return (zio);
 612  612  }
 613  613  
 614  614  zio_t *
 615      -zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
      615 +zio_root(spa_t *spa, zio_done_func_t *done, void *io_private,
      616 +    enum zio_flag flags)
 616  617  {
 617      -        return (zio_null(NULL, spa, NULL, done, private, flags));
      618 +        return (zio_null(NULL, spa, NULL, done, io_private, flags));
 618  619  }
 619  620  
 620  621  zio_t *
 621  622  zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 622      -    void *data, uint64_t size, zio_done_func_t *done, void *private,
      623 +    void *data, uint64_t size, zio_done_func_t *done, void *io_private,
 623  624      int priority, enum zio_flag flags, const zbookmark_t *zb)
 624  625  {
 625  626          zio_t *zio;
 626  627  
 627  628          zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 628      -            data, size, done, private,
      629 +            data, size, done, io_private,
 629  630              ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 630  631              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 631  632              ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 632  633  
 633  634          return (zio);
 634  635  }
 635  636  
 636  637  zio_t *
 637  638  zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 638  639      void *data, uint64_t size, const zio_prop_t *zp,
 639      -    zio_done_func_t *ready, zio_done_func_t *done, void *private,
      640 +    zio_done_func_t *ready, zio_done_func_t *done, void *io_private,
 640  641      int priority, enum zio_flag flags, const zbookmark_t *zb)
 641  642  {
 642  643          zio_t *zio;
 643  644  
 644  645          ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 645  646              zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 646  647              zp->zp_compress >= ZIO_COMPRESS_OFF &&
 647  648              zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 648  649              DMU_OT_IS_VALID(zp->zp_type) &&
 649  650              zp->zp_level < 32 &&
 650  651              zp->zp_copies > 0 &&
 651  652              zp->zp_copies <= spa_max_replication(spa));
 652  653  
 653      -        zio = zio_create(pio, spa, txg, bp, data, size, done, private,
      654 +        zio = zio_create(pio, spa, txg, bp, data, size, done, io_private,
 654  655              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 655  656              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 656  657              ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 657  658  
 658  659          zio->io_ready = ready;
 659  660          zio->io_prop = *zp;
 660  661  
 661  662          return (zio);
 662  663  }
 663  664  
 664  665  zio_t *
 665  666  zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 666      -    uint64_t size, zio_done_func_t *done, void *private, int priority,
      667 +    uint64_t size, zio_done_func_t *done, void *io_private, int priority,
 667  668      enum zio_flag flags, zbookmark_t *zb)
 668  669  {
 669  670          zio_t *zio;
 670  671  
 671      -        zio = zio_create(pio, spa, txg, bp, data, size, done, private,
      672 +        zio = zio_create(pio, spa, txg, bp, data, size, done, io_private,
 672  673              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 673  674              ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 674  675  
 675  676          return (zio);
 676  677  }
 677  678  
 678  679  void
 679  680  zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 680  681  {
 681  682          ASSERT(zio->io_type == ZIO_TYPE_WRITE);

 682  683          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 683  684          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 684  685          ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 685  686  
 686  687          /*
 687  688           * We must reset the io_prop to match the values that existed
 688  689           * when the bp was first written by dmu_sync() keeping in mind
 689  690           * that nopwrite and dedup are mutually exclusive.
 690  691           */
 691  692          zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 692  693          zio->io_prop.zp_nopwrite = nopwrite;
 693  694          zio->io_prop.zp_copies = copies;
 694  695          zio->io_bp_override = bp;
 695  696  }
 696  697  
 697  698  void
 698  699  zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 699  700  {
 700  701          metaslab_check_free(spa, bp);
 701  702          bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 702  703  }
 703  704  
 704  705  zio_t *
 705  706  zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 706  707      enum zio_flag flags)
 707  708  {
 708  709          zio_t *zio;
 709  710  
 710  711          dprintf_bp(bp, "freeing in txg %llu, pass %u",
 711  712              (longlong_t)txg, spa->spa_sync_pass);
 712  713  
 713  714          ASSERT(!BP_IS_HOLE(bp));
 714  715          ASSERT(spa_syncing_txg(spa) == txg);
 715  716          ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 716  717  
 717  718          metaslab_check_free(spa, bp);

↓ open down ↓

36 lines elided

↑ open up ↑

 718  719  
 719  720          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 720  721              NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
 721  722              NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
 722  723  
 723  724          return (zio);
 724  725  }
 725  726  
 726  727  zio_t *
 727  728  zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 728      -    zio_done_func_t *done, void *private, enum zio_flag flags)
      729 +    zio_done_func_t *done, void *io_private, enum zio_flag flags)
 729  730  {
 730  731          zio_t *zio;
 731  732  
 732  733          /*
 733  734           * A claim is an allocation of a specific block.  Claims are needed
 734  735           * to support immediate writes in the intent log.  The issue is that
 735  736           * immediate writes contain committed data, but in a txg that was
 736  737           * *not* committed.  Upon opening the pool after an unclean shutdown,
 737  738           * the intent log claims all blocks that contain immediate write data
 738  739           * so that the SPA knows they're in use.
 739  740           *
 740  741           * All claims *must* be resolved in the first txg -- before the SPA
 741  742           * starts allocating blocks -- so that nothing is allocated twice.
 742  743           * If txg == 0 we just verify that the block is claimable.
 743  744           */
 744  745          ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 745  746          ASSERT(txg == spa_first_txg(spa) || txg == 0);
 746  747          ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 747  748  
 748  749          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 749      -            done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
      750 +            done, io_private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 750  751              NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 751  752  
 752  753          return (zio);
 753  754  }
 754  755  
 755  756  zio_t *
 756  757  zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 757      -    zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
      758 +    zio_done_func_t *done, void *io_private, int priority, enum zio_flag flags)
 758  759  {
 759  760          zio_t *zio;
 760  761          int c;
 761  762  
 762  763          if (vd->vdev_children == 0) {
 763      -                zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
      764 +                zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, io_private,
 764  765                      ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
 765  766                      ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 766  767  
 767  768                  zio->io_cmd = cmd;
 768  769          } else {
 769  770                  zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 770  771  
 771  772                  for (c = 0; c < vd->vdev_children; c++)
 772  773                          zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 773      -                            done, private, priority, flags));
      774 +                            done, io_private, priority, flags));
 774  775          }
 775  776  
 776  777          return (zio);
 777  778  }
 778  779  
 779  780  zio_t *
 780  781  zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 781      -    void *data, int checksum, zio_done_func_t *done, void *private,
      782 +    void *data, int checksum, zio_done_func_t *done, void *io_private,
 782  783      int priority, enum zio_flag flags, boolean_t labels)
 783  784  {
 784  785          zio_t *zio;
 785  786  
 786  787          ASSERT(vd->vdev_children == 0);
 787  788          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 788  789              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 789  790          ASSERT3U(offset + size, <=, vd->vdev_psize);
 790  791  
 791      -        zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 792      -            ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
      792 +        zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done,
      793 +            io_private, ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
 793  794              ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 794  795  
 795  796          zio->io_prop.zp_checksum = checksum;
 796  797  
 797  798          return (zio);
 798  799  }
 799  800  
 800  801  zio_t *
 801  802  zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 802      -    void *data, int checksum, zio_done_func_t *done, void *private,
      803 +    void *data, int checksum, zio_done_func_t *done, void *io_private,
 803  804      int priority, enum zio_flag flags, boolean_t labels)
 804  805  {
 805  806          zio_t *zio;
 806  807  
 807  808          ASSERT(vd->vdev_children == 0);
 808  809          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 809  810              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 810  811          ASSERT3U(offset + size, <=, vd->vdev_psize);
 811  812  
 812      -        zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 813      -            ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
      813 +        zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done,
      814 +            io_private, ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
 814  815              ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 815  816  
 816  817          zio->io_prop.zp_checksum = checksum;
 817  818  
 818  819          if (zio_checksum_table[checksum].ci_eck) {
 819  820                  /*
 820  821                   * zec checksums are necessarily destructive -- they modify
 821  822                   * the end of the write buffer to hold the verifier/checksum.
 822  823                   * Therefore, we must make a local copy in case the data is
 823  824                   * being written to multiple places in parallel.

 824  825                   */
 825  826                  void *wbuf = zio_buf_alloc(size);
 826  827                  bcopy(data, wbuf, size);
 827  828                  zio_push_transform(zio, wbuf, size, size, NULL);
 828  829          }

↓ open down ↓

5 lines elided

↑ open up ↑

 829  830  
 830  831          return (zio);
 831  832  }
 832  833  
 833  834  /*
 834  835   * Create a child I/O to do some work for us.
 835  836   */
 836  837  zio_t *
 837  838  zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 838  839          void *data, uint64_t size, int type, int priority, enum zio_flag flags,
 839      -        zio_done_func_t *done, void *private)
      840 +        zio_done_func_t *done, void *io_private)
 840  841  {
 841  842          enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 842  843          zio_t *zio;
 843  844  
 844  845          ASSERT(vd->vdev_parent ==
 845  846              (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 846  847  
 847  848          if (type == ZIO_TYPE_READ && bp != NULL) {
 848  849                  /*
 849  850                   * If we have the bp, then the child should perform the

 850  851                   * checksum and the parent need not.  This pushes error
 851  852                   * detection as close to the leaves as possible and
 852  853                   * eliminates redundant checksums in the interior nodes.
 853  854                   */
 854  855                  pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 855  856                  pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 856  857          }
 857  858  
 858  859          if (vd->vdev_children == 0)
 859  860                  offset += VDEV_LABEL_START_SIZE;
 860  861

↓ open down ↓

11 lines elided

↑ open up ↑

 861  862          flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 862  863  
 863  864          /*
 864  865           * If we've decided to do a repair, the write is not speculative --
 865  866           * even if the original read was.
 866  867           */
 867  868          if (flags & ZIO_FLAG_IO_REPAIR)
 868  869                  flags &= ~ZIO_FLAG_SPECULATIVE;
 869  870  
 870  871          zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 871      -            done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 872      -            ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
      872 +            done, io_private, type, priority, flags, vd, offset,
      873 +            &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 873  874  
 874  875          return (zio);
 875  876  }
 876  877  
 877  878  zio_t *
 878  879  zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 879  880          int type, int priority, enum zio_flag flags,
 880      -        zio_done_func_t *done, void *private)
      881 +        zio_done_func_t *done, void *io_private)
 881  882  {
 882  883          zio_t *zio;
 883  884  
 884  885          ASSERT(vd->vdev_ops->vdev_op_leaf);
 885  886  
 886  887          zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 887      -            data, size, done, private, type, priority,
      888 +            data, size, done, io_private, type, priority,
 888  889              flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
 889  890              vd, offset, NULL,
 890  891              ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 891  892  
 892  893          return (zio);
 893  894  }
 894  895  
 895  896  void
 896  897  zio_flush(zio_t *zio, vdev_t *vd)
 897  898  {

 898  899          zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 899  900              NULL, NULL, ZIO_PRIORITY_NOW,
 900  901              ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 901  902  }
 902  903  
 903  904  void
 904  905  zio_shrink(zio_t *zio, uint64_t size)
 905  906  {
 906  907          ASSERT(zio->io_executor == NULL);
 907  908          ASSERT(zio->io_orig_size == zio->io_size);
 908  909          ASSERT(size <= zio->io_size);
 909  910  
 910  911          /*
 911  912           * We don't shrink for raidz because of problems with the
 912  913           * reconstruction when reading back less than the block size.
 913  914           * Note, BP_IS_RAIDZ() assumes no compression.
 914  915           */
 915  916          ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 916  917          if (!BP_IS_RAIDZ(zio->io_bp))
 917  918                  zio->io_orig_size = zio->io_size = size;
 918  919  }
 919  920  
 920  921  /*
 921  922   * ==========================================================================
 922  923   * Prepare to read and write logical blocks
 923  924   * ==========================================================================
 924  925   */
 925  926  
 926  927  static int
 927  928  zio_read_bp_init(zio_t *zio)
 928  929  {
 929  930          blkptr_t *bp = zio->io_bp;
 930  931  
 931  932          if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 932  933              zio->io_child_type == ZIO_CHILD_LOGICAL &&
 933  934              !(zio->io_flags & ZIO_FLAG_RAW)) {
 934  935                  uint64_t psize = BP_GET_PSIZE(bp);
 935  936                  void *cbuf = zio_buf_alloc(psize);
 936  937  
 937  938                  zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 938  939          }
 939  940  
 940  941          if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 941  942                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 942  943  
 943  944          if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
 944  945                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 945  946  
 946  947          if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 947  948                  zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 948  949  
 949  950          return (ZIO_PIPELINE_CONTINUE);
 950  951  }
 951  952  
 952  953  static int
 953  954  zio_write_bp_init(zio_t *zio)
 954  955  {
 955  956          spa_t *spa = zio->io_spa;
 956  957          zio_prop_t *zp = &zio->io_prop;
 957  958          enum zio_compress compress = zp->zp_compress;
 958  959          blkptr_t *bp = zio->io_bp;
 959  960          uint64_t lsize = zio->io_size;
 960  961          uint64_t psize = lsize;
 961  962          int pass = 1;
 962  963  
 963  964          /*
 964  965           * If our children haven't all reached the ready stage,
 965  966           * wait for them and then repeat this pipeline stage.
 966  967           */
 967  968          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
 968  969              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
 969  970                  return (ZIO_PIPELINE_STOP);
 970  971  
 971  972          if (!IO_IS_ALLOCATING(zio))
 972  973                  return (ZIO_PIPELINE_CONTINUE);
 973  974  
 974  975          ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 975  976  
 976  977          if (zio->io_bp_override) {
 977  978                  ASSERT(bp->blk_birth != zio->io_txg);
 978  979                  ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
 979  980  
 980  981                  *bp = *zio->io_bp_override;
 981  982                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 982  983  
 983  984                  /*
 984  985                   * If we've been overridden and nopwrite is set then
 985  986                   * set the flag accordingly to indicate that a nopwrite
 986  987                   * has already occurred.
 987  988                   */
 988  989                  if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 989  990                          ASSERT(!zp->zp_dedup);
 990  991                          zio->io_flags |= ZIO_FLAG_NOPWRITE;
 991  992                          return (ZIO_PIPELINE_CONTINUE);
 992  993                  }
 993  994  
 994  995                  ASSERT(!zp->zp_nopwrite);
 995  996  
 996  997                  if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 997  998                          return (ZIO_PIPELINE_CONTINUE);
 998  999  
 999 1000                  ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1000 1001                      zp->zp_dedup_verify);
1001 1002  
1002 1003                  if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1003 1004                          BP_SET_DEDUP(bp, 1);
1004 1005                          zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1005 1006                          return (ZIO_PIPELINE_CONTINUE);
1006 1007                  }
1007 1008                  zio->io_bp_override = NULL;
1008 1009                  BP_ZERO(bp);
1009 1010          }
1010 1011  
1011 1012          if (bp->blk_birth == zio->io_txg) {
1012 1013                  /*
1013 1014                   * We're rewriting an existing block, which means we're
1014 1015                   * working on behalf of spa_sync().  For spa_sync() to
1015 1016                   * converge, it must eventually be the case that we don't
1016 1017                   * have to allocate new blocks.  But compression changes
1017 1018                   * the blocksize, which forces a reallocate, and makes
1018 1019                   * convergence take longer.  Therefore, after the first
1019 1020                   * few passes, stop compressing to ensure convergence.
1020 1021                   */
1021 1022                  pass = spa_sync_pass(spa);
1022 1023  
1023 1024                  ASSERT(zio->io_txg == spa_syncing_txg(spa));
1024 1025                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1025 1026                  ASSERT(!BP_GET_DEDUP(bp));
1026 1027  
1027 1028                  if (pass >= zfs_sync_pass_dont_compress)
1028 1029                          compress = ZIO_COMPRESS_OFF;
1029 1030  
1030 1031                  /* Make sure someone doesn't change their mind on overwrites */
1031 1032                  ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
1032 1033                      spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1033 1034          }
1034 1035  
1035 1036          if (compress != ZIO_COMPRESS_OFF) {
1036 1037                  void *cbuf = zio_buf_alloc(lsize);
1037 1038                  psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1038 1039                  if (psize == 0 || psize == lsize) {
1039 1040                          compress = ZIO_COMPRESS_OFF;
1040 1041                          zio_buf_free(cbuf, lsize);
1041 1042                  } else {
1042 1043                          ASSERT(psize < lsize);
1043 1044                          zio_push_transform(zio, cbuf, psize, lsize, NULL);
1044 1045                  }
1045 1046          }
1046 1047  
1047 1048          /*
1048 1049           * The final pass of spa_sync() must be all rewrites, but the first
1049 1050           * few passes offer a trade-off: allocating blocks defers convergence,
1050 1051           * but newly allocated blocks are sequential, so they can be written
1051 1052           * to disk faster.  Therefore, we allow the first few passes of
1052 1053           * spa_sync() to allocate new blocks, but force rewrites after that.
1053 1054           * There should only be a handful of blocks after pass 1 in any case.
1054 1055           */
1055 1056          if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
1056 1057              pass >= zfs_sync_pass_rewrite) {
1057 1058                  ASSERT(psize != 0);
1058 1059                  enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1059 1060                  zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1060 1061                  zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1061 1062          } else {
1062 1063                  BP_ZERO(bp);
1063 1064                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
1064 1065          }
1065 1066  
1066 1067          if (psize == 0) {
1067 1068                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1068 1069          } else {
1069 1070                  ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1070 1071                  BP_SET_LSIZE(bp, lsize);
1071 1072                  BP_SET_PSIZE(bp, psize);
1072 1073                  BP_SET_COMPRESS(bp, compress);
1073 1074                  BP_SET_CHECKSUM(bp, zp->zp_checksum);
1074 1075                  BP_SET_TYPE(bp, zp->zp_type);
1075 1076                  BP_SET_LEVEL(bp, zp->zp_level);
1076 1077                  BP_SET_DEDUP(bp, zp->zp_dedup);
1077 1078                  BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1078 1079                  if (zp->zp_dedup) {
1079 1080                          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1080 1081                          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1081 1082                          zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1082 1083                  }
1083 1084                  if (zp->zp_nopwrite) {
1084 1085                          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1085 1086                          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1086 1087                          zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1087 1088                  }
1088 1089          }
1089 1090  
1090 1091          return (ZIO_PIPELINE_CONTINUE);
1091 1092  }
1092 1093  
1093 1094  static int
1094 1095  zio_free_bp_init(zio_t *zio)
1095 1096  {
1096 1097          blkptr_t *bp = zio->io_bp;
1097 1098  
1098 1099          if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1099 1100                  if (BP_GET_DEDUP(bp))
1100 1101                          zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1101 1102          }
1102 1103  
1103 1104          return (ZIO_PIPELINE_CONTINUE);
1104 1105  }
1105 1106  
1106 1107  /*
1107 1108   * ==========================================================================
1108 1109   * Execute the I/O pipeline
1109 1110   * ==========================================================================
1110 1111   */
1111 1112  
1112 1113  static void
1113 1114  zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1114 1115  {
1115 1116          spa_t *spa = zio->io_spa;
1116 1117          zio_type_t t = zio->io_type;
1117 1118          int flags = (cutinline ? TQ_FRONT : 0);
1118 1119  
1119 1120          /*
1120 1121           * If we're a config writer or a probe, the normal issue and
1121 1122           * interrupt threads may all be blocked waiting for the config lock.
1122 1123           * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1123 1124           */
1124 1125          if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1125 1126                  t = ZIO_TYPE_NULL;
1126 1127  
1127 1128          /*
1128 1129           * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1129 1130           */
1130 1131          if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1131 1132                  t = ZIO_TYPE_NULL;
1132 1133  
1133 1134          /*
1134 1135           * If this is a high priority I/O, then use the high priority taskq if
1135 1136           * available.
1136 1137           */
1137 1138          if (zio->io_priority == ZIO_PRIORITY_NOW &&
1138 1139              spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1139 1140                  q++;
1140 1141  
1141 1142          ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1142 1143  
1143 1144          /*
1144 1145           * NB: We are assuming that the zio can only be dispatched
1145 1146           * to a single taskq at a time.  It would be a grievous error
1146 1147           * to dispatch the zio to another taskq at the same time.
1147 1148           */
1148 1149          ASSERT(zio->io_tqent.tqent_next == NULL);
1149 1150          spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1150 1151              flags, &zio->io_tqent);
1151 1152  }
1152 1153  
1153 1154  static boolean_t
1154 1155  zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1155 1156  {
1156 1157          kthread_t *executor = zio->io_executor;
1157 1158          spa_t *spa = zio->io_spa;
1158 1159  
1159 1160          for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1160 1161                  spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1161 1162                  uint_t i;
1162 1163                  for (i = 0; i < tqs->stqs_count; i++) {
1163 1164                          if (taskq_member(tqs->stqs_taskq[i], executor))
1164 1165                                  return (B_TRUE);
1165 1166                  }
1166 1167          }
1167 1168  
1168 1169          return (B_FALSE);
1169 1170  }
1170 1171  
1171 1172  static int
1172 1173  zio_issue_async(zio_t *zio)
1173 1174  {
1174 1175          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1175 1176  
1176 1177          return (ZIO_PIPELINE_STOP);
1177 1178  }
1178 1179  
1179 1180  void
1180 1181  zio_interrupt(zio_t *zio)
1181 1182  {
1182 1183          zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1183 1184  }
1184 1185  
1185 1186  /*
1186 1187   * Execute the I/O pipeline until one of the following occurs:
1187 1188   * (1) the I/O completes; (2) the pipeline stalls waiting for
1188 1189   * dependent child I/Os; (3) the I/O issues, so we're waiting
1189 1190   * for an I/O completion interrupt; (4) the I/O is delegated by
1190 1191   * vdev-level caching or aggregation; (5) the I/O is deferred
1191 1192   * due to vdev-level queueing; (6) the I/O is handed off to
1192 1193   * another thread.  In all cases, the pipeline stops whenever
1193 1194   * there's no CPU work; it never burns a thread in cv_wait().
1194 1195   *
1195 1196   * There's no locking on io_stage because there's no legitimate way
1196 1197   * for multiple threads to be attempting to process the same I/O.
1197 1198   */
1198 1199  static zio_pipe_stage_t *zio_pipeline[];
1199 1200  
1200 1201  void
1201 1202  zio_execute(zio_t *zio)
1202 1203  {
1203 1204          zio->io_executor = curthread;
1204 1205  
1205 1206          while (zio->io_stage < ZIO_STAGE_DONE) {
1206 1207                  enum zio_stage pipeline = zio->io_pipeline;
1207 1208                  enum zio_stage stage = zio->io_stage;
1208 1209                  int rv;
1209 1210  
1210 1211                  ASSERT(!MUTEX_HELD(&zio->io_lock));
1211 1212                  ASSERT(ISP2(stage));
1212 1213                  ASSERT(zio->io_stall == NULL);
1213 1214  
1214 1215                  do {
1215 1216                          stage <<= 1;
1216 1217                  } while ((stage & pipeline) == 0);
1217 1218  
1218 1219                  ASSERT(stage <= ZIO_STAGE_DONE);
1219 1220  
1220 1221                  /*
1221 1222                   * If we are in interrupt context and this pipeline stage
1222 1223                   * will grab a config lock that is held across I/O,
1223 1224                   * or may wait for an I/O that needs an interrupt thread
1224 1225                   * to complete, issue async to avoid deadlock.
1225 1226                   *
1226 1227                   * For VDEV_IO_START, we cut in line so that the io will
1227 1228                   * be sent to disk promptly.
1228 1229                   */
1229 1230                  if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1230 1231                      zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1231 1232                          boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1232 1233                              zio_requeue_io_start_cut_in_line : B_FALSE;
1233 1234                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1234 1235                          return;
1235 1236                  }
1236 1237  
1237 1238                  zio->io_stage = stage;
1238 1239                  rv = zio_pipeline[highbit(stage) - 1](zio);
1239 1240  
1240 1241                  if (rv == ZIO_PIPELINE_STOP)
1241 1242                          return;
1242 1243  
1243 1244                  ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1244 1245          }
1245 1246  }
1246 1247  
1247 1248  /*
1248 1249   * ==========================================================================
1249 1250   * Initiate I/O, either sync or async
1250 1251   * ==========================================================================
1251 1252   */
1252 1253  int
1253 1254  zio_wait(zio_t *zio)
1254 1255  {
1255 1256          int error;
1256 1257  
1257 1258          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1258 1259          ASSERT(zio->io_executor == NULL);
1259 1260  
1260 1261          zio->io_waiter = curthread;
1261 1262  
1262 1263          zio_execute(zio);
1263 1264  
1264 1265          mutex_enter(&zio->io_lock);
1265 1266          while (zio->io_executor != NULL)
1266 1267                  cv_wait(&zio->io_cv, &zio->io_lock);
1267 1268          mutex_exit(&zio->io_lock);
1268 1269  
1269 1270          error = zio->io_error;
1270 1271          zio_destroy(zio);
1271 1272  
1272 1273          return (error);
1273 1274  }
1274 1275  
1275 1276  void
1276 1277  zio_nowait(zio_t *zio)
1277 1278  {
1278 1279          ASSERT(zio->io_executor == NULL);
1279 1280  
1280 1281          if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1281 1282              zio_unique_parent(zio) == NULL) {
1282 1283                  /*
1283 1284                   * This is a logical async I/O with no parent to wait for it.
1284 1285                   * We add it to the spa_async_root_zio "Godfather" I/O which
1285 1286                   * will ensure they complete prior to unloading the pool.
1286 1287                   */
1287 1288                  spa_t *spa = zio->io_spa;
1288 1289  
1289 1290                  zio_add_child(spa->spa_async_zio_root, zio);
1290 1291          }
1291 1292  
1292 1293          zio_execute(zio);
1293 1294  }
1294 1295  
1295 1296  /*
1296 1297   * ==========================================================================
1297 1298   * Reexecute or suspend/resume failed I/O
1298 1299   * ==========================================================================
1299 1300   */
1300 1301  
1301 1302  static void
1302 1303  zio_reexecute(zio_t *pio)
1303 1304  {
1304 1305          zio_t *cio, *cio_next;
1305 1306  
1306 1307          ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1307 1308          ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1308 1309          ASSERT(pio->io_gang_leader == NULL);
1309 1310          ASSERT(pio->io_gang_tree == NULL);
1310 1311  
1311 1312          pio->io_flags = pio->io_orig_flags;
1312 1313          pio->io_stage = pio->io_orig_stage;
1313 1314          pio->io_pipeline = pio->io_orig_pipeline;
1314 1315          pio->io_reexecute = 0;
1315 1316          pio->io_flags |= ZIO_FLAG_REEXECUTED;
1316 1317          pio->io_error = 0;
1317 1318          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1318 1319                  pio->io_state[w] = 0;
1319 1320          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1320 1321                  pio->io_child_error[c] = 0;
1321 1322  
1322 1323          if (IO_IS_ALLOCATING(pio))
1323 1324                  BP_ZERO(pio->io_bp);
1324 1325  
1325 1326          /*
1326 1327           * As we reexecute pio's children, new children could be created.
1327 1328           * New children go to the head of pio's io_child_list, however,
1328 1329           * so we will (correctly) not reexecute them.  The key is that
1329 1330           * the remainder of pio's io_child_list, from 'cio_next' onward,
1330 1331           * cannot be affected by any side effects of reexecuting 'cio'.
1331 1332           */
1332 1333          for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1333 1334                  cio_next = zio_walk_children(pio);
1334 1335                  mutex_enter(&pio->io_lock);
1335 1336                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1336 1337                          pio->io_children[cio->io_child_type][w]++;
1337 1338                  mutex_exit(&pio->io_lock);
1338 1339                  zio_reexecute(cio);
1339 1340          }
1340 1341  
1341 1342          /*
1342 1343           * Now that all children have been reexecuted, execute the parent.
1343 1344           * We don't reexecute "The Godfather" I/O here as it's the
1344 1345           * responsibility of the caller to wait on him.
1345 1346           */
1346 1347          if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1347 1348                  zio_execute(pio);
1348 1349  }
1349 1350  
1350 1351  void
1351 1352  zio_suspend(spa_t *spa, zio_t *zio)
1352 1353  {
1353 1354          if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1354 1355                  fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1355 1356                      "failure and the failure mode property for this pool "
1356 1357                      "is set to panic.", spa_name(spa));
1357 1358  
1358 1359          zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1359 1360  
1360 1361          mutex_enter(&spa->spa_suspend_lock);
1361 1362  
1362 1363          if (spa->spa_suspend_zio_root == NULL)
1363 1364                  spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1364 1365                      ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1365 1366                      ZIO_FLAG_GODFATHER);
1366 1367  
1367 1368          spa->spa_suspended = B_TRUE;
1368 1369  
1369 1370          if (zio != NULL) {
1370 1371                  ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1371 1372                  ASSERT(zio != spa->spa_suspend_zio_root);
1372 1373                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1373 1374                  ASSERT(zio_unique_parent(zio) == NULL);
1374 1375                  ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1375 1376                  zio_add_child(spa->spa_suspend_zio_root, zio);
1376 1377          }
1377 1378  
1378 1379          mutex_exit(&spa->spa_suspend_lock);
1379 1380  }
1380 1381  
1381 1382  int
1382 1383  zio_resume(spa_t *spa)
1383 1384  {
1384 1385          zio_t *pio;
1385 1386  
1386 1387          /*
1387 1388           * Reexecute all previously suspended i/o.
1388 1389           */
1389 1390          mutex_enter(&spa->spa_suspend_lock);
1390 1391          spa->spa_suspended = B_FALSE;
1391 1392          cv_broadcast(&spa->spa_suspend_cv);
1392 1393          pio = spa->spa_suspend_zio_root;
1393 1394          spa->spa_suspend_zio_root = NULL;
1394 1395          mutex_exit(&spa->spa_suspend_lock);
1395 1396  
1396 1397          if (pio == NULL)
1397 1398                  return (0);
1398 1399  
1399 1400          zio_reexecute(pio);
1400 1401          return (zio_wait(pio));
1401 1402  }
1402 1403  
1403 1404  void
1404 1405  zio_resume_wait(spa_t *spa)
1405 1406  {
1406 1407          mutex_enter(&spa->spa_suspend_lock);
1407 1408          while (spa_suspended(spa))
1408 1409                  cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1409 1410          mutex_exit(&spa->spa_suspend_lock);
1410 1411  }
1411 1412  
1412 1413  /*
1413 1414   * ==========================================================================
1414 1415   * Gang blocks.
1415 1416   *
1416 1417   * A gang block is a collection of small blocks that looks to the DMU
1417 1418   * like one large block.  When zio_dva_allocate() cannot find a block
1418 1419   * of the requested size, due to either severe fragmentation or the pool
1419 1420   * being nearly full, it calls zio_write_gang_block() to construct the
1420 1421   * block from smaller fragments.
1421 1422   *
1422 1423   * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1423 1424   * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1424 1425   * an indirect block: it's an array of block pointers.  It consumes
1425 1426   * only one sector and hence is allocatable regardless of fragmentation.
1426 1427   * The gang header's bps point to its gang members, which hold the data.
1427 1428   *
1428 1429   * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1429 1430   * as the verifier to ensure uniqueness of the SHA256 checksum.
1430 1431   * Critically, the gang block bp's blk_cksum is the checksum of the data,
1431 1432   * not the gang header.  This ensures that data block signatures (needed for
1432 1433   * deduplication) are independent of how the block is physically stored.
1433 1434   *
1434 1435   * Gang blocks can be nested: a gang member may itself be a gang block.
1435 1436   * Thus every gang block is a tree in which root and all interior nodes are
1436 1437   * gang headers, and the leaves are normal blocks that contain user data.
1437 1438   * The root of the gang tree is called the gang leader.
1438 1439   *
1439 1440   * To perform any operation (read, rewrite, free, claim) on a gang block,
1440 1441   * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1441 1442   * in the io_gang_tree field of the original logical i/o by recursively
1442 1443   * reading the gang leader and all gang headers below it.  This yields
1443 1444   * an in-core tree containing the contents of every gang header and the
1444 1445   * bps for every constituent of the gang block.
1445 1446   *
1446 1447   * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1447 1448   * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1448 1449   * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1449 1450   * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1450 1451   * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1451 1452   * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1452 1453   * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1453 1454   * of the gang header plus zio_checksum_compute() of the data to update the
1454 1455   * gang header's blk_cksum as described above.
1455 1456   *
1456 1457   * The two-phase assemble/issue model solves the problem of partial failure --
1457 1458   * what if you'd freed part of a gang block but then couldn't read the
1458 1459   * gang header for another part?  Assembling the entire gang tree first
1459 1460   * ensures that all the necessary gang header I/O has succeeded before
1460 1461   * starting the actual work of free, claim, or write.  Once the gang tree
1461 1462   * is assembled, free and claim are in-memory operations that cannot fail.
1462 1463   *
1463 1464   * In the event that a gang write fails, zio_dva_unallocate() walks the
1464 1465   * gang tree to immediately free (i.e. insert back into the space map)
1465 1466   * everything we've allocated.  This ensures that we don't get ENOSPC
1466 1467   * errors during repeated suspend/resume cycles due to a flaky device.
1467 1468   *
1468 1469   * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1469 1470   * the gang tree, we won't modify the block, so we can safely defer the free
1470 1471   * (knowing that the block is still intact).  If we *can* assemble the gang
1471 1472   * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1472 1473   * each constituent bp and we can allocate a new block on the next sync pass.
1473 1474   *
1474 1475   * In all cases, the gang tree allows complete recovery from partial failure.
1475 1476   * ==========================================================================
1476 1477   */
1477 1478  
1478 1479  static zio_t *
1479 1480  zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1480 1481  {
1481 1482          if (gn != NULL)
1482 1483                  return (pio);
1483 1484  
1484 1485          return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1485 1486              NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1486 1487              &pio->io_bookmark));
1487 1488  }
1488 1489  
1489 1490  zio_t *
1490 1491  zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1491 1492  {
1492 1493          zio_t *zio;
1493 1494  
1494 1495          if (gn != NULL) {
1495 1496                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1496 1497                      gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1497 1498                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1498 1499                  /*
1499 1500                   * As we rewrite each gang header, the pipeline will compute
1500 1501                   * a new gang block header checksum for it; but no one will
1501 1502                   * compute a new data checksum, so we do that here.  The one
1502 1503                   * exception is the gang leader: the pipeline already computed
1503 1504                   * its data checksum because that stage precedes gang assembly.
1504 1505                   * (Presently, nothing actually uses interior data checksums;
1505 1506                   * this is just good hygiene.)
1506 1507                   */
1507 1508                  if (gn != pio->io_gang_leader->io_gang_tree) {
1508 1509                          zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1509 1510                              data, BP_GET_PSIZE(bp));
1510 1511                  }
1511 1512                  /*
1512 1513                   * If we are here to damage data for testing purposes,
1513 1514                   * leave the GBH alone so that we can detect the damage.
1514 1515                   */
1515 1516                  if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1516 1517                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1517 1518          } else {
1518 1519                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1519 1520                      data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1520 1521                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1521 1522          }
1522 1523  
1523 1524          return (zio);
1524 1525  }
1525 1526  
1526 1527  /* ARGSUSED */
1527 1528  zio_t *
1528 1529  zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1529 1530  {
1530 1531          return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1531 1532              ZIO_GANG_CHILD_FLAGS(pio)));
1532 1533  }
1533 1534  
1534 1535  /* ARGSUSED */
1535 1536  zio_t *
1536 1537  zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1537 1538  {
1538 1539          return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1539 1540              NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1540 1541  }
1541 1542  
1542 1543  static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1543 1544          NULL,
1544 1545          zio_read_gang,
1545 1546          zio_rewrite_gang,
1546 1547          zio_free_gang,
1547 1548          zio_claim_gang,
1548 1549          NULL
1549 1550  };
1550 1551  
1551 1552  static void zio_gang_tree_assemble_done(zio_t *zio);
1552 1553  
1553 1554  static zio_gang_node_t *
1554 1555  zio_gang_node_alloc(zio_gang_node_t **gnpp)
1555 1556  {
1556 1557          zio_gang_node_t *gn;
1557 1558  
1558 1559          ASSERT(*gnpp == NULL);
1559 1560  
1560 1561          gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1561 1562          gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1562 1563          *gnpp = gn;
1563 1564  
1564 1565          return (gn);
1565 1566  }
1566 1567  
1567 1568  static void
1568 1569  zio_gang_node_free(zio_gang_node_t **gnpp)
1569 1570  {
1570 1571          zio_gang_node_t *gn = *gnpp;
1571 1572  
1572 1573          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1573 1574                  ASSERT(gn->gn_child[g] == NULL);
1574 1575  
1575 1576          zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1576 1577          kmem_free(gn, sizeof (*gn));
1577 1578          *gnpp = NULL;
1578 1579  }
1579 1580  
1580 1581  static void
1581 1582  zio_gang_tree_free(zio_gang_node_t **gnpp)
1582 1583  {
1583 1584          zio_gang_node_t *gn = *gnpp;
1584 1585  
1585 1586          if (gn == NULL)
1586 1587                  return;
1587 1588  
1588 1589          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1589 1590                  zio_gang_tree_free(&gn->gn_child[g]);
1590 1591  
1591 1592          zio_gang_node_free(gnpp);
1592 1593  }
1593 1594  
1594 1595  static void
1595 1596  zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1596 1597  {
1597 1598          zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1598 1599  
1599 1600          ASSERT(gio->io_gang_leader == gio);
1600 1601          ASSERT(BP_IS_GANG(bp));
1601 1602  
1602 1603          zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1603 1604              SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1604 1605              gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1605 1606  }
1606 1607  
1607 1608  static void
1608 1609  zio_gang_tree_assemble_done(zio_t *zio)
1609 1610  {
1610 1611          zio_t *gio = zio->io_gang_leader;
1611 1612          zio_gang_node_t *gn = zio->io_private;
1612 1613          blkptr_t *bp = zio->io_bp;
1613 1614  
1614 1615          ASSERT(gio == zio_unique_parent(zio));
1615 1616          ASSERT(zio->io_child_count == 0);
1616 1617  
1617 1618          if (zio->io_error)
1618 1619                  return;
1619 1620  
1620 1621          if (BP_SHOULD_BYTESWAP(bp))
1621 1622                  byteswap_uint64_array(zio->io_data, zio->io_size);
1622 1623  
1623 1624          ASSERT(zio->io_data == gn->gn_gbh);
1624 1625          ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1625 1626          ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1626 1627  
1627 1628          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1628 1629                  blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1629 1630                  if (!BP_IS_GANG(gbp))
1630 1631                          continue;
1631 1632                  zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1632 1633          }
1633 1634  }
1634 1635  
1635 1636  static void
1636 1637  zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1637 1638  {
1638 1639          zio_t *gio = pio->io_gang_leader;
1639 1640          zio_t *zio;
1640 1641  
1641 1642          ASSERT(BP_IS_GANG(bp) == !!gn);
1642 1643          ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1643 1644          ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1644 1645  
1645 1646          /*
1646 1647           * If you're a gang header, your data is in gn->gn_gbh.
1647 1648           * If you're a gang member, your data is in 'data' and gn == NULL.
1648 1649           */
1649 1650          zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1650 1651  
1651 1652          if (gn != NULL) {
1652 1653                  ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1653 1654  
1654 1655                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1655 1656                          blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1656 1657                          if (BP_IS_HOLE(gbp))
1657 1658                                  continue;
1658 1659                          zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1659 1660                          data = (char *)data + BP_GET_PSIZE(gbp);
1660 1661                  }
1661 1662          }
1662 1663  
1663 1664          if (gn == gio->io_gang_tree)
1664 1665                  ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1665 1666  
1666 1667          if (zio != pio)
1667 1668                  zio_nowait(zio);
1668 1669  }
1669 1670  
1670 1671  static int
1671 1672  zio_gang_assemble(zio_t *zio)
1672 1673  {
1673 1674          blkptr_t *bp = zio->io_bp;
1674 1675  
1675 1676          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1676 1677          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1677 1678  
1678 1679          zio->io_gang_leader = zio;
1679 1680  
1680 1681          zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1681 1682  
1682 1683          return (ZIO_PIPELINE_CONTINUE);
1683 1684  }
1684 1685  
1685 1686  static int
1686 1687  zio_gang_issue(zio_t *zio)
1687 1688  {
1688 1689          blkptr_t *bp = zio->io_bp;
1689 1690  
1690 1691          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1691 1692                  return (ZIO_PIPELINE_STOP);
1692 1693  
1693 1694          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1694 1695          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1695 1696  
1696 1697          if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1697 1698                  zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1698 1699          else
1699 1700                  zio_gang_tree_free(&zio->io_gang_tree);
1700 1701  
1701 1702          zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1702 1703  
1703 1704          return (ZIO_PIPELINE_CONTINUE);
1704 1705  }
1705 1706  
1706 1707  static void
1707 1708  zio_write_gang_member_ready(zio_t *zio)
1708 1709  {
1709 1710          zio_t *pio = zio_unique_parent(zio);
1710 1711          zio_t *gio = zio->io_gang_leader;
1711 1712          dva_t *cdva = zio->io_bp->blk_dva;
1712 1713          dva_t *pdva = pio->io_bp->blk_dva;
1713 1714          uint64_t asize;
1714 1715  
1715 1716          if (BP_IS_HOLE(zio->io_bp))
1716 1717                  return;
1717 1718  
1718 1719          ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1719 1720  
1720 1721          ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1721 1722          ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1722 1723          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1723 1724          ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1724 1725          ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1725 1726  
1726 1727          mutex_enter(&pio->io_lock);
1727 1728          for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1728 1729                  ASSERT(DVA_GET_GANG(&pdva[d]));
1729 1730                  asize = DVA_GET_ASIZE(&pdva[d]);
1730 1731                  asize += DVA_GET_ASIZE(&cdva[d]);
1731 1732                  DVA_SET_ASIZE(&pdva[d], asize);
1732 1733          }
1733 1734          mutex_exit(&pio->io_lock);
1734 1735  }
1735 1736  
1736 1737  static int
1737 1738  zio_write_gang_block(zio_t *pio)
1738 1739  {
1739 1740          spa_t *spa = pio->io_spa;
1740 1741          blkptr_t *bp = pio->io_bp;
1741 1742          zio_t *gio = pio->io_gang_leader;
1742 1743          zio_t *zio;
1743 1744          zio_gang_node_t *gn, **gnpp;
1744 1745          zio_gbh_phys_t *gbh;
1745 1746          uint64_t txg = pio->io_txg;
1746 1747          uint64_t resid = pio->io_size;
1747 1748          uint64_t lsize;
1748 1749          int copies = gio->io_prop.zp_copies;
1749 1750          int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1750 1751          zio_prop_t zp;
1751 1752          int error;
1752 1753  
1753 1754          error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1754 1755              bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1755 1756              METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1756 1757          if (error) {
1757 1758                  pio->io_error = error;
1758 1759                  return (ZIO_PIPELINE_CONTINUE);
1759 1760          }
1760 1761  
1761 1762          if (pio == gio) {
1762 1763                  gnpp = &gio->io_gang_tree;
1763 1764          } else {
1764 1765                  gnpp = pio->io_private;
1765 1766                  ASSERT(pio->io_ready == zio_write_gang_member_ready);
1766 1767          }
1767 1768  
1768 1769          gn = zio_gang_node_alloc(gnpp);
1769 1770          gbh = gn->gn_gbh;
1770 1771          bzero(gbh, SPA_GANGBLOCKSIZE);
1771 1772  
1772 1773          /*
1773 1774           * Create the gang header.
1774 1775           */
1775 1776          zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1776 1777              pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1777 1778  
1778 1779          /*
1779 1780           * Create and nowait the gang children.
1780 1781           */
1781 1782          for (int g = 0; resid != 0; resid -= lsize, g++) {
1782 1783                  lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1783 1784                      SPA_MINBLOCKSIZE);
1784 1785                  ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1785 1786  
1786 1787                  zp.zp_checksum = gio->io_prop.zp_checksum;
1787 1788                  zp.zp_compress = ZIO_COMPRESS_OFF;
1788 1789                  zp.zp_type = DMU_OT_NONE;
1789 1790                  zp.zp_level = 0;
1790 1791                  zp.zp_copies = gio->io_prop.zp_copies;
1791 1792                  zp.zp_dedup = B_FALSE;
1792 1793                  zp.zp_dedup_verify = B_FALSE;
1793 1794                  zp.zp_nopwrite = B_FALSE;
1794 1795  
1795 1796                  zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1796 1797                      (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1797 1798                      zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1798 1799                      pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1799 1800                      &pio->io_bookmark));
1800 1801          }
1801 1802  
1802 1803          /*
1803 1804           * Set pio's pipeline to just wait for zio to finish.
1804 1805           */
1805 1806          pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1806 1807  
1807 1808          zio_nowait(zio);
1808 1809  
1809 1810          return (ZIO_PIPELINE_CONTINUE);
1810 1811  }
1811 1812  
1812 1813  /*
1813 1814   * The zio_nop_write stage in the pipeline determines if allocating
1814 1815   * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1815 1816   * such as SHA256, we can compare the checksums of the new data and the old
1816 1817   * to determine if allocating a new block is required.  The nopwrite
1817 1818   * feature can handle writes in either syncing or open context (i.e. zil
1818 1819   * writes) and as a result is mutually exclusive with dedup.
1819 1820   */
1820 1821  static int
1821 1822  zio_nop_write(zio_t *zio)
1822 1823  {
1823 1824          blkptr_t *bp = zio->io_bp;
1824 1825          blkptr_t *bp_orig = &zio->io_bp_orig;
1825 1826          zio_prop_t *zp = &zio->io_prop;
1826 1827  
1827 1828          ASSERT(BP_GET_LEVEL(bp) == 0);
1828 1829          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1829 1830          ASSERT(zp->zp_nopwrite);
1830 1831          ASSERT(!zp->zp_dedup);
1831 1832          ASSERT(zio->io_bp_override == NULL);
1832 1833          ASSERT(IO_IS_ALLOCATING(zio));
1833 1834  
1834 1835          /*
1835 1836           * Check to see if the original bp and the new bp have matching
1836 1837           * characteristics (i.e. same checksum, compression algorithms, etc).
1837 1838           * If they don't then just continue with the pipeline which will
1838 1839           * allocate a new bp.
1839 1840           */
1840 1841          if (BP_IS_HOLE(bp_orig) ||
1841 1842              !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
1842 1843              BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
1843 1844              BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
1844 1845              BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
1845 1846              zp->zp_copies != BP_GET_NDVAS(bp_orig))
1846 1847                  return (ZIO_PIPELINE_CONTINUE);
1847 1848  
1848 1849          /*
1849 1850           * If the checksums match then reset the pipeline so that we
1850 1851           * avoid allocating a new bp and issuing any I/O.
1851 1852           */
1852 1853          if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
1853 1854                  ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
1854 1855                  ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
1855 1856                  ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
1856 1857                  ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
1857 1858                  ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
1858 1859                      sizeof (uint64_t)) == 0);
1859 1860  
1860 1861                  *bp = *bp_orig;
1861 1862                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1862 1863                  zio->io_flags |= ZIO_FLAG_NOPWRITE;
1863 1864          }
1864 1865  
1865 1866          return (ZIO_PIPELINE_CONTINUE);
1866 1867  }
1867 1868  
1868 1869  /*
1869 1870   * ==========================================================================
1870 1871   * Dedup
1871 1872   * ==========================================================================
1872 1873   */
1873 1874  static void
1874 1875  zio_ddt_child_read_done(zio_t *zio)
1875 1876  {
1876 1877          blkptr_t *bp = zio->io_bp;
1877 1878          ddt_entry_t *dde = zio->io_private;
1878 1879          ddt_phys_t *ddp;
1879 1880          zio_t *pio = zio_unique_parent(zio);
1880 1881  
1881 1882          mutex_enter(&pio->io_lock);
1882 1883          ddp = ddt_phys_select(dde, bp);
1883 1884          if (zio->io_error == 0)
1884 1885                  ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
1885 1886          if (zio->io_error == 0 && dde->dde_repair_data == NULL)
1886 1887                  dde->dde_repair_data = zio->io_data;
1887 1888          else
1888 1889                  zio_buf_free(zio->io_data, zio->io_size);
1889 1890          mutex_exit(&pio->io_lock);
1890 1891  }
1891 1892  
1892 1893  static int
1893 1894  zio_ddt_read_start(zio_t *zio)
1894 1895  {
1895 1896          blkptr_t *bp = zio->io_bp;
1896 1897  
1897 1898          ASSERT(BP_GET_DEDUP(bp));
1898 1899          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
1899 1900          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1900 1901  
1901 1902          if (zio->io_child_error[ZIO_CHILD_DDT]) {
1902 1903                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
1903 1904                  ddt_entry_t *dde = ddt_repair_start(ddt, bp);
1904 1905                  ddt_phys_t *ddp = dde->dde_phys;
1905 1906                  ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
1906 1907                  blkptr_t blk;
1907 1908  
1908 1909                  ASSERT(zio->io_vsd == NULL);
1909 1910                  zio->io_vsd = dde;
1910 1911  
1911 1912                  if (ddp_self == NULL)
1912 1913                          return (ZIO_PIPELINE_CONTINUE);
1913 1914  
1914 1915                  for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1915 1916                          if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
1916 1917                                  continue;
1917 1918                          ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
1918 1919                              &blk);
1919 1920                          zio_nowait(zio_read(zio, zio->io_spa, &blk,
1920 1921                              zio_buf_alloc(zio->io_size), zio->io_size,
1921 1922                              zio_ddt_child_read_done, dde, zio->io_priority,
1922 1923                              ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
1923 1924                              &zio->io_bookmark));
1924 1925                  }
1925 1926                  return (ZIO_PIPELINE_CONTINUE);
1926 1927          }
1927 1928  
1928 1929          zio_nowait(zio_read(zio, zio->io_spa, bp,
1929 1930              zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
1930 1931              ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
1931 1932  
1932 1933          return (ZIO_PIPELINE_CONTINUE);
1933 1934  }
1934 1935  
1935 1936  static int
1936 1937  zio_ddt_read_done(zio_t *zio)
1937 1938  {
1938 1939          blkptr_t *bp = zio->io_bp;
1939 1940  
1940 1941          if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
1941 1942                  return (ZIO_PIPELINE_STOP);
1942 1943  
1943 1944          ASSERT(BP_GET_DEDUP(bp));
1944 1945          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
1945 1946          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1946 1947  
1947 1948          if (zio->io_child_error[ZIO_CHILD_DDT]) {
1948 1949                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
1949 1950                  ddt_entry_t *dde = zio->io_vsd;
1950 1951                  if (ddt == NULL) {
1951 1952                          ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
1952 1953                          return (ZIO_PIPELINE_CONTINUE);
1953 1954                  }
1954 1955                  if (dde == NULL) {
1955 1956                          zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
1956 1957                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1957 1958                          return (ZIO_PIPELINE_STOP);
1958 1959                  }
1959 1960                  if (dde->dde_repair_data != NULL) {
1960 1961                          bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
1961 1962                          zio->io_child_error[ZIO_CHILD_DDT] = 0;
1962 1963                  }
1963 1964                  ddt_repair_done(ddt, dde);
1964 1965                  zio->io_vsd = NULL;
1965 1966          }
1966 1967  
1967 1968          ASSERT(zio->io_vsd == NULL);
1968 1969  
1969 1970          return (ZIO_PIPELINE_CONTINUE);
1970 1971  }
1971 1972  
1972 1973  static boolean_t
1973 1974  zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
1974 1975  {
1975 1976          spa_t *spa = zio->io_spa;
1976 1977  
1977 1978          /*
1978 1979           * Note: we compare the original data, not the transformed data,
1979 1980           * because when zio->io_bp is an override bp, we will not have
1980 1981           * pushed the I/O transforms.  That's an important optimization
1981 1982           * because otherwise we'd compress/encrypt all dmu_sync() data twice.
1982 1983           */
1983 1984          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
1984 1985                  zio_t *lio = dde->dde_lead_zio[p];
1985 1986  
1986 1987                  if (lio != NULL) {
1987 1988                          return (lio->io_orig_size != zio->io_orig_size ||
1988 1989                              bcmp(zio->io_orig_data, lio->io_orig_data,
1989 1990                              zio->io_orig_size) != 0);
1990 1991                  }
1991 1992          }
1992 1993  
1993 1994          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
1994 1995                  ddt_phys_t *ddp = &dde->dde_phys[p];
1995 1996  
1996 1997                  if (ddp->ddp_phys_birth != 0) {
1997 1998                          arc_buf_t *abuf = NULL;
1998 1999                          uint32_t aflags = ARC_WAIT;
1999 2000                          blkptr_t blk = *zio->io_bp;
2000 2001                          int error;
2001 2002  
2002 2003                          ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2003 2004  
2004 2005                          ddt_exit(ddt);
2005 2006  
2006 2007                          error = arc_read(NULL, spa, &blk,
2007 2008                              arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2008 2009                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2009 2010                              &aflags, &zio->io_bookmark);
2010 2011  
2011 2012                          if (error == 0) {
2012 2013                                  if (arc_buf_size(abuf) != zio->io_orig_size ||
2013 2014                                      bcmp(abuf->b_data, zio->io_orig_data,
2014 2015                                      zio->io_orig_size) != 0)
2015 2016                                          error = SET_ERROR(EEXIST);
2016 2017                                  VERIFY(arc_buf_remove_ref(abuf, &abuf));
2017 2018                          }
2018 2019  
2019 2020                          ddt_enter(ddt);
2020 2021                          return (error != 0);
2021 2022                  }
2022 2023          }
2023 2024  
2024 2025          return (B_FALSE);
2025 2026  }
2026 2027  
2027 2028  static void
2028 2029  zio_ddt_child_write_ready(zio_t *zio)
2029 2030  {
2030 2031          int p = zio->io_prop.zp_copies;
2031 2032          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2032 2033          ddt_entry_t *dde = zio->io_private;
2033 2034          ddt_phys_t *ddp = &dde->dde_phys[p];
2034 2035          zio_t *pio;
2035 2036  
2036 2037          if (zio->io_error)
2037 2038                  return;
2038 2039  
2039 2040          ddt_enter(ddt);
2040 2041  
2041 2042          ASSERT(dde->dde_lead_zio[p] == zio);
2042 2043  
2043 2044          ddt_phys_fill(ddp, zio->io_bp);
2044 2045  
2045 2046          while ((pio = zio_walk_parents(zio)) != NULL)
2046 2047                  ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2047 2048  
2048 2049          ddt_exit(ddt);
2049 2050  }
2050 2051  
2051 2052  static void
2052 2053  zio_ddt_child_write_done(zio_t *zio)
2053 2054  {
2054 2055          int p = zio->io_prop.zp_copies;
2055 2056          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2056 2057          ddt_entry_t *dde = zio->io_private;
2057 2058          ddt_phys_t *ddp = &dde->dde_phys[p];
2058 2059  
2059 2060          ddt_enter(ddt);
2060 2061  
2061 2062          ASSERT(ddp->ddp_refcnt == 0);
2062 2063          ASSERT(dde->dde_lead_zio[p] == zio);
2063 2064          dde->dde_lead_zio[p] = NULL;
2064 2065  
2065 2066          if (zio->io_error == 0) {
2066 2067                  while (zio_walk_parents(zio) != NULL)
2067 2068                          ddt_phys_addref(ddp);
2068 2069          } else {
2069 2070                  ddt_phys_clear(ddp);
2070 2071          }
2071 2072  
2072 2073          ddt_exit(ddt);
2073 2074  }
2074 2075  
2075 2076  static void
2076 2077  zio_ddt_ditto_write_done(zio_t *zio)
2077 2078  {
2078 2079          int p = DDT_PHYS_DITTO;
2079 2080          zio_prop_t *zp = &zio->io_prop;
2080 2081          blkptr_t *bp = zio->io_bp;
2081 2082          ddt_t *ddt = ddt_select(zio->io_spa, bp);
2082 2083          ddt_entry_t *dde = zio->io_private;
2083 2084          ddt_phys_t *ddp = &dde->dde_phys[p];
2084 2085          ddt_key_t *ddk = &dde->dde_key;
2085 2086  
2086 2087          ddt_enter(ddt);
2087 2088  
2088 2089          ASSERT(ddp->ddp_refcnt == 0);
2089 2090          ASSERT(dde->dde_lead_zio[p] == zio);
2090 2091          dde->dde_lead_zio[p] = NULL;
2091 2092  
2092 2093          if (zio->io_error == 0) {
2093 2094                  ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2094 2095                  ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2095 2096                  ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2096 2097                  if (ddp->ddp_phys_birth != 0)
2097 2098                          ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2098 2099                  ddt_phys_fill(ddp, bp);
2099 2100          }
2100 2101  
2101 2102          ddt_exit(ddt);
2102 2103  }
2103 2104  
2104 2105  static int
2105 2106  zio_ddt_write(zio_t *zio)
2106 2107  {
2107 2108          spa_t *spa = zio->io_spa;
2108 2109          blkptr_t *bp = zio->io_bp;
2109 2110          uint64_t txg = zio->io_txg;
2110 2111          zio_prop_t *zp = &zio->io_prop;
2111 2112          int p = zp->zp_copies;
2112 2113          int ditto_copies;
2113 2114          zio_t *cio = NULL;
2114 2115          zio_t *dio = NULL;
2115 2116          ddt_t *ddt = ddt_select(spa, bp);
2116 2117          ddt_entry_t *dde;
2117 2118          ddt_phys_t *ddp;
2118 2119  
2119 2120          ASSERT(BP_GET_DEDUP(bp));
2120 2121          ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2121 2122          ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2122 2123  
2123 2124          ddt_enter(ddt);
2124 2125          dde = ddt_lookup(ddt, bp, B_TRUE);
2125 2126          ddp = &dde->dde_phys[p];
2126 2127  
2127 2128          if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2128 2129                  /*
2129 2130                   * If we're using a weak checksum, upgrade to a strong checksum
2130 2131                   * and try again.  If we're already using a strong checksum,
2131 2132                   * we can't resolve it, so just convert to an ordinary write.
2132 2133                   * (And automatically e-mail a paper to Nature?)
2133 2134                   */
2134 2135                  if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2135 2136                          zp->zp_checksum = spa_dedup_checksum(spa);
2136 2137                          zio_pop_transforms(zio);
2137 2138                          zio->io_stage = ZIO_STAGE_OPEN;
2138 2139                          BP_ZERO(bp);
2139 2140                  } else {
2140 2141                          zp->zp_dedup = B_FALSE;
2141 2142                  }
2142 2143                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
2143 2144                  ddt_exit(ddt);
2144 2145                  return (ZIO_PIPELINE_CONTINUE);
2145 2146          }
2146 2147  
2147 2148          ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2148 2149          ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2149 2150  
2150 2151          if (ditto_copies > ddt_ditto_copies_present(dde) &&
2151 2152              dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2152 2153                  zio_prop_t czp = *zp;
2153 2154  
2154 2155                  czp.zp_copies = ditto_copies;
2155 2156  
2156 2157                  /*
2157 2158                   * If we arrived here with an override bp, we won't have run
2158 2159                   * the transform stack, so we won't have the data we need to
2159 2160                   * generate a child i/o.  So, toss the override bp and restart.
2160 2161                   * This is safe, because using the override bp is just an
2161 2162                   * optimization; and it's rare, so the cost doesn't matter.
2162 2163                   */
2163 2164                  if (zio->io_bp_override) {
2164 2165                          zio_pop_transforms(zio);
2165 2166                          zio->io_stage = ZIO_STAGE_OPEN;
2166 2167                          zio->io_pipeline = ZIO_WRITE_PIPELINE;
2167 2168                          zio->io_bp_override = NULL;
2168 2169                          BP_ZERO(bp);
2169 2170                          ddt_exit(ddt);
2170 2171                          return (ZIO_PIPELINE_CONTINUE);
2171 2172                  }
2172 2173  
2173 2174                  dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2174 2175                      zio->io_orig_size, &czp, NULL,
2175 2176                      zio_ddt_ditto_write_done, dde, zio->io_priority,
2176 2177                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2177 2178  
2178 2179                  zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2179 2180                  dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2180 2181          }
2181 2182  
2182 2183          if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2183 2184                  if (ddp->ddp_phys_birth != 0)
2184 2185                          ddt_bp_fill(ddp, bp, txg);
2185 2186                  if (dde->dde_lead_zio[p] != NULL)
2186 2187                          zio_add_child(zio, dde->dde_lead_zio[p]);
2187 2188                  else
2188 2189                          ddt_phys_addref(ddp);
2189 2190          } else if (zio->io_bp_override) {
2190 2191                  ASSERT(bp->blk_birth == txg);
2191 2192                  ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2192 2193                  ddt_phys_fill(ddp, bp);
2193 2194                  ddt_phys_addref(ddp);
2194 2195          } else {
2195 2196                  cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2196 2197                      zio->io_orig_size, zp, zio_ddt_child_write_ready,
2197 2198                      zio_ddt_child_write_done, dde, zio->io_priority,
2198 2199                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2199 2200  
2200 2201                  zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2201 2202                  dde->dde_lead_zio[p] = cio;
2202 2203          }
2203 2204  
2204 2205          ddt_exit(ddt);
2205 2206  
2206 2207          if (cio)
2207 2208                  zio_nowait(cio);
2208 2209          if (dio)
2209 2210                  zio_nowait(dio);
2210 2211  
2211 2212          return (ZIO_PIPELINE_CONTINUE);
2212 2213  }
2213 2214  
2214 2215  ddt_entry_t *freedde; /* for debugging */
2215 2216  
2216 2217  static int
2217 2218  zio_ddt_free(zio_t *zio)
2218 2219  {
2219 2220          spa_t *spa = zio->io_spa;
2220 2221          blkptr_t *bp = zio->io_bp;
2221 2222          ddt_t *ddt = ddt_select(spa, bp);
2222 2223          ddt_entry_t *dde;
2223 2224          ddt_phys_t *ddp;
2224 2225  
2225 2226          ASSERT(BP_GET_DEDUP(bp));
2226 2227          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2227 2228  
2228 2229          ddt_enter(ddt);
2229 2230          freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2230 2231          ddp = ddt_phys_select(dde, bp);
2231 2232          ddt_phys_decref(ddp);
2232 2233          ddt_exit(ddt);
2233 2234  
2234 2235          return (ZIO_PIPELINE_CONTINUE);
2235 2236  }
2236 2237  
2237 2238  /*
2238 2239   * ==========================================================================
2239 2240   * Allocate and free blocks
2240 2241   * ==========================================================================
2241 2242   */
2242 2243  static int
2243 2244  zio_dva_allocate(zio_t *zio)
2244 2245  {
2245 2246          spa_t *spa = zio->io_spa;
2246 2247          metaslab_class_t *mc = spa_normal_class(spa);
2247 2248          blkptr_t *bp = zio->io_bp;
2248 2249          int error;
2249 2250          int flags = 0;
2250 2251  
2251 2252          if (zio->io_gang_leader == NULL) {
2252 2253                  ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2253 2254                  zio->io_gang_leader = zio;
2254 2255          }
2255 2256  
2256 2257          ASSERT(BP_IS_HOLE(bp));
2257 2258          ASSERT0(BP_GET_NDVAS(bp));
2258 2259          ASSERT3U(zio->io_prop.zp_copies, >, 0);
2259 2260          ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2260 2261          ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2261 2262  
2262 2263          /*
2263 2264           * The dump device does not support gang blocks so allocation on
2264 2265           * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2265 2266           * the "fast" gang feature.
2266 2267           */
2267 2268          flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2268 2269          flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2269 2270              METASLAB_GANG_CHILD : 0;
2270 2271          error = metaslab_alloc(spa, mc, zio->io_size, bp,
2271 2272              zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2272 2273  
2273 2274          if (error) {
2274 2275                  spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2275 2276                      "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2276 2277                      error);
2277 2278                  if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2278 2279                          return (zio_write_gang_block(zio));
2279 2280                  zio->io_error = error;
2280 2281          }
2281 2282  
2282 2283          return (ZIO_PIPELINE_CONTINUE);
2283 2284  }
2284 2285  
2285 2286  static int
2286 2287  zio_dva_free(zio_t *zio)
2287 2288  {
2288 2289          metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2289 2290  
2290 2291          return (ZIO_PIPELINE_CONTINUE);
2291 2292  }
2292 2293  
2293 2294  static int
2294 2295  zio_dva_claim(zio_t *zio)
2295 2296  {
2296 2297          int error;
2297 2298  
2298 2299          error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2299 2300          if (error)
2300 2301                  zio->io_error = error;
2301 2302  
2302 2303          return (ZIO_PIPELINE_CONTINUE);
2303 2304  }
2304 2305  
2305 2306  /*
2306 2307   * Undo an allocation.  This is used by zio_done() when an I/O fails
2307 2308   * and we want to give back the block we just allocated.
2308 2309   * This handles both normal blocks and gang blocks.
2309 2310   */
2310 2311  static void
2311 2312  zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2312 2313  {
2313 2314          ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2314 2315          ASSERT(zio->io_bp_override == NULL);
2315 2316  
2316 2317          if (!BP_IS_HOLE(bp))
2317 2318                  metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2318 2319  
2319 2320          if (gn != NULL) {
2320 2321                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2321 2322                          zio_dva_unallocate(zio, gn->gn_child[g],
2322 2323                              &gn->gn_gbh->zg_blkptr[g]);
2323 2324                  }
2324 2325          }
2325 2326  }
2326 2327  
2327 2328  /*
2328 2329   * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2329 2330   */
2330 2331  int
2331 2332  zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2332 2333      uint64_t size, boolean_t use_slog)
2333 2334  {
2334 2335          int error = 1;
2335 2336  
2336 2337          ASSERT(txg > spa_syncing_txg(spa));
2337 2338  
2338 2339          /*
2339 2340           * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2340 2341           * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2341 2342           * when allocating them.
2342 2343           */
2343 2344          if (use_slog) {
2344 2345                  error = metaslab_alloc(spa, spa_log_class(spa), size,
2345 2346                      new_bp, 1, txg, old_bp,
2346 2347                      METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2347 2348          }
2348 2349  
2349 2350          if (error) {
2350 2351                  error = metaslab_alloc(spa, spa_normal_class(spa), size,
2351 2352                      new_bp, 1, txg, old_bp,
2352 2353                      METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2353 2354          }
2354 2355  
2355 2356          if (error == 0) {
2356 2357                  BP_SET_LSIZE(new_bp, size);
2357 2358                  BP_SET_PSIZE(new_bp, size);
2358 2359                  BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2359 2360                  BP_SET_CHECKSUM(new_bp,
2360 2361                      spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2361 2362                      ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2362 2363                  BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2363 2364                  BP_SET_LEVEL(new_bp, 0);
2364 2365                  BP_SET_DEDUP(new_bp, 0);
2365 2366                  BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2366 2367          }
2367 2368  
2368 2369          return (error);
2369 2370  }
2370 2371  
2371 2372  /*
2372 2373   * Free an intent log block.
2373 2374   */
2374 2375  void
2375 2376  zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2376 2377  {
2377 2378          ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2378 2379          ASSERT(!BP_IS_GANG(bp));
2379 2380  
2380 2381          zio_free(spa, txg, bp);
2381 2382  }
2382 2383  
2383 2384  /*
2384 2385   * ==========================================================================
2385 2386   * Read and write to physical devices
2386 2387   * ==========================================================================
2387 2388   */
2388 2389  static int
2389 2390  zio_vdev_io_start(zio_t *zio)
2390 2391  {
2391 2392          vdev_t *vd = zio->io_vd;
2392 2393          uint64_t align;
2393 2394          spa_t *spa = zio->io_spa;
2394 2395  
2395 2396          ASSERT(zio->io_error == 0);
2396 2397          ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2397 2398  
2398 2399          if (vd == NULL) {
2399 2400                  if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2400 2401                          spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2401 2402  
2402 2403                  /*
2403 2404                   * The mirror_ops handle multiple DVAs in a single BP.
2404 2405                   */
2405 2406                  return (vdev_mirror_ops.vdev_op_io_start(zio));
2406 2407          }
2407 2408  
2408 2409          /*
2409 2410           * We keep track of time-sensitive I/Os so that the scan thread
2410 2411           * can quickly react to certain workloads.  In particular, we care
2411 2412           * about non-scrubbing, top-level reads and writes with the following
2412 2413           * characteristics:
2413 2414           *      - synchronous writes of user data to non-slog devices
2414 2415           *      - any reads of user data
2415 2416           * When these conditions are met, adjust the timestamp of spa_last_io
2416 2417           * which allows the scan thread to adjust its workload accordingly.
2417 2418           */
2418 2419          if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2419 2420              vd == vd->vdev_top && !vd->vdev_islog &&
2420 2421              zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2421 2422              zio->io_txg != spa_syncing_txg(spa)) {
2422 2423                  uint64_t old = spa->spa_last_io;
2423 2424                  uint64_t new = ddi_get_lbolt64();
2424 2425                  if (old != new)
2425 2426                          (void) atomic_cas_64(&spa->spa_last_io, old, new);
2426 2427          }
2427 2428  
2428 2429          align = 1ULL << vd->vdev_top->vdev_ashift;
2429 2430  
2430 2431          if (P2PHASE(zio->io_size, align) != 0) {
2431 2432                  uint64_t asize = P2ROUNDUP(zio->io_size, align);
2432 2433                  char *abuf = zio_buf_alloc(asize);
2433 2434                  ASSERT(vd == vd->vdev_top);
2434 2435                  if (zio->io_type == ZIO_TYPE_WRITE) {
2435 2436                          bcopy(zio->io_data, abuf, zio->io_size);
2436 2437                          bzero(abuf + zio->io_size, asize - zio->io_size);
2437 2438                  }
2438 2439                  zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2439 2440          }
2440 2441  
2441 2442          ASSERT(P2PHASE(zio->io_offset, align) == 0);
2442 2443          ASSERT(P2PHASE(zio->io_size, align) == 0);
2443 2444          VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2444 2445  
2445 2446          /*
2446 2447           * If this is a repair I/O, and there's no self-healing involved --
2447 2448           * that is, we're just resilvering what we expect to resilver --
2448 2449           * then don't do the I/O unless zio's txg is actually in vd's DTL.
2449 2450           * This prevents spurious resilvering with nested replication.
2450 2451           * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2451 2452           * A is out of date, we'll read from C+D, then use the data to
2452 2453           * resilver A+B -- but we don't actually want to resilver B, just A.
2453 2454           * The top-level mirror has no way to know this, so instead we just
2454 2455           * discard unnecessary repairs as we work our way down the vdev tree.
2455 2456           * The same logic applies to any form of nested replication:
2456 2457           * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2457 2458           */
2458 2459          if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2459 2460              !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2460 2461              zio->io_txg != 0 && /* not a delegated i/o */
2461 2462              !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2462 2463                  ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2463 2464                  zio_vdev_io_bypass(zio);
2464 2465                  return (ZIO_PIPELINE_CONTINUE);
2465 2466          }
2466 2467  
2467 2468          if (vd->vdev_ops->vdev_op_leaf &&
2468 2469              (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2469 2470  
2470 2471                  if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
2471 2472                          return (ZIO_PIPELINE_CONTINUE);
2472 2473  
2473 2474                  if ((zio = vdev_queue_io(zio)) == NULL)
2474 2475                          return (ZIO_PIPELINE_STOP);
2475 2476  
2476 2477                  if (!vdev_accessible(vd, zio)) {
2477 2478                          zio->io_error = SET_ERROR(ENXIO);
2478 2479                          zio_interrupt(zio);
2479 2480                          return (ZIO_PIPELINE_STOP);
2480 2481                  }
2481 2482          }
2482 2483  
2483 2484          return (vd->vdev_ops->vdev_op_io_start(zio));
2484 2485  }
2485 2486  
2486 2487  static int
2487 2488  zio_vdev_io_done(zio_t *zio)
2488 2489  {
2489 2490          vdev_t *vd = zio->io_vd;
2490 2491          vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2491 2492          boolean_t unexpected_error = B_FALSE;
2492 2493  
2493 2494          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2494 2495                  return (ZIO_PIPELINE_STOP);
2495 2496  
2496 2497          ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2497 2498  
2498 2499          if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2499 2500  
2500 2501                  vdev_queue_io_done(zio);
2501 2502  
2502 2503                  if (zio->io_type == ZIO_TYPE_WRITE)
2503 2504                          vdev_cache_write(zio);
2504 2505  
2505 2506                  if (zio_injection_enabled && zio->io_error == 0)
2506 2507                          zio->io_error = zio_handle_device_injection(vd,
2507 2508                              zio, EIO);
2508 2509  
2509 2510                  if (zio_injection_enabled && zio->io_error == 0)
2510 2511                          zio->io_error = zio_handle_label_injection(zio, EIO);
2511 2512  
2512 2513                  if (zio->io_error) {
2513 2514                          if (!vdev_accessible(vd, zio)) {
2514 2515                                  zio->io_error = SET_ERROR(ENXIO);
2515 2516                          } else {
2516 2517                                  unexpected_error = B_TRUE;
2517 2518                          }
2518 2519                  }
2519 2520          }
2520 2521  
2521 2522          ops->vdev_op_io_done(zio);
2522 2523  
2523 2524          if (unexpected_error)
2524 2525                  VERIFY(vdev_probe(vd, zio) == NULL);
2525 2526  
2526 2527          return (ZIO_PIPELINE_CONTINUE);
2527 2528  }
2528 2529  
2529 2530  /*
2530 2531   * For non-raidz ZIOs, we can just copy aside the bad data read from the
2531 2532   * disk, and use that to finish the checksum ereport later.
2532 2533   */
2533 2534  static void
2534 2535  zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2535 2536      const void *good_buf)
2536 2537  {
2537 2538          /* no processing needed */
2538 2539          zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2539 2540  }
2540 2541  
2541 2542  /*ARGSUSED*/
2542 2543  void
2543 2544  zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2544 2545  {
2545 2546          void *buf = zio_buf_alloc(zio->io_size);
2546 2547  
2547 2548          bcopy(zio->io_data, buf, zio->io_size);
2548 2549  
2549 2550          zcr->zcr_cbinfo = zio->io_size;
2550 2551          zcr->zcr_cbdata = buf;
2551 2552          zcr->zcr_finish = zio_vsd_default_cksum_finish;
2552 2553          zcr->zcr_free = zio_buf_free;
2553 2554  }
2554 2555  
2555 2556  static int
2556 2557  zio_vdev_io_assess(zio_t *zio)
2557 2558  {
2558 2559          vdev_t *vd = zio->io_vd;
2559 2560  
2560 2561          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2561 2562                  return (ZIO_PIPELINE_STOP);
2562 2563  
2563 2564          if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2564 2565                  spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2565 2566  
2566 2567          if (zio->io_vsd != NULL) {
2567 2568                  zio->io_vsd_ops->vsd_free(zio);
2568 2569                  zio->io_vsd = NULL;
2569 2570          }
2570 2571  
2571 2572          if (zio_injection_enabled && zio->io_error == 0)
2572 2573                  zio->io_error = zio_handle_fault_injection(zio, EIO);
2573 2574  
2574 2575          /*
2575 2576           * If the I/O failed, determine whether we should attempt to retry it.
2576 2577           *
2577 2578           * On retry, we cut in line in the issue queue, since we don't want
2578 2579           * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2579 2580           */
2580 2581          if (zio->io_error && vd == NULL &&
2581 2582              !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2582 2583                  ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2583 2584                  ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2584 2585                  zio->io_error = 0;
2585 2586                  zio->io_flags |= ZIO_FLAG_IO_RETRY |
2586 2587                      ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2587 2588                  zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2588 2589                  zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2589 2590                      zio_requeue_io_start_cut_in_line);
2590 2591                  return (ZIO_PIPELINE_STOP);
2591 2592          }
2592 2593  
2593 2594          /*
2594 2595           * If we got an error on a leaf device, convert it to ENXIO
2595 2596           * if the device is not accessible at all.
2596 2597           */
2597 2598          if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2598 2599              !vdev_accessible(vd, zio))
2599 2600                  zio->io_error = SET_ERROR(ENXIO);
2600 2601  
2601 2602          /*
2602 2603           * If we can't write to an interior vdev (mirror or RAID-Z),
2603 2604           * set vdev_cant_write so that we stop trying to allocate from it.
2604 2605           */
2605 2606          if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2606 2607              vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2607 2608                  vd->vdev_cant_write = B_TRUE;
2608 2609          }
2609 2610  
2610 2611          if (zio->io_error)
2611 2612                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2612 2613  
2613 2614          return (ZIO_PIPELINE_CONTINUE);
2614 2615  }
2615 2616  
2616 2617  void
2617 2618  zio_vdev_io_reissue(zio_t *zio)
2618 2619  {
2619 2620          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2620 2621          ASSERT(zio->io_error == 0);
2621 2622  
2622 2623          zio->io_stage >>= 1;
2623 2624  }
2624 2625  
2625 2626  void
2626 2627  zio_vdev_io_redone(zio_t *zio)
2627 2628  {
2628 2629          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2629 2630  
2630 2631          zio->io_stage >>= 1;
2631 2632  }
2632 2633  
2633 2634  void
2634 2635  zio_vdev_io_bypass(zio_t *zio)
2635 2636  {
2636 2637          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2637 2638          ASSERT(zio->io_error == 0);
2638 2639  
2639 2640          zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2640 2641          zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2641 2642  }
2642 2643  
2643 2644  /*
2644 2645   * ==========================================================================
2645 2646   * Generate and verify checksums
2646 2647   * ==========================================================================
2647 2648   */
2648 2649  static int
2649 2650  zio_checksum_generate(zio_t *zio)
2650 2651  {
2651 2652          blkptr_t *bp = zio->io_bp;
2652 2653          enum zio_checksum checksum;
2653 2654  
2654 2655          if (bp == NULL) {
2655 2656                  /*
2656 2657                   * This is zio_write_phys().
2657 2658                   * We're either generating a label checksum, or none at all.
2658 2659                   */
2659 2660                  checksum = zio->io_prop.zp_checksum;
2660 2661  
2661 2662                  if (checksum == ZIO_CHECKSUM_OFF)
2662 2663                          return (ZIO_PIPELINE_CONTINUE);
2663 2664  
2664 2665                  ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2665 2666          } else {
2666 2667                  if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2667 2668                          ASSERT(!IO_IS_ALLOCATING(zio));
2668 2669                          checksum = ZIO_CHECKSUM_GANG_HEADER;
2669 2670                  } else {
2670 2671                          checksum = BP_GET_CHECKSUM(bp);
2671 2672                  }
2672 2673          }
2673 2674  
2674 2675          zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2675 2676  
2676 2677          return (ZIO_PIPELINE_CONTINUE);
2677 2678  }
2678 2679  
2679 2680  static int
2680 2681  zio_checksum_verify(zio_t *zio)
2681 2682  {
2682 2683          zio_bad_cksum_t info;
2683 2684          blkptr_t *bp = zio->io_bp;
2684 2685          int error;
2685 2686  
2686 2687          ASSERT(zio->io_vd != NULL);
2687 2688  
2688 2689          if (bp == NULL) {
2689 2690                  /*
2690 2691                   * This is zio_read_phys().
2691 2692                   * We're either verifying a label checksum, or nothing at all.
2692 2693                   */
2693 2694                  if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2694 2695                          return (ZIO_PIPELINE_CONTINUE);
2695 2696  
2696 2697                  ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2697 2698          }
2698 2699  
2699 2700          if ((error = zio_checksum_error(zio, &info)) != 0) {
2700 2701                  zio->io_error = error;
2701 2702                  if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2702 2703                          zfs_ereport_start_checksum(zio->io_spa,
2703 2704                              zio->io_vd, zio, zio->io_offset,
2704 2705                              zio->io_size, NULL, &info);
2705 2706                  }
2706 2707          }
2707 2708  
2708 2709          return (ZIO_PIPELINE_CONTINUE);
2709 2710  }
2710 2711  
2711 2712  /*
2712 2713   * Called by RAID-Z to ensure we don't compute the checksum twice.
2713 2714   */
2714 2715  void
2715 2716  zio_checksum_verified(zio_t *zio)
2716 2717  {
2717 2718          zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2718 2719  }
2719 2720  
2720 2721  /*
2721 2722   * ==========================================================================
2722 2723   * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2723 2724   * An error of 0 indictes success.  ENXIO indicates whole-device failure,
2724 2725   * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2725 2726   * indicate errors that are specific to one I/O, and most likely permanent.
2726 2727   * Any other error is presumed to be worse because we weren't expecting it.
2727 2728   * ==========================================================================
2728 2729   */
2729 2730  int
2730 2731  zio_worst_error(int e1, int e2)
2731 2732  {
2732 2733          static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2733 2734          int r1, r2;
2734 2735  
2735 2736          for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2736 2737                  if (e1 == zio_error_rank[r1])
2737 2738                          break;
2738 2739  
2739 2740          for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2740 2741                  if (e2 == zio_error_rank[r2])
2741 2742                          break;
2742 2743  
2743 2744          return (r1 > r2 ? e1 : e2);
2744 2745  }
2745 2746  
2746 2747  /*
2747 2748   * ==========================================================================
2748 2749   * I/O completion
2749 2750   * ==========================================================================
2750 2751   */
2751 2752  static int
2752 2753  zio_ready(zio_t *zio)
2753 2754  {
2754 2755          blkptr_t *bp = zio->io_bp;
2755 2756          zio_t *pio, *pio_next;
2756 2757  
2757 2758          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2758 2759              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2759 2760                  return (ZIO_PIPELINE_STOP);
2760 2761  
2761 2762          if (zio->io_ready) {
2762 2763                  ASSERT(IO_IS_ALLOCATING(zio));
2763 2764                  ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
2764 2765                      (zio->io_flags & ZIO_FLAG_NOPWRITE));
2765 2766                  ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2766 2767  
2767 2768                  zio->io_ready(zio);
2768 2769          }
2769 2770  
2770 2771          if (bp != NULL && bp != &zio->io_bp_copy)
2771 2772                  zio->io_bp_copy = *bp;
2772 2773  
2773 2774          if (zio->io_error)
2774 2775                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2775 2776  
2776 2777          mutex_enter(&zio->io_lock);
2777 2778          zio->io_state[ZIO_WAIT_READY] = 1;
2778 2779          pio = zio_walk_parents(zio);
2779 2780          mutex_exit(&zio->io_lock);
2780 2781  
2781 2782          /*
2782 2783           * As we notify zio's parents, new parents could be added.
2783 2784           * New parents go to the head of zio's io_parent_list, however,
2784 2785           * so we will (correctly) not notify them.  The remainder of zio's
2785 2786           * io_parent_list, from 'pio_next' onward, cannot change because
2786 2787           * all parents must wait for us to be done before they can be done.
2787 2788           */
2788 2789          for (; pio != NULL; pio = pio_next) {
2789 2790                  pio_next = zio_walk_parents(zio);
2790 2791                  zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2791 2792          }
2792 2793  
2793 2794          if (zio->io_flags & ZIO_FLAG_NODATA) {
2794 2795                  if (BP_IS_GANG(bp)) {
2795 2796                          zio->io_flags &= ~ZIO_FLAG_NODATA;
2796 2797                  } else {
2797 2798                          ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
2798 2799                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2799 2800                  }
2800 2801          }
2801 2802  
2802 2803          if (zio_injection_enabled &&
2803 2804              zio->io_spa->spa_syncing_txg == zio->io_txg)
2804 2805                  zio_handle_ignored_writes(zio);
2805 2806  
2806 2807          return (ZIO_PIPELINE_CONTINUE);
2807 2808  }
2808 2809  
2809 2810  static int
2810 2811  zio_done(zio_t *zio)
2811 2812  {
2812 2813          spa_t *spa = zio->io_spa;
2813 2814          zio_t *lio = zio->io_logical;
2814 2815          blkptr_t *bp = zio->io_bp;
2815 2816          vdev_t *vd = zio->io_vd;
2816 2817          uint64_t psize = zio->io_size;
2817 2818          zio_t *pio, *pio_next;
2818 2819  
2819 2820          /*
2820 2821           * If our children haven't all completed,
2821 2822           * wait for them and then repeat this pipeline stage.
2822 2823           */
2823 2824          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2824 2825              zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2825 2826              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
2826 2827              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2827 2828                  return (ZIO_PIPELINE_STOP);
2828 2829  
2829 2830          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2830 2831                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2831 2832                          ASSERT(zio->io_children[c][w] == 0);
2832 2833  
2833 2834          if (bp != NULL) {
2834 2835                  ASSERT(bp->blk_pad[0] == 0);
2835 2836                  ASSERT(bp->blk_pad[1] == 0);
2836 2837                  ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2837 2838                      (bp == zio_unique_parent(zio)->io_bp));
2838 2839                  if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2839 2840                      zio->io_bp_override == NULL &&
2840 2841                      !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2841 2842                          ASSERT(!BP_SHOULD_BYTESWAP(bp));
2842 2843                          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
2843 2844                          ASSERT(BP_COUNT_GANG(bp) == 0 ||
2844 2845                              (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
2845 2846                  }
2846 2847                  if (zio->io_flags & ZIO_FLAG_NOPWRITE)
2847 2848                          VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
2848 2849          }
2849 2850  
2850 2851          /*
2851 2852           * If there were child vdev/gang/ddt errors, they apply to us now.
2852 2853           */
2853 2854          zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
2854 2855          zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
2855 2856          zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
2856 2857  
2857 2858          /*
2858 2859           * If the I/O on the transformed data was successful, generate any
2859 2860           * checksum reports now while we still have the transformed data.
2860 2861           */
2861 2862          if (zio->io_error == 0) {
2862 2863                  while (zio->io_cksum_report != NULL) {
2863 2864                          zio_cksum_report_t *zcr = zio->io_cksum_report;
2864 2865                          uint64_t align = zcr->zcr_align;
2865 2866                          uint64_t asize = P2ROUNDUP(psize, align);
2866 2867                          char *abuf = zio->io_data;
2867 2868  
2868 2869                          if (asize != psize) {
2869 2870                                  abuf = zio_buf_alloc(asize);
2870 2871                                  bcopy(zio->io_data, abuf, psize);
2871 2872                                  bzero(abuf + psize, asize - psize);
2872 2873                          }
2873 2874  
2874 2875                          zio->io_cksum_report = zcr->zcr_next;
2875 2876                          zcr->zcr_next = NULL;
2876 2877                          zcr->zcr_finish(zcr, abuf);
2877 2878                          zfs_ereport_free_checksum(zcr);
2878 2879  
2879 2880                          if (asize != psize)
2880 2881                                  zio_buf_free(abuf, asize);
2881 2882                  }
2882 2883          }
2883 2884  
2884 2885          zio_pop_transforms(zio);        /* note: may set zio->io_error */
2885 2886  
2886 2887          vdev_stat_update(zio, psize);
2887 2888  
2888 2889          if (zio->io_error) {
2889 2890                  /*
2890 2891                   * If this I/O is attached to a particular vdev,
2891 2892                   * generate an error message describing the I/O failure
2892 2893                   * at the block level.  We ignore these errors if the
2893 2894                   * device is currently unavailable.
2894 2895                   */
2895 2896                  if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
2896 2897                          zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
2897 2898  
2898 2899                  if ((zio->io_error == EIO || !(zio->io_flags &
2899 2900                      (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
2900 2901                      zio == lio) {
2901 2902                          /*
2902 2903                           * For logical I/O requests, tell the SPA to log the
2903 2904                           * error and generate a logical data ereport.
2904 2905                           */
2905 2906                          spa_log_error(spa, zio);
2906 2907                          zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
2907 2908                              0, 0);
2908 2909                  }
2909 2910          }
2910 2911  
2911 2912          if (zio->io_error && zio == lio) {
2912 2913                  /*
2913 2914                   * Determine whether zio should be reexecuted.  This will
2914 2915                   * propagate all the way to the root via zio_notify_parent().
2915 2916                   */
2916 2917                  ASSERT(vd == NULL && bp != NULL);
2917 2918                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2918 2919  
2919 2920                  if (IO_IS_ALLOCATING(zio) &&
2920 2921                      !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
2921 2922                          if (zio->io_error != ENOSPC)
2922 2923                                  zio->io_reexecute |= ZIO_REEXECUTE_NOW;
2923 2924                          else
2924 2925                                  zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2925 2926                  }
2926 2927  
2927 2928                  if ((zio->io_type == ZIO_TYPE_READ ||
2928 2929                      zio->io_type == ZIO_TYPE_FREE) &&
2929 2930                      !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
2930 2931                      zio->io_error == ENXIO &&
2931 2932                      spa_load_state(spa) == SPA_LOAD_NONE &&
2932 2933                      spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
2933 2934                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2934 2935  
2935 2936                  if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
2936 2937                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2937 2938  
2938 2939                  /*
2939 2940                   * Here is a possibly good place to attempt to do
2940 2941                   * either combinatorial reconstruction or error correction
2941 2942                   * based on checksums.  It also might be a good place
2942 2943                   * to send out preliminary ereports before we suspend
2943 2944                   * processing.
2944 2945                   */
2945 2946          }
2946 2947  
2947 2948          /*
2948 2949           * If there were logical child errors, they apply to us now.
2949 2950           * We defer this until now to avoid conflating logical child
2950 2951           * errors with errors that happened to the zio itself when
2951 2952           * updating vdev stats and reporting FMA events above.
2952 2953           */
2953 2954          zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
2954 2955  
2955 2956          if ((zio->io_error || zio->io_reexecute) &&
2956 2957              IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
2957 2958              !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
2958 2959                  zio_dva_unallocate(zio, zio->io_gang_tree, bp);
2959 2960  
2960 2961          zio_gang_tree_free(&zio->io_gang_tree);
2961 2962  
2962 2963          /*
2963 2964           * Godfather I/Os should never suspend.
2964 2965           */
2965 2966          if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
2966 2967              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
2967 2968                  zio->io_reexecute = 0;
2968 2969  
2969 2970          if (zio->io_reexecute) {
2970 2971                  /*
2971 2972                   * This is a logical I/O that wants to reexecute.
2972 2973                   *
2973 2974                   * Reexecute is top-down.  When an i/o fails, if it's not
2974 2975                   * the root, it simply notifies its parent and sticks around.
2975 2976                   * The parent, seeing that it still has children in zio_done(),
2976 2977                   * does the same.  This percolates all the way up to the root.
2977 2978                   * The root i/o will reexecute or suspend the entire tree.
2978 2979                   *
2979 2980                   * This approach ensures that zio_reexecute() honors
2980 2981                   * all the original i/o dependency relationships, e.g.
2981 2982                   * parents not executing until children are ready.
2982 2983                   */
2983 2984                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2984 2985  
2985 2986                  zio->io_gang_leader = NULL;
2986 2987  
2987 2988                  mutex_enter(&zio->io_lock);
2988 2989                  zio->io_state[ZIO_WAIT_DONE] = 1;
2989 2990                  mutex_exit(&zio->io_lock);
2990 2991  
2991 2992                  /*
2992 2993                   * "The Godfather" I/O monitors its children but is
2993 2994                   * not a true parent to them. It will track them through
2994 2995                   * the pipeline but severs its ties whenever they get into
2995 2996                   * trouble (e.g. suspended). This allows "The Godfather"
2996 2997                   * I/O to return status without blocking.
2997 2998                   */
2998 2999                  for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
2999 3000                          zio_link_t *zl = zio->io_walk_link;
3000 3001                          pio_next = zio_walk_parents(zio);
3001 3002  
3002 3003                          if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3003 3004                              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3004 3005                                  zio_remove_child(pio, zio, zl);
3005 3006                                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3006 3007                          }
3007 3008                  }
3008 3009  
3009 3010                  if ((pio = zio_unique_parent(zio)) != NULL) {
3010 3011                          /*
3011 3012                           * We're not a root i/o, so there's nothing to do
3012 3013                           * but notify our parent.  Don't propagate errors
3013 3014                           * upward since we haven't permanently failed yet.
3014 3015                           */
3015 3016                          ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3016 3017                          zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3017 3018                          zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3018 3019                  } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3019 3020                          /*
3020 3021                           * We'd fail again if we reexecuted now, so suspend
3021 3022                           * until conditions improve (e.g. device comes online).
3022 3023                           */
3023 3024                          zio_suspend(spa, zio);
3024 3025                  } else {
3025 3026                          /*
3026 3027                           * Reexecution is potentially a huge amount of work.
3027 3028                           * Hand it off to the otherwise-unused claim taskq.
3028 3029                           */
3029 3030                          ASSERT(zio->io_tqent.tqent_next == NULL);
3030 3031                          spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
3031 3032                              ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
3032 3033                              0, &zio->io_tqent);
3033 3034                  }
3034 3035                  return (ZIO_PIPELINE_STOP);
3035 3036          }
3036 3037  
3037 3038          ASSERT(zio->io_child_count == 0);
3038 3039          ASSERT(zio->io_reexecute == 0);
3039 3040          ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3040 3041  
3041 3042          /*
3042 3043           * Report any checksum errors, since the I/O is complete.
3043 3044           */
3044 3045          while (zio->io_cksum_report != NULL) {
3045 3046                  zio_cksum_report_t *zcr = zio->io_cksum_report;
3046 3047                  zio->io_cksum_report = zcr->zcr_next;
3047 3048                  zcr->zcr_next = NULL;
3048 3049                  zcr->zcr_finish(zcr, NULL);
3049 3050                  zfs_ereport_free_checksum(zcr);
3050 3051          }
3051 3052  
3052 3053          /*
3053 3054           * It is the responsibility of the done callback to ensure that this
3054 3055           * particular zio is no longer discoverable for adoption, and as
3055 3056           * such, cannot acquire any new parents.
3056 3057           */
3057 3058          if (zio->io_done)
3058 3059                  zio->io_done(zio);
3059 3060  
3060 3061          mutex_enter(&zio->io_lock);
3061 3062          zio->io_state[ZIO_WAIT_DONE] = 1;
3062 3063          mutex_exit(&zio->io_lock);
3063 3064  
3064 3065          for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3065 3066                  zio_link_t *zl = zio->io_walk_link;
3066 3067                  pio_next = zio_walk_parents(zio);
3067 3068                  zio_remove_child(pio, zio, zl);
3068 3069                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3069 3070          }
3070 3071  
3071 3072          if (zio->io_waiter != NULL) {
3072 3073                  mutex_enter(&zio->io_lock);
3073 3074                  zio->io_executor = NULL;
3074 3075                  cv_broadcast(&zio->io_cv);
3075 3076                  mutex_exit(&zio->io_lock);
3076 3077          } else {
3077 3078                  zio_destroy(zio);
3078 3079          }
3079 3080  
3080 3081          return (ZIO_PIPELINE_STOP);
3081 3082  }
3082 3083  
3083 3084  /*
3084 3085   * ==========================================================================
3085 3086   * I/O pipeline definition
3086 3087   * ==========================================================================
3087 3088   */
3088 3089  static zio_pipe_stage_t *zio_pipeline[] = {
3089 3090          NULL,
3090 3091          zio_read_bp_init,
3091 3092          zio_free_bp_init,
3092 3093          zio_issue_async,
3093 3094          zio_write_bp_init,
3094 3095          zio_checksum_generate,
3095 3096          zio_nop_write,
3096 3097          zio_ddt_read_start,
3097 3098          zio_ddt_read_done,
3098 3099          zio_ddt_write,
3099 3100          zio_ddt_free,
3100 3101          zio_gang_assemble,
3101 3102          zio_gang_issue,
3102 3103          zio_dva_allocate,
3103 3104          zio_dva_free,
3104 3105          zio_dva_claim,
3105 3106          zio_ready,
3106 3107          zio_vdev_io_start,
3107 3108          zio_vdev_io_done,
3108 3109          zio_vdev_io_assess,
3109 3110          zio_checksum_verify,
3110 3111          zio_done
3111 3112  };
3112 3113  
3113 3114  /* dnp is the dnode for zb1->zb_object */
3114 3115  boolean_t
3115 3116  zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
3116 3117      const zbookmark_t *zb2)
3117 3118  {
3118 3119          uint64_t zb1nextL0, zb2thisobj;
3119 3120  
3120 3121          ASSERT(zb1->zb_objset == zb2->zb_objset);
3121 3122          ASSERT(zb2->zb_level == 0);
3122 3123  
3123 3124          /*
3124 3125           * A bookmark in the deadlist is considered to be after
3125 3126           * everything else.
3126 3127           */
3127 3128          if (zb2->zb_object == DMU_DEADLIST_OBJECT)
3128 3129                  return (B_TRUE);
3129 3130  
3130 3131          /* The objset_phys_t isn't before anything. */
3131 3132          if (dnp == NULL)
3132 3133                  return (B_FALSE);
3133 3134  
3134 3135          zb1nextL0 = (zb1->zb_blkid + 1) <<
3135 3136              ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3136 3137  
3137 3138          zb2thisobj = zb2->zb_object ? zb2->zb_object :
3138 3139              zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3139 3140  
3140 3141          if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3141 3142                  uint64_t nextobj = zb1nextL0 *
3142 3143                      (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3143 3144                  return (nextobj <= zb2thisobj);
3144 3145          }
3145 3146  
3146 3147          if (zb1->zb_object < zb2thisobj)
3147 3148                  return (B_TRUE);
3148 3149          if (zb1->zb_object > zb2thisobj)
3149 3150                  return (B_FALSE);
3150 3151          if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3151 3152                  return (B_FALSE);
3152 3153          return (zb1nextL0 <= zb2->zb_blkid);
3153 3154  }

↓ open down ↓

2256 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX