illumos-3742.1 Wdiff usr/src/uts/common/fs/zfs/zio.c

Print this page

3742 zfs comments need cleaner, more consistent style
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>
Reviewed by:    George Wilson <george.wilson@delphix.com>
Reviewed by:    Eric Schrock <eric.schrock@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zio.c
          +++ new/usr/src/uts/common/fs/zfs/zio.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/fm/fs/zfs.h>
  29   29  #include <sys/spa.h>
  30   30  #include <sys/txg.h>
  31   31  #include <sys/spa_impl.h>
  32   32  #include <sys/vdev_impl.h>
  33   33  #include <sys/zio_impl.h>
  34   34  #include <sys/zio_compress.h>
  35   35  #include <sys/zio_checksum.h>
  36   36  #include <sys/dmu_objset.h>
  37   37  #include <sys/arc.h>
  38   38  #include <sys/ddt.h>
  39   39  
  40   40  /*
  41   41   * ==========================================================================
  42   42   * I/O priority table
  43   43   * ==========================================================================
  44   44   */
  45   45  uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
  46   46          0,      /* ZIO_PRIORITY_NOW             */
  47   47          0,      /* ZIO_PRIORITY_SYNC_READ       */
  48   48          0,      /* ZIO_PRIORITY_SYNC_WRITE      */
  49   49          0,      /* ZIO_PRIORITY_LOG_WRITE       */
  50   50          1,      /* ZIO_PRIORITY_CACHE_FILL      */
  51   51          1,      /* ZIO_PRIORITY_AGG             */
  52   52          4,      /* ZIO_PRIORITY_FREE            */
  53   53          4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
  54   54          6,      /* ZIO_PRIORITY_ASYNC_READ      */
  55   55          10,     /* ZIO_PRIORITY_RESILVER        */
  56   56          20,     /* ZIO_PRIORITY_SCRUB           */
  57   57          2,      /* ZIO_PRIORITY_DDT_PREFETCH    */
  58   58  };
  59   59  
  60   60  /*
  61   61   * ==========================================================================
  62   62   * I/O type descriptions
  63   63   * ==========================================================================
  64   64   */
  65   65  char *zio_type_name[ZIO_TYPES] = {
  66   66          "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
  67   67          "zio_ioctl"
  68   68  };
  69   69  
  70   70  /*
  71   71   * ==========================================================================
  72   72   * I/O kmem caches
  73   73   * ==========================================================================
  74   74   */
  75   75  kmem_cache_t *zio_cache;
  76   76  kmem_cache_t *zio_link_cache;
  77   77  kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  78   78  kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  79   79  
  80   80  #ifdef _KERNEL
  81   81  extern vmem_t *zio_alloc_arena;
  82   82  #endif
  83   83  extern int zfs_mg_alloc_failures;
  84   84  
  85   85  /*
  86   86   * The following actions directly effect the spa's sync-to-convergence logic.
  87   87   * The values below define the sync pass when we start performing the action.
  88   88   * Care should be taken when changing these values as they directly impact
  89   89   * spa_sync() performance. Tuning these values may introduce subtle performance
  90   90   * pathologies and should only be done in the context of performance analysis.
  91   91   * These tunables will eventually be removed and replaced with #defines once
  92   92   * enough analysis has been done to determine optimal values.
  93   93   *
  94   94   * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  95   95   * regular blocks are not deferred.
  96   96   */
  97   97  int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  98   98  int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  99   99  int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
 100  100  
 101  101  /*
 102  102   * An allocating zio is one that either currently has the DVA allocate
 103  103   * stage set or will have it later in its lifetime.
 104  104   */
 105  105  #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 106  106  
 107  107  boolean_t       zio_requeue_io_start_cut_in_line = B_TRUE;
 108  108  
 109  109  #ifdef ZFS_DEBUG
 110  110  int zio_buf_debug_limit = 16384;
 111  111  #else
 112  112  int zio_buf_debug_limit = 0;
 113  113  #endif
 114  114  
 115  115  void
 116  116  zio_init(void)
 117  117  {
 118  118          size_t c;
 119  119          vmem_t *data_alloc_arena = NULL;
 120  120  
 121  121  #ifdef _KERNEL
 122  122          data_alloc_arena = zio_alloc_arena;
 123  123  #endif
 124  124          zio_cache = kmem_cache_create("zio_cache",
 125  125              sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 126  126          zio_link_cache = kmem_cache_create("zio_link_cache",
 127  127              sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 128  128  
 129  129          /*
 130  130           * For small buffers, we want a cache for each multiple of
 131  131           * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
 132  132           * for each quarter-power of 2.  For large buffers, we want
 133  133           * a cache for each multiple of PAGESIZE.
 134  134           */
 135  135          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 136  136                  size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 137  137                  size_t p2 = size;
 138  138                  size_t align = 0;
 139  139                  size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 140  140  
 141  141                  while (p2 & (p2 - 1))
 142  142                          p2 &= p2 - 1;
 143  143  
 144  144  #ifndef _KERNEL
 145  145                  /*
 146  146                   * If we are using watchpoints, put each buffer on its own page,
 147  147                   * to eliminate the performance overhead of trapping to the
 148  148                   * kernel when modifying a non-watched buffer that shares the
 149  149                   * page with a watched buffer.
 150  150                   */
 151  151                  if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 152  152                          continue;
 153  153  #endif
 154  154                  if (size <= 4 * SPA_MINBLOCKSIZE) {
 155  155                          align = SPA_MINBLOCKSIZE;
 156  156                  } else if (IS_P2ALIGNED(size, PAGESIZE)) {
 157  157                          align = PAGESIZE;
 158  158                  } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 159  159                          align = p2 >> 2;
 160  160                  }
 161  161  
 162  162                  if (align != 0) {
 163  163                          char name[36];
 164  164                          (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 165  165                          zio_buf_cache[c] = kmem_cache_create(name, size,
 166  166                              align, NULL, NULL, NULL, NULL, NULL, cflags);
 167  167  
 168  168                          /*
 169  169                           * Since zio_data bufs do not appear in crash dumps, we
 170  170                           * pass KMC_NOTOUCH so that no allocator metadata is
 171  171                           * stored with the buffers.
 172  172                           */
 173  173                          (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 174  174                          zio_data_buf_cache[c] = kmem_cache_create(name, size,
 175  175                              align, NULL, NULL, NULL, NULL, data_alloc_arena,
 176  176                              cflags | KMC_NOTOUCH);
 177  177                  }
 178  178          }
 179  179  
 180  180          while (--c != 0) {
 181  181                  ASSERT(zio_buf_cache[c] != NULL);
 182  182                  if (zio_buf_cache[c - 1] == NULL)
 183  183                          zio_buf_cache[c - 1] = zio_buf_cache[c];
 184  184  
 185  185                  ASSERT(zio_data_buf_cache[c] != NULL);
 186  186                  if (zio_data_buf_cache[c - 1] == NULL)
 187  187                          zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 188  188          }
 189  189  
 190  190          /*
 191  191           * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
 192  192           * to fail 3 times per txg or 8 failures, whichever is greater.
 193  193           */
 194  194          zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
 195  195  
 196  196          zio_inject_init();
 197  197  }
 198  198  
 199  199  void
 200  200  zio_fini(void)
 201  201  {
 202  202          size_t c;
 203  203          kmem_cache_t *last_cache = NULL;
 204  204          kmem_cache_t *last_data_cache = NULL;
 205  205  
 206  206          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 207  207                  if (zio_buf_cache[c] != last_cache) {
 208  208                          last_cache = zio_buf_cache[c];
 209  209                          kmem_cache_destroy(zio_buf_cache[c]);
 210  210                  }
 211  211                  zio_buf_cache[c] = NULL;
 212  212  
 213  213                  if (zio_data_buf_cache[c] != last_data_cache) {
 214  214                          last_data_cache = zio_data_buf_cache[c];
 215  215                          kmem_cache_destroy(zio_data_buf_cache[c]);
 216  216                  }
 217  217                  zio_data_buf_cache[c] = NULL;
 218  218          }
 219  219  
 220  220          kmem_cache_destroy(zio_link_cache);
 221  221          kmem_cache_destroy(zio_cache);
 222  222  
 223  223          zio_inject_fini();
 224  224  }
 225  225  
 226  226  /*
 227  227   * ==========================================================================
 228  228   * Allocate and free I/O buffers
 229  229   * ==========================================================================
 230  230   */
 231  231  
 232  232  /*
 233  233   * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 234  234   * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 235  235   * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 236  236   * excess / transient data in-core during a crashdump.
 237  237   */
 238  238  void *
 239  239  zio_buf_alloc(size_t size)
 240  240  {
 241  241          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 242  242  
 243  243          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 244  244  
 245  245          return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 246  246  }
 247  247  
 248  248  /*
 249  249   * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 250  250   * crashdump if the kernel panics.  This exists so that we will limit the amount
 251  251   * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 252  252   * of kernel heap dumped to disk when the kernel panics)
 253  253   */
 254  254  void *
 255  255  zio_data_buf_alloc(size_t size)
 256  256  {
 257  257          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 258  258  
 259  259          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 260  260  
 261  261          return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 262  262  }
 263  263  
 264  264  void
 265  265  zio_buf_free(void *buf, size_t size)
 266  266  {
 267  267          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 268  268  
 269  269          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 270  270  
 271  271          kmem_cache_free(zio_buf_cache[c], buf);
 272  272  }
 273  273  
 274  274  void
 275  275  zio_data_buf_free(void *buf, size_t size)
 276  276  {
 277  277          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 278  278  
 279  279          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 280  280  
 281  281          kmem_cache_free(zio_data_buf_cache[c], buf);
 282  282  }
 283  283  
 284  284  /*
 285  285   * ==========================================================================
 286  286   * Push and pop I/O transform buffers
 287  287   * ==========================================================================
 288  288   */
 289  289  static void
 290  290  zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 291  291          zio_transform_func_t *transform)
 292  292  {
 293  293          zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 294  294  
 295  295          zt->zt_orig_data = zio->io_data;
 296  296          zt->zt_orig_size = zio->io_size;
 297  297          zt->zt_bufsize = bufsize;
 298  298          zt->zt_transform = transform;
 299  299  
 300  300          zt->zt_next = zio->io_transform_stack;
 301  301          zio->io_transform_stack = zt;
 302  302  
 303  303          zio->io_data = data;
 304  304          zio->io_size = size;
 305  305  }
 306  306  
 307  307  static void
 308  308  zio_pop_transforms(zio_t *zio)
 309  309  {
 310  310          zio_transform_t *zt;
 311  311  
 312  312          while ((zt = zio->io_transform_stack) != NULL) {
 313  313                  if (zt->zt_transform != NULL)
 314  314                          zt->zt_transform(zio,
 315  315                              zt->zt_orig_data, zt->zt_orig_size);
 316  316  
 317  317                  if (zt->zt_bufsize != 0)
 318  318                          zio_buf_free(zio->io_data, zt->zt_bufsize);
 319  319  
 320  320                  zio->io_data = zt->zt_orig_data;
 321  321                  zio->io_size = zt->zt_orig_size;
 322  322                  zio->io_transform_stack = zt->zt_next;
 323  323  
 324  324                  kmem_free(zt, sizeof (zio_transform_t));
 325  325          }
 326  326  }
 327  327  
 328  328  /*
 329  329   * ==========================================================================
 330  330   * I/O transform callbacks for subblocks and decompression
 331  331   * ==========================================================================
 332  332   */
 333  333  static void
 334  334  zio_subblock(zio_t *zio, void *data, uint64_t size)
 335  335  {
 336  336          ASSERT(zio->io_size > size);
 337  337  
 338  338          if (zio->io_type == ZIO_TYPE_READ)
 339  339                  bcopy(zio->io_data, data, size);
 340  340  }
 341  341  
 342  342  static void
 343  343  zio_decompress(zio_t *zio, void *data, uint64_t size)
 344  344  {
 345  345          if (zio->io_error == 0 &&
 346  346              zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 347  347              zio->io_data, data, zio->io_size, size) != 0)
 348  348                  zio->io_error = SET_ERROR(EIO);
 349  349  }
 350  350  
 351  351  /*
 352  352   * ==========================================================================
 353  353   * I/O parent/child relationships and pipeline interlocks
 354  354   * ==========================================================================
 355  355   */
 356  356  /*
 357  357   * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 358  358   *        continue calling these functions until they return NULL.
 359  359   *        Otherwise, the next caller will pick up the list walk in
 360  360   *        some indeterminate state.  (Otherwise every caller would
 361  361   *        have to pass in a cookie to keep the state represented by
 362  362   *        io_walk_link, which gets annoying.)
 363  363   */
 364  364  zio_t *
 365  365  zio_walk_parents(zio_t *cio)
 366  366  {
 367  367          zio_link_t *zl = cio->io_walk_link;
 368  368          list_t *pl = &cio->io_parent_list;
 369  369  
 370  370          zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 371  371          cio->io_walk_link = zl;
 372  372  
 373  373          if (zl == NULL)
 374  374                  return (NULL);
 375  375  
 376  376          ASSERT(zl->zl_child == cio);
 377  377          return (zl->zl_parent);
 378  378  }
 379  379  
 380  380  zio_t *
 381  381  zio_walk_children(zio_t *pio)
 382  382  {
 383  383          zio_link_t *zl = pio->io_walk_link;
 384  384          list_t *cl = &pio->io_child_list;
 385  385  
 386  386          zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 387  387          pio->io_walk_link = zl;
 388  388  
 389  389          if (zl == NULL)
 390  390                  return (NULL);
 391  391  
 392  392          ASSERT(zl->zl_parent == pio);
 393  393          return (zl->zl_child);
 394  394  }
 395  395  
 396  396  zio_t *
 397  397  zio_unique_parent(zio_t *cio)
 398  398  {
 399  399          zio_t *pio = zio_walk_parents(cio);
 400  400  
 401  401          VERIFY(zio_walk_parents(cio) == NULL);
 402  402          return (pio);
 403  403  }
 404  404  
 405  405  void
 406  406  zio_add_child(zio_t *pio, zio_t *cio)
 407  407  {
 408  408          zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 409  409  
 410  410          /*
 411  411           * Logical I/Os can have logical, gang, or vdev children.
 412  412           * Gang I/Os can have gang or vdev children.
 413  413           * Vdev I/Os can only have vdev children.
 414  414           * The following ASSERT captures all of these constraints.
 415  415           */
 416  416          ASSERT(cio->io_child_type <= pio->io_child_type);
 417  417  
 418  418          zl->zl_parent = pio;
 419  419          zl->zl_child = cio;
 420  420  
 421  421          mutex_enter(&cio->io_lock);
 422  422          mutex_enter(&pio->io_lock);
 423  423  
 424  424          ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 425  425  
 426  426          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 427  427                  pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 428  428  
 429  429          list_insert_head(&pio->io_child_list, zl);
 430  430          list_insert_head(&cio->io_parent_list, zl);
 431  431  
 432  432          pio->io_child_count++;
 433  433          cio->io_parent_count++;
 434  434  
 435  435          mutex_exit(&pio->io_lock);
 436  436          mutex_exit(&cio->io_lock);
 437  437  }
 438  438  
 439  439  static void
 440  440  zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 441  441  {
 442  442          ASSERT(zl->zl_parent == pio);
 443  443          ASSERT(zl->zl_child == cio);
 444  444  
 445  445          mutex_enter(&cio->io_lock);
 446  446          mutex_enter(&pio->io_lock);
 447  447  
 448  448          list_remove(&pio->io_child_list, zl);
 449  449          list_remove(&cio->io_parent_list, zl);
 450  450  
 451  451          pio->io_child_count--;
 452  452          cio->io_parent_count--;
 453  453  
 454  454          mutex_exit(&pio->io_lock);
 455  455          mutex_exit(&cio->io_lock);
 456  456  
 457  457          kmem_cache_free(zio_link_cache, zl);
 458  458  }
 459  459  
 460  460  static boolean_t
 461  461  zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 462  462  {
 463  463          uint64_t *countp = &zio->io_children[child][wait];
 464  464          boolean_t waiting = B_FALSE;
 465  465  
 466  466          mutex_enter(&zio->io_lock);
 467  467          ASSERT(zio->io_stall == NULL);
 468  468          if (*countp != 0) {
 469  469                  zio->io_stage >>= 1;
 470  470                  zio->io_stall = countp;
 471  471                  waiting = B_TRUE;
 472  472          }
 473  473          mutex_exit(&zio->io_lock);
 474  474  
 475  475          return (waiting);
 476  476  }
 477  477  
 478  478  static void
 479  479  zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 480  480  {
 481  481          uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 482  482          int *errorp = &pio->io_child_error[zio->io_child_type];
 483  483  
 484  484          mutex_enter(&pio->io_lock);
 485  485          if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 486  486                  *errorp = zio_worst_error(*errorp, zio->io_error);
 487  487          pio->io_reexecute |= zio->io_reexecute;
 488  488          ASSERT3U(*countp, >, 0);
 489  489          if (--*countp == 0 && pio->io_stall == countp) {
 490  490                  pio->io_stall = NULL;
 491  491                  mutex_exit(&pio->io_lock);
 492  492                  zio_execute(pio);
 493  493          } else {
 494  494                  mutex_exit(&pio->io_lock);
 495  495          }
 496  496  }
 497  497  
 498  498  static void
 499  499  zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 500  500  {
 501  501          if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 502  502                  zio->io_error = zio->io_child_error[c];
 503  503  }
 504  504  
 505  505  /*
 506  506   * ==========================================================================
 507  507   * Create the various types of I/O (read, write, free, etc)
 508  508   * ==========================================================================
 509  509   */
 510  510  static zio_t *
 511  511  zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 512  512      void *data, uint64_t size, zio_done_func_t *done, void *private,
 513  513      zio_type_t type, int priority, enum zio_flag flags,
 514  514      vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
 515  515      enum zio_stage stage, enum zio_stage pipeline)
 516  516  {
 517  517          zio_t *zio;
 518  518  
 519  519          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 520  520          ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 521  521          ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 522  522  
 523  523          ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 524  524          ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 525  525          ASSERT(vd || stage == ZIO_STAGE_OPEN);
 526  526  
 527  527          zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 528  528          bzero(zio, sizeof (zio_t));
 529  529  
 530  530          mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 531  531          cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 532  532  
 533  533          list_create(&zio->io_parent_list, sizeof (zio_link_t),
 534  534              offsetof(zio_link_t, zl_parent_node));
 535  535          list_create(&zio->io_child_list, sizeof (zio_link_t),
 536  536              offsetof(zio_link_t, zl_child_node));
 537  537  
 538  538          if (vd != NULL)
 539  539                  zio->io_child_type = ZIO_CHILD_VDEV;
 540  540          else if (flags & ZIO_FLAG_GANG_CHILD)
 541  541                  zio->io_child_type = ZIO_CHILD_GANG;
 542  542          else if (flags & ZIO_FLAG_DDT_CHILD)
 543  543                  zio->io_child_type = ZIO_CHILD_DDT;
 544  544          else
 545  545                  zio->io_child_type = ZIO_CHILD_LOGICAL;
 546  546  
 547  547          if (bp != NULL) {
 548  548                  zio->io_bp = (blkptr_t *)bp;
 549  549                  zio->io_bp_copy = *bp;
 550  550                  zio->io_bp_orig = *bp;
 551  551                  if (type != ZIO_TYPE_WRITE ||
 552  552                      zio->io_child_type == ZIO_CHILD_DDT)
 553  553                          zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 554  554                  if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 555  555                          zio->io_logical = zio;
 556  556                  if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 557  557                          pipeline |= ZIO_GANG_STAGES;
 558  558          }
 559  559  
 560  560          zio->io_spa = spa;
 561  561          zio->io_txg = txg;
 562  562          zio->io_done = done;
 563  563          zio->io_private = private;
 564  564          zio->io_type = type;
 565  565          zio->io_priority = priority;
 566  566          zio->io_vd = vd;
 567  567          zio->io_offset = offset;
 568  568          zio->io_orig_data = zio->io_data = data;
 569  569          zio->io_orig_size = zio->io_size = size;
 570  570          zio->io_orig_flags = zio->io_flags = flags;
 571  571          zio->io_orig_stage = zio->io_stage = stage;
 572  572          zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 573  573  
 574  574          zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 575  575          zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 576  576  
 577  577          if (zb != NULL)
 578  578                  zio->io_bookmark = *zb;
 579  579  
 580  580          if (pio != NULL) {
 581  581                  if (zio->io_logical == NULL)
 582  582                          zio->io_logical = pio->io_logical;
 583  583                  if (zio->io_child_type == ZIO_CHILD_GANG)
 584  584                          zio->io_gang_leader = pio->io_gang_leader;
 585  585                  zio_add_child(pio, zio);
 586  586          }
 587  587  
 588  588          return (zio);
 589  589  }
 590  590  
 591  591  static void
 592  592  zio_destroy(zio_t *zio)
 593  593  {
 594  594          list_destroy(&zio->io_parent_list);
 595  595          list_destroy(&zio->io_child_list);
 596  596          mutex_destroy(&zio->io_lock);
 597  597          cv_destroy(&zio->io_cv);
 598  598          kmem_cache_free(zio_cache, zio);
 599  599  }
 600  600  
 601  601  zio_t *
 602  602  zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 603  603      void *private, enum zio_flag flags)
 604  604  {
 605  605          zio_t *zio;
 606  606  
 607  607          zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 608  608              ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 609  609              ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 610  610  
 611  611          return (zio);
 612  612  }
 613  613  
 614  614  zio_t *
 615  615  zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 616  616  {
 617  617          return (zio_null(NULL, spa, NULL, done, private, flags));
 618  618  }
 619  619  
 620  620  zio_t *
 621  621  zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 622  622      void *data, uint64_t size, zio_done_func_t *done, void *private,
 623  623      int priority, enum zio_flag flags, const zbookmark_t *zb)
 624  624  {
 625  625          zio_t *zio;
 626  626  
 627  627          zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 628  628              data, size, done, private,
 629  629              ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 630  630              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 631  631              ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 632  632  
 633  633          return (zio);
 634  634  }
 635  635  
 636  636  zio_t *
 637  637  zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 638  638      void *data, uint64_t size, const zio_prop_t *zp,
 639  639      zio_done_func_t *ready, zio_done_func_t *done, void *private,
 640  640      int priority, enum zio_flag flags, const zbookmark_t *zb)
 641  641  {
 642  642          zio_t *zio;
 643  643  
 644  644          ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 645  645              zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 646  646              zp->zp_compress >= ZIO_COMPRESS_OFF &&
 647  647              zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 648  648              DMU_OT_IS_VALID(zp->zp_type) &&
 649  649              zp->zp_level < 32 &&
 650  650              zp->zp_copies > 0 &&
 651  651              zp->zp_copies <= spa_max_replication(spa));
 652  652  
 653  653          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 654  654              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 655  655              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 656  656              ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 657  657  
 658  658          zio->io_ready = ready;
 659  659          zio->io_prop = *zp;
 660  660  
 661  661          return (zio);
 662  662  }
 663  663  
 664  664  zio_t *
 665  665  zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 666  666      uint64_t size, zio_done_func_t *done, void *private, int priority,
 667  667      enum zio_flag flags, zbookmark_t *zb)
 668  668  {
 669  669          zio_t *zio;
 670  670  
 671  671          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 672  672              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 673  673              ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 674  674  
 675  675          return (zio);
 676  676  }
 677  677  
 678  678  void
 679  679  zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 680  680  {
 681  681          ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 682  682          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 683  683          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 684  684          ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 685  685  
 686  686          /*
 687  687           * We must reset the io_prop to match the values that existed
 688  688           * when the bp was first written by dmu_sync() keeping in mind
 689  689           * that nopwrite and dedup are mutually exclusive.
 690  690           */
 691  691          zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 692  692          zio->io_prop.zp_nopwrite = nopwrite;
 693  693          zio->io_prop.zp_copies = copies;
 694  694          zio->io_bp_override = bp;
 695  695  }
 696  696  
 697  697  void
 698  698  zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 699  699  {
 700  700          metaslab_check_free(spa, bp);
 701  701          bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 702  702  }
 703  703  
 704  704  zio_t *
 705  705  zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 706  706      enum zio_flag flags)
 707  707  {
 708  708          zio_t *zio;
 709  709  
 710  710          dprintf_bp(bp, "freeing in txg %llu, pass %u",
 711  711              (longlong_t)txg, spa->spa_sync_pass);
 712  712  
 713  713          ASSERT(!BP_IS_HOLE(bp));
 714  714          ASSERT(spa_syncing_txg(spa) == txg);
 715  715          ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 716  716  
 717  717          metaslab_check_free(spa, bp);
 718  718  
 719  719          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 720  720              NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
 721  721              NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
 722  722  
 723  723          return (zio);
 724  724  }
 725  725  
 726  726  zio_t *
 727  727  zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 728  728      zio_done_func_t *done, void *private, enum zio_flag flags)
 729  729  {
 730  730          zio_t *zio;
 731  731  
 732  732          /*
 733  733           * A claim is an allocation of a specific block.  Claims are needed
 734  734           * to support immediate writes in the intent log.  The issue is that
 735  735           * immediate writes contain committed data, but in a txg that was
 736  736           * *not* committed.  Upon opening the pool after an unclean shutdown,
 737  737           * the intent log claims all blocks that contain immediate write data
 738  738           * so that the SPA knows they're in use.
 739  739           *
 740  740           * All claims *must* be resolved in the first txg -- before the SPA
 741  741           * starts allocating blocks -- so that nothing is allocated twice.
 742  742           * If txg == 0 we just verify that the block is claimable.
 743  743           */
 744  744          ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 745  745          ASSERT(txg == spa_first_txg(spa) || txg == 0);
 746  746          ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 747  747  
 748  748          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 749  749              done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 750  750              NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 751  751  
 752  752          return (zio);
 753  753  }
 754  754  
 755  755  zio_t *
 756  756  zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 757  757      zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
 758  758  {
 759  759          zio_t *zio;
 760  760          int c;
 761  761  
 762  762          if (vd->vdev_children == 0) {
 763  763                  zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 764  764                      ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
 765  765                      ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 766  766  
 767  767                  zio->io_cmd = cmd;
 768  768          } else {
 769  769                  zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 770  770  
 771  771                  for (c = 0; c < vd->vdev_children; c++)
 772  772                          zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 773  773                              done, private, priority, flags));
 774  774          }
 775  775  
 776  776          return (zio);
 777  777  }
 778  778  
 779  779  zio_t *
 780  780  zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 781  781      void *data, int checksum, zio_done_func_t *done, void *private,
 782  782      int priority, enum zio_flag flags, boolean_t labels)
 783  783  {
 784  784          zio_t *zio;
 785  785  
 786  786          ASSERT(vd->vdev_children == 0);
 787  787          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 788  788              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 789  789          ASSERT3U(offset + size, <=, vd->vdev_psize);
 790  790  
 791  791          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 792  792              ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
 793  793              ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 794  794  
 795  795          zio->io_prop.zp_checksum = checksum;
 796  796  
 797  797          return (zio);
 798  798  }
 799  799  
 800  800  zio_t *
 801  801  zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 802  802      void *data, int checksum, zio_done_func_t *done, void *private,
 803  803      int priority, enum zio_flag flags, boolean_t labels)
 804  804  {
 805  805          zio_t *zio;
 806  806  
 807  807          ASSERT(vd->vdev_children == 0);
 808  808          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 809  809              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 810  810          ASSERT3U(offset + size, <=, vd->vdev_psize);
 811  811  
 812  812          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 813  813              ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
 814  814              ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 815  815  
 816  816          zio->io_prop.zp_checksum = checksum;
 817  817  
 818  818          if (zio_checksum_table[checksum].ci_eck) {
 819  819                  /*
 820  820                   * zec checksums are necessarily destructive -- they modify
 821  821                   * the end of the write buffer to hold the verifier/checksum.
 822  822                   * Therefore, we must make a local copy in case the data is
 823  823                   * being written to multiple places in parallel.
 824  824                   */
 825  825                  void *wbuf = zio_buf_alloc(size);
 826  826                  bcopy(data, wbuf, size);
 827  827                  zio_push_transform(zio, wbuf, size, size, NULL);
 828  828          }
 829  829  
 830  830          return (zio);
 831  831  }
 832  832  
 833  833  /*
 834  834   * Create a child I/O to do some work for us.
 835  835   */
 836  836  zio_t *
 837  837  zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 838  838          void *data, uint64_t size, int type, int priority, enum zio_flag flags,
 839  839          zio_done_func_t *done, void *private)
 840  840  {
 841  841          enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 842  842          zio_t *zio;
 843  843  
 844  844          ASSERT(vd->vdev_parent ==
 845  845              (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 846  846  
 847  847          if (type == ZIO_TYPE_READ && bp != NULL) {
 848  848                  /*
 849  849                   * If we have the bp, then the child should perform the
 850  850                   * checksum and the parent need not.  This pushes error
 851  851                   * detection as close to the leaves as possible and
 852  852                   * eliminates redundant checksums in the interior nodes.
 853  853                   */
 854  854                  pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 855  855                  pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 856  856          }
 857  857  
 858  858          if (vd->vdev_children == 0)
 859  859                  offset += VDEV_LABEL_START_SIZE;
 860  860  
 861  861          flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 862  862  
 863  863          /*
 864  864           * If we've decided to do a repair, the write is not speculative --
 865  865           * even if the original read was.
 866  866           */
 867  867          if (flags & ZIO_FLAG_IO_REPAIR)
 868  868                  flags &= ~ZIO_FLAG_SPECULATIVE;
 869  869  
 870  870          zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 871  871              done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 872  872              ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 873  873  
 874  874          return (zio);
 875  875  }
 876  876  
 877  877  zio_t *
 878  878  zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 879  879          int type, int priority, enum zio_flag flags,
 880  880          zio_done_func_t *done, void *private)
 881  881  {
 882  882          zio_t *zio;
 883  883  
 884  884          ASSERT(vd->vdev_ops->vdev_op_leaf);
 885  885  
 886  886          zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 887  887              data, size, done, private, type, priority,
 888  888              flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
 889  889              vd, offset, NULL,
 890  890              ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 891  891  
 892  892          return (zio);
 893  893  }
 894  894  
 895  895  void
 896  896  zio_flush(zio_t *zio, vdev_t *vd)
 897  897  {
 898  898          zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 899  899              NULL, NULL, ZIO_PRIORITY_NOW,
 900  900              ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 901  901  }
 902  902  
 903  903  void
 904  904  zio_shrink(zio_t *zio, uint64_t size)
 905  905  {
 906  906          ASSERT(zio->io_executor == NULL);
 907  907          ASSERT(zio->io_orig_size == zio->io_size);
 908  908          ASSERT(size <= zio->io_size);
 909  909  
 910  910          /*
 911  911           * We don't shrink for raidz because of problems with the
 912  912           * reconstruction when reading back less than the block size.
 913  913           * Note, BP_IS_RAIDZ() assumes no compression.
 914  914           */
 915  915          ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 916  916          if (!BP_IS_RAIDZ(zio->io_bp))
 917  917                  zio->io_orig_size = zio->io_size = size;
 918  918  }
 919  919  
 920  920  /*
 921  921   * ==========================================================================
 922  922   * Prepare to read and write logical blocks
 923  923   * ==========================================================================
 924  924   */
 925  925  
 926  926  static int
 927  927  zio_read_bp_init(zio_t *zio)
 928  928  {
 929  929          blkptr_t *bp = zio->io_bp;
 930  930  
 931  931          if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 932  932              zio->io_child_type == ZIO_CHILD_LOGICAL &&
 933  933              !(zio->io_flags & ZIO_FLAG_RAW)) {
 934  934                  uint64_t psize = BP_GET_PSIZE(bp);
 935  935                  void *cbuf = zio_buf_alloc(psize);
 936  936  
 937  937                  zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 938  938          }
 939  939  
 940  940          if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 941  941                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 942  942  
 943  943          if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
 944  944                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 945  945  
 946  946          if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 947  947                  zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 948  948  
 949  949          return (ZIO_PIPELINE_CONTINUE);
 950  950  }
 951  951  
 952  952  static int
 953  953  zio_write_bp_init(zio_t *zio)
 954  954  {
 955  955          spa_t *spa = zio->io_spa;
 956  956          zio_prop_t *zp = &zio->io_prop;
 957  957          enum zio_compress compress = zp->zp_compress;
 958  958          blkptr_t *bp = zio->io_bp;
 959  959          uint64_t lsize = zio->io_size;
 960  960          uint64_t psize = lsize;
 961  961          int pass = 1;
 962  962  
 963  963          /*
 964  964           * If our children haven't all reached the ready stage,
 965  965           * wait for them and then repeat this pipeline stage.
 966  966           */
 967  967          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
 968  968              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
 969  969                  return (ZIO_PIPELINE_STOP);
 970  970  
 971  971          if (!IO_IS_ALLOCATING(zio))
 972  972                  return (ZIO_PIPELINE_CONTINUE);
 973  973  
 974  974          ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 975  975  
 976  976          if (zio->io_bp_override) {
 977  977                  ASSERT(bp->blk_birth != zio->io_txg);
 978  978                  ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
 979  979  
 980  980                  *bp = *zio->io_bp_override;
 981  981                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 982  982  
 983  983                  /*
 984  984                   * If we've been overridden and nopwrite is set then
 985  985                   * set the flag accordingly to indicate that a nopwrite
 986  986                   * has already occurred.
 987  987                   */
 988  988                  if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 989  989                          ASSERT(!zp->zp_dedup);
 990  990                          zio->io_flags |= ZIO_FLAG_NOPWRITE;
 991  991                          return (ZIO_PIPELINE_CONTINUE);
 992  992                  }
 993  993  
 994  994                  ASSERT(!zp->zp_nopwrite);
 995  995  
 996  996                  if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 997  997                          return (ZIO_PIPELINE_CONTINUE);
 998  998  
 999  999                  ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1000 1000                      zp->zp_dedup_verify);
1001 1001  
1002 1002                  if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1003 1003                          BP_SET_DEDUP(bp, 1);
1004 1004                          zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1005 1005                          return (ZIO_PIPELINE_CONTINUE);
1006 1006                  }
1007 1007                  zio->io_bp_override = NULL;
1008 1008                  BP_ZERO(bp);
1009 1009          }
1010 1010  
1011 1011          if (bp->blk_birth == zio->io_txg) {
1012 1012                  /*
1013 1013                   * We're rewriting an existing block, which means we're
1014 1014                   * working on behalf of spa_sync().  For spa_sync() to
1015 1015                   * converge, it must eventually be the case that we don't
1016 1016                   * have to allocate new blocks.  But compression changes
1017 1017                   * the blocksize, which forces a reallocate, and makes
1018 1018                   * convergence take longer.  Therefore, after the first
1019 1019                   * few passes, stop compressing to ensure convergence.
1020 1020                   */
1021 1021                  pass = spa_sync_pass(spa);
1022 1022  
1023 1023                  ASSERT(zio->io_txg == spa_syncing_txg(spa));
1024 1024                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1025 1025                  ASSERT(!BP_GET_DEDUP(bp));
1026 1026  
1027 1027                  if (pass >= zfs_sync_pass_dont_compress)
1028 1028                          compress = ZIO_COMPRESS_OFF;
1029 1029  
1030 1030                  /* Make sure someone doesn't change their mind on overwrites */
1031 1031                  ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
1032 1032                      spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1033 1033          }
1034 1034  
1035 1035          if (compress != ZIO_COMPRESS_OFF) {
1036 1036                  void *cbuf = zio_buf_alloc(lsize);
1037 1037                  psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1038 1038                  if (psize == 0 || psize == lsize) {
1039 1039                          compress = ZIO_COMPRESS_OFF;
1040 1040                          zio_buf_free(cbuf, lsize);
1041 1041                  } else {
1042 1042                          ASSERT(psize < lsize);
1043 1043                          zio_push_transform(zio, cbuf, psize, lsize, NULL);
1044 1044                  }
1045 1045          }
1046 1046  
1047 1047          /*
1048 1048           * The final pass of spa_sync() must be all rewrites, but the first
1049 1049           * few passes offer a trade-off: allocating blocks defers convergence,
1050 1050           * but newly allocated blocks are sequential, so they can be written
1051 1051           * to disk faster.  Therefore, we allow the first few passes of
1052 1052           * spa_sync() to allocate new blocks, but force rewrites after that.
1053 1053           * There should only be a handful of blocks after pass 1 in any case.
1054 1054           */
1055 1055          if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
1056 1056              pass >= zfs_sync_pass_rewrite) {
1057 1057                  ASSERT(psize != 0);
1058 1058                  enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1059 1059                  zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1060 1060                  zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1061 1061          } else {
1062 1062                  BP_ZERO(bp);
1063 1063                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
1064 1064          }
1065 1065  
1066 1066          if (psize == 0) {
1067 1067                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1068 1068          } else {
1069 1069                  ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1070 1070                  BP_SET_LSIZE(bp, lsize);
1071 1071                  BP_SET_PSIZE(bp, psize);
1072 1072                  BP_SET_COMPRESS(bp, compress);
1073 1073                  BP_SET_CHECKSUM(bp, zp->zp_checksum);
1074 1074                  BP_SET_TYPE(bp, zp->zp_type);
1075 1075                  BP_SET_LEVEL(bp, zp->zp_level);
1076 1076                  BP_SET_DEDUP(bp, zp->zp_dedup);
1077 1077                  BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1078 1078                  if (zp->zp_dedup) {
1079 1079                          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1080 1080                          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1081 1081                          zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1082 1082                  }
1083 1083                  if (zp->zp_nopwrite) {
1084 1084                          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1085 1085                          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1086 1086                          zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1087 1087                  }
1088 1088          }
1089 1089  
1090 1090          return (ZIO_PIPELINE_CONTINUE);
1091 1091  }
1092 1092  
1093 1093  static int
1094 1094  zio_free_bp_init(zio_t *zio)
1095 1095  {
1096 1096          blkptr_t *bp = zio->io_bp;
1097 1097  
1098 1098          if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1099 1099                  if (BP_GET_DEDUP(bp))
1100 1100                          zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1101 1101          }
1102 1102  
1103 1103          return (ZIO_PIPELINE_CONTINUE);
1104 1104  }
1105 1105  
1106 1106  /*
1107 1107   * ==========================================================================
1108 1108   * Execute the I/O pipeline
1109 1109   * ==========================================================================
1110 1110   */
1111 1111  
1112 1112  static void
1113 1113  zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1114 1114  {
1115 1115          spa_t *spa = zio->io_spa;
1116 1116          zio_type_t t = zio->io_type;
1117 1117          int flags = (cutinline ? TQ_FRONT : 0);
1118 1118  
1119 1119          /*
1120 1120           * If we're a config writer or a probe, the normal issue and
1121 1121           * interrupt threads may all be blocked waiting for the config lock.
1122 1122           * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1123 1123           */
1124 1124          if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1125 1125                  t = ZIO_TYPE_NULL;
1126 1126  
1127 1127          /*
1128 1128           * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1129 1129           */
1130 1130          if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1131 1131                  t = ZIO_TYPE_NULL;
1132 1132  
1133 1133          /*
1134 1134           * If this is a high priority I/O, then use the high priority taskq if
1135 1135           * available.
1136 1136           */
1137 1137          if (zio->io_priority == ZIO_PRIORITY_NOW &&
1138 1138              spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1139 1139                  q++;
1140 1140  
1141 1141          ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1142 1142  
1143 1143          /*
1144 1144           * NB: We are assuming that the zio can only be dispatched
1145 1145           * to a single taskq at a time.  It would be a grievous error
1146 1146           * to dispatch the zio to another taskq at the same time.
1147 1147           */
1148 1148          ASSERT(zio->io_tqent.tqent_next == NULL);
1149 1149          spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1150 1150              flags, &zio->io_tqent);
1151 1151  }
1152 1152  
1153 1153  static boolean_t
1154 1154  zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1155 1155  {
1156 1156          kthread_t *executor = zio->io_executor;
1157 1157          spa_t *spa = zio->io_spa;
1158 1158  
1159 1159          for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1160 1160                  spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1161 1161                  uint_t i;
1162 1162                  for (i = 0; i < tqs->stqs_count; i++) {
1163 1163                          if (taskq_member(tqs->stqs_taskq[i], executor))
1164 1164                                  return (B_TRUE);
1165 1165                  }
1166 1166          }
1167 1167  
1168 1168          return (B_FALSE);
1169 1169  }
1170 1170  
1171 1171  static int
1172 1172  zio_issue_async(zio_t *zio)
1173 1173  {
1174 1174          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1175 1175  
1176 1176          return (ZIO_PIPELINE_STOP);

↓ open down ↓

1176 lines elided

↑ open up ↑

1177 1177  }
1178 1178  
1179 1179  void
1180 1180  zio_interrupt(zio_t *zio)
1181 1181  {
1182 1182          zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1183 1183  }
1184 1184  
1185 1185  /*
1186 1186   * Execute the I/O pipeline until one of the following occurs:
1187      - * (1) the I/O completes; (2) the pipeline stalls waiting for
1188      - * dependent child I/Os; (3) the I/O issues, so we're waiting
1189      - * for an I/O completion interrupt; (4) the I/O is delegated by
1190      - * vdev-level caching or aggregation; (5) the I/O is deferred
1191      - * due to vdev-level queueing; (6) the I/O is handed off to
1192      - * another thread.  In all cases, the pipeline stops whenever
1193      - * there's no CPU work; it never burns a thread in cv_wait().
     1187 + *
     1188 + *      (1) the I/O completes
     1189 + *      (2) the pipeline stalls waiting for dependent child I/Os
     1190 + *      (3) the I/O issues, so we're waiting for an I/O completion interrupt
     1191 + *      (4) the I/O is delegated by vdev-level caching or aggregation
     1192 + *      (5) the I/O is deferred due to vdev-level queueing
     1193 + *      (6) the I/O is handed off to another thread.
     1194 + *
     1195 + * In all cases, the pipeline stops whenever there's no CPU work; it never
     1196 + * burns a thread in cv_wait().
1194 1197   *
1195 1198   * There's no locking on io_stage because there's no legitimate way
1196 1199   * for multiple threads to be attempting to process the same I/O.
1197 1200   */
1198 1201  static zio_pipe_stage_t *zio_pipeline[];
1199 1202  
1200 1203  void
1201 1204  zio_execute(zio_t *zio)
1202 1205  {
1203 1206          zio->io_executor = curthread;

1204 1207  
1205 1208          while (zio->io_stage < ZIO_STAGE_DONE) {
1206 1209                  enum zio_stage pipeline = zio->io_pipeline;
1207 1210                  enum zio_stage stage = zio->io_stage;
1208 1211                  int rv;
1209 1212  
1210 1213                  ASSERT(!MUTEX_HELD(&zio->io_lock));
1211 1214                  ASSERT(ISP2(stage));
1212 1215                  ASSERT(zio->io_stall == NULL);
1213 1216  
1214 1217                  do {
1215 1218                          stage <<= 1;
1216 1219                  } while ((stage & pipeline) == 0);
1217 1220  
1218 1221                  ASSERT(stage <= ZIO_STAGE_DONE);
1219 1222  
1220 1223                  /*
1221 1224                   * If we are in interrupt context and this pipeline stage
1222 1225                   * will grab a config lock that is held across I/O,
1223 1226                   * or may wait for an I/O that needs an interrupt thread
1224 1227                   * to complete, issue async to avoid deadlock.
1225 1228                   *
1226 1229                   * For VDEV_IO_START, we cut in line so that the io will
1227 1230                   * be sent to disk promptly.
1228 1231                   */
1229 1232                  if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1230 1233                      zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1231 1234                          boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1232 1235                              zio_requeue_io_start_cut_in_line : B_FALSE;
1233 1236                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1234 1237                          return;
1235 1238                  }
1236 1239  
1237 1240                  zio->io_stage = stage;
1238 1241                  rv = zio_pipeline[highbit(stage) - 1](zio);
1239 1242  
1240 1243                  if (rv == ZIO_PIPELINE_STOP)
1241 1244                          return;
1242 1245  
1243 1246                  ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1244 1247          }
1245 1248  }
1246 1249  
1247 1250  /*
1248 1251   * ==========================================================================
1249 1252   * Initiate I/O, either sync or async
1250 1253   * ==========================================================================
1251 1254   */
1252 1255  int
1253 1256  zio_wait(zio_t *zio)
1254 1257  {
1255 1258          int error;
1256 1259  
1257 1260          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1258 1261          ASSERT(zio->io_executor == NULL);
1259 1262  
1260 1263          zio->io_waiter = curthread;
1261 1264  
1262 1265          zio_execute(zio);
1263 1266  
1264 1267          mutex_enter(&zio->io_lock);
1265 1268          while (zio->io_executor != NULL)
1266 1269                  cv_wait(&zio->io_cv, &zio->io_lock);
1267 1270          mutex_exit(&zio->io_lock);
1268 1271  
1269 1272          error = zio->io_error;
1270 1273          zio_destroy(zio);
1271 1274  
1272 1275          return (error);
1273 1276  }
1274 1277  
1275 1278  void
1276 1279  zio_nowait(zio_t *zio)
1277 1280  {
1278 1281          ASSERT(zio->io_executor == NULL);
1279 1282  
1280 1283          if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1281 1284              zio_unique_parent(zio) == NULL) {
1282 1285                  /*
1283 1286                   * This is a logical async I/O with no parent to wait for it.
1284 1287                   * We add it to the spa_async_root_zio "Godfather" I/O which
1285 1288                   * will ensure they complete prior to unloading the pool.
1286 1289                   */
1287 1290                  spa_t *spa = zio->io_spa;
1288 1291  
1289 1292                  zio_add_child(spa->spa_async_zio_root, zio);
1290 1293          }
1291 1294  
1292 1295          zio_execute(zio);
1293 1296  }
1294 1297  
1295 1298  /*
1296 1299   * ==========================================================================
1297 1300   * Reexecute or suspend/resume failed I/O
1298 1301   * ==========================================================================
1299 1302   */
1300 1303  
1301 1304  static void
1302 1305  zio_reexecute(zio_t *pio)
1303 1306  {
1304 1307          zio_t *cio, *cio_next;
1305 1308  
1306 1309          ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1307 1310          ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1308 1311          ASSERT(pio->io_gang_leader == NULL);
1309 1312          ASSERT(pio->io_gang_tree == NULL);
1310 1313  
1311 1314          pio->io_flags = pio->io_orig_flags;
1312 1315          pio->io_stage = pio->io_orig_stage;
1313 1316          pio->io_pipeline = pio->io_orig_pipeline;
1314 1317          pio->io_reexecute = 0;
1315 1318          pio->io_flags |= ZIO_FLAG_REEXECUTED;
1316 1319          pio->io_error = 0;
1317 1320          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1318 1321                  pio->io_state[w] = 0;
1319 1322          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1320 1323                  pio->io_child_error[c] = 0;
1321 1324  
1322 1325          if (IO_IS_ALLOCATING(pio))
1323 1326                  BP_ZERO(pio->io_bp);
1324 1327  
1325 1328          /*
1326 1329           * As we reexecute pio's children, new children could be created.
1327 1330           * New children go to the head of pio's io_child_list, however,
1328 1331           * so we will (correctly) not reexecute them.  The key is that
1329 1332           * the remainder of pio's io_child_list, from 'cio_next' onward,
1330 1333           * cannot be affected by any side effects of reexecuting 'cio'.
1331 1334           */
1332 1335          for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1333 1336                  cio_next = zio_walk_children(pio);
1334 1337                  mutex_enter(&pio->io_lock);
1335 1338                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1336 1339                          pio->io_children[cio->io_child_type][w]++;
1337 1340                  mutex_exit(&pio->io_lock);
1338 1341                  zio_reexecute(cio);
1339 1342          }
1340 1343  
1341 1344          /*
1342 1345           * Now that all children have been reexecuted, execute the parent.
1343 1346           * We don't reexecute "The Godfather" I/O here as it's the
1344 1347           * responsibility of the caller to wait on him.
1345 1348           */
1346 1349          if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1347 1350                  zio_execute(pio);
1348 1351  }
1349 1352  
1350 1353  void
1351 1354  zio_suspend(spa_t *spa, zio_t *zio)
1352 1355  {
1353 1356          if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1354 1357                  fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1355 1358                      "failure and the failure mode property for this pool "
1356 1359                      "is set to panic.", spa_name(spa));
1357 1360  
1358 1361          zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1359 1362  
1360 1363          mutex_enter(&spa->spa_suspend_lock);
1361 1364  
1362 1365          if (spa->spa_suspend_zio_root == NULL)
1363 1366                  spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1364 1367                      ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1365 1368                      ZIO_FLAG_GODFATHER);
1366 1369  
1367 1370          spa->spa_suspended = B_TRUE;
1368 1371  
1369 1372          if (zio != NULL) {
1370 1373                  ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1371 1374                  ASSERT(zio != spa->spa_suspend_zio_root);
1372 1375                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1373 1376                  ASSERT(zio_unique_parent(zio) == NULL);
1374 1377                  ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1375 1378                  zio_add_child(spa->spa_suspend_zio_root, zio);
1376 1379          }
1377 1380  
1378 1381          mutex_exit(&spa->spa_suspend_lock);
1379 1382  }
1380 1383  
1381 1384  int
1382 1385  zio_resume(spa_t *spa)
1383 1386  {
1384 1387          zio_t *pio;
1385 1388  
1386 1389          /*
1387 1390           * Reexecute all previously suspended i/o.
1388 1391           */
1389 1392          mutex_enter(&spa->spa_suspend_lock);
1390 1393          spa->spa_suspended = B_FALSE;
1391 1394          cv_broadcast(&spa->spa_suspend_cv);
1392 1395          pio = spa->spa_suspend_zio_root;
1393 1396          spa->spa_suspend_zio_root = NULL;
1394 1397          mutex_exit(&spa->spa_suspend_lock);
1395 1398  
1396 1399          if (pio == NULL)
1397 1400                  return (0);
1398 1401  
1399 1402          zio_reexecute(pio);
1400 1403          return (zio_wait(pio));
1401 1404  }
1402 1405  
1403 1406  void
1404 1407  zio_resume_wait(spa_t *spa)
1405 1408  {
1406 1409          mutex_enter(&spa->spa_suspend_lock);
1407 1410          while (spa_suspended(spa))
1408 1411                  cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1409 1412          mutex_exit(&spa->spa_suspend_lock);
1410 1413  }
1411 1414  
1412 1415  /*
1413 1416   * ==========================================================================
1414 1417   * Gang blocks.
1415 1418   *
1416 1419   * A gang block is a collection of small blocks that looks to the DMU
1417 1420   * like one large block.  When zio_dva_allocate() cannot find a block
1418 1421   * of the requested size, due to either severe fragmentation or the pool
1419 1422   * being nearly full, it calls zio_write_gang_block() to construct the
1420 1423   * block from smaller fragments.
1421 1424   *
1422 1425   * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1423 1426   * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1424 1427   * an indirect block: it's an array of block pointers.  It consumes
1425 1428   * only one sector and hence is allocatable regardless of fragmentation.
1426 1429   * The gang header's bps point to its gang members, which hold the data.
1427 1430   *
1428 1431   * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1429 1432   * as the verifier to ensure uniqueness of the SHA256 checksum.
1430 1433   * Critically, the gang block bp's blk_cksum is the checksum of the data,
1431 1434   * not the gang header.  This ensures that data block signatures (needed for
1432 1435   * deduplication) are independent of how the block is physically stored.
1433 1436   *
1434 1437   * Gang blocks can be nested: a gang member may itself be a gang block.
1435 1438   * Thus every gang block is a tree in which root and all interior nodes are
1436 1439   * gang headers, and the leaves are normal blocks that contain user data.
1437 1440   * The root of the gang tree is called the gang leader.
1438 1441   *
1439 1442   * To perform any operation (read, rewrite, free, claim) on a gang block,
1440 1443   * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1441 1444   * in the io_gang_tree field of the original logical i/o by recursively
1442 1445   * reading the gang leader and all gang headers below it.  This yields
1443 1446   * an in-core tree containing the contents of every gang header and the
1444 1447   * bps for every constituent of the gang block.
1445 1448   *
1446 1449   * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1447 1450   * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1448 1451   * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1449 1452   * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1450 1453   * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1451 1454   * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1452 1455   * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1453 1456   * of the gang header plus zio_checksum_compute() of the data to update the
1454 1457   * gang header's blk_cksum as described above.
1455 1458   *
1456 1459   * The two-phase assemble/issue model solves the problem of partial failure --
1457 1460   * what if you'd freed part of a gang block but then couldn't read the
1458 1461   * gang header for another part?  Assembling the entire gang tree first
1459 1462   * ensures that all the necessary gang header I/O has succeeded before
1460 1463   * starting the actual work of free, claim, or write.  Once the gang tree
1461 1464   * is assembled, free and claim are in-memory operations that cannot fail.
1462 1465   *
1463 1466   * In the event that a gang write fails, zio_dva_unallocate() walks the
1464 1467   * gang tree to immediately free (i.e. insert back into the space map)
1465 1468   * everything we've allocated.  This ensures that we don't get ENOSPC
1466 1469   * errors during repeated suspend/resume cycles due to a flaky device.
1467 1470   *
1468 1471   * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1469 1472   * the gang tree, we won't modify the block, so we can safely defer the free
1470 1473   * (knowing that the block is still intact).  If we *can* assemble the gang
1471 1474   * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1472 1475   * each constituent bp and we can allocate a new block on the next sync pass.
1473 1476   *
1474 1477   * In all cases, the gang tree allows complete recovery from partial failure.
1475 1478   * ==========================================================================
1476 1479   */
1477 1480  
1478 1481  static zio_t *
1479 1482  zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1480 1483  {
1481 1484          if (gn != NULL)
1482 1485                  return (pio);
1483 1486  
1484 1487          return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1485 1488              NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1486 1489              &pio->io_bookmark));
1487 1490  }
1488 1491  
1489 1492  zio_t *
1490 1493  zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1491 1494  {
1492 1495          zio_t *zio;
1493 1496  
1494 1497          if (gn != NULL) {
1495 1498                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1496 1499                      gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1497 1500                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1498 1501                  /*
1499 1502                   * As we rewrite each gang header, the pipeline will compute
1500 1503                   * a new gang block header checksum for it; but no one will
1501 1504                   * compute a new data checksum, so we do that here.  The one
1502 1505                   * exception is the gang leader: the pipeline already computed
1503 1506                   * its data checksum because that stage precedes gang assembly.
1504 1507                   * (Presently, nothing actually uses interior data checksums;
1505 1508                   * this is just good hygiene.)
1506 1509                   */
1507 1510                  if (gn != pio->io_gang_leader->io_gang_tree) {
1508 1511                          zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1509 1512                              data, BP_GET_PSIZE(bp));
1510 1513                  }
1511 1514                  /*
1512 1515                   * If we are here to damage data for testing purposes,
1513 1516                   * leave the GBH alone so that we can detect the damage.
1514 1517                   */
1515 1518                  if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1516 1519                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1517 1520          } else {
1518 1521                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1519 1522                      data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1520 1523                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1521 1524          }
1522 1525  
1523 1526          return (zio);
1524 1527  }
1525 1528  
1526 1529  /* ARGSUSED */
1527 1530  zio_t *
1528 1531  zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1529 1532  {
1530 1533          return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1531 1534              ZIO_GANG_CHILD_FLAGS(pio)));
1532 1535  }
1533 1536  
1534 1537  /* ARGSUSED */
1535 1538  zio_t *
1536 1539  zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1537 1540  {
1538 1541          return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1539 1542              NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1540 1543  }
1541 1544  
1542 1545  static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1543 1546          NULL,
1544 1547          zio_read_gang,
1545 1548          zio_rewrite_gang,
1546 1549          zio_free_gang,
1547 1550          zio_claim_gang,
1548 1551          NULL
1549 1552  };
1550 1553  
1551 1554  static void zio_gang_tree_assemble_done(zio_t *zio);
1552 1555  
1553 1556  static zio_gang_node_t *
1554 1557  zio_gang_node_alloc(zio_gang_node_t **gnpp)
1555 1558  {
1556 1559          zio_gang_node_t *gn;
1557 1560  
1558 1561          ASSERT(*gnpp == NULL);
1559 1562  
1560 1563          gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1561 1564          gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1562 1565          *gnpp = gn;
1563 1566  
1564 1567          return (gn);
1565 1568  }
1566 1569  
1567 1570  static void
1568 1571  zio_gang_node_free(zio_gang_node_t **gnpp)
1569 1572  {
1570 1573          zio_gang_node_t *gn = *gnpp;
1571 1574  
1572 1575          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1573 1576                  ASSERT(gn->gn_child[g] == NULL);
1574 1577  
1575 1578          zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1576 1579          kmem_free(gn, sizeof (*gn));
1577 1580          *gnpp = NULL;
1578 1581  }
1579 1582  
1580 1583  static void
1581 1584  zio_gang_tree_free(zio_gang_node_t **gnpp)
1582 1585  {
1583 1586          zio_gang_node_t *gn = *gnpp;
1584 1587  
1585 1588          if (gn == NULL)
1586 1589                  return;
1587 1590  
1588 1591          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1589 1592                  zio_gang_tree_free(&gn->gn_child[g]);
1590 1593  
1591 1594          zio_gang_node_free(gnpp);
1592 1595  }
1593 1596  
1594 1597  static void
1595 1598  zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1596 1599  {
1597 1600          zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1598 1601  
1599 1602          ASSERT(gio->io_gang_leader == gio);
1600 1603          ASSERT(BP_IS_GANG(bp));
1601 1604  
1602 1605          zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1603 1606              SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1604 1607              gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1605 1608  }
1606 1609  
1607 1610  static void
1608 1611  zio_gang_tree_assemble_done(zio_t *zio)
1609 1612  {
1610 1613          zio_t *gio = zio->io_gang_leader;
1611 1614          zio_gang_node_t *gn = zio->io_private;
1612 1615          blkptr_t *bp = zio->io_bp;
1613 1616  
1614 1617          ASSERT(gio == zio_unique_parent(zio));
1615 1618          ASSERT(zio->io_child_count == 0);
1616 1619  
1617 1620          if (zio->io_error)
1618 1621                  return;
1619 1622  
1620 1623          if (BP_SHOULD_BYTESWAP(bp))
1621 1624                  byteswap_uint64_array(zio->io_data, zio->io_size);
1622 1625  
1623 1626          ASSERT(zio->io_data == gn->gn_gbh);
1624 1627          ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1625 1628          ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1626 1629  
1627 1630          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1628 1631                  blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1629 1632                  if (!BP_IS_GANG(gbp))
1630 1633                          continue;
1631 1634                  zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1632 1635          }
1633 1636  }
1634 1637  
1635 1638  static void
1636 1639  zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1637 1640  {
1638 1641          zio_t *gio = pio->io_gang_leader;
1639 1642          zio_t *zio;
1640 1643  
1641 1644          ASSERT(BP_IS_GANG(bp) == !!gn);
1642 1645          ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1643 1646          ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1644 1647  
1645 1648          /*
1646 1649           * If you're a gang header, your data is in gn->gn_gbh.
1647 1650           * If you're a gang member, your data is in 'data' and gn == NULL.
1648 1651           */
1649 1652          zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1650 1653  
1651 1654          if (gn != NULL) {
1652 1655                  ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1653 1656  
1654 1657                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1655 1658                          blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1656 1659                          if (BP_IS_HOLE(gbp))
1657 1660                                  continue;
1658 1661                          zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1659 1662                          data = (char *)data + BP_GET_PSIZE(gbp);
1660 1663                  }
1661 1664          }
1662 1665  
1663 1666          if (gn == gio->io_gang_tree)
1664 1667                  ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1665 1668  
1666 1669          if (zio != pio)
1667 1670                  zio_nowait(zio);
1668 1671  }
1669 1672  
1670 1673  static int
1671 1674  zio_gang_assemble(zio_t *zio)
1672 1675  {
1673 1676          blkptr_t *bp = zio->io_bp;
1674 1677  
1675 1678          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1676 1679          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1677 1680  
1678 1681          zio->io_gang_leader = zio;
1679 1682  
1680 1683          zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1681 1684  
1682 1685          return (ZIO_PIPELINE_CONTINUE);
1683 1686  }
1684 1687  
1685 1688  static int
1686 1689  zio_gang_issue(zio_t *zio)
1687 1690  {
1688 1691          blkptr_t *bp = zio->io_bp;
1689 1692  
1690 1693          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1691 1694                  return (ZIO_PIPELINE_STOP);
1692 1695  
1693 1696          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1694 1697          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1695 1698  
1696 1699          if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1697 1700                  zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1698 1701          else
1699 1702                  zio_gang_tree_free(&zio->io_gang_tree);
1700 1703  
1701 1704          zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1702 1705  
1703 1706          return (ZIO_PIPELINE_CONTINUE);
1704 1707  }
1705 1708  
1706 1709  static void
1707 1710  zio_write_gang_member_ready(zio_t *zio)
1708 1711  {
1709 1712          zio_t *pio = zio_unique_parent(zio);
1710 1713          zio_t *gio = zio->io_gang_leader;
1711 1714          dva_t *cdva = zio->io_bp->blk_dva;
1712 1715          dva_t *pdva = pio->io_bp->blk_dva;
1713 1716          uint64_t asize;
1714 1717  
1715 1718          if (BP_IS_HOLE(zio->io_bp))
1716 1719                  return;
1717 1720  
1718 1721          ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1719 1722  
1720 1723          ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1721 1724          ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1722 1725          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1723 1726          ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1724 1727          ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1725 1728  
1726 1729          mutex_enter(&pio->io_lock);
1727 1730          for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1728 1731                  ASSERT(DVA_GET_GANG(&pdva[d]));
1729 1732                  asize = DVA_GET_ASIZE(&pdva[d]);
1730 1733                  asize += DVA_GET_ASIZE(&cdva[d]);
1731 1734                  DVA_SET_ASIZE(&pdva[d], asize);
1732 1735          }
1733 1736          mutex_exit(&pio->io_lock);
1734 1737  }
1735 1738  
1736 1739  static int
1737 1740  zio_write_gang_block(zio_t *pio)
1738 1741  {
1739 1742          spa_t *spa = pio->io_spa;
1740 1743          blkptr_t *bp = pio->io_bp;
1741 1744          zio_t *gio = pio->io_gang_leader;
1742 1745          zio_t *zio;
1743 1746          zio_gang_node_t *gn, **gnpp;
1744 1747          zio_gbh_phys_t *gbh;
1745 1748          uint64_t txg = pio->io_txg;
1746 1749          uint64_t resid = pio->io_size;
1747 1750          uint64_t lsize;
1748 1751          int copies = gio->io_prop.zp_copies;
1749 1752          int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1750 1753          zio_prop_t zp;
1751 1754          int error;
1752 1755  
1753 1756          error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1754 1757              bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1755 1758              METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1756 1759          if (error) {
1757 1760                  pio->io_error = error;
1758 1761                  return (ZIO_PIPELINE_CONTINUE);
1759 1762          }
1760 1763  
1761 1764          if (pio == gio) {
1762 1765                  gnpp = &gio->io_gang_tree;
1763 1766          } else {
1764 1767                  gnpp = pio->io_private;
1765 1768                  ASSERT(pio->io_ready == zio_write_gang_member_ready);
1766 1769          }
1767 1770  
1768 1771          gn = zio_gang_node_alloc(gnpp);
1769 1772          gbh = gn->gn_gbh;
1770 1773          bzero(gbh, SPA_GANGBLOCKSIZE);
1771 1774  
1772 1775          /*
1773 1776           * Create the gang header.
1774 1777           */
1775 1778          zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1776 1779              pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1777 1780  
1778 1781          /*
1779 1782           * Create and nowait the gang children.
1780 1783           */
1781 1784          for (int g = 0; resid != 0; resid -= lsize, g++) {
1782 1785                  lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1783 1786                      SPA_MINBLOCKSIZE);
1784 1787                  ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1785 1788  
1786 1789                  zp.zp_checksum = gio->io_prop.zp_checksum;
1787 1790                  zp.zp_compress = ZIO_COMPRESS_OFF;
1788 1791                  zp.zp_type = DMU_OT_NONE;
1789 1792                  zp.zp_level = 0;
1790 1793                  zp.zp_copies = gio->io_prop.zp_copies;
1791 1794                  zp.zp_dedup = B_FALSE;
1792 1795                  zp.zp_dedup_verify = B_FALSE;
1793 1796                  zp.zp_nopwrite = B_FALSE;
1794 1797  
1795 1798                  zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1796 1799                      (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1797 1800                      zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1798 1801                      pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1799 1802                      &pio->io_bookmark));
1800 1803          }
1801 1804  
1802 1805          /*
1803 1806           * Set pio's pipeline to just wait for zio to finish.
1804 1807           */
1805 1808          pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1806 1809  
1807 1810          zio_nowait(zio);
1808 1811  
1809 1812          return (ZIO_PIPELINE_CONTINUE);
1810 1813  }
1811 1814  
1812 1815  /*
1813 1816   * The zio_nop_write stage in the pipeline determines if allocating
1814 1817   * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1815 1818   * such as SHA256, we can compare the checksums of the new data and the old
1816 1819   * to determine if allocating a new block is required.  The nopwrite
1817 1820   * feature can handle writes in either syncing or open context (i.e. zil
1818 1821   * writes) and as a result is mutually exclusive with dedup.
1819 1822   */
1820 1823  static int
1821 1824  zio_nop_write(zio_t *zio)
1822 1825  {
1823 1826          blkptr_t *bp = zio->io_bp;
1824 1827          blkptr_t *bp_orig = &zio->io_bp_orig;
1825 1828          zio_prop_t *zp = &zio->io_prop;
1826 1829  
1827 1830          ASSERT(BP_GET_LEVEL(bp) == 0);
1828 1831          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1829 1832          ASSERT(zp->zp_nopwrite);
1830 1833          ASSERT(!zp->zp_dedup);
1831 1834          ASSERT(zio->io_bp_override == NULL);
1832 1835          ASSERT(IO_IS_ALLOCATING(zio));
1833 1836  
1834 1837          /*
1835 1838           * Check to see if the original bp and the new bp have matching
1836 1839           * characteristics (i.e. same checksum, compression algorithms, etc).
1837 1840           * If they don't then just continue with the pipeline which will
1838 1841           * allocate a new bp.
1839 1842           */
1840 1843          if (BP_IS_HOLE(bp_orig) ||
1841 1844              !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
1842 1845              BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
1843 1846              BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
1844 1847              BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
1845 1848              zp->zp_copies != BP_GET_NDVAS(bp_orig))
1846 1849                  return (ZIO_PIPELINE_CONTINUE);
1847 1850  
1848 1851          /*
1849 1852           * If the checksums match then reset the pipeline so that we
1850 1853           * avoid allocating a new bp and issuing any I/O.
1851 1854           */
1852 1855          if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
1853 1856                  ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
1854 1857                  ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
1855 1858                  ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
1856 1859                  ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
1857 1860                  ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
1858 1861                      sizeof (uint64_t)) == 0);
1859 1862  
1860 1863                  *bp = *bp_orig;
1861 1864                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1862 1865                  zio->io_flags |= ZIO_FLAG_NOPWRITE;
1863 1866          }
1864 1867  
1865 1868          return (ZIO_PIPELINE_CONTINUE);
1866 1869  }
1867 1870  
1868 1871  /*
1869 1872   * ==========================================================================
1870 1873   * Dedup
1871 1874   * ==========================================================================
1872 1875   */
1873 1876  static void
1874 1877  zio_ddt_child_read_done(zio_t *zio)
1875 1878  {
1876 1879          blkptr_t *bp = zio->io_bp;
1877 1880          ddt_entry_t *dde = zio->io_private;
1878 1881          ddt_phys_t *ddp;
1879 1882          zio_t *pio = zio_unique_parent(zio);
1880 1883  
1881 1884          mutex_enter(&pio->io_lock);
1882 1885          ddp = ddt_phys_select(dde, bp);
1883 1886          if (zio->io_error == 0)
1884 1887                  ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
1885 1888          if (zio->io_error == 0 && dde->dde_repair_data == NULL)
1886 1889                  dde->dde_repair_data = zio->io_data;
1887 1890          else
1888 1891                  zio_buf_free(zio->io_data, zio->io_size);
1889 1892          mutex_exit(&pio->io_lock);
1890 1893  }
1891 1894  
1892 1895  static int
1893 1896  zio_ddt_read_start(zio_t *zio)
1894 1897  {
1895 1898          blkptr_t *bp = zio->io_bp;
1896 1899  
1897 1900          ASSERT(BP_GET_DEDUP(bp));
1898 1901          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
1899 1902          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1900 1903  
1901 1904          if (zio->io_child_error[ZIO_CHILD_DDT]) {
1902 1905                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
1903 1906                  ddt_entry_t *dde = ddt_repair_start(ddt, bp);
1904 1907                  ddt_phys_t *ddp = dde->dde_phys;
1905 1908                  ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
1906 1909                  blkptr_t blk;
1907 1910  
1908 1911                  ASSERT(zio->io_vsd == NULL);
1909 1912                  zio->io_vsd = dde;
1910 1913  
1911 1914                  if (ddp_self == NULL)
1912 1915                          return (ZIO_PIPELINE_CONTINUE);
1913 1916  
1914 1917                  for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1915 1918                          if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
1916 1919                                  continue;
1917 1920                          ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
1918 1921                              &blk);
1919 1922                          zio_nowait(zio_read(zio, zio->io_spa, &blk,
1920 1923                              zio_buf_alloc(zio->io_size), zio->io_size,
1921 1924                              zio_ddt_child_read_done, dde, zio->io_priority,
1922 1925                              ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
1923 1926                              &zio->io_bookmark));
1924 1927                  }
1925 1928                  return (ZIO_PIPELINE_CONTINUE);
1926 1929          }
1927 1930  
1928 1931          zio_nowait(zio_read(zio, zio->io_spa, bp,
1929 1932              zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
1930 1933              ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
1931 1934  
1932 1935          return (ZIO_PIPELINE_CONTINUE);
1933 1936  }
1934 1937  
1935 1938  static int
1936 1939  zio_ddt_read_done(zio_t *zio)
1937 1940  {
1938 1941          blkptr_t *bp = zio->io_bp;
1939 1942  
1940 1943          if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
1941 1944                  return (ZIO_PIPELINE_STOP);
1942 1945  
1943 1946          ASSERT(BP_GET_DEDUP(bp));
1944 1947          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
1945 1948          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1946 1949  
1947 1950          if (zio->io_child_error[ZIO_CHILD_DDT]) {
1948 1951                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
1949 1952                  ddt_entry_t *dde = zio->io_vsd;
1950 1953                  if (ddt == NULL) {
1951 1954                          ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
1952 1955                          return (ZIO_PIPELINE_CONTINUE);
1953 1956                  }
1954 1957                  if (dde == NULL) {
1955 1958                          zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
1956 1959                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1957 1960                          return (ZIO_PIPELINE_STOP);
1958 1961                  }
1959 1962                  if (dde->dde_repair_data != NULL) {
1960 1963                          bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
1961 1964                          zio->io_child_error[ZIO_CHILD_DDT] = 0;
1962 1965                  }
1963 1966                  ddt_repair_done(ddt, dde);
1964 1967                  zio->io_vsd = NULL;
1965 1968          }
1966 1969  
1967 1970          ASSERT(zio->io_vsd == NULL);
1968 1971  
1969 1972          return (ZIO_PIPELINE_CONTINUE);
1970 1973  }
1971 1974  
1972 1975  static boolean_t
1973 1976  zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
1974 1977  {
1975 1978          spa_t *spa = zio->io_spa;
1976 1979  
1977 1980          /*
1978 1981           * Note: we compare the original data, not the transformed data,
1979 1982           * because when zio->io_bp is an override bp, we will not have
1980 1983           * pushed the I/O transforms.  That's an important optimization
1981 1984           * because otherwise we'd compress/encrypt all dmu_sync() data twice.
1982 1985           */
1983 1986          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
1984 1987                  zio_t *lio = dde->dde_lead_zio[p];
1985 1988  
1986 1989                  if (lio != NULL) {
1987 1990                          return (lio->io_orig_size != zio->io_orig_size ||
1988 1991                              bcmp(zio->io_orig_data, lio->io_orig_data,
1989 1992                              zio->io_orig_size) != 0);
1990 1993                  }
1991 1994          }
1992 1995  
1993 1996          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
1994 1997                  ddt_phys_t *ddp = &dde->dde_phys[p];
1995 1998  
1996 1999                  if (ddp->ddp_phys_birth != 0) {
1997 2000                          arc_buf_t *abuf = NULL;
1998 2001                          uint32_t aflags = ARC_WAIT;
1999 2002                          blkptr_t blk = *zio->io_bp;
2000 2003                          int error;
2001 2004  
2002 2005                          ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2003 2006  
2004 2007                          ddt_exit(ddt);
2005 2008  
2006 2009                          error = arc_read(NULL, spa, &blk,
2007 2010                              arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2008 2011                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2009 2012                              &aflags, &zio->io_bookmark);
2010 2013  
2011 2014                          if (error == 0) {
2012 2015                                  if (arc_buf_size(abuf) != zio->io_orig_size ||
2013 2016                                      bcmp(abuf->b_data, zio->io_orig_data,
2014 2017                                      zio->io_orig_size) != 0)
2015 2018                                          error = SET_ERROR(EEXIST);
2016 2019                                  VERIFY(arc_buf_remove_ref(abuf, &abuf));
2017 2020                          }
2018 2021  
2019 2022                          ddt_enter(ddt);
2020 2023                          return (error != 0);
2021 2024                  }
2022 2025          }
2023 2026  
2024 2027          return (B_FALSE);
2025 2028  }
2026 2029  
2027 2030  static void
2028 2031  zio_ddt_child_write_ready(zio_t *zio)
2029 2032  {
2030 2033          int p = zio->io_prop.zp_copies;
2031 2034          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2032 2035          ddt_entry_t *dde = zio->io_private;
2033 2036          ddt_phys_t *ddp = &dde->dde_phys[p];
2034 2037          zio_t *pio;
2035 2038  
2036 2039          if (zio->io_error)
2037 2040                  return;
2038 2041  
2039 2042          ddt_enter(ddt);
2040 2043  
2041 2044          ASSERT(dde->dde_lead_zio[p] == zio);
2042 2045  
2043 2046          ddt_phys_fill(ddp, zio->io_bp);
2044 2047  
2045 2048          while ((pio = zio_walk_parents(zio)) != NULL)
2046 2049                  ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2047 2050  
2048 2051          ddt_exit(ddt);
2049 2052  }
2050 2053  
2051 2054  static void
2052 2055  zio_ddt_child_write_done(zio_t *zio)
2053 2056  {
2054 2057          int p = zio->io_prop.zp_copies;
2055 2058          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2056 2059          ddt_entry_t *dde = zio->io_private;
2057 2060          ddt_phys_t *ddp = &dde->dde_phys[p];
2058 2061  
2059 2062          ddt_enter(ddt);
2060 2063  
2061 2064          ASSERT(ddp->ddp_refcnt == 0);
2062 2065          ASSERT(dde->dde_lead_zio[p] == zio);
2063 2066          dde->dde_lead_zio[p] = NULL;
2064 2067  
2065 2068          if (zio->io_error == 0) {
2066 2069                  while (zio_walk_parents(zio) != NULL)
2067 2070                          ddt_phys_addref(ddp);
2068 2071          } else {
2069 2072                  ddt_phys_clear(ddp);
2070 2073          }
2071 2074  
2072 2075          ddt_exit(ddt);
2073 2076  }
2074 2077  
2075 2078  static void
2076 2079  zio_ddt_ditto_write_done(zio_t *zio)
2077 2080  {
2078 2081          int p = DDT_PHYS_DITTO;
2079 2082          zio_prop_t *zp = &zio->io_prop;
2080 2083          blkptr_t *bp = zio->io_bp;
2081 2084          ddt_t *ddt = ddt_select(zio->io_spa, bp);
2082 2085          ddt_entry_t *dde = zio->io_private;
2083 2086          ddt_phys_t *ddp = &dde->dde_phys[p];
2084 2087          ddt_key_t *ddk = &dde->dde_key;
2085 2088  
2086 2089          ddt_enter(ddt);
2087 2090  
2088 2091          ASSERT(ddp->ddp_refcnt == 0);
2089 2092          ASSERT(dde->dde_lead_zio[p] == zio);
2090 2093          dde->dde_lead_zio[p] = NULL;
2091 2094  
2092 2095          if (zio->io_error == 0) {
2093 2096                  ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2094 2097                  ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2095 2098                  ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2096 2099                  if (ddp->ddp_phys_birth != 0)
2097 2100                          ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2098 2101                  ddt_phys_fill(ddp, bp);
2099 2102          }
2100 2103  
2101 2104          ddt_exit(ddt);
2102 2105  }
2103 2106  
2104 2107  static int
2105 2108  zio_ddt_write(zio_t *zio)
2106 2109  {
2107 2110          spa_t *spa = zio->io_spa;
2108 2111          blkptr_t *bp = zio->io_bp;
2109 2112          uint64_t txg = zio->io_txg;
2110 2113          zio_prop_t *zp = &zio->io_prop;
2111 2114          int p = zp->zp_copies;
2112 2115          int ditto_copies;
2113 2116          zio_t *cio = NULL;
2114 2117          zio_t *dio = NULL;
2115 2118          ddt_t *ddt = ddt_select(spa, bp);
2116 2119          ddt_entry_t *dde;
2117 2120          ddt_phys_t *ddp;
2118 2121  
2119 2122          ASSERT(BP_GET_DEDUP(bp));
2120 2123          ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2121 2124          ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2122 2125  
2123 2126          ddt_enter(ddt);
2124 2127          dde = ddt_lookup(ddt, bp, B_TRUE);
2125 2128          ddp = &dde->dde_phys[p];
2126 2129  
2127 2130          if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2128 2131                  /*
2129 2132                   * If we're using a weak checksum, upgrade to a strong checksum
2130 2133                   * and try again.  If we're already using a strong checksum,
2131 2134                   * we can't resolve it, so just convert to an ordinary write.
2132 2135                   * (And automatically e-mail a paper to Nature?)
2133 2136                   */
2134 2137                  if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2135 2138                          zp->zp_checksum = spa_dedup_checksum(spa);
2136 2139                          zio_pop_transforms(zio);
2137 2140                          zio->io_stage = ZIO_STAGE_OPEN;
2138 2141                          BP_ZERO(bp);
2139 2142                  } else {
2140 2143                          zp->zp_dedup = B_FALSE;
2141 2144                  }
2142 2145                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
2143 2146                  ddt_exit(ddt);
2144 2147                  return (ZIO_PIPELINE_CONTINUE);
2145 2148          }
2146 2149  
2147 2150          ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2148 2151          ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2149 2152  
2150 2153          if (ditto_copies > ddt_ditto_copies_present(dde) &&
2151 2154              dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2152 2155                  zio_prop_t czp = *zp;
2153 2156  
2154 2157                  czp.zp_copies = ditto_copies;
2155 2158  
2156 2159                  /*
2157 2160                   * If we arrived here with an override bp, we won't have run
2158 2161                   * the transform stack, so we won't have the data we need to
2159 2162                   * generate a child i/o.  So, toss the override bp and restart.
2160 2163                   * This is safe, because using the override bp is just an
2161 2164                   * optimization; and it's rare, so the cost doesn't matter.
2162 2165                   */
2163 2166                  if (zio->io_bp_override) {
2164 2167                          zio_pop_transforms(zio);
2165 2168                          zio->io_stage = ZIO_STAGE_OPEN;
2166 2169                          zio->io_pipeline = ZIO_WRITE_PIPELINE;
2167 2170                          zio->io_bp_override = NULL;
2168 2171                          BP_ZERO(bp);
2169 2172                          ddt_exit(ddt);
2170 2173                          return (ZIO_PIPELINE_CONTINUE);
2171 2174                  }
2172 2175  
2173 2176                  dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2174 2177                      zio->io_orig_size, &czp, NULL,
2175 2178                      zio_ddt_ditto_write_done, dde, zio->io_priority,
2176 2179                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2177 2180  
2178 2181                  zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2179 2182                  dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2180 2183          }
2181 2184  
2182 2185          if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2183 2186                  if (ddp->ddp_phys_birth != 0)
2184 2187                          ddt_bp_fill(ddp, bp, txg);
2185 2188                  if (dde->dde_lead_zio[p] != NULL)
2186 2189                          zio_add_child(zio, dde->dde_lead_zio[p]);
2187 2190                  else
2188 2191                          ddt_phys_addref(ddp);
2189 2192          } else if (zio->io_bp_override) {
2190 2193                  ASSERT(bp->blk_birth == txg);
2191 2194                  ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2192 2195                  ddt_phys_fill(ddp, bp);
2193 2196                  ddt_phys_addref(ddp);
2194 2197          } else {
2195 2198                  cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2196 2199                      zio->io_orig_size, zp, zio_ddt_child_write_ready,
2197 2200                      zio_ddt_child_write_done, dde, zio->io_priority,
2198 2201                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2199 2202  
2200 2203                  zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2201 2204                  dde->dde_lead_zio[p] = cio;
2202 2205          }
2203 2206  
2204 2207          ddt_exit(ddt);
2205 2208  
2206 2209          if (cio)
2207 2210                  zio_nowait(cio);
2208 2211          if (dio)
2209 2212                  zio_nowait(dio);
2210 2213  
2211 2214          return (ZIO_PIPELINE_CONTINUE);
2212 2215  }
2213 2216  
2214 2217  ddt_entry_t *freedde; /* for debugging */
2215 2218  
2216 2219  static int
2217 2220  zio_ddt_free(zio_t *zio)
2218 2221  {
2219 2222          spa_t *spa = zio->io_spa;
2220 2223          blkptr_t *bp = zio->io_bp;
2221 2224          ddt_t *ddt = ddt_select(spa, bp);
2222 2225          ddt_entry_t *dde;
2223 2226          ddt_phys_t *ddp;
2224 2227  
2225 2228          ASSERT(BP_GET_DEDUP(bp));
2226 2229          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2227 2230  
2228 2231          ddt_enter(ddt);
2229 2232          freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2230 2233          ddp = ddt_phys_select(dde, bp);
2231 2234          ddt_phys_decref(ddp);
2232 2235          ddt_exit(ddt);
2233 2236  
2234 2237          return (ZIO_PIPELINE_CONTINUE);
2235 2238  }
2236 2239  
2237 2240  /*
2238 2241   * ==========================================================================
2239 2242   * Allocate and free blocks
2240 2243   * ==========================================================================
2241 2244   */
2242 2245  static int
2243 2246  zio_dva_allocate(zio_t *zio)
2244 2247  {
2245 2248          spa_t *spa = zio->io_spa;
2246 2249          metaslab_class_t *mc = spa_normal_class(spa);
2247 2250          blkptr_t *bp = zio->io_bp;
2248 2251          int error;
2249 2252          int flags = 0;
2250 2253  
2251 2254          if (zio->io_gang_leader == NULL) {
2252 2255                  ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2253 2256                  zio->io_gang_leader = zio;
2254 2257          }
2255 2258  
2256 2259          ASSERT(BP_IS_HOLE(bp));
2257 2260          ASSERT0(BP_GET_NDVAS(bp));
2258 2261          ASSERT3U(zio->io_prop.zp_copies, >, 0);
2259 2262          ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2260 2263          ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2261 2264  
2262 2265          /*
2263 2266           * The dump device does not support gang blocks so allocation on
2264 2267           * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2265 2268           * the "fast" gang feature.
2266 2269           */
2267 2270          flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2268 2271          flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2269 2272              METASLAB_GANG_CHILD : 0;
2270 2273          error = metaslab_alloc(spa, mc, zio->io_size, bp,
2271 2274              zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2272 2275  
2273 2276          if (error) {
2274 2277                  spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2275 2278                      "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2276 2279                      error);
2277 2280                  if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2278 2281                          return (zio_write_gang_block(zio));
2279 2282                  zio->io_error = error;
2280 2283          }
2281 2284  
2282 2285          return (ZIO_PIPELINE_CONTINUE);
2283 2286  }
2284 2287  
2285 2288  static int
2286 2289  zio_dva_free(zio_t *zio)
2287 2290  {
2288 2291          metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2289 2292  
2290 2293          return (ZIO_PIPELINE_CONTINUE);
2291 2294  }
2292 2295  
2293 2296  static int
2294 2297  zio_dva_claim(zio_t *zio)
2295 2298  {
2296 2299          int error;
2297 2300  
2298 2301          error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2299 2302          if (error)
2300 2303                  zio->io_error = error;
2301 2304  
2302 2305          return (ZIO_PIPELINE_CONTINUE);
2303 2306  }
2304 2307  
2305 2308  /*
2306 2309   * Undo an allocation.  This is used by zio_done() when an I/O fails
2307 2310   * and we want to give back the block we just allocated.
2308 2311   * This handles both normal blocks and gang blocks.
2309 2312   */
2310 2313  static void
2311 2314  zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2312 2315  {
2313 2316          ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2314 2317          ASSERT(zio->io_bp_override == NULL);
2315 2318  
2316 2319          if (!BP_IS_HOLE(bp))
2317 2320                  metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2318 2321  
2319 2322          if (gn != NULL) {
2320 2323                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2321 2324                          zio_dva_unallocate(zio, gn->gn_child[g],
2322 2325                              &gn->gn_gbh->zg_blkptr[g]);
2323 2326                  }
2324 2327          }
2325 2328  }
2326 2329  
2327 2330  /*
2328 2331   * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2329 2332   */
2330 2333  int
2331 2334  zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2332 2335      uint64_t size, boolean_t use_slog)
2333 2336  {
2334 2337          int error = 1;
2335 2338  
2336 2339          ASSERT(txg > spa_syncing_txg(spa));
2337 2340  
2338 2341          /*
2339 2342           * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2340 2343           * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2341 2344           * when allocating them.
2342 2345           */
2343 2346          if (use_slog) {
2344 2347                  error = metaslab_alloc(spa, spa_log_class(spa), size,
2345 2348                      new_bp, 1, txg, old_bp,
2346 2349                      METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2347 2350          }
2348 2351  
2349 2352          if (error) {
2350 2353                  error = metaslab_alloc(spa, spa_normal_class(spa), size,
2351 2354                      new_bp, 1, txg, old_bp,
2352 2355                      METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2353 2356          }
2354 2357  
2355 2358          if (error == 0) {
2356 2359                  BP_SET_LSIZE(new_bp, size);
2357 2360                  BP_SET_PSIZE(new_bp, size);
2358 2361                  BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2359 2362                  BP_SET_CHECKSUM(new_bp,
2360 2363                      spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2361 2364                      ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2362 2365                  BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2363 2366                  BP_SET_LEVEL(new_bp, 0);
2364 2367                  BP_SET_DEDUP(new_bp, 0);
2365 2368                  BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2366 2369          }
2367 2370  
2368 2371          return (error);
2369 2372  }
2370 2373  
2371 2374  /*
2372 2375   * Free an intent log block.
2373 2376   */
2374 2377  void
2375 2378  zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2376 2379  {
2377 2380          ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2378 2381          ASSERT(!BP_IS_GANG(bp));
2379 2382  
2380 2383          zio_free(spa, txg, bp);
2381 2384  }
2382 2385  
2383 2386  /*
2384 2387   * ==========================================================================
2385 2388   * Read and write to physical devices
2386 2389   * ==========================================================================
2387 2390   */
2388 2391  static int
2389 2392  zio_vdev_io_start(zio_t *zio)
2390 2393  {
2391 2394          vdev_t *vd = zio->io_vd;
2392 2395          uint64_t align;
2393 2396          spa_t *spa = zio->io_spa;
2394 2397  
2395 2398          ASSERT(zio->io_error == 0);
2396 2399          ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2397 2400  
2398 2401          if (vd == NULL) {
2399 2402                  if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2400 2403                          spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2401 2404  
2402 2405                  /*
2403 2406                   * The mirror_ops handle multiple DVAs in a single BP.
2404 2407                   */
2405 2408                  return (vdev_mirror_ops.vdev_op_io_start(zio));
2406 2409          }
2407 2410  
2408 2411          /*
2409 2412           * We keep track of time-sensitive I/Os so that the scan thread
2410 2413           * can quickly react to certain workloads.  In particular, we care
2411 2414           * about non-scrubbing, top-level reads and writes with the following
2412 2415           * characteristics:
2413 2416           *      - synchronous writes of user data to non-slog devices
2414 2417           *      - any reads of user data
2415 2418           * When these conditions are met, adjust the timestamp of spa_last_io
2416 2419           * which allows the scan thread to adjust its workload accordingly.
2417 2420           */
2418 2421          if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2419 2422              vd == vd->vdev_top && !vd->vdev_islog &&
2420 2423              zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2421 2424              zio->io_txg != spa_syncing_txg(spa)) {
2422 2425                  uint64_t old = spa->spa_last_io;
2423 2426                  uint64_t new = ddi_get_lbolt64();
2424 2427                  if (old != new)
2425 2428                          (void) atomic_cas_64(&spa->spa_last_io, old, new);
2426 2429          }
2427 2430  
2428 2431          align = 1ULL << vd->vdev_top->vdev_ashift;
2429 2432  
2430 2433          if (P2PHASE(zio->io_size, align) != 0) {
2431 2434                  uint64_t asize = P2ROUNDUP(zio->io_size, align);
2432 2435                  char *abuf = zio_buf_alloc(asize);
2433 2436                  ASSERT(vd == vd->vdev_top);
2434 2437                  if (zio->io_type == ZIO_TYPE_WRITE) {
2435 2438                          bcopy(zio->io_data, abuf, zio->io_size);
2436 2439                          bzero(abuf + zio->io_size, asize - zio->io_size);
2437 2440                  }
2438 2441                  zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2439 2442          }
2440 2443  
2441 2444          ASSERT(P2PHASE(zio->io_offset, align) == 0);
2442 2445          ASSERT(P2PHASE(zio->io_size, align) == 0);
2443 2446          VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2444 2447  
2445 2448          /*
2446 2449           * If this is a repair I/O, and there's no self-healing involved --
2447 2450           * that is, we're just resilvering what we expect to resilver --
2448 2451           * then don't do the I/O unless zio's txg is actually in vd's DTL.
2449 2452           * This prevents spurious resilvering with nested replication.
2450 2453           * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2451 2454           * A is out of date, we'll read from C+D, then use the data to
2452 2455           * resilver A+B -- but we don't actually want to resilver B, just A.
2453 2456           * The top-level mirror has no way to know this, so instead we just
2454 2457           * discard unnecessary repairs as we work our way down the vdev tree.
2455 2458           * The same logic applies to any form of nested replication:
2456 2459           * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2457 2460           */
2458 2461          if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2459 2462              !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2460 2463              zio->io_txg != 0 && /* not a delegated i/o */
2461 2464              !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2462 2465                  ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2463 2466                  zio_vdev_io_bypass(zio);
2464 2467                  return (ZIO_PIPELINE_CONTINUE);
2465 2468          }
2466 2469  
2467 2470          if (vd->vdev_ops->vdev_op_leaf &&
2468 2471              (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2469 2472  
2470 2473                  if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
2471 2474                          return (ZIO_PIPELINE_CONTINUE);
2472 2475  
2473 2476                  if ((zio = vdev_queue_io(zio)) == NULL)
2474 2477                          return (ZIO_PIPELINE_STOP);
2475 2478  
2476 2479                  if (!vdev_accessible(vd, zio)) {
2477 2480                          zio->io_error = SET_ERROR(ENXIO);
2478 2481                          zio_interrupt(zio);
2479 2482                          return (ZIO_PIPELINE_STOP);
2480 2483                  }
2481 2484          }
2482 2485  
2483 2486          return (vd->vdev_ops->vdev_op_io_start(zio));
2484 2487  }
2485 2488  
2486 2489  static int
2487 2490  zio_vdev_io_done(zio_t *zio)
2488 2491  {
2489 2492          vdev_t *vd = zio->io_vd;
2490 2493          vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2491 2494          boolean_t unexpected_error = B_FALSE;
2492 2495  
2493 2496          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2494 2497                  return (ZIO_PIPELINE_STOP);
2495 2498  
2496 2499          ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2497 2500  
2498 2501          if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2499 2502  
2500 2503                  vdev_queue_io_done(zio);
2501 2504  
2502 2505                  if (zio->io_type == ZIO_TYPE_WRITE)
2503 2506                          vdev_cache_write(zio);
2504 2507  
2505 2508                  if (zio_injection_enabled && zio->io_error == 0)
2506 2509                          zio->io_error = zio_handle_device_injection(vd,
2507 2510                              zio, EIO);
2508 2511  
2509 2512                  if (zio_injection_enabled && zio->io_error == 0)
2510 2513                          zio->io_error = zio_handle_label_injection(zio, EIO);
2511 2514  
2512 2515                  if (zio->io_error) {
2513 2516                          if (!vdev_accessible(vd, zio)) {
2514 2517                                  zio->io_error = SET_ERROR(ENXIO);
2515 2518                          } else {
2516 2519                                  unexpected_error = B_TRUE;
2517 2520                          }
2518 2521                  }
2519 2522          }
2520 2523  
2521 2524          ops->vdev_op_io_done(zio);
2522 2525  
2523 2526          if (unexpected_error)
2524 2527                  VERIFY(vdev_probe(vd, zio) == NULL);
2525 2528  
2526 2529          return (ZIO_PIPELINE_CONTINUE);
2527 2530  }
2528 2531  
2529 2532  /*
2530 2533   * For non-raidz ZIOs, we can just copy aside the bad data read from the
2531 2534   * disk, and use that to finish the checksum ereport later.
2532 2535   */
2533 2536  static void
2534 2537  zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2535 2538      const void *good_buf)
2536 2539  {
2537 2540          /* no processing needed */
2538 2541          zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2539 2542  }
2540 2543  
2541 2544  /*ARGSUSED*/
2542 2545  void
2543 2546  zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2544 2547  {
2545 2548          void *buf = zio_buf_alloc(zio->io_size);
2546 2549  
2547 2550          bcopy(zio->io_data, buf, zio->io_size);
2548 2551  
2549 2552          zcr->zcr_cbinfo = zio->io_size;
2550 2553          zcr->zcr_cbdata = buf;
2551 2554          zcr->zcr_finish = zio_vsd_default_cksum_finish;
2552 2555          zcr->zcr_free = zio_buf_free;
2553 2556  }
2554 2557  
2555 2558  static int
2556 2559  zio_vdev_io_assess(zio_t *zio)
2557 2560  {
2558 2561          vdev_t *vd = zio->io_vd;
2559 2562  
2560 2563          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2561 2564                  return (ZIO_PIPELINE_STOP);
2562 2565  
2563 2566          if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2564 2567                  spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2565 2568  
2566 2569          if (zio->io_vsd != NULL) {
2567 2570                  zio->io_vsd_ops->vsd_free(zio);
2568 2571                  zio->io_vsd = NULL;
2569 2572          }
2570 2573  
2571 2574          if (zio_injection_enabled && zio->io_error == 0)
2572 2575                  zio->io_error = zio_handle_fault_injection(zio, EIO);
2573 2576  
2574 2577          /*
2575 2578           * If the I/O failed, determine whether we should attempt to retry it.
2576 2579           *
2577 2580           * On retry, we cut in line in the issue queue, since we don't want
2578 2581           * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2579 2582           */
2580 2583          if (zio->io_error && vd == NULL &&
2581 2584              !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2582 2585                  ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2583 2586                  ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2584 2587                  zio->io_error = 0;
2585 2588                  zio->io_flags |= ZIO_FLAG_IO_RETRY |
2586 2589                      ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2587 2590                  zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2588 2591                  zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2589 2592                      zio_requeue_io_start_cut_in_line);
2590 2593                  return (ZIO_PIPELINE_STOP);
2591 2594          }
2592 2595  
2593 2596          /*
2594 2597           * If we got an error on a leaf device, convert it to ENXIO
2595 2598           * if the device is not accessible at all.
2596 2599           */
2597 2600          if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2598 2601              !vdev_accessible(vd, zio))
2599 2602                  zio->io_error = SET_ERROR(ENXIO);
2600 2603  
2601 2604          /*
2602 2605           * If we can't write to an interior vdev (mirror or RAID-Z),
2603 2606           * set vdev_cant_write so that we stop trying to allocate from it.
2604 2607           */
2605 2608          if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2606 2609              vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2607 2610                  vd->vdev_cant_write = B_TRUE;
2608 2611          }
2609 2612  
2610 2613          if (zio->io_error)
2611 2614                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2612 2615  
2613 2616          return (ZIO_PIPELINE_CONTINUE);
2614 2617  }
2615 2618  
2616 2619  void
2617 2620  zio_vdev_io_reissue(zio_t *zio)
2618 2621  {
2619 2622          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2620 2623          ASSERT(zio->io_error == 0);
2621 2624  
2622 2625          zio->io_stage >>= 1;
2623 2626  }
2624 2627  
2625 2628  void
2626 2629  zio_vdev_io_redone(zio_t *zio)
2627 2630  {
2628 2631          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2629 2632  
2630 2633          zio->io_stage >>= 1;
2631 2634  }
2632 2635  
2633 2636  void
2634 2637  zio_vdev_io_bypass(zio_t *zio)
2635 2638  {
2636 2639          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2637 2640          ASSERT(zio->io_error == 0);
2638 2641  
2639 2642          zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2640 2643          zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2641 2644  }
2642 2645  
2643 2646  /*
2644 2647   * ==========================================================================
2645 2648   * Generate and verify checksums
2646 2649   * ==========================================================================
2647 2650   */
2648 2651  static int
2649 2652  zio_checksum_generate(zio_t *zio)
2650 2653  {
2651 2654          blkptr_t *bp = zio->io_bp;
2652 2655          enum zio_checksum checksum;
2653 2656  
2654 2657          if (bp == NULL) {
2655 2658                  /*
2656 2659                   * This is zio_write_phys().
2657 2660                   * We're either generating a label checksum, or none at all.
2658 2661                   */
2659 2662                  checksum = zio->io_prop.zp_checksum;
2660 2663  
2661 2664                  if (checksum == ZIO_CHECKSUM_OFF)
2662 2665                          return (ZIO_PIPELINE_CONTINUE);
2663 2666  
2664 2667                  ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2665 2668          } else {
2666 2669                  if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2667 2670                          ASSERT(!IO_IS_ALLOCATING(zio));
2668 2671                          checksum = ZIO_CHECKSUM_GANG_HEADER;
2669 2672                  } else {
2670 2673                          checksum = BP_GET_CHECKSUM(bp);
2671 2674                  }
2672 2675          }
2673 2676  
2674 2677          zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2675 2678  
2676 2679          return (ZIO_PIPELINE_CONTINUE);
2677 2680  }
2678 2681  
2679 2682  static int
2680 2683  zio_checksum_verify(zio_t *zio)
2681 2684  {
2682 2685          zio_bad_cksum_t info;
2683 2686          blkptr_t *bp = zio->io_bp;
2684 2687          int error;
2685 2688  
2686 2689          ASSERT(zio->io_vd != NULL);
2687 2690  
2688 2691          if (bp == NULL) {
2689 2692                  /*
2690 2693                   * This is zio_read_phys().
2691 2694                   * We're either verifying a label checksum, or nothing at all.
2692 2695                   */
2693 2696                  if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2694 2697                          return (ZIO_PIPELINE_CONTINUE);
2695 2698  
2696 2699                  ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2697 2700          }
2698 2701  
2699 2702          if ((error = zio_checksum_error(zio, &info)) != 0) {
2700 2703                  zio->io_error = error;
2701 2704                  if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2702 2705                          zfs_ereport_start_checksum(zio->io_spa,
2703 2706                              zio->io_vd, zio, zio->io_offset,
2704 2707                              zio->io_size, NULL, &info);
2705 2708                  }
2706 2709          }
2707 2710  
2708 2711          return (ZIO_PIPELINE_CONTINUE);
2709 2712  }
2710 2713  
2711 2714  /*
2712 2715   * Called by RAID-Z to ensure we don't compute the checksum twice.
2713 2716   */
2714 2717  void
2715 2718  zio_checksum_verified(zio_t *zio)
2716 2719  {
2717 2720          zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2718 2721  }
2719 2722  
2720 2723  /*
2721 2724   * ==========================================================================
2722 2725   * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2723 2726   * An error of 0 indictes success.  ENXIO indicates whole-device failure,
2724 2727   * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2725 2728   * indicate errors that are specific to one I/O, and most likely permanent.
2726 2729   * Any other error is presumed to be worse because we weren't expecting it.
2727 2730   * ==========================================================================
2728 2731   */
2729 2732  int
2730 2733  zio_worst_error(int e1, int e2)
2731 2734  {
2732 2735          static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2733 2736          int r1, r2;
2734 2737  
2735 2738          for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2736 2739                  if (e1 == zio_error_rank[r1])
2737 2740                          break;
2738 2741  
2739 2742          for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2740 2743                  if (e2 == zio_error_rank[r2])
2741 2744                          break;
2742 2745  
2743 2746          return (r1 > r2 ? e1 : e2);
2744 2747  }
2745 2748  
2746 2749  /*
2747 2750   * ==========================================================================
2748 2751   * I/O completion
2749 2752   * ==========================================================================
2750 2753   */
2751 2754  static int
2752 2755  zio_ready(zio_t *zio)
2753 2756  {
2754 2757          blkptr_t *bp = zio->io_bp;
2755 2758          zio_t *pio, *pio_next;
2756 2759  
2757 2760          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2758 2761              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2759 2762                  return (ZIO_PIPELINE_STOP);
2760 2763  
2761 2764          if (zio->io_ready) {
2762 2765                  ASSERT(IO_IS_ALLOCATING(zio));
2763 2766                  ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
2764 2767                      (zio->io_flags & ZIO_FLAG_NOPWRITE));
2765 2768                  ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2766 2769  
2767 2770                  zio->io_ready(zio);
2768 2771          }
2769 2772  
2770 2773          if (bp != NULL && bp != &zio->io_bp_copy)
2771 2774                  zio->io_bp_copy = *bp;
2772 2775  
2773 2776          if (zio->io_error)
2774 2777                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2775 2778  
2776 2779          mutex_enter(&zio->io_lock);
2777 2780          zio->io_state[ZIO_WAIT_READY] = 1;
2778 2781          pio = zio_walk_parents(zio);
2779 2782          mutex_exit(&zio->io_lock);
2780 2783  
2781 2784          /*
2782 2785           * As we notify zio's parents, new parents could be added.
2783 2786           * New parents go to the head of zio's io_parent_list, however,
2784 2787           * so we will (correctly) not notify them.  The remainder of zio's
2785 2788           * io_parent_list, from 'pio_next' onward, cannot change because
2786 2789           * all parents must wait for us to be done before they can be done.
2787 2790           */
2788 2791          for (; pio != NULL; pio = pio_next) {
2789 2792                  pio_next = zio_walk_parents(zio);
2790 2793                  zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2791 2794          }
2792 2795  
2793 2796          if (zio->io_flags & ZIO_FLAG_NODATA) {
2794 2797                  if (BP_IS_GANG(bp)) {
2795 2798                          zio->io_flags &= ~ZIO_FLAG_NODATA;
2796 2799                  } else {
2797 2800                          ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
2798 2801                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2799 2802                  }
2800 2803          }
2801 2804  
2802 2805          if (zio_injection_enabled &&
2803 2806              zio->io_spa->spa_syncing_txg == zio->io_txg)
2804 2807                  zio_handle_ignored_writes(zio);
2805 2808  
2806 2809          return (ZIO_PIPELINE_CONTINUE);
2807 2810  }
2808 2811  
2809 2812  static int
2810 2813  zio_done(zio_t *zio)
2811 2814  {
2812 2815          spa_t *spa = zio->io_spa;
2813 2816          zio_t *lio = zio->io_logical;
2814 2817          blkptr_t *bp = zio->io_bp;
2815 2818          vdev_t *vd = zio->io_vd;
2816 2819          uint64_t psize = zio->io_size;
2817 2820          zio_t *pio, *pio_next;
2818 2821  
2819 2822          /*
2820 2823           * If our children haven't all completed,
2821 2824           * wait for them and then repeat this pipeline stage.
2822 2825           */
2823 2826          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2824 2827              zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2825 2828              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
2826 2829              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2827 2830                  return (ZIO_PIPELINE_STOP);
2828 2831  
2829 2832          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2830 2833                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2831 2834                          ASSERT(zio->io_children[c][w] == 0);
2832 2835  
2833 2836          if (bp != NULL) {
2834 2837                  ASSERT(bp->blk_pad[0] == 0);
2835 2838                  ASSERT(bp->blk_pad[1] == 0);
2836 2839                  ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2837 2840                      (bp == zio_unique_parent(zio)->io_bp));
2838 2841                  if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2839 2842                      zio->io_bp_override == NULL &&
2840 2843                      !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2841 2844                          ASSERT(!BP_SHOULD_BYTESWAP(bp));
2842 2845                          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
2843 2846                          ASSERT(BP_COUNT_GANG(bp) == 0 ||
2844 2847                              (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
2845 2848                  }
2846 2849                  if (zio->io_flags & ZIO_FLAG_NOPWRITE)
2847 2850                          VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
2848 2851          }
2849 2852  
2850 2853          /*
2851 2854           * If there were child vdev/gang/ddt errors, they apply to us now.
2852 2855           */
2853 2856          zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
2854 2857          zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
2855 2858          zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
2856 2859  
2857 2860          /*
2858 2861           * If the I/O on the transformed data was successful, generate any
2859 2862           * checksum reports now while we still have the transformed data.
2860 2863           */
2861 2864          if (zio->io_error == 0) {
2862 2865                  while (zio->io_cksum_report != NULL) {
2863 2866                          zio_cksum_report_t *zcr = zio->io_cksum_report;
2864 2867                          uint64_t align = zcr->zcr_align;
2865 2868                          uint64_t asize = P2ROUNDUP(psize, align);
2866 2869                          char *abuf = zio->io_data;
2867 2870  
2868 2871                          if (asize != psize) {
2869 2872                                  abuf = zio_buf_alloc(asize);
2870 2873                                  bcopy(zio->io_data, abuf, psize);
2871 2874                                  bzero(abuf + psize, asize - psize);
2872 2875                          }
2873 2876  
2874 2877                          zio->io_cksum_report = zcr->zcr_next;
2875 2878                          zcr->zcr_next = NULL;
2876 2879                          zcr->zcr_finish(zcr, abuf);
2877 2880                          zfs_ereport_free_checksum(zcr);
2878 2881  
2879 2882                          if (asize != psize)
2880 2883                                  zio_buf_free(abuf, asize);
2881 2884                  }
2882 2885          }
2883 2886  
2884 2887          zio_pop_transforms(zio);        /* note: may set zio->io_error */
2885 2888  
2886 2889          vdev_stat_update(zio, psize);
2887 2890  
2888 2891          if (zio->io_error) {
2889 2892                  /*
2890 2893                   * If this I/O is attached to a particular vdev,
2891 2894                   * generate an error message describing the I/O failure
2892 2895                   * at the block level.  We ignore these errors if the
2893 2896                   * device is currently unavailable.
2894 2897                   */
2895 2898                  if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
2896 2899                          zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
2897 2900  
2898 2901                  if ((zio->io_error == EIO || !(zio->io_flags &
2899 2902                      (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
2900 2903                      zio == lio) {
2901 2904                          /*
2902 2905                           * For logical I/O requests, tell the SPA to log the
2903 2906                           * error and generate a logical data ereport.
2904 2907                           */
2905 2908                          spa_log_error(spa, zio);
2906 2909                          zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
2907 2910                              0, 0);
2908 2911                  }
2909 2912          }
2910 2913  
2911 2914          if (zio->io_error && zio == lio) {
2912 2915                  /*
2913 2916                   * Determine whether zio should be reexecuted.  This will
2914 2917                   * propagate all the way to the root via zio_notify_parent().
2915 2918                   */
2916 2919                  ASSERT(vd == NULL && bp != NULL);
2917 2920                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2918 2921  
2919 2922                  if (IO_IS_ALLOCATING(zio) &&
2920 2923                      !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
2921 2924                          if (zio->io_error != ENOSPC)
2922 2925                                  zio->io_reexecute |= ZIO_REEXECUTE_NOW;
2923 2926                          else
2924 2927                                  zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2925 2928                  }
2926 2929  
2927 2930                  if ((zio->io_type == ZIO_TYPE_READ ||
2928 2931                      zio->io_type == ZIO_TYPE_FREE) &&
2929 2932                      !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
2930 2933                      zio->io_error == ENXIO &&
2931 2934                      spa_load_state(spa) == SPA_LOAD_NONE &&
2932 2935                      spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
2933 2936                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2934 2937  
2935 2938                  if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
2936 2939                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2937 2940  
2938 2941                  /*
2939 2942                   * Here is a possibly good place to attempt to do
2940 2943                   * either combinatorial reconstruction or error correction
2941 2944                   * based on checksums.  It also might be a good place
2942 2945                   * to send out preliminary ereports before we suspend
2943 2946                   * processing.
2944 2947                   */
2945 2948          }
2946 2949  
2947 2950          /*
2948 2951           * If there were logical child errors, they apply to us now.
2949 2952           * We defer this until now to avoid conflating logical child
2950 2953           * errors with errors that happened to the zio itself when
2951 2954           * updating vdev stats and reporting FMA events above.
2952 2955           */
2953 2956          zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
2954 2957  
2955 2958          if ((zio->io_error || zio->io_reexecute) &&
2956 2959              IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
2957 2960              !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
2958 2961                  zio_dva_unallocate(zio, zio->io_gang_tree, bp);
2959 2962  
2960 2963          zio_gang_tree_free(&zio->io_gang_tree);
2961 2964  
2962 2965          /*
2963 2966           * Godfather I/Os should never suspend.
2964 2967           */
2965 2968          if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
2966 2969              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
2967 2970                  zio->io_reexecute = 0;
2968 2971  
2969 2972          if (zio->io_reexecute) {
2970 2973                  /*
2971 2974                   * This is a logical I/O that wants to reexecute.
2972 2975                   *
2973 2976                   * Reexecute is top-down.  When an i/o fails, if it's not
2974 2977                   * the root, it simply notifies its parent and sticks around.
2975 2978                   * The parent, seeing that it still has children in zio_done(),
2976 2979                   * does the same.  This percolates all the way up to the root.
2977 2980                   * The root i/o will reexecute or suspend the entire tree.
2978 2981                   *
2979 2982                   * This approach ensures that zio_reexecute() honors
2980 2983                   * all the original i/o dependency relationships, e.g.
2981 2984                   * parents not executing until children are ready.
2982 2985                   */
2983 2986                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2984 2987  
2985 2988                  zio->io_gang_leader = NULL;
2986 2989  
2987 2990                  mutex_enter(&zio->io_lock);
2988 2991                  zio->io_state[ZIO_WAIT_DONE] = 1;
2989 2992                  mutex_exit(&zio->io_lock);
2990 2993  
2991 2994                  /*
2992 2995                   * "The Godfather" I/O monitors its children but is
2993 2996                   * not a true parent to them. It will track them through
2994 2997                   * the pipeline but severs its ties whenever they get into
2995 2998                   * trouble (e.g. suspended). This allows "The Godfather"
2996 2999                   * I/O to return status without blocking.
2997 3000                   */
2998 3001                  for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
2999 3002                          zio_link_t *zl = zio->io_walk_link;
3000 3003                          pio_next = zio_walk_parents(zio);
3001 3004  
3002 3005                          if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3003 3006                              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3004 3007                                  zio_remove_child(pio, zio, zl);
3005 3008                                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3006 3009                          }
3007 3010                  }
3008 3011  
3009 3012                  if ((pio = zio_unique_parent(zio)) != NULL) {
3010 3013                          /*
3011 3014                           * We're not a root i/o, so there's nothing to do
3012 3015                           * but notify our parent.  Don't propagate errors
3013 3016                           * upward since we haven't permanently failed yet.
3014 3017                           */
3015 3018                          ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3016 3019                          zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3017 3020                          zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3018 3021                  } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3019 3022                          /*
3020 3023                           * We'd fail again if we reexecuted now, so suspend
3021 3024                           * until conditions improve (e.g. device comes online).
3022 3025                           */
3023 3026                          zio_suspend(spa, zio);
3024 3027                  } else {
3025 3028                          /*
3026 3029                           * Reexecution is potentially a huge amount of work.
3027 3030                           * Hand it off to the otherwise-unused claim taskq.
3028 3031                           */
3029 3032                          ASSERT(zio->io_tqent.tqent_next == NULL);
3030 3033                          spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
3031 3034                              ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
3032 3035                              0, &zio->io_tqent);
3033 3036                  }
3034 3037                  return (ZIO_PIPELINE_STOP);
3035 3038          }
3036 3039  
3037 3040          ASSERT(zio->io_child_count == 0);
3038 3041          ASSERT(zio->io_reexecute == 0);
3039 3042          ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3040 3043  
3041 3044          /*
3042 3045           * Report any checksum errors, since the I/O is complete.
3043 3046           */
3044 3047          while (zio->io_cksum_report != NULL) {
3045 3048                  zio_cksum_report_t *zcr = zio->io_cksum_report;
3046 3049                  zio->io_cksum_report = zcr->zcr_next;
3047 3050                  zcr->zcr_next = NULL;
3048 3051                  zcr->zcr_finish(zcr, NULL);
3049 3052                  zfs_ereport_free_checksum(zcr);
3050 3053          }
3051 3054  
3052 3055          /*
3053 3056           * It is the responsibility of the done callback to ensure that this
3054 3057           * particular zio is no longer discoverable for adoption, and as
3055 3058           * such, cannot acquire any new parents.
3056 3059           */
3057 3060          if (zio->io_done)
3058 3061                  zio->io_done(zio);
3059 3062  
3060 3063          mutex_enter(&zio->io_lock);
3061 3064          zio->io_state[ZIO_WAIT_DONE] = 1;
3062 3065          mutex_exit(&zio->io_lock);
3063 3066  
3064 3067          for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3065 3068                  zio_link_t *zl = zio->io_walk_link;
3066 3069                  pio_next = zio_walk_parents(zio);
3067 3070                  zio_remove_child(pio, zio, zl);
3068 3071                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3069 3072          }
3070 3073  
3071 3074          if (zio->io_waiter != NULL) {
3072 3075                  mutex_enter(&zio->io_lock);
3073 3076                  zio->io_executor = NULL;
3074 3077                  cv_broadcast(&zio->io_cv);
3075 3078                  mutex_exit(&zio->io_lock);
3076 3079          } else {
3077 3080                  zio_destroy(zio);
3078 3081          }
3079 3082  
3080 3083          return (ZIO_PIPELINE_STOP);
3081 3084  }
3082 3085  
3083 3086  /*
3084 3087   * ==========================================================================
3085 3088   * I/O pipeline definition
3086 3089   * ==========================================================================
3087 3090   */
3088 3091  static zio_pipe_stage_t *zio_pipeline[] = {
3089 3092          NULL,
3090 3093          zio_read_bp_init,
3091 3094          zio_free_bp_init,
3092 3095          zio_issue_async,
3093 3096          zio_write_bp_init,
3094 3097          zio_checksum_generate,
3095 3098          zio_nop_write,
3096 3099          zio_ddt_read_start,
3097 3100          zio_ddt_read_done,
3098 3101          zio_ddt_write,
3099 3102          zio_ddt_free,
3100 3103          zio_gang_assemble,
3101 3104          zio_gang_issue,
3102 3105          zio_dva_allocate,
3103 3106          zio_dva_free,
3104 3107          zio_dva_claim,
3105 3108          zio_ready,
3106 3109          zio_vdev_io_start,
3107 3110          zio_vdev_io_done,
3108 3111          zio_vdev_io_assess,
3109 3112          zio_checksum_verify,
3110 3113          zio_done
3111 3114  };
3112 3115  
3113 3116  /* dnp is the dnode for zb1->zb_object */
3114 3117  boolean_t
3115 3118  zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
3116 3119      const zbookmark_t *zb2)
3117 3120  {
3118 3121          uint64_t zb1nextL0, zb2thisobj;
3119 3122  
3120 3123          ASSERT(zb1->zb_objset == zb2->zb_objset);
3121 3124          ASSERT(zb2->zb_level == 0);
3122 3125  
3123 3126          /*
3124 3127           * A bookmark in the deadlist is considered to be after
3125 3128           * everything else.
3126 3129           */
3127 3130          if (zb2->zb_object == DMU_DEADLIST_OBJECT)
3128 3131                  return (B_TRUE);
3129 3132  
3130 3133          /* The objset_phys_t isn't before anything. */
3131 3134          if (dnp == NULL)
3132 3135                  return (B_FALSE);
3133 3136  
3134 3137          zb1nextL0 = (zb1->zb_blkid + 1) <<
3135 3138              ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3136 3139  
3137 3140          zb2thisobj = zb2->zb_object ? zb2->zb_object :
3138 3141              zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3139 3142  
3140 3143          if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3141 3144                  uint64_t nextobj = zb1nextL0 *
3142 3145                      (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3143 3146                  return (nextobj <= zb2thisobj);
3144 3147          }
3145 3148  
3146 3149          if (zb1->zb_object < zb2thisobj)
3147 3150                  return (B_TRUE);
3148 3151          if (zb1->zb_object > zb2thisobj)
3149 3152                  return (B_FALSE);
3150 3153          if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3151 3154                  return (B_FALSE);
3152 3155          return (zb1nextL0 <= zb2->zb_blkid);
3153 3156  }

↓ open down ↓

1950 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX