illumos-gate Wdiff usr/src/uts/common/fs/zfs/zio.c

Print this page

arc_get_data_buf should be more aggressive in eviction when memory is unavailable

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zio.c
          +++ new/usr/src/uts/common/fs/zfs/zio.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/fm/fs/zfs.h>
  29   29  #include <sys/spa.h>
  30   30  #include <sys/txg.h>
  31   31  #include <sys/spa_impl.h>
  32   32  #include <sys/vdev_impl.h>
  33   33  #include <sys/zio_impl.h>
  34   34  #include <sys/zio_compress.h>
  35   35  #include <sys/zio_checksum.h>
  36   36  #include <sys/dmu_objset.h>
  37   37  #include <sys/arc.h>
  38   38  #include <sys/ddt.h>
  39   39  #include <sys/blkptr.h>
  40   40  #include <sys/zfeature.h>
  41   41  
  42   42  /*
  43   43   * ==========================================================================
  44   44   * I/O type descriptions
  45   45   * ==========================================================================
  46   46   */
  47   47  const char *zio_type_name[ZIO_TYPES] = {
  48   48          "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
  49   49          "zio_ioctl"
  50   50  };
  51   51  
  52   52  /*
  53   53   * ==========================================================================
  54   54   * I/O kmem caches
  55   55   * ==========================================================================
  56   56   */
  57   57  kmem_cache_t *zio_cache;
  58   58  kmem_cache_t *zio_link_cache;
  59   59  kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  60   60  kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  61   61  
  62   62  #ifdef _KERNEL
  63   63  extern vmem_t *zio_alloc_arena;
  64   64  #endif
  65   65  
  66   66  #define ZIO_PIPELINE_CONTINUE           0x100
  67   67  #define ZIO_PIPELINE_STOP               0x101
  68   68  
  69   69  /*
  70   70   * The following actions directly effect the spa's sync-to-convergence logic.
  71   71   * The values below define the sync pass when we start performing the action.
  72   72   * Care should be taken when changing these values as they directly impact
  73   73   * spa_sync() performance. Tuning these values may introduce subtle performance
  74   74   * pathologies and should only be done in the context of performance analysis.
  75   75   * These tunables will eventually be removed and replaced with #defines once
  76   76   * enough analysis has been done to determine optimal values.
  77   77   *
  78   78   * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  79   79   * regular blocks are not deferred.
  80   80   */
  81   81  int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  82   82  int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  83   83  int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
  84   84  
  85   85  /*
  86   86   * An allocating zio is one that either currently has the DVA allocate
  87   87   * stage set or will have it later in its lifetime.
  88   88   */
  89   89  #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
  90   90  
  91   91  boolean_t       zio_requeue_io_start_cut_in_line = B_TRUE;
  92   92  
  93   93  #ifdef ZFS_DEBUG
  94   94  int zio_buf_debug_limit = 16384;
  95   95  #else
  96   96  int zio_buf_debug_limit = 0;
  97   97  #endif
  98   98  
  99   99  void
 100  100  zio_init(void)
 101  101  {
 102  102          size_t c;
 103  103          vmem_t *data_alloc_arena = NULL;
 104  104  
 105  105  #ifdef _KERNEL
 106  106          data_alloc_arena = zio_alloc_arena;
 107  107  #endif
 108  108          zio_cache = kmem_cache_create("zio_cache",
 109  109              sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 110  110          zio_link_cache = kmem_cache_create("zio_link_cache",
 111  111              sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 112  112  
 113  113          /*
 114  114           * For small buffers, we want a cache for each multiple of
 115  115           * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 116  116           * for each quarter-power of 2.
 117  117           */
 118  118          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 119  119                  size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 120  120                  size_t p2 = size;
 121  121                  size_t align = 0;
 122  122                  size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 123  123  
 124  124                  while (p2 & (p2 - 1))
 125  125                          p2 &= p2 - 1;
 126  126  
 127  127  #ifndef _KERNEL
 128  128                  /*
 129  129                   * If we are using watchpoints, put each buffer on its own page,
 130  130                   * to eliminate the performance overhead of trapping to the
 131  131                   * kernel when modifying a non-watched buffer that shares the
 132  132                   * page with a watched buffer.
 133  133                   */
 134  134                  if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 135  135                          continue;
 136  136  #endif
 137  137                  if (size <= 4 * SPA_MINBLOCKSIZE) {
 138  138                          align = SPA_MINBLOCKSIZE;
 139  139                  } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 140  140                          align = MIN(p2 >> 2, PAGESIZE);
 141  141                  }
 142  142  
 143  143                  if (align != 0) {
 144  144                          char name[36];
 145  145                          (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 146  146                          zio_buf_cache[c] = kmem_cache_create(name, size,
 147  147                              align, NULL, NULL, NULL, NULL, NULL, cflags);
 148  148  
 149  149                          /*
 150  150                           * Since zio_data bufs do not appear in crash dumps, we
 151  151                           * pass KMC_NOTOUCH so that no allocator metadata is
 152  152                           * stored with the buffers.
 153  153                           */
 154  154                          (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 155  155                          zio_data_buf_cache[c] = kmem_cache_create(name, size,
 156  156                              align, NULL, NULL, NULL, NULL, data_alloc_arena,
 157  157                              cflags | KMC_NOTOUCH);
 158  158                  }
 159  159          }
 160  160  
 161  161          while (--c != 0) {
 162  162                  ASSERT(zio_buf_cache[c] != NULL);
 163  163                  if (zio_buf_cache[c - 1] == NULL)
 164  164                          zio_buf_cache[c - 1] = zio_buf_cache[c];
 165  165  
 166  166                  ASSERT(zio_data_buf_cache[c] != NULL);
 167  167                  if (zio_data_buf_cache[c - 1] == NULL)
 168  168                          zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 169  169          }
 170  170  
 171  171          zio_inject_init();
 172  172  }
 173  173  
 174  174  void
 175  175  zio_fini(void)
 176  176  {
 177  177          size_t c;
 178  178          kmem_cache_t *last_cache = NULL;
 179  179          kmem_cache_t *last_data_cache = NULL;
 180  180  
 181  181          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 182  182                  if (zio_buf_cache[c] != last_cache) {
 183  183                          last_cache = zio_buf_cache[c];
 184  184                          kmem_cache_destroy(zio_buf_cache[c]);
 185  185                  }
 186  186                  zio_buf_cache[c] = NULL;
 187  187  
 188  188                  if (zio_data_buf_cache[c] != last_data_cache) {
 189  189                          last_data_cache = zio_data_buf_cache[c];
 190  190                          kmem_cache_destroy(zio_data_buf_cache[c]);
 191  191                  }
 192  192                  zio_data_buf_cache[c] = NULL;
 193  193          }
 194  194  
 195  195          kmem_cache_destroy(zio_link_cache);
 196  196          kmem_cache_destroy(zio_cache);
 197  197  
 198  198          zio_inject_fini();
 199  199  }
 200  200  
 201  201  /*
 202  202   * ==========================================================================
 203  203   * Allocate and free I/O buffers
 204  204   * ==========================================================================
 205  205   */
 206  206  
 207  207  /*
 208  208   * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 209  209   * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 210  210   * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 211  211   * excess / transient data in-core during a crashdump.
 212  212   */
 213  213  void *

↓ open down ↓

213 lines elided

↑ open up ↑

 214  214  zio_buf_alloc(size_t size)
 215  215  {
 216  216          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 217  217  
 218  218          ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 219  219  
 220  220          return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 221  221  }
 222  222  
 223  223  /*
      224 + * Same as zio_buf_alloc, but won't sleep in case memory cannot be allocated
      225 + * and will instead return immediately with a failure.
      226 + */
      227 +void *
      228 +zio_buf_alloc_canfail(size_t size)
      229 +{
      230 +        size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
      231 +
      232 +        ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
      233 +
      234 +        return (kmem_cache_alloc(zio_buf_cache[c], KM_NOSLEEP | KM_NORMALPRI));
      235 +}
      236 +
      237 +/*
 224  238   * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 225  239   * crashdump if the kernel panics.  This exists so that we will limit the amount
 226  240   * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 227  241   * of kernel heap dumped to disk when the kernel panics)
 228  242   */
 229  243  void *
 230  244  zio_data_buf_alloc(size_t size)
 231  245  {
 232  246          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 233  247  
 234  248          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 235  249  
 236  250          return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 237  251  }
 238  252  
      253 +/*
      254 + * Same as zio_data_buf_alloc, but won't sleep in case memory cannot be
      255 + * allocated and will instead return immediately with a failure.
      256 + */
      257 +void *
      258 +zio_data_buf_alloc_canfail(size_t size)
      259 +{
      260 +        size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
      261 +
      262 +        ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
      263 +
      264 +        return (kmem_cache_alloc(zio_data_buf_cache[c],
      265 +            KM_NOSLEEP | KM_NORMALPRI));
      266 +}
      267 +
 239  268  void
 240  269  zio_buf_free(void *buf, size_t size)
 241  270  {
 242  271          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 243  272  
 244  273          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 245  274  
 246  275          kmem_cache_free(zio_buf_cache[c], buf);
 247  276  }
 248  277

 249  278  void
 250  279  zio_data_buf_free(void *buf, size_t size)
 251  280  {
 252  281          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 253  282  
 254  283          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 255  284  
 256  285          kmem_cache_free(zio_data_buf_cache[c], buf);
 257  286  }
 258  287  
 259  288  /*
 260  289   * ==========================================================================
 261  290   * Push and pop I/O transform buffers
 262  291   * ==========================================================================
 263  292   */
 264  293  static void
 265  294  zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 266  295          zio_transform_func_t *transform)
 267  296  {
 268  297          zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 269  298  
 270  299          zt->zt_orig_data = zio->io_data;
 271  300          zt->zt_orig_size = zio->io_size;
 272  301          zt->zt_bufsize = bufsize;
 273  302          zt->zt_transform = transform;
 274  303  
 275  304          zt->zt_next = zio->io_transform_stack;
 276  305          zio->io_transform_stack = zt;
 277  306  
 278  307          zio->io_data = data;
 279  308          zio->io_size = size;
 280  309  }
 281  310  
 282  311  static void
 283  312  zio_pop_transforms(zio_t *zio)
 284  313  {
 285  314          zio_transform_t *zt;
 286  315  
 287  316          while ((zt = zio->io_transform_stack) != NULL) {
 288  317                  if (zt->zt_transform != NULL)
 289  318                          zt->zt_transform(zio,
 290  319                              zt->zt_orig_data, zt->zt_orig_size);
 291  320  
 292  321                  if (zt->zt_bufsize != 0)
 293  322                          zio_buf_free(zio->io_data, zt->zt_bufsize);
 294  323  
 295  324                  zio->io_data = zt->zt_orig_data;
 296  325                  zio->io_size = zt->zt_orig_size;
 297  326                  zio->io_transform_stack = zt->zt_next;
 298  327  
 299  328                  kmem_free(zt, sizeof (zio_transform_t));
 300  329          }
 301  330  }
 302  331  
 303  332  /*
 304  333   * ==========================================================================
 305  334   * I/O transform callbacks for subblocks and decompression
 306  335   * ==========================================================================
 307  336   */
 308  337  static void
 309  338  zio_subblock(zio_t *zio, void *data, uint64_t size)
 310  339  {
 311  340          ASSERT(zio->io_size > size);
 312  341  
 313  342          if (zio->io_type == ZIO_TYPE_READ)
 314  343                  bcopy(zio->io_data, data, size);
 315  344  }
 316  345  
 317  346  static void
 318  347  zio_decompress(zio_t *zio, void *data, uint64_t size)
 319  348  {
 320  349          if (zio->io_error == 0 &&
 321  350              zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 322  351              zio->io_data, data, zio->io_size, size) != 0)
 323  352                  zio->io_error = SET_ERROR(EIO);
 324  353  }
 325  354  
 326  355  /*
 327  356   * ==========================================================================
 328  357   * I/O parent/child relationships and pipeline interlocks
 329  358   * ==========================================================================
 330  359   */
 331  360  /*
 332  361   * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 333  362   *        continue calling these functions until they return NULL.
 334  363   *        Otherwise, the next caller will pick up the list walk in
 335  364   *        some indeterminate state.  (Otherwise every caller would
 336  365   *        have to pass in a cookie to keep the state represented by
 337  366   *        io_walk_link, which gets annoying.)
 338  367   */
 339  368  zio_t *
 340  369  zio_walk_parents(zio_t *cio)
 341  370  {
 342  371          zio_link_t *zl = cio->io_walk_link;
 343  372          list_t *pl = &cio->io_parent_list;
 344  373  
 345  374          zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 346  375          cio->io_walk_link = zl;
 347  376  
 348  377          if (zl == NULL)
 349  378                  return (NULL);
 350  379  
 351  380          ASSERT(zl->zl_child == cio);
 352  381          return (zl->zl_parent);
 353  382  }
 354  383  
 355  384  zio_t *
 356  385  zio_walk_children(zio_t *pio)
 357  386  {
 358  387          zio_link_t *zl = pio->io_walk_link;
 359  388          list_t *cl = &pio->io_child_list;
 360  389  
 361  390          zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 362  391          pio->io_walk_link = zl;
 363  392  
 364  393          if (zl == NULL)
 365  394                  return (NULL);
 366  395  
 367  396          ASSERT(zl->zl_parent == pio);
 368  397          return (zl->zl_child);
 369  398  }
 370  399  
 371  400  zio_t *
 372  401  zio_unique_parent(zio_t *cio)
 373  402  {
 374  403          zio_t *pio = zio_walk_parents(cio);
 375  404  
 376  405          VERIFY(zio_walk_parents(cio) == NULL);
 377  406          return (pio);
 378  407  }
 379  408  
 380  409  void
 381  410  zio_add_child(zio_t *pio, zio_t *cio)
 382  411  {
 383  412          zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 384  413  
 385  414          /*
 386  415           * Logical I/Os can have logical, gang, or vdev children.
 387  416           * Gang I/Os can have gang or vdev children.
 388  417           * Vdev I/Os can only have vdev children.
 389  418           * The following ASSERT captures all of these constraints.
 390  419           */
 391  420          ASSERT(cio->io_child_type <= pio->io_child_type);
 392  421  
 393  422          zl->zl_parent = pio;
 394  423          zl->zl_child = cio;
 395  424  
 396  425          mutex_enter(&cio->io_lock);
 397  426          mutex_enter(&pio->io_lock);
 398  427  
 399  428          ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 400  429  
 401  430          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 402  431                  pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 403  432  
 404  433          list_insert_head(&pio->io_child_list, zl);
 405  434          list_insert_head(&cio->io_parent_list, zl);
 406  435  
 407  436          pio->io_child_count++;
 408  437          cio->io_parent_count++;
 409  438  
 410  439          mutex_exit(&pio->io_lock);
 411  440          mutex_exit(&cio->io_lock);
 412  441  }
 413  442  
 414  443  static void
 415  444  zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 416  445  {
 417  446          ASSERT(zl->zl_parent == pio);
 418  447          ASSERT(zl->zl_child == cio);
 419  448  
 420  449          mutex_enter(&cio->io_lock);
 421  450          mutex_enter(&pio->io_lock);
 422  451  
 423  452          list_remove(&pio->io_child_list, zl);
 424  453          list_remove(&cio->io_parent_list, zl);
 425  454  
 426  455          pio->io_child_count--;
 427  456          cio->io_parent_count--;
 428  457  
 429  458          mutex_exit(&pio->io_lock);
 430  459          mutex_exit(&cio->io_lock);
 431  460  
 432  461          kmem_cache_free(zio_link_cache, zl);
 433  462  }
 434  463  
 435  464  static boolean_t
 436  465  zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 437  466  {
 438  467          uint64_t *countp = &zio->io_children[child][wait];
 439  468          boolean_t waiting = B_FALSE;
 440  469  
 441  470          mutex_enter(&zio->io_lock);
 442  471          ASSERT(zio->io_stall == NULL);
 443  472          if (*countp != 0) {
 444  473                  zio->io_stage >>= 1;
 445  474                  zio->io_stall = countp;
 446  475                  waiting = B_TRUE;
 447  476          }
 448  477          mutex_exit(&zio->io_lock);
 449  478  
 450  479          return (waiting);
 451  480  }
 452  481  
 453  482  static void
 454  483  zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 455  484  {
 456  485          uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 457  486          int *errorp = &pio->io_child_error[zio->io_child_type];
 458  487  
 459  488          mutex_enter(&pio->io_lock);
 460  489          if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 461  490                  *errorp = zio_worst_error(*errorp, zio->io_error);
 462  491          pio->io_reexecute |= zio->io_reexecute;
 463  492          ASSERT3U(*countp, >, 0);
 464  493  
 465  494          (*countp)--;
 466  495  
 467  496          if (*countp == 0 && pio->io_stall == countp) {
 468  497                  pio->io_stall = NULL;
 469  498                  mutex_exit(&pio->io_lock);
 470  499                  zio_execute(pio);
 471  500          } else {
 472  501                  mutex_exit(&pio->io_lock);
 473  502          }
 474  503  }
 475  504  
 476  505  static void
 477  506  zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 478  507  {
 479  508          if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 480  509                  zio->io_error = zio->io_child_error[c];
 481  510  }
 482  511  
 483  512  /*
 484  513   * ==========================================================================
 485  514   * Create the various types of I/O (read, write, free, etc)
 486  515   * ==========================================================================
 487  516   */
 488  517  static zio_t *
 489  518  zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 490  519      void *data, uint64_t size, zio_done_func_t *done, void *private,
 491  520      zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 492  521      vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
 493  522      enum zio_stage stage, enum zio_stage pipeline)
 494  523  {
 495  524          zio_t *zio;
 496  525  
 497  526          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 498  527          ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 499  528          ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 500  529  
 501  530          ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 502  531          ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 503  532          ASSERT(vd || stage == ZIO_STAGE_OPEN);
 504  533  
 505  534          zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 506  535          bzero(zio, sizeof (zio_t));
 507  536  
 508  537          mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 509  538          cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 510  539  
 511  540          list_create(&zio->io_parent_list, sizeof (zio_link_t),
 512  541              offsetof(zio_link_t, zl_parent_node));
 513  542          list_create(&zio->io_child_list, sizeof (zio_link_t),
 514  543              offsetof(zio_link_t, zl_child_node));
 515  544  
 516  545          if (vd != NULL)
 517  546                  zio->io_child_type = ZIO_CHILD_VDEV;
 518  547          else if (flags & ZIO_FLAG_GANG_CHILD)
 519  548                  zio->io_child_type = ZIO_CHILD_GANG;
 520  549          else if (flags & ZIO_FLAG_DDT_CHILD)
 521  550                  zio->io_child_type = ZIO_CHILD_DDT;
 522  551          else
 523  552                  zio->io_child_type = ZIO_CHILD_LOGICAL;
 524  553  
 525  554          if (bp != NULL) {
 526  555                  zio->io_bp = (blkptr_t *)bp;
 527  556                  zio->io_bp_copy = *bp;
 528  557                  zio->io_bp_orig = *bp;
 529  558                  if (type != ZIO_TYPE_WRITE ||
 530  559                      zio->io_child_type == ZIO_CHILD_DDT)
 531  560                          zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 532  561                  if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 533  562                          zio->io_logical = zio;
 534  563                  if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 535  564                          pipeline |= ZIO_GANG_STAGES;
 536  565          }
 537  566  
 538  567          zio->io_spa = spa;
 539  568          zio->io_txg = txg;
 540  569          zio->io_done = done;
 541  570          zio->io_private = private;
 542  571          zio->io_type = type;
 543  572          zio->io_priority = priority;
 544  573          zio->io_vd = vd;
 545  574          zio->io_offset = offset;
 546  575          zio->io_orig_data = zio->io_data = data;
 547  576          zio->io_orig_size = zio->io_size = size;
 548  577          zio->io_orig_flags = zio->io_flags = flags;
 549  578          zio->io_orig_stage = zio->io_stage = stage;
 550  579          zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 551  580  
 552  581          zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 553  582          zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 554  583  
 555  584          if (zb != NULL)
 556  585                  zio->io_bookmark = *zb;
 557  586  
 558  587          if (pio != NULL) {
 559  588                  if (zio->io_logical == NULL)
 560  589                          zio->io_logical = pio->io_logical;
 561  590                  if (zio->io_child_type == ZIO_CHILD_GANG)
 562  591                          zio->io_gang_leader = pio->io_gang_leader;
 563  592                  zio_add_child(pio, zio);
 564  593          }
 565  594  
 566  595          return (zio);
 567  596  }
 568  597  
 569  598  static void
 570  599  zio_destroy(zio_t *zio)
 571  600  {
 572  601          list_destroy(&zio->io_parent_list);
 573  602          list_destroy(&zio->io_child_list);
 574  603          mutex_destroy(&zio->io_lock);
 575  604          cv_destroy(&zio->io_cv);
 576  605          kmem_cache_free(zio_cache, zio);
 577  606  }
 578  607  
 579  608  zio_t *
 580  609  zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 581  610      void *private, enum zio_flag flags)
 582  611  {
 583  612          zio_t *zio;
 584  613  
 585  614          zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 586  615              ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 587  616              ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 588  617  
 589  618          return (zio);
 590  619  }
 591  620  
 592  621  zio_t *
 593  622  zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 594  623  {
 595  624          return (zio_null(NULL, spa, NULL, done, private, flags));
 596  625  }
 597  626  
 598  627  zio_t *
 599  628  zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 600  629      void *data, uint64_t size, zio_done_func_t *done, void *private,
 601  630      zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 602  631  {
 603  632          zio_t *zio;
 604  633  
 605  634          zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 606  635              data, size, done, private,
 607  636              ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 608  637              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 609  638              ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 610  639  
 611  640          return (zio);
 612  641  }
 613  642  
 614  643  zio_t *
 615  644  zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 616  645      void *data, uint64_t size, const zio_prop_t *zp,
 617  646      zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
 618  647      void *private,
 619  648      zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 620  649  {
 621  650          zio_t *zio;
 622  651  
 623  652          ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 624  653              zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 625  654              zp->zp_compress >= ZIO_COMPRESS_OFF &&
 626  655              zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 627  656              DMU_OT_IS_VALID(zp->zp_type) &&
 628  657              zp->zp_level < 32 &&
 629  658              zp->zp_copies > 0 &&
 630  659              zp->zp_copies <= spa_max_replication(spa));
 631  660  
 632  661          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 633  662              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 634  663              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 635  664              ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 636  665  
 637  666          zio->io_ready = ready;
 638  667          zio->io_physdone = physdone;
 639  668          zio->io_prop = *zp;
 640  669  
 641  670          /*
 642  671           * Data can be NULL if we are going to call zio_write_override() to
 643  672           * provide the already-allocated BP.  But we may need the data to
 644  673           * verify a dedup hit (if requested).  In this case, don't try to
 645  674           * dedup (just take the already-allocated BP verbatim).
 646  675           */
 647  676          if (data == NULL && zio->io_prop.zp_dedup_verify) {
 648  677                  zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 649  678          }
 650  679  
 651  680          return (zio);
 652  681  }
 653  682  
 654  683  zio_t *
 655  684  zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 656  685      uint64_t size, zio_done_func_t *done, void *private,
 657  686      zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 658  687  {
 659  688          zio_t *zio;
 660  689  
 661  690          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 662  691              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 663  692              ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 664  693  
 665  694          return (zio);
 666  695  }
 667  696  
 668  697  void
 669  698  zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 670  699  {
 671  700          ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 672  701          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 673  702          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 674  703          ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 675  704  
 676  705          /*
 677  706           * We must reset the io_prop to match the values that existed
 678  707           * when the bp was first written by dmu_sync() keeping in mind
 679  708           * that nopwrite and dedup are mutually exclusive.
 680  709           */
 681  710          zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 682  711          zio->io_prop.zp_nopwrite = nopwrite;
 683  712          zio->io_prop.zp_copies = copies;
 684  713          zio->io_bp_override = bp;
 685  714  }
 686  715  
 687  716  void
 688  717  zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 689  718  {
 690  719  
 691  720          /*
 692  721           * The check for EMBEDDED is a performance optimization.  We
 693  722           * process the free here (by ignoring it) rather than
 694  723           * putting it on the list and then processing it in zio_free_sync().
 695  724           */
 696  725          if (BP_IS_EMBEDDED(bp))
 697  726                  return;
 698  727          metaslab_check_free(spa, bp);
 699  728  
 700  729          /*
 701  730           * Frees that are for the currently-syncing txg, are not going to be
 702  731           * deferred, and which will not need to do a read (i.e. not GANG or
 703  732           * DEDUP), can be processed immediately.  Otherwise, put them on the
 704  733           * in-memory list for later processing.
 705  734           */
 706  735          if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 707  736              txg != spa->spa_syncing_txg ||
 708  737              spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 709  738                  bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 710  739          } else {
 711  740                  VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
 712  741          }
 713  742  }
 714  743  
 715  744  zio_t *
 716  745  zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 717  746      enum zio_flag flags)
 718  747  {
 719  748          zio_t *zio;
 720  749          enum zio_stage stage = ZIO_FREE_PIPELINE;
 721  750  
 722  751          ASSERT(!BP_IS_HOLE(bp));
 723  752          ASSERT(spa_syncing_txg(spa) == txg);
 724  753          ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 725  754  
 726  755          if (BP_IS_EMBEDDED(bp))
 727  756                  return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 728  757  
 729  758          metaslab_check_free(spa, bp);
 730  759          arc_freed(spa, bp);
 731  760  
 732  761          /*
 733  762           * GANG and DEDUP blocks can induce a read (for the gang block header,
 734  763           * or the DDT), so issue them asynchronously so that this thread is
 735  764           * not tied up.
 736  765           */
 737  766          if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 738  767                  stage |= ZIO_STAGE_ISSUE_ASYNC;
 739  768  
 740  769          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 741  770              NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 742  771              NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 743  772  
 744  773          return (zio);
 745  774  }
 746  775  
 747  776  zio_t *
 748  777  zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 749  778      zio_done_func_t *done, void *private, enum zio_flag flags)
 750  779  {
 751  780          zio_t *zio;
 752  781  
 753  782          dprintf_bp(bp, "claiming in txg %llu", txg);
 754  783  
 755  784          if (BP_IS_EMBEDDED(bp))
 756  785                  return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 757  786  
 758  787          /*
 759  788           * A claim is an allocation of a specific block.  Claims are needed
 760  789           * to support immediate writes in the intent log.  The issue is that
 761  790           * immediate writes contain committed data, but in a txg that was
 762  791           * *not* committed.  Upon opening the pool after an unclean shutdown,
 763  792           * the intent log claims all blocks that contain immediate write data
 764  793           * so that the SPA knows they're in use.
 765  794           *
 766  795           * All claims *must* be resolved in the first txg -- before the SPA
 767  796           * starts allocating blocks -- so that nothing is allocated twice.
 768  797           * If txg == 0 we just verify that the block is claimable.
 769  798           */
 770  799          ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 771  800          ASSERT(txg == spa_first_txg(spa) || txg == 0);
 772  801          ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 773  802  
 774  803          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 775  804              done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 776  805              NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 777  806  
 778  807          return (zio);
 779  808  }
 780  809  
 781  810  zio_t *
 782  811  zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 783  812      zio_done_func_t *done, void *private, enum zio_flag flags)
 784  813  {
 785  814          zio_t *zio;
 786  815          int c;
 787  816  
 788  817          if (vd->vdev_children == 0) {
 789  818                  zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 790  819                      ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 791  820                      ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 792  821  
 793  822                  zio->io_cmd = cmd;
 794  823          } else {
 795  824                  zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 796  825  
 797  826                  for (c = 0; c < vd->vdev_children; c++)
 798  827                          zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 799  828                              done, private, flags));
 800  829          }
 801  830  
 802  831          return (zio);
 803  832  }
 804  833  
 805  834  zio_t *
 806  835  zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 807  836      void *data, int checksum, zio_done_func_t *done, void *private,
 808  837      zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 809  838  {
 810  839          zio_t *zio;
 811  840  
 812  841          ASSERT(vd->vdev_children == 0);
 813  842          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 814  843              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 815  844          ASSERT3U(offset + size, <=, vd->vdev_psize);
 816  845  
 817  846          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 818  847              ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 819  848              NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 820  849  
 821  850          zio->io_prop.zp_checksum = checksum;
 822  851  
 823  852          return (zio);
 824  853  }
 825  854  
 826  855  zio_t *
 827  856  zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 828  857      void *data, int checksum, zio_done_func_t *done, void *private,
 829  858      zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 830  859  {
 831  860          zio_t *zio;
 832  861  
 833  862          ASSERT(vd->vdev_children == 0);
 834  863          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 835  864              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 836  865          ASSERT3U(offset + size, <=, vd->vdev_psize);
 837  866  
 838  867          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 839  868              ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 840  869              NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 841  870  
 842  871          zio->io_prop.zp_checksum = checksum;
 843  872  
 844  873          if (zio_checksum_table[checksum].ci_eck) {
 845  874                  /*
 846  875                   * zec checksums are necessarily destructive -- they modify
 847  876                   * the end of the write buffer to hold the verifier/checksum.
 848  877                   * Therefore, we must make a local copy in case the data is
 849  878                   * being written to multiple places in parallel.
 850  879                   */
 851  880                  void *wbuf = zio_buf_alloc(size);
 852  881                  bcopy(data, wbuf, size);
 853  882                  zio_push_transform(zio, wbuf, size, size, NULL);
 854  883          }
 855  884  
 856  885          return (zio);
 857  886  }
 858  887  
 859  888  /*
 860  889   * Create a child I/O to do some work for us.
 861  890   */
 862  891  zio_t *
 863  892  zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 864  893          void *data, uint64_t size, int type, zio_priority_t priority,
 865  894          enum zio_flag flags, zio_done_func_t *done, void *private)
 866  895  {
 867  896          enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 868  897          zio_t *zio;
 869  898  
 870  899          ASSERT(vd->vdev_parent ==
 871  900              (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 872  901  
 873  902          if (type == ZIO_TYPE_READ && bp != NULL) {
 874  903                  /*
 875  904                   * If we have the bp, then the child should perform the
 876  905                   * checksum and the parent need not.  This pushes error
 877  906                   * detection as close to the leaves as possible and
 878  907                   * eliminates redundant checksums in the interior nodes.
 879  908                   */
 880  909                  pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 881  910                  pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 882  911          }
 883  912  
 884  913          if (vd->vdev_children == 0)
 885  914                  offset += VDEV_LABEL_START_SIZE;
 886  915  
 887  916          flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 888  917  
 889  918          /*
 890  919           * If we've decided to do a repair, the write is not speculative --
 891  920           * even if the original read was.
 892  921           */
 893  922          if (flags & ZIO_FLAG_IO_REPAIR)
 894  923                  flags &= ~ZIO_FLAG_SPECULATIVE;
 895  924  
 896  925          zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 897  926              done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 898  927              ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 899  928  
 900  929          zio->io_physdone = pio->io_physdone;
 901  930          if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
 902  931                  zio->io_logical->io_phys_children++;
 903  932  
 904  933          return (zio);
 905  934  }
 906  935  
 907  936  zio_t *
 908  937  zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 909  938          int type, zio_priority_t priority, enum zio_flag flags,
 910  939          zio_done_func_t *done, void *private)
 911  940  {
 912  941          zio_t *zio;
 913  942  
 914  943          ASSERT(vd->vdev_ops->vdev_op_leaf);
 915  944  
 916  945          zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 917  946              data, size, done, private, type, priority,
 918  947              flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 919  948              vd, offset, NULL,
 920  949              ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 921  950  
 922  951          return (zio);
 923  952  }
 924  953  
 925  954  void
 926  955  zio_flush(zio_t *zio, vdev_t *vd)
 927  956  {
 928  957          zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 929  958              NULL, NULL,
 930  959              ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 931  960  }
 932  961  
 933  962  void
 934  963  zio_shrink(zio_t *zio, uint64_t size)
 935  964  {
 936  965          ASSERT(zio->io_executor == NULL);
 937  966          ASSERT(zio->io_orig_size == zio->io_size);
 938  967          ASSERT(size <= zio->io_size);
 939  968  
 940  969          /*
 941  970           * We don't shrink for raidz because of problems with the
 942  971           * reconstruction when reading back less than the block size.
 943  972           * Note, BP_IS_RAIDZ() assumes no compression.
 944  973           */
 945  974          ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 946  975          if (!BP_IS_RAIDZ(zio->io_bp))
 947  976                  zio->io_orig_size = zio->io_size = size;
 948  977  }
 949  978  
 950  979  /*
 951  980   * ==========================================================================
 952  981   * Prepare to read and write logical blocks
 953  982   * ==========================================================================
 954  983   */
 955  984  
 956  985  static int
 957  986  zio_read_bp_init(zio_t *zio)
 958  987  {
 959  988          blkptr_t *bp = zio->io_bp;
 960  989  
 961  990          if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 962  991              zio->io_child_type == ZIO_CHILD_LOGICAL &&
 963  992              !(zio->io_flags & ZIO_FLAG_RAW)) {
 964  993                  uint64_t psize =
 965  994                      BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 966  995                  void *cbuf = zio_buf_alloc(psize);
 967  996  
 968  997                  zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 969  998          }
 970  999  
 971 1000          if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 972 1001                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 973 1002                  decode_embedded_bp_compressed(bp, zio->io_data);
 974 1003          } else {
 975 1004                  ASSERT(!BP_IS_EMBEDDED(bp));
 976 1005          }
 977 1006  
 978 1007          if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 979 1008                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 980 1009  
 981 1010          if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
 982 1011                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 983 1012  
 984 1013          if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 985 1014                  zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 986 1015  
 987 1016          return (ZIO_PIPELINE_CONTINUE);
 988 1017  }
 989 1018  
 990 1019  static int
 991 1020  zio_write_bp_init(zio_t *zio)
 992 1021  {
 993 1022          spa_t *spa = zio->io_spa;
 994 1023          zio_prop_t *zp = &zio->io_prop;
 995 1024          enum zio_compress compress = zp->zp_compress;
 996 1025          blkptr_t *bp = zio->io_bp;
 997 1026          uint64_t lsize = zio->io_size;
 998 1027          uint64_t psize = lsize;
 999 1028          int pass = 1;
1000 1029  
1001 1030          /*
1002 1031           * If our children haven't all reached the ready stage,
1003 1032           * wait for them and then repeat this pipeline stage.
1004 1033           */
1005 1034          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1006 1035              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1007 1036                  return (ZIO_PIPELINE_STOP);
1008 1037  
1009 1038          if (!IO_IS_ALLOCATING(zio))
1010 1039                  return (ZIO_PIPELINE_CONTINUE);
1011 1040  
1012 1041          ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1013 1042  
1014 1043          if (zio->io_bp_override) {
1015 1044                  ASSERT(bp->blk_birth != zio->io_txg);
1016 1045                  ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1017 1046  
1018 1047                  *bp = *zio->io_bp_override;
1019 1048                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1020 1049  
1021 1050                  if (BP_IS_EMBEDDED(bp))
1022 1051                          return (ZIO_PIPELINE_CONTINUE);
1023 1052  
1024 1053                  /*
1025 1054                   * If we've been overridden and nopwrite is set then
1026 1055                   * set the flag accordingly to indicate that a nopwrite
1027 1056                   * has already occurred.
1028 1057                   */
1029 1058                  if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1030 1059                          ASSERT(!zp->zp_dedup);
1031 1060                          zio->io_flags |= ZIO_FLAG_NOPWRITE;
1032 1061                          return (ZIO_PIPELINE_CONTINUE);
1033 1062                  }
1034 1063  
1035 1064                  ASSERT(!zp->zp_nopwrite);
1036 1065  
1037 1066                  if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1038 1067                          return (ZIO_PIPELINE_CONTINUE);
1039 1068  
1040 1069                  ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1041 1070                      zp->zp_dedup_verify);
1042 1071  
1043 1072                  if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1044 1073                          BP_SET_DEDUP(bp, 1);
1045 1074                          zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1046 1075                          return (ZIO_PIPELINE_CONTINUE);
1047 1076                  }
1048 1077                  zio->io_bp_override = NULL;
1049 1078                  BP_ZERO(bp);
1050 1079          }
1051 1080  
1052 1081          if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1053 1082                  /*
1054 1083                   * We're rewriting an existing block, which means we're
1055 1084                   * working on behalf of spa_sync().  For spa_sync() to
1056 1085                   * converge, it must eventually be the case that we don't
1057 1086                   * have to allocate new blocks.  But compression changes
1058 1087                   * the blocksize, which forces a reallocate, and makes
1059 1088                   * convergence take longer.  Therefore, after the first
1060 1089                   * few passes, stop compressing to ensure convergence.
1061 1090                   */
1062 1091                  pass = spa_sync_pass(spa);
1063 1092  
1064 1093                  ASSERT(zio->io_txg == spa_syncing_txg(spa));
1065 1094                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1066 1095                  ASSERT(!BP_GET_DEDUP(bp));
1067 1096  
1068 1097                  if (pass >= zfs_sync_pass_dont_compress)
1069 1098                          compress = ZIO_COMPRESS_OFF;
1070 1099  
1071 1100                  /* Make sure someone doesn't change their mind on overwrites */
1072 1101                  ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1073 1102                      spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1074 1103          }
1075 1104  
1076 1105          if (compress != ZIO_COMPRESS_OFF) {
1077 1106                  void *cbuf = zio_buf_alloc(lsize);
1078 1107                  psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1079 1108                  if (psize == 0 || psize == lsize) {
1080 1109                          compress = ZIO_COMPRESS_OFF;
1081 1110                          zio_buf_free(cbuf, lsize);
1082 1111                  } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1083 1112                      zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1084 1113                      spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1085 1114                          encode_embedded_bp_compressed(bp,
1086 1115                              cbuf, compress, lsize, psize);
1087 1116                          BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1088 1117                          BP_SET_TYPE(bp, zio->io_prop.zp_type);
1089 1118                          BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1090 1119                          zio_buf_free(cbuf, lsize);
1091 1120                          bp->blk_birth = zio->io_txg;
1092 1121                          zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1093 1122                          ASSERT(spa_feature_is_active(spa,
1094 1123                              SPA_FEATURE_EMBEDDED_DATA));
1095 1124                          return (ZIO_PIPELINE_CONTINUE);
1096 1125                  } else {
1097 1126                          /*
1098 1127                           * Round up compressed size to MINBLOCKSIZE and
1099 1128                           * zero the tail.
1100 1129                           */
1101 1130                          size_t rounded =
1102 1131                              P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
1103 1132                          if (rounded > psize) {
1104 1133                                  bzero((char *)cbuf + psize, rounded - psize);
1105 1134                                  psize = rounded;
1106 1135                          }
1107 1136                          if (psize == lsize) {
1108 1137                                  compress = ZIO_COMPRESS_OFF;
1109 1138                                  zio_buf_free(cbuf, lsize);
1110 1139                          } else {
1111 1140                                  zio_push_transform(zio, cbuf,
1112 1141                                      psize, lsize, NULL);
1113 1142                          }
1114 1143                  }
1115 1144          }
1116 1145  
1117 1146          /*
1118 1147           * The final pass of spa_sync() must be all rewrites, but the first
1119 1148           * few passes offer a trade-off: allocating blocks defers convergence,
1120 1149           * but newly allocated blocks are sequential, so they can be written
1121 1150           * to disk faster.  Therefore, we allow the first few passes of
1122 1151           * spa_sync() to allocate new blocks, but force rewrites after that.
1123 1152           * There should only be a handful of blocks after pass 1 in any case.
1124 1153           */
1125 1154          if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1126 1155              BP_GET_PSIZE(bp) == psize &&
1127 1156              pass >= zfs_sync_pass_rewrite) {
1128 1157                  ASSERT(psize != 0);
1129 1158                  enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1130 1159                  zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1131 1160                  zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1132 1161          } else {
1133 1162                  BP_ZERO(bp);
1134 1163                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
1135 1164          }
1136 1165  
1137 1166          if (psize == 0) {
1138 1167                  if (zio->io_bp_orig.blk_birth != 0 &&
1139 1168                      spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1140 1169                          BP_SET_LSIZE(bp, lsize);
1141 1170                          BP_SET_TYPE(bp, zp->zp_type);
1142 1171                          BP_SET_LEVEL(bp, zp->zp_level);
1143 1172                          BP_SET_BIRTH(bp, zio->io_txg, 0);
1144 1173                  }
1145 1174                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1146 1175          } else {
1147 1176                  ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1148 1177                  BP_SET_LSIZE(bp, lsize);
1149 1178                  BP_SET_TYPE(bp, zp->zp_type);
1150 1179                  BP_SET_LEVEL(bp, zp->zp_level);
1151 1180                  BP_SET_PSIZE(bp, psize);
1152 1181                  BP_SET_COMPRESS(bp, compress);
1153 1182                  BP_SET_CHECKSUM(bp, zp->zp_checksum);
1154 1183                  BP_SET_DEDUP(bp, zp->zp_dedup);
1155 1184                  BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1156 1185                  if (zp->zp_dedup) {
1157 1186                          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1158 1187                          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1159 1188                          zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1160 1189                  }
1161 1190                  if (zp->zp_nopwrite) {
1162 1191                          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1163 1192                          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1164 1193                          zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1165 1194                  }
1166 1195          }
1167 1196  
1168 1197          return (ZIO_PIPELINE_CONTINUE);
1169 1198  }
1170 1199  
1171 1200  static int
1172 1201  zio_free_bp_init(zio_t *zio)
1173 1202  {
1174 1203          blkptr_t *bp = zio->io_bp;
1175 1204  
1176 1205          if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1177 1206                  if (BP_GET_DEDUP(bp))
1178 1207                          zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1179 1208          }
1180 1209  
1181 1210          return (ZIO_PIPELINE_CONTINUE);
1182 1211  }
1183 1212  
1184 1213  /*
1185 1214   * ==========================================================================
1186 1215   * Execute the I/O pipeline
1187 1216   * ==========================================================================
1188 1217   */
1189 1218  
1190 1219  static void
1191 1220  zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1192 1221  {
1193 1222          spa_t *spa = zio->io_spa;
1194 1223          zio_type_t t = zio->io_type;
1195 1224          int flags = (cutinline ? TQ_FRONT : 0);
1196 1225  
1197 1226          /*
1198 1227           * If we're a config writer or a probe, the normal issue and
1199 1228           * interrupt threads may all be blocked waiting for the config lock.
1200 1229           * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1201 1230           */
1202 1231          if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1203 1232                  t = ZIO_TYPE_NULL;
1204 1233  
1205 1234          /*
1206 1235           * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1207 1236           */
1208 1237          if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1209 1238                  t = ZIO_TYPE_NULL;
1210 1239  
1211 1240          /*
1212 1241           * If this is a high priority I/O, then use the high priority taskq if
1213 1242           * available.
1214 1243           */
1215 1244          if (zio->io_priority == ZIO_PRIORITY_NOW &&
1216 1245              spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1217 1246                  q++;
1218 1247  
1219 1248          ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1220 1249  
1221 1250          /*
1222 1251           * NB: We are assuming that the zio can only be dispatched
1223 1252           * to a single taskq at a time.  It would be a grievous error
1224 1253           * to dispatch the zio to another taskq at the same time.
1225 1254           */
1226 1255          ASSERT(zio->io_tqent.tqent_next == NULL);
1227 1256          spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1228 1257              flags, &zio->io_tqent);
1229 1258  }
1230 1259  
1231 1260  static boolean_t
1232 1261  zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1233 1262  {
1234 1263          kthread_t *executor = zio->io_executor;
1235 1264          spa_t *spa = zio->io_spa;
1236 1265  
1237 1266          for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1238 1267                  spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1239 1268                  uint_t i;
1240 1269                  for (i = 0; i < tqs->stqs_count; i++) {
1241 1270                          if (taskq_member(tqs->stqs_taskq[i], executor))
1242 1271                                  return (B_TRUE);
1243 1272                  }
1244 1273          }
1245 1274  
1246 1275          return (B_FALSE);
1247 1276  }
1248 1277  
1249 1278  static int
1250 1279  zio_issue_async(zio_t *zio)
1251 1280  {
1252 1281          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1253 1282  
1254 1283          return (ZIO_PIPELINE_STOP);
1255 1284  }
1256 1285  
1257 1286  void
1258 1287  zio_interrupt(zio_t *zio)
1259 1288  {
1260 1289          zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1261 1290  }
1262 1291  
1263 1292  /*
1264 1293   * Execute the I/O pipeline until one of the following occurs:
1265 1294   *
1266 1295   *      (1) the I/O completes
1267 1296   *      (2) the pipeline stalls waiting for dependent child I/Os
1268 1297   *      (3) the I/O issues, so we're waiting for an I/O completion interrupt
1269 1298   *      (4) the I/O is delegated by vdev-level caching or aggregation
1270 1299   *      (5) the I/O is deferred due to vdev-level queueing
1271 1300   *      (6) the I/O is handed off to another thread.
1272 1301   *
1273 1302   * In all cases, the pipeline stops whenever there's no CPU work; it never
1274 1303   * burns a thread in cv_wait().
1275 1304   *
1276 1305   * There's no locking on io_stage because there's no legitimate way
1277 1306   * for multiple threads to be attempting to process the same I/O.
1278 1307   */
1279 1308  static zio_pipe_stage_t *zio_pipeline[];
1280 1309  
1281 1310  void
1282 1311  zio_execute(zio_t *zio)
1283 1312  {
1284 1313          zio->io_executor = curthread;
1285 1314  
1286 1315          while (zio->io_stage < ZIO_STAGE_DONE) {
1287 1316                  enum zio_stage pipeline = zio->io_pipeline;
1288 1317                  enum zio_stage stage = zio->io_stage;
1289 1318                  int rv;
1290 1319  
1291 1320                  ASSERT(!MUTEX_HELD(&zio->io_lock));
1292 1321                  ASSERT(ISP2(stage));
1293 1322                  ASSERT(zio->io_stall == NULL);
1294 1323  
1295 1324                  do {
1296 1325                          stage <<= 1;
1297 1326                  } while ((stage & pipeline) == 0);
1298 1327  
1299 1328                  ASSERT(stage <= ZIO_STAGE_DONE);
1300 1329  
1301 1330                  /*
1302 1331                   * If we are in interrupt context and this pipeline stage
1303 1332                   * will grab a config lock that is held across I/O,
1304 1333                   * or may wait for an I/O that needs an interrupt thread
1305 1334                   * to complete, issue async to avoid deadlock.
1306 1335                   *
1307 1336                   * For VDEV_IO_START, we cut in line so that the io will
1308 1337                   * be sent to disk promptly.
1309 1338                   */
1310 1339                  if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1311 1340                      zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1312 1341                          boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1313 1342                              zio_requeue_io_start_cut_in_line : B_FALSE;
1314 1343                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1315 1344                          return;
1316 1345                  }
1317 1346  
1318 1347                  zio->io_stage = stage;
1319 1348                  rv = zio_pipeline[highbit64(stage) - 1](zio);
1320 1349  
1321 1350                  if (rv == ZIO_PIPELINE_STOP)
1322 1351                          return;
1323 1352  
1324 1353                  ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1325 1354          }
1326 1355  }
1327 1356  
1328 1357  /*
1329 1358   * ==========================================================================
1330 1359   * Initiate I/O, either sync or async
1331 1360   * ==========================================================================
1332 1361   */
1333 1362  int
1334 1363  zio_wait(zio_t *zio)
1335 1364  {
1336 1365          int error;
1337 1366  
1338 1367          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1339 1368          ASSERT(zio->io_executor == NULL);
1340 1369  
1341 1370          zio->io_waiter = curthread;
1342 1371  
1343 1372          zio_execute(zio);
1344 1373  
1345 1374          mutex_enter(&zio->io_lock);
1346 1375          while (zio->io_executor != NULL)
1347 1376                  cv_wait(&zio->io_cv, &zio->io_lock);
1348 1377          mutex_exit(&zio->io_lock);
1349 1378  
1350 1379          error = zio->io_error;
1351 1380          zio_destroy(zio);
1352 1381  
1353 1382          return (error);
1354 1383  }
1355 1384  
1356 1385  void
1357 1386  zio_nowait(zio_t *zio)
1358 1387  {
1359 1388          ASSERT(zio->io_executor == NULL);
1360 1389  
1361 1390          if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1362 1391              zio_unique_parent(zio) == NULL) {
1363 1392                  /*
1364 1393                   * This is a logical async I/O with no parent to wait for it.
1365 1394                   * We add it to the spa_async_root_zio "Godfather" I/O which
1366 1395                   * will ensure they complete prior to unloading the pool.
1367 1396                   */
1368 1397                  spa_t *spa = zio->io_spa;
1369 1398  
1370 1399                  zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
1371 1400          }
1372 1401  
1373 1402          zio_execute(zio);
1374 1403  }
1375 1404  
1376 1405  /*
1377 1406   * ==========================================================================
1378 1407   * Reexecute or suspend/resume failed I/O
1379 1408   * ==========================================================================
1380 1409   */
1381 1410  
1382 1411  static void
1383 1412  zio_reexecute(zio_t *pio)
1384 1413  {
1385 1414          zio_t *cio, *cio_next;
1386 1415  
1387 1416          ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1388 1417          ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1389 1418          ASSERT(pio->io_gang_leader == NULL);
1390 1419          ASSERT(pio->io_gang_tree == NULL);
1391 1420  
1392 1421          pio->io_flags = pio->io_orig_flags;
1393 1422          pio->io_stage = pio->io_orig_stage;
1394 1423          pio->io_pipeline = pio->io_orig_pipeline;
1395 1424          pio->io_reexecute = 0;
1396 1425          pio->io_flags |= ZIO_FLAG_REEXECUTED;
1397 1426          pio->io_error = 0;
1398 1427          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1399 1428                  pio->io_state[w] = 0;
1400 1429          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1401 1430                  pio->io_child_error[c] = 0;
1402 1431  
1403 1432          if (IO_IS_ALLOCATING(pio))
1404 1433                  BP_ZERO(pio->io_bp);
1405 1434  
1406 1435          /*
1407 1436           * As we reexecute pio's children, new children could be created.
1408 1437           * New children go to the head of pio's io_child_list, however,
1409 1438           * so we will (correctly) not reexecute them.  The key is that
1410 1439           * the remainder of pio's io_child_list, from 'cio_next' onward,
1411 1440           * cannot be affected by any side effects of reexecuting 'cio'.
1412 1441           */
1413 1442          for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1414 1443                  cio_next = zio_walk_children(pio);
1415 1444                  mutex_enter(&pio->io_lock);
1416 1445                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1417 1446                          pio->io_children[cio->io_child_type][w]++;
1418 1447                  mutex_exit(&pio->io_lock);
1419 1448                  zio_reexecute(cio);
1420 1449          }
1421 1450  
1422 1451          /*
1423 1452           * Now that all children have been reexecuted, execute the parent.
1424 1453           * We don't reexecute "The Godfather" I/O here as it's the
1425 1454           * responsibility of the caller to wait on him.
1426 1455           */
1427 1456          if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1428 1457                  zio_execute(pio);
1429 1458  }
1430 1459  
1431 1460  void
1432 1461  zio_suspend(spa_t *spa, zio_t *zio)
1433 1462  {
1434 1463          if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1435 1464                  fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1436 1465                      "failure and the failure mode property for this pool "
1437 1466                      "is set to panic.", spa_name(spa));
1438 1467  
1439 1468          zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1440 1469  
1441 1470          mutex_enter(&spa->spa_suspend_lock);
1442 1471  
1443 1472          if (spa->spa_suspend_zio_root == NULL)
1444 1473                  spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1445 1474                      ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1446 1475                      ZIO_FLAG_GODFATHER);
1447 1476  
1448 1477          spa->spa_suspended = B_TRUE;
1449 1478  
1450 1479          if (zio != NULL) {
1451 1480                  ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1452 1481                  ASSERT(zio != spa->spa_suspend_zio_root);
1453 1482                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1454 1483                  ASSERT(zio_unique_parent(zio) == NULL);
1455 1484                  ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1456 1485                  zio_add_child(spa->spa_suspend_zio_root, zio);
1457 1486          }
1458 1487  
1459 1488          mutex_exit(&spa->spa_suspend_lock);
1460 1489  }
1461 1490  
1462 1491  int
1463 1492  zio_resume(spa_t *spa)
1464 1493  {
1465 1494          zio_t *pio;
1466 1495  
1467 1496          /*
1468 1497           * Reexecute all previously suspended i/o.
1469 1498           */
1470 1499          mutex_enter(&spa->spa_suspend_lock);
1471 1500          spa->spa_suspended = B_FALSE;
1472 1501          cv_broadcast(&spa->spa_suspend_cv);
1473 1502          pio = spa->spa_suspend_zio_root;
1474 1503          spa->spa_suspend_zio_root = NULL;
1475 1504          mutex_exit(&spa->spa_suspend_lock);
1476 1505  
1477 1506          if (pio == NULL)
1478 1507                  return (0);
1479 1508  
1480 1509          zio_reexecute(pio);
1481 1510          return (zio_wait(pio));
1482 1511  }
1483 1512  
1484 1513  void
1485 1514  zio_resume_wait(spa_t *spa)
1486 1515  {
1487 1516          mutex_enter(&spa->spa_suspend_lock);
1488 1517          while (spa_suspended(spa))
1489 1518                  cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1490 1519          mutex_exit(&spa->spa_suspend_lock);
1491 1520  }
1492 1521  
1493 1522  /*
1494 1523   * ==========================================================================
1495 1524   * Gang blocks.
1496 1525   *
1497 1526   * A gang block is a collection of small blocks that looks to the DMU
1498 1527   * like one large block.  When zio_dva_allocate() cannot find a block
1499 1528   * of the requested size, due to either severe fragmentation or the pool
1500 1529   * being nearly full, it calls zio_write_gang_block() to construct the
1501 1530   * block from smaller fragments.
1502 1531   *
1503 1532   * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1504 1533   * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1505 1534   * an indirect block: it's an array of block pointers.  It consumes
1506 1535   * only one sector and hence is allocatable regardless of fragmentation.
1507 1536   * The gang header's bps point to its gang members, which hold the data.
1508 1537   *
1509 1538   * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1510 1539   * as the verifier to ensure uniqueness of the SHA256 checksum.
1511 1540   * Critically, the gang block bp's blk_cksum is the checksum of the data,
1512 1541   * not the gang header.  This ensures that data block signatures (needed for
1513 1542   * deduplication) are independent of how the block is physically stored.
1514 1543   *
1515 1544   * Gang blocks can be nested: a gang member may itself be a gang block.
1516 1545   * Thus every gang block is a tree in which root and all interior nodes are
1517 1546   * gang headers, and the leaves are normal blocks that contain user data.
1518 1547   * The root of the gang tree is called the gang leader.
1519 1548   *
1520 1549   * To perform any operation (read, rewrite, free, claim) on a gang block,
1521 1550   * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1522 1551   * in the io_gang_tree field of the original logical i/o by recursively
1523 1552   * reading the gang leader and all gang headers below it.  This yields
1524 1553   * an in-core tree containing the contents of every gang header and the
1525 1554   * bps for every constituent of the gang block.
1526 1555   *
1527 1556   * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1528 1557   * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1529 1558   * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1530 1559   * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1531 1560   * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1532 1561   * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1533 1562   * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1534 1563   * of the gang header plus zio_checksum_compute() of the data to update the
1535 1564   * gang header's blk_cksum as described above.
1536 1565   *
1537 1566   * The two-phase assemble/issue model solves the problem of partial failure --
1538 1567   * what if you'd freed part of a gang block but then couldn't read the
1539 1568   * gang header for another part?  Assembling the entire gang tree first
1540 1569   * ensures that all the necessary gang header I/O has succeeded before
1541 1570   * starting the actual work of free, claim, or write.  Once the gang tree
1542 1571   * is assembled, free and claim are in-memory operations that cannot fail.
1543 1572   *
1544 1573   * In the event that a gang write fails, zio_dva_unallocate() walks the
1545 1574   * gang tree to immediately free (i.e. insert back into the space map)
1546 1575   * everything we've allocated.  This ensures that we don't get ENOSPC
1547 1576   * errors during repeated suspend/resume cycles due to a flaky device.
1548 1577   *
1549 1578   * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1550 1579   * the gang tree, we won't modify the block, so we can safely defer the free
1551 1580   * (knowing that the block is still intact).  If we *can* assemble the gang
1552 1581   * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1553 1582   * each constituent bp and we can allocate a new block on the next sync pass.
1554 1583   *
1555 1584   * In all cases, the gang tree allows complete recovery from partial failure.
1556 1585   * ==========================================================================
1557 1586   */
1558 1587  
1559 1588  static zio_t *
1560 1589  zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1561 1590  {
1562 1591          if (gn != NULL)
1563 1592                  return (pio);
1564 1593  
1565 1594          return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1566 1595              NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1567 1596              &pio->io_bookmark));
1568 1597  }
1569 1598  
1570 1599  zio_t *
1571 1600  zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1572 1601  {
1573 1602          zio_t *zio;
1574 1603  
1575 1604          if (gn != NULL) {
1576 1605                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1577 1606                      gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1578 1607                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1579 1608                  /*
1580 1609                   * As we rewrite each gang header, the pipeline will compute
1581 1610                   * a new gang block header checksum for it; but no one will
1582 1611                   * compute a new data checksum, so we do that here.  The one
1583 1612                   * exception is the gang leader: the pipeline already computed
1584 1613                   * its data checksum because that stage precedes gang assembly.
1585 1614                   * (Presently, nothing actually uses interior data checksums;
1586 1615                   * this is just good hygiene.)
1587 1616                   */
1588 1617                  if (gn != pio->io_gang_leader->io_gang_tree) {
1589 1618                          zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1590 1619                              data, BP_GET_PSIZE(bp));
1591 1620                  }
1592 1621                  /*
1593 1622                   * If we are here to damage data for testing purposes,
1594 1623                   * leave the GBH alone so that we can detect the damage.
1595 1624                   */
1596 1625                  if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1597 1626                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1598 1627          } else {
1599 1628                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1600 1629                      data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1601 1630                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1602 1631          }
1603 1632  
1604 1633          return (zio);
1605 1634  }
1606 1635  
1607 1636  /* ARGSUSED */
1608 1637  zio_t *
1609 1638  zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1610 1639  {
1611 1640          return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1612 1641              ZIO_GANG_CHILD_FLAGS(pio)));
1613 1642  }
1614 1643  
1615 1644  /* ARGSUSED */
1616 1645  zio_t *
1617 1646  zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1618 1647  {
1619 1648          return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1620 1649              NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1621 1650  }
1622 1651  
1623 1652  static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1624 1653          NULL,
1625 1654          zio_read_gang,
1626 1655          zio_rewrite_gang,
1627 1656          zio_free_gang,
1628 1657          zio_claim_gang,
1629 1658          NULL
1630 1659  };
1631 1660  
1632 1661  static void zio_gang_tree_assemble_done(zio_t *zio);
1633 1662  
1634 1663  static zio_gang_node_t *
1635 1664  zio_gang_node_alloc(zio_gang_node_t **gnpp)
1636 1665  {
1637 1666          zio_gang_node_t *gn;
1638 1667  
1639 1668          ASSERT(*gnpp == NULL);
1640 1669  
1641 1670          gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1642 1671          gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1643 1672          *gnpp = gn;
1644 1673  
1645 1674          return (gn);
1646 1675  }
1647 1676  
1648 1677  static void
1649 1678  zio_gang_node_free(zio_gang_node_t **gnpp)
1650 1679  {
1651 1680          zio_gang_node_t *gn = *gnpp;
1652 1681  
1653 1682          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1654 1683                  ASSERT(gn->gn_child[g] == NULL);
1655 1684  
1656 1685          zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1657 1686          kmem_free(gn, sizeof (*gn));
1658 1687          *gnpp = NULL;
1659 1688  }
1660 1689  
1661 1690  static void
1662 1691  zio_gang_tree_free(zio_gang_node_t **gnpp)
1663 1692  {
1664 1693          zio_gang_node_t *gn = *gnpp;
1665 1694  
1666 1695          if (gn == NULL)
1667 1696                  return;
1668 1697  
1669 1698          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1670 1699                  zio_gang_tree_free(&gn->gn_child[g]);
1671 1700  
1672 1701          zio_gang_node_free(gnpp);
1673 1702  }
1674 1703  
1675 1704  static void
1676 1705  zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1677 1706  {
1678 1707          zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1679 1708  
1680 1709          ASSERT(gio->io_gang_leader == gio);
1681 1710          ASSERT(BP_IS_GANG(bp));
1682 1711  
1683 1712          zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1684 1713              SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1685 1714              gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1686 1715  }
1687 1716  
1688 1717  static void
1689 1718  zio_gang_tree_assemble_done(zio_t *zio)
1690 1719  {
1691 1720          zio_t *gio = zio->io_gang_leader;
1692 1721          zio_gang_node_t *gn = zio->io_private;
1693 1722          blkptr_t *bp = zio->io_bp;
1694 1723  
1695 1724          ASSERT(gio == zio_unique_parent(zio));
1696 1725          ASSERT(zio->io_child_count == 0);
1697 1726  
1698 1727          if (zio->io_error)
1699 1728                  return;
1700 1729  
1701 1730          if (BP_SHOULD_BYTESWAP(bp))
1702 1731                  byteswap_uint64_array(zio->io_data, zio->io_size);
1703 1732  
1704 1733          ASSERT(zio->io_data == gn->gn_gbh);
1705 1734          ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1706 1735          ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1707 1736  
1708 1737          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1709 1738                  blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1710 1739                  if (!BP_IS_GANG(gbp))
1711 1740                          continue;
1712 1741                  zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1713 1742          }
1714 1743  }
1715 1744  
1716 1745  static void
1717 1746  zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1718 1747  {
1719 1748          zio_t *gio = pio->io_gang_leader;
1720 1749          zio_t *zio;
1721 1750  
1722 1751          ASSERT(BP_IS_GANG(bp) == !!gn);
1723 1752          ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1724 1753          ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1725 1754  
1726 1755          /*
1727 1756           * If you're a gang header, your data is in gn->gn_gbh.
1728 1757           * If you're a gang member, your data is in 'data' and gn == NULL.
1729 1758           */
1730 1759          zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1731 1760  
1732 1761          if (gn != NULL) {
1733 1762                  ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1734 1763  
1735 1764                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1736 1765                          blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1737 1766                          if (BP_IS_HOLE(gbp))
1738 1767                                  continue;
1739 1768                          zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1740 1769                          data = (char *)data + BP_GET_PSIZE(gbp);
1741 1770                  }
1742 1771          }
1743 1772  
1744 1773          if (gn == gio->io_gang_tree)
1745 1774                  ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1746 1775  
1747 1776          if (zio != pio)
1748 1777                  zio_nowait(zio);
1749 1778  }
1750 1779  
1751 1780  static int
1752 1781  zio_gang_assemble(zio_t *zio)
1753 1782  {
1754 1783          blkptr_t *bp = zio->io_bp;
1755 1784  
1756 1785          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1757 1786          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1758 1787  
1759 1788          zio->io_gang_leader = zio;
1760 1789  
1761 1790          zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1762 1791  
1763 1792          return (ZIO_PIPELINE_CONTINUE);
1764 1793  }
1765 1794  
1766 1795  static int
1767 1796  zio_gang_issue(zio_t *zio)
1768 1797  {
1769 1798          blkptr_t *bp = zio->io_bp;
1770 1799  
1771 1800          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1772 1801                  return (ZIO_PIPELINE_STOP);
1773 1802  
1774 1803          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1775 1804          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1776 1805  
1777 1806          if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1778 1807                  zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1779 1808          else
1780 1809                  zio_gang_tree_free(&zio->io_gang_tree);
1781 1810  
1782 1811          zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1783 1812  
1784 1813          return (ZIO_PIPELINE_CONTINUE);
1785 1814  }
1786 1815  
1787 1816  static void
1788 1817  zio_write_gang_member_ready(zio_t *zio)
1789 1818  {
1790 1819          zio_t *pio = zio_unique_parent(zio);
1791 1820          zio_t *gio = zio->io_gang_leader;
1792 1821          dva_t *cdva = zio->io_bp->blk_dva;
1793 1822          dva_t *pdva = pio->io_bp->blk_dva;
1794 1823          uint64_t asize;
1795 1824  
1796 1825          if (BP_IS_HOLE(zio->io_bp))
1797 1826                  return;
1798 1827  
1799 1828          ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1800 1829  
1801 1830          ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1802 1831          ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1803 1832          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1804 1833          ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1805 1834          ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1806 1835  
1807 1836          mutex_enter(&pio->io_lock);
1808 1837          for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1809 1838                  ASSERT(DVA_GET_GANG(&pdva[d]));
1810 1839                  asize = DVA_GET_ASIZE(&pdva[d]);
1811 1840                  asize += DVA_GET_ASIZE(&cdva[d]);
1812 1841                  DVA_SET_ASIZE(&pdva[d], asize);
1813 1842          }
1814 1843          mutex_exit(&pio->io_lock);
1815 1844  }
1816 1845  
1817 1846  static int
1818 1847  zio_write_gang_block(zio_t *pio)
1819 1848  {
1820 1849          spa_t *spa = pio->io_spa;
1821 1850          blkptr_t *bp = pio->io_bp;
1822 1851          zio_t *gio = pio->io_gang_leader;
1823 1852          zio_t *zio;
1824 1853          zio_gang_node_t *gn, **gnpp;
1825 1854          zio_gbh_phys_t *gbh;
1826 1855          uint64_t txg = pio->io_txg;
1827 1856          uint64_t resid = pio->io_size;
1828 1857          uint64_t lsize;
1829 1858          int copies = gio->io_prop.zp_copies;
1830 1859          int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1831 1860          zio_prop_t zp;
1832 1861          int error;
1833 1862  
1834 1863          error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1835 1864              bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1836 1865              METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1837 1866          if (error) {
1838 1867                  pio->io_error = error;
1839 1868                  return (ZIO_PIPELINE_CONTINUE);
1840 1869          }
1841 1870  
1842 1871          if (pio == gio) {
1843 1872                  gnpp = &gio->io_gang_tree;
1844 1873          } else {
1845 1874                  gnpp = pio->io_private;
1846 1875                  ASSERT(pio->io_ready == zio_write_gang_member_ready);
1847 1876          }
1848 1877  
1849 1878          gn = zio_gang_node_alloc(gnpp);
1850 1879          gbh = gn->gn_gbh;
1851 1880          bzero(gbh, SPA_GANGBLOCKSIZE);
1852 1881  
1853 1882          /*
1854 1883           * Create the gang header.
1855 1884           */
1856 1885          zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1857 1886              pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1858 1887  
1859 1888          /*
1860 1889           * Create and nowait the gang children.
1861 1890           */
1862 1891          for (int g = 0; resid != 0; resid -= lsize, g++) {
1863 1892                  lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1864 1893                      SPA_MINBLOCKSIZE);
1865 1894                  ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1866 1895  
1867 1896                  zp.zp_checksum = gio->io_prop.zp_checksum;
1868 1897                  zp.zp_compress = ZIO_COMPRESS_OFF;
1869 1898                  zp.zp_type = DMU_OT_NONE;
1870 1899                  zp.zp_level = 0;
1871 1900                  zp.zp_copies = gio->io_prop.zp_copies;
1872 1901                  zp.zp_dedup = B_FALSE;
1873 1902                  zp.zp_dedup_verify = B_FALSE;
1874 1903                  zp.zp_nopwrite = B_FALSE;
1875 1904  
1876 1905                  zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1877 1906                      (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1878 1907                      zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1879 1908                      pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1880 1909                      &pio->io_bookmark));
1881 1910          }
1882 1911  
1883 1912          /*
1884 1913           * Set pio's pipeline to just wait for zio to finish.
1885 1914           */
1886 1915          pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1887 1916  
1888 1917          zio_nowait(zio);
1889 1918  
1890 1919          return (ZIO_PIPELINE_CONTINUE);
1891 1920  }
1892 1921  
1893 1922  /*
1894 1923   * The zio_nop_write stage in the pipeline determines if allocating
1895 1924   * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1896 1925   * such as SHA256, we can compare the checksums of the new data and the old
1897 1926   * to determine if allocating a new block is required.  The nopwrite
1898 1927   * feature can handle writes in either syncing or open context (i.e. zil
1899 1928   * writes) and as a result is mutually exclusive with dedup.
1900 1929   */
1901 1930  static int
1902 1931  zio_nop_write(zio_t *zio)
1903 1932  {
1904 1933          blkptr_t *bp = zio->io_bp;
1905 1934          blkptr_t *bp_orig = &zio->io_bp_orig;
1906 1935          zio_prop_t *zp = &zio->io_prop;
1907 1936  
1908 1937          ASSERT(BP_GET_LEVEL(bp) == 0);
1909 1938          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1910 1939          ASSERT(zp->zp_nopwrite);
1911 1940          ASSERT(!zp->zp_dedup);
1912 1941          ASSERT(zio->io_bp_override == NULL);
1913 1942          ASSERT(IO_IS_ALLOCATING(zio));
1914 1943  
1915 1944          /*
1916 1945           * Check to see if the original bp and the new bp have matching
1917 1946           * characteristics (i.e. same checksum, compression algorithms, etc).
1918 1947           * If they don't then just continue with the pipeline which will
1919 1948           * allocate a new bp.
1920 1949           */
1921 1950          if (BP_IS_HOLE(bp_orig) ||
1922 1951              !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
1923 1952              BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
1924 1953              BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
1925 1954              BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
1926 1955              zp->zp_copies != BP_GET_NDVAS(bp_orig))
1927 1956                  return (ZIO_PIPELINE_CONTINUE);
1928 1957  
1929 1958          /*
1930 1959           * If the checksums match then reset the pipeline so that we
1931 1960           * avoid allocating a new bp and issuing any I/O.
1932 1961           */
1933 1962          if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
1934 1963                  ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
1935 1964                  ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
1936 1965                  ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
1937 1966                  ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
1938 1967                  ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
1939 1968                      sizeof (uint64_t)) == 0);
1940 1969  
1941 1970                  *bp = *bp_orig;
1942 1971                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1943 1972                  zio->io_flags |= ZIO_FLAG_NOPWRITE;
1944 1973          }
1945 1974  
1946 1975          return (ZIO_PIPELINE_CONTINUE);
1947 1976  }
1948 1977  
1949 1978  /*
1950 1979   * ==========================================================================
1951 1980   * Dedup
1952 1981   * ==========================================================================
1953 1982   */
1954 1983  static void
1955 1984  zio_ddt_child_read_done(zio_t *zio)
1956 1985  {
1957 1986          blkptr_t *bp = zio->io_bp;
1958 1987          ddt_entry_t *dde = zio->io_private;
1959 1988          ddt_phys_t *ddp;
1960 1989          zio_t *pio = zio_unique_parent(zio);
1961 1990  
1962 1991          mutex_enter(&pio->io_lock);
1963 1992          ddp = ddt_phys_select(dde, bp);
1964 1993          if (zio->io_error == 0)
1965 1994                  ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
1966 1995          if (zio->io_error == 0 && dde->dde_repair_data == NULL)
1967 1996                  dde->dde_repair_data = zio->io_data;
1968 1997          else
1969 1998                  zio_buf_free(zio->io_data, zio->io_size);
1970 1999          mutex_exit(&pio->io_lock);
1971 2000  }
1972 2001  
1973 2002  static int
1974 2003  zio_ddt_read_start(zio_t *zio)
1975 2004  {
1976 2005          blkptr_t *bp = zio->io_bp;
1977 2006  
1978 2007          ASSERT(BP_GET_DEDUP(bp));
1979 2008          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
1980 2009          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1981 2010  
1982 2011          if (zio->io_child_error[ZIO_CHILD_DDT]) {
1983 2012                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
1984 2013                  ddt_entry_t *dde = ddt_repair_start(ddt, bp);
1985 2014                  ddt_phys_t *ddp = dde->dde_phys;
1986 2015                  ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
1987 2016                  blkptr_t blk;
1988 2017  
1989 2018                  ASSERT(zio->io_vsd == NULL);
1990 2019                  zio->io_vsd = dde;
1991 2020  
1992 2021                  if (ddp_self == NULL)
1993 2022                          return (ZIO_PIPELINE_CONTINUE);
1994 2023  
1995 2024                  for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1996 2025                          if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
1997 2026                                  continue;
1998 2027                          ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
1999 2028                              &blk);
2000 2029                          zio_nowait(zio_read(zio, zio->io_spa, &blk,
2001 2030                              zio_buf_alloc(zio->io_size), zio->io_size,
2002 2031                              zio_ddt_child_read_done, dde, zio->io_priority,
2003 2032                              ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2004 2033                              &zio->io_bookmark));
2005 2034                  }
2006 2035                  return (ZIO_PIPELINE_CONTINUE);
2007 2036          }
2008 2037  
2009 2038          zio_nowait(zio_read(zio, zio->io_spa, bp,
2010 2039              zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2011 2040              ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2012 2041  
2013 2042          return (ZIO_PIPELINE_CONTINUE);
2014 2043  }
2015 2044  
2016 2045  static int
2017 2046  zio_ddt_read_done(zio_t *zio)
2018 2047  {
2019 2048          blkptr_t *bp = zio->io_bp;
2020 2049  
2021 2050          if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2022 2051                  return (ZIO_PIPELINE_STOP);
2023 2052  
2024 2053          ASSERT(BP_GET_DEDUP(bp));
2025 2054          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2026 2055          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2027 2056  
2028 2057          if (zio->io_child_error[ZIO_CHILD_DDT]) {
2029 2058                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
2030 2059                  ddt_entry_t *dde = zio->io_vsd;
2031 2060                  if (ddt == NULL) {
2032 2061                          ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2033 2062                          return (ZIO_PIPELINE_CONTINUE);
2034 2063                  }
2035 2064                  if (dde == NULL) {
2036 2065                          zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2037 2066                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2038 2067                          return (ZIO_PIPELINE_STOP);
2039 2068                  }
2040 2069                  if (dde->dde_repair_data != NULL) {
2041 2070                          bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2042 2071                          zio->io_child_error[ZIO_CHILD_DDT] = 0;
2043 2072                  }
2044 2073                  ddt_repair_done(ddt, dde);
2045 2074                  zio->io_vsd = NULL;
2046 2075          }
2047 2076  
2048 2077          ASSERT(zio->io_vsd == NULL);
2049 2078  
2050 2079          return (ZIO_PIPELINE_CONTINUE);
2051 2080  }
2052 2081  
2053 2082  static boolean_t
2054 2083  zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2055 2084  {
2056 2085          spa_t *spa = zio->io_spa;
2057 2086  
2058 2087          /*
2059 2088           * Note: we compare the original data, not the transformed data,
2060 2089           * because when zio->io_bp is an override bp, we will not have
2061 2090           * pushed the I/O transforms.  That's an important optimization
2062 2091           * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2063 2092           */
2064 2093          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2065 2094                  zio_t *lio = dde->dde_lead_zio[p];
2066 2095  
2067 2096                  if (lio != NULL) {
2068 2097                          return (lio->io_orig_size != zio->io_orig_size ||
2069 2098                              bcmp(zio->io_orig_data, lio->io_orig_data,
2070 2099                              zio->io_orig_size) != 0);
2071 2100                  }
2072 2101          }
2073 2102  
2074 2103          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2075 2104                  ddt_phys_t *ddp = &dde->dde_phys[p];
2076 2105  
2077 2106                  if (ddp->ddp_phys_birth != 0) {
2078 2107                          arc_buf_t *abuf = NULL;
2079 2108                          uint32_t aflags = ARC_WAIT;
2080 2109                          blkptr_t blk = *zio->io_bp;
2081 2110                          int error;
2082 2111  
2083 2112                          ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2084 2113  
2085 2114                          ddt_exit(ddt);
2086 2115  
2087 2116                          error = arc_read(NULL, spa, &blk,
2088 2117                              arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2089 2118                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2090 2119                              &aflags, &zio->io_bookmark);
2091 2120  
2092 2121                          if (error == 0) {
2093 2122                                  if (arc_buf_size(abuf) != zio->io_orig_size ||
2094 2123                                      bcmp(abuf->b_data, zio->io_orig_data,
2095 2124                                      zio->io_orig_size) != 0)
2096 2125                                          error = SET_ERROR(EEXIST);
2097 2126                                  VERIFY(arc_buf_remove_ref(abuf, &abuf));
2098 2127                          }
2099 2128  
2100 2129                          ddt_enter(ddt);
2101 2130                          return (error != 0);
2102 2131                  }
2103 2132          }
2104 2133  
2105 2134          return (B_FALSE);
2106 2135  }
2107 2136  
2108 2137  static void
2109 2138  zio_ddt_child_write_ready(zio_t *zio)
2110 2139  {
2111 2140          int p = zio->io_prop.zp_copies;
2112 2141          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2113 2142          ddt_entry_t *dde = zio->io_private;
2114 2143          ddt_phys_t *ddp = &dde->dde_phys[p];
2115 2144          zio_t *pio;
2116 2145  
2117 2146          if (zio->io_error)
2118 2147                  return;
2119 2148  
2120 2149          ddt_enter(ddt);
2121 2150  
2122 2151          ASSERT(dde->dde_lead_zio[p] == zio);
2123 2152  
2124 2153          ddt_phys_fill(ddp, zio->io_bp);
2125 2154  
2126 2155          while ((pio = zio_walk_parents(zio)) != NULL)
2127 2156                  ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2128 2157  
2129 2158          ddt_exit(ddt);
2130 2159  }
2131 2160  
2132 2161  static void
2133 2162  zio_ddt_child_write_done(zio_t *zio)
2134 2163  {
2135 2164          int p = zio->io_prop.zp_copies;
2136 2165          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2137 2166          ddt_entry_t *dde = zio->io_private;
2138 2167          ddt_phys_t *ddp = &dde->dde_phys[p];
2139 2168  
2140 2169          ddt_enter(ddt);
2141 2170  
2142 2171          ASSERT(ddp->ddp_refcnt == 0);
2143 2172          ASSERT(dde->dde_lead_zio[p] == zio);
2144 2173          dde->dde_lead_zio[p] = NULL;
2145 2174  
2146 2175          if (zio->io_error == 0) {
2147 2176                  while (zio_walk_parents(zio) != NULL)
2148 2177                          ddt_phys_addref(ddp);
2149 2178          } else {
2150 2179                  ddt_phys_clear(ddp);
2151 2180          }
2152 2181  
2153 2182          ddt_exit(ddt);
2154 2183  }
2155 2184  
2156 2185  static void
2157 2186  zio_ddt_ditto_write_done(zio_t *zio)
2158 2187  {
2159 2188          int p = DDT_PHYS_DITTO;
2160 2189          zio_prop_t *zp = &zio->io_prop;
2161 2190          blkptr_t *bp = zio->io_bp;
2162 2191          ddt_t *ddt = ddt_select(zio->io_spa, bp);
2163 2192          ddt_entry_t *dde = zio->io_private;
2164 2193          ddt_phys_t *ddp = &dde->dde_phys[p];
2165 2194          ddt_key_t *ddk = &dde->dde_key;
2166 2195  
2167 2196          ddt_enter(ddt);
2168 2197  
2169 2198          ASSERT(ddp->ddp_refcnt == 0);
2170 2199          ASSERT(dde->dde_lead_zio[p] == zio);
2171 2200          dde->dde_lead_zio[p] = NULL;
2172 2201  
2173 2202          if (zio->io_error == 0) {
2174 2203                  ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2175 2204                  ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2176 2205                  ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2177 2206                  if (ddp->ddp_phys_birth != 0)
2178 2207                          ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2179 2208                  ddt_phys_fill(ddp, bp);
2180 2209          }
2181 2210  
2182 2211          ddt_exit(ddt);
2183 2212  }
2184 2213  
2185 2214  static int
2186 2215  zio_ddt_write(zio_t *zio)
2187 2216  {
2188 2217          spa_t *spa = zio->io_spa;
2189 2218          blkptr_t *bp = zio->io_bp;
2190 2219          uint64_t txg = zio->io_txg;
2191 2220          zio_prop_t *zp = &zio->io_prop;
2192 2221          int p = zp->zp_copies;
2193 2222          int ditto_copies;
2194 2223          zio_t *cio = NULL;
2195 2224          zio_t *dio = NULL;
2196 2225          ddt_t *ddt = ddt_select(spa, bp);
2197 2226          ddt_entry_t *dde;
2198 2227          ddt_phys_t *ddp;
2199 2228  
2200 2229          ASSERT(BP_GET_DEDUP(bp));
2201 2230          ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2202 2231          ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2203 2232  
2204 2233          ddt_enter(ddt);
2205 2234          dde = ddt_lookup(ddt, bp, B_TRUE);
2206 2235          ddp = &dde->dde_phys[p];
2207 2236  
2208 2237          if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2209 2238                  /*
2210 2239                   * If we're using a weak checksum, upgrade to a strong checksum
2211 2240                   * and try again.  If we're already using a strong checksum,
2212 2241                   * we can't resolve it, so just convert to an ordinary write.
2213 2242                   * (And automatically e-mail a paper to Nature?)
2214 2243                   */
2215 2244                  if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2216 2245                          zp->zp_checksum = spa_dedup_checksum(spa);
2217 2246                          zio_pop_transforms(zio);
2218 2247                          zio->io_stage = ZIO_STAGE_OPEN;
2219 2248                          BP_ZERO(bp);
2220 2249                  } else {
2221 2250                          zp->zp_dedup = B_FALSE;
2222 2251                  }
2223 2252                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
2224 2253                  ddt_exit(ddt);
2225 2254                  return (ZIO_PIPELINE_CONTINUE);
2226 2255          }
2227 2256  
2228 2257          ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2229 2258          ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2230 2259  
2231 2260          if (ditto_copies > ddt_ditto_copies_present(dde) &&
2232 2261              dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2233 2262                  zio_prop_t czp = *zp;
2234 2263  
2235 2264                  czp.zp_copies = ditto_copies;
2236 2265  
2237 2266                  /*
2238 2267                   * If we arrived here with an override bp, we won't have run
2239 2268                   * the transform stack, so we won't have the data we need to
2240 2269                   * generate a child i/o.  So, toss the override bp and restart.
2241 2270                   * This is safe, because using the override bp is just an
2242 2271                   * optimization; and it's rare, so the cost doesn't matter.
2243 2272                   */
2244 2273                  if (zio->io_bp_override) {
2245 2274                          zio_pop_transforms(zio);
2246 2275                          zio->io_stage = ZIO_STAGE_OPEN;
2247 2276                          zio->io_pipeline = ZIO_WRITE_PIPELINE;
2248 2277                          zio->io_bp_override = NULL;
2249 2278                          BP_ZERO(bp);
2250 2279                          ddt_exit(ddt);
2251 2280                          return (ZIO_PIPELINE_CONTINUE);
2252 2281                  }
2253 2282  
2254 2283                  dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2255 2284                      zio->io_orig_size, &czp, NULL, NULL,
2256 2285                      zio_ddt_ditto_write_done, dde, zio->io_priority,
2257 2286                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2258 2287  
2259 2288                  zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2260 2289                  dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2261 2290          }
2262 2291  
2263 2292          if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2264 2293                  if (ddp->ddp_phys_birth != 0)
2265 2294                          ddt_bp_fill(ddp, bp, txg);
2266 2295                  if (dde->dde_lead_zio[p] != NULL)
2267 2296                          zio_add_child(zio, dde->dde_lead_zio[p]);
2268 2297                  else
2269 2298                          ddt_phys_addref(ddp);
2270 2299          } else if (zio->io_bp_override) {
2271 2300                  ASSERT(bp->blk_birth == txg);
2272 2301                  ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2273 2302                  ddt_phys_fill(ddp, bp);
2274 2303                  ddt_phys_addref(ddp);
2275 2304          } else {
2276 2305                  cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2277 2306                      zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2278 2307                      zio_ddt_child_write_done, dde, zio->io_priority,
2279 2308                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2280 2309  
2281 2310                  zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2282 2311                  dde->dde_lead_zio[p] = cio;
2283 2312          }
2284 2313  
2285 2314          ddt_exit(ddt);
2286 2315  
2287 2316          if (cio)
2288 2317                  zio_nowait(cio);
2289 2318          if (dio)
2290 2319                  zio_nowait(dio);
2291 2320  
2292 2321          return (ZIO_PIPELINE_CONTINUE);
2293 2322  }
2294 2323  
2295 2324  ddt_entry_t *freedde; /* for debugging */
2296 2325  
2297 2326  static int
2298 2327  zio_ddt_free(zio_t *zio)
2299 2328  {
2300 2329          spa_t *spa = zio->io_spa;
2301 2330          blkptr_t *bp = zio->io_bp;
2302 2331          ddt_t *ddt = ddt_select(spa, bp);
2303 2332          ddt_entry_t *dde;
2304 2333          ddt_phys_t *ddp;
2305 2334  
2306 2335          ASSERT(BP_GET_DEDUP(bp));
2307 2336          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2308 2337  
2309 2338          ddt_enter(ddt);
2310 2339          freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2311 2340          ddp = ddt_phys_select(dde, bp);
2312 2341          ddt_phys_decref(ddp);
2313 2342          ddt_exit(ddt);
2314 2343  
2315 2344          return (ZIO_PIPELINE_CONTINUE);
2316 2345  }
2317 2346  
2318 2347  /*
2319 2348   * ==========================================================================
2320 2349   * Allocate and free blocks
2321 2350   * ==========================================================================
2322 2351   */
2323 2352  static int
2324 2353  zio_dva_allocate(zio_t *zio)
2325 2354  {
2326 2355          spa_t *spa = zio->io_spa;
2327 2356          metaslab_class_t *mc = spa_normal_class(spa);
2328 2357          blkptr_t *bp = zio->io_bp;
2329 2358          int error;
2330 2359          int flags = 0;
2331 2360  
2332 2361          if (zio->io_gang_leader == NULL) {
2333 2362                  ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2334 2363                  zio->io_gang_leader = zio;
2335 2364          }
2336 2365  
2337 2366          ASSERT(BP_IS_HOLE(bp));
2338 2367          ASSERT0(BP_GET_NDVAS(bp));
2339 2368          ASSERT3U(zio->io_prop.zp_copies, >, 0);
2340 2369          ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2341 2370          ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2342 2371  
2343 2372          /*
2344 2373           * The dump device does not support gang blocks so allocation on
2345 2374           * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2346 2375           * the "fast" gang feature.
2347 2376           */
2348 2377          flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2349 2378          flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2350 2379              METASLAB_GANG_CHILD : 0;
2351 2380          error = metaslab_alloc(spa, mc, zio->io_size, bp,
2352 2381              zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2353 2382  
2354 2383          if (error) {
2355 2384                  spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2356 2385                      "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2357 2386                      error);
2358 2387                  if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2359 2388                          return (zio_write_gang_block(zio));
2360 2389                  zio->io_error = error;
2361 2390          }
2362 2391  
2363 2392          return (ZIO_PIPELINE_CONTINUE);
2364 2393  }
2365 2394  
2366 2395  static int
2367 2396  zio_dva_free(zio_t *zio)
2368 2397  {
2369 2398          metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2370 2399  
2371 2400          return (ZIO_PIPELINE_CONTINUE);
2372 2401  }
2373 2402  
2374 2403  static int
2375 2404  zio_dva_claim(zio_t *zio)
2376 2405  {
2377 2406          int error;
2378 2407  
2379 2408          error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2380 2409          if (error)
2381 2410                  zio->io_error = error;
2382 2411  
2383 2412          return (ZIO_PIPELINE_CONTINUE);
2384 2413  }
2385 2414  
2386 2415  /*
2387 2416   * Undo an allocation.  This is used by zio_done() when an I/O fails
2388 2417   * and we want to give back the block we just allocated.
2389 2418   * This handles both normal blocks and gang blocks.
2390 2419   */
2391 2420  static void
2392 2421  zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2393 2422  {
2394 2423          ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2395 2424          ASSERT(zio->io_bp_override == NULL);
2396 2425  
2397 2426          if (!BP_IS_HOLE(bp))
2398 2427                  metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2399 2428  
2400 2429          if (gn != NULL) {
2401 2430                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2402 2431                          zio_dva_unallocate(zio, gn->gn_child[g],
2403 2432                              &gn->gn_gbh->zg_blkptr[g]);
2404 2433                  }
2405 2434          }
2406 2435  }
2407 2436  
2408 2437  /*
2409 2438   * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2410 2439   */
2411 2440  int
2412 2441  zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2413 2442      uint64_t size, boolean_t use_slog)
2414 2443  {
2415 2444          int error = 1;
2416 2445  
2417 2446          ASSERT(txg > spa_syncing_txg(spa));
2418 2447  
2419 2448          /*
2420 2449           * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2421 2450           * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2422 2451           * when allocating them.
2423 2452           */
2424 2453          if (use_slog) {
2425 2454                  error = metaslab_alloc(spa, spa_log_class(spa), size,
2426 2455                      new_bp, 1, txg, old_bp,
2427 2456                      METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2428 2457          }
2429 2458  
2430 2459          if (error) {
2431 2460                  error = metaslab_alloc(spa, spa_normal_class(spa), size,
2432 2461                      new_bp, 1, txg, old_bp,
2433 2462                      METASLAB_HINTBP_AVOID);
2434 2463          }
2435 2464  
2436 2465          if (error == 0) {
2437 2466                  BP_SET_LSIZE(new_bp, size);
2438 2467                  BP_SET_PSIZE(new_bp, size);
2439 2468                  BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2440 2469                  BP_SET_CHECKSUM(new_bp,
2441 2470                      spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2442 2471                      ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2443 2472                  BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2444 2473                  BP_SET_LEVEL(new_bp, 0);
2445 2474                  BP_SET_DEDUP(new_bp, 0);
2446 2475                  BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2447 2476          }
2448 2477  
2449 2478          return (error);
2450 2479  }
2451 2480  
2452 2481  /*
2453 2482   * Free an intent log block.
2454 2483   */
2455 2484  void
2456 2485  zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2457 2486  {
2458 2487          ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2459 2488          ASSERT(!BP_IS_GANG(bp));
2460 2489  
2461 2490          zio_free(spa, txg, bp);
2462 2491  }
2463 2492  
2464 2493  /*
2465 2494   * ==========================================================================
2466 2495   * Read and write to physical devices
2467 2496   * ==========================================================================
2468 2497   */
2469 2498  
2470 2499  
2471 2500  /*
2472 2501   * Issue an I/O to the underlying vdev. Typically the issue pipeline
2473 2502   * stops after this stage and will resume upon I/O completion.
2474 2503   * However, there are instances where the vdev layer may need to
2475 2504   * continue the pipeline when an I/O was not issued. Since the I/O
2476 2505   * that was sent to the vdev layer might be different than the one
2477 2506   * currently active in the pipeline (see vdev_queue_io()), we explicitly
2478 2507   * force the underlying vdev layers to call either zio_execute() or
2479 2508   * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2480 2509   */
2481 2510  static int
2482 2511  zio_vdev_io_start(zio_t *zio)
2483 2512  {
2484 2513          vdev_t *vd = zio->io_vd;
2485 2514          uint64_t align;
2486 2515          spa_t *spa = zio->io_spa;
2487 2516  
2488 2517          ASSERT(zio->io_error == 0);
2489 2518          ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2490 2519  
2491 2520          if (vd == NULL) {
2492 2521                  if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2493 2522                          spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2494 2523  
2495 2524                  /*
2496 2525                   * The mirror_ops handle multiple DVAs in a single BP.
2497 2526                   */
2498 2527                  vdev_mirror_ops.vdev_op_io_start(zio);
2499 2528                  return (ZIO_PIPELINE_STOP);
2500 2529          }
2501 2530  
2502 2531          /*
2503 2532           * We keep track of time-sensitive I/Os so that the scan thread
2504 2533           * can quickly react to certain workloads.  In particular, we care
2505 2534           * about non-scrubbing, top-level reads and writes with the following
2506 2535           * characteristics:
2507 2536           *      - synchronous writes of user data to non-slog devices
2508 2537           *      - any reads of user data
2509 2538           * When these conditions are met, adjust the timestamp of spa_last_io
2510 2539           * which allows the scan thread to adjust its workload accordingly.
2511 2540           */
2512 2541          if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2513 2542              vd == vd->vdev_top && !vd->vdev_islog &&
2514 2543              zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2515 2544              zio->io_txg != spa_syncing_txg(spa)) {
2516 2545                  uint64_t old = spa->spa_last_io;
2517 2546                  uint64_t new = ddi_get_lbolt64();
2518 2547                  if (old != new)
2519 2548                          (void) atomic_cas_64(&spa->spa_last_io, old, new);
2520 2549          }
2521 2550  
2522 2551          align = 1ULL << vd->vdev_top->vdev_ashift;
2523 2552  
2524 2553          if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2525 2554              P2PHASE(zio->io_size, align) != 0) {
2526 2555                  /* Transform logical writes to be a full physical block size. */
2527 2556                  uint64_t asize = P2ROUNDUP(zio->io_size, align);
2528 2557                  char *abuf = zio_buf_alloc(asize);
2529 2558                  ASSERT(vd == vd->vdev_top);
2530 2559                  if (zio->io_type == ZIO_TYPE_WRITE) {
2531 2560                          bcopy(zio->io_data, abuf, zio->io_size);
2532 2561                          bzero(abuf + zio->io_size, asize - zio->io_size);
2533 2562                  }
2534 2563                  zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2535 2564          }
2536 2565  
2537 2566          /*
2538 2567           * If this is not a physical io, make sure that it is properly aligned
2539 2568           * before proceeding.
2540 2569           */
2541 2570          if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2542 2571                  ASSERT0(P2PHASE(zio->io_offset, align));
2543 2572                  ASSERT0(P2PHASE(zio->io_size, align));
2544 2573          } else {
2545 2574                  /*
2546 2575                   * For physical writes, we allow 512b aligned writes and assume
2547 2576                   * the device will perform a read-modify-write as necessary.
2548 2577                   */
2549 2578                  ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2550 2579                  ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2551 2580          }
2552 2581  
2553 2582          VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2554 2583  
2555 2584          /*
2556 2585           * If this is a repair I/O, and there's no self-healing involved --
2557 2586           * that is, we're just resilvering what we expect to resilver --
2558 2587           * then don't do the I/O unless zio's txg is actually in vd's DTL.
2559 2588           * This prevents spurious resilvering with nested replication.
2560 2589           * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2561 2590           * A is out of date, we'll read from C+D, then use the data to
2562 2591           * resilver A+B -- but we don't actually want to resilver B, just A.
2563 2592           * The top-level mirror has no way to know this, so instead we just
2564 2593           * discard unnecessary repairs as we work our way down the vdev tree.
2565 2594           * The same logic applies to any form of nested replication:
2566 2595           * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2567 2596           */
2568 2597          if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2569 2598              !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2570 2599              zio->io_txg != 0 && /* not a delegated i/o */
2571 2600              !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2572 2601                  ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2573 2602                  zio_vdev_io_bypass(zio);
2574 2603                  return (ZIO_PIPELINE_CONTINUE);
2575 2604          }
2576 2605  
2577 2606          if (vd->vdev_ops->vdev_op_leaf &&
2578 2607              (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2579 2608  
2580 2609                  if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2581 2610                          return (ZIO_PIPELINE_CONTINUE);
2582 2611  
2583 2612                  if ((zio = vdev_queue_io(zio)) == NULL)
2584 2613                          return (ZIO_PIPELINE_STOP);
2585 2614  
2586 2615                  if (!vdev_accessible(vd, zio)) {
2587 2616                          zio->io_error = SET_ERROR(ENXIO);
2588 2617                          zio_interrupt(zio);
2589 2618                          return (ZIO_PIPELINE_STOP);
2590 2619                  }
2591 2620          }
2592 2621  
2593 2622          vd->vdev_ops->vdev_op_io_start(zio);
2594 2623          return (ZIO_PIPELINE_STOP);
2595 2624  }
2596 2625  
2597 2626  static int
2598 2627  zio_vdev_io_done(zio_t *zio)
2599 2628  {
2600 2629          vdev_t *vd = zio->io_vd;
2601 2630          vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2602 2631          boolean_t unexpected_error = B_FALSE;
2603 2632  
2604 2633          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2605 2634                  return (ZIO_PIPELINE_STOP);
2606 2635  
2607 2636          ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2608 2637  
2609 2638          if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2610 2639  
2611 2640                  vdev_queue_io_done(zio);
2612 2641  
2613 2642                  if (zio->io_type == ZIO_TYPE_WRITE)
2614 2643                          vdev_cache_write(zio);
2615 2644  
2616 2645                  if (zio_injection_enabled && zio->io_error == 0)
2617 2646                          zio->io_error = zio_handle_device_injection(vd,
2618 2647                              zio, EIO);
2619 2648  
2620 2649                  if (zio_injection_enabled && zio->io_error == 0)
2621 2650                          zio->io_error = zio_handle_label_injection(zio, EIO);
2622 2651  
2623 2652                  if (zio->io_error) {
2624 2653                          if (!vdev_accessible(vd, zio)) {
2625 2654                                  zio->io_error = SET_ERROR(ENXIO);
2626 2655                          } else {
2627 2656                                  unexpected_error = B_TRUE;
2628 2657                          }
2629 2658                  }
2630 2659          }
2631 2660  
2632 2661          ops->vdev_op_io_done(zio);
2633 2662  
2634 2663          if (unexpected_error)
2635 2664                  VERIFY(vdev_probe(vd, zio) == NULL);
2636 2665  
2637 2666          return (ZIO_PIPELINE_CONTINUE);
2638 2667  }
2639 2668  
2640 2669  /*
2641 2670   * For non-raidz ZIOs, we can just copy aside the bad data read from the
2642 2671   * disk, and use that to finish the checksum ereport later.
2643 2672   */
2644 2673  static void
2645 2674  zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2646 2675      const void *good_buf)
2647 2676  {
2648 2677          /* no processing needed */
2649 2678          zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2650 2679  }
2651 2680  
2652 2681  /*ARGSUSED*/
2653 2682  void
2654 2683  zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2655 2684  {
2656 2685          void *buf = zio_buf_alloc(zio->io_size);
2657 2686  
2658 2687          bcopy(zio->io_data, buf, zio->io_size);
2659 2688  
2660 2689          zcr->zcr_cbinfo = zio->io_size;
2661 2690          zcr->zcr_cbdata = buf;
2662 2691          zcr->zcr_finish = zio_vsd_default_cksum_finish;
2663 2692          zcr->zcr_free = zio_buf_free;
2664 2693  }
2665 2694  
2666 2695  static int
2667 2696  zio_vdev_io_assess(zio_t *zio)
2668 2697  {
2669 2698          vdev_t *vd = zio->io_vd;
2670 2699  
2671 2700          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2672 2701                  return (ZIO_PIPELINE_STOP);
2673 2702  
2674 2703          if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2675 2704                  spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2676 2705  
2677 2706          if (zio->io_vsd != NULL) {
2678 2707                  zio->io_vsd_ops->vsd_free(zio);
2679 2708                  zio->io_vsd = NULL;
2680 2709          }
2681 2710  
2682 2711          if (zio_injection_enabled && zio->io_error == 0)
2683 2712                  zio->io_error = zio_handle_fault_injection(zio, EIO);
2684 2713  
2685 2714          /*
2686 2715           * If the I/O failed, determine whether we should attempt to retry it.
2687 2716           *
2688 2717           * On retry, we cut in line in the issue queue, since we don't want
2689 2718           * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2690 2719           */
2691 2720          if (zio->io_error && vd == NULL &&
2692 2721              !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2693 2722                  ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2694 2723                  ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2695 2724                  zio->io_error = 0;
2696 2725                  zio->io_flags |= ZIO_FLAG_IO_RETRY |
2697 2726                      ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2698 2727                  zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2699 2728                  zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2700 2729                      zio_requeue_io_start_cut_in_line);
2701 2730                  return (ZIO_PIPELINE_STOP);
2702 2731          }
2703 2732  
2704 2733          /*
2705 2734           * If we got an error on a leaf device, convert it to ENXIO
2706 2735           * if the device is not accessible at all.
2707 2736           */
2708 2737          if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2709 2738              !vdev_accessible(vd, zio))
2710 2739                  zio->io_error = SET_ERROR(ENXIO);
2711 2740  
2712 2741          /*
2713 2742           * If we can't write to an interior vdev (mirror or RAID-Z),
2714 2743           * set vdev_cant_write so that we stop trying to allocate from it.
2715 2744           */
2716 2745          if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2717 2746              vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2718 2747                  vd->vdev_cant_write = B_TRUE;
2719 2748          }
2720 2749  
2721 2750          if (zio->io_error)
2722 2751                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2723 2752  
2724 2753          if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2725 2754              zio->io_physdone != NULL) {
2726 2755                  ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2727 2756                  ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2728 2757                  zio->io_physdone(zio->io_logical);
2729 2758          }
2730 2759  
2731 2760          return (ZIO_PIPELINE_CONTINUE);
2732 2761  }
2733 2762  
2734 2763  void
2735 2764  zio_vdev_io_reissue(zio_t *zio)
2736 2765  {
2737 2766          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2738 2767          ASSERT(zio->io_error == 0);
2739 2768  
2740 2769          zio->io_stage >>= 1;
2741 2770  }
2742 2771  
2743 2772  void
2744 2773  zio_vdev_io_redone(zio_t *zio)
2745 2774  {
2746 2775          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2747 2776  
2748 2777          zio->io_stage >>= 1;
2749 2778  }
2750 2779  
2751 2780  void
2752 2781  zio_vdev_io_bypass(zio_t *zio)
2753 2782  {
2754 2783          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2755 2784          ASSERT(zio->io_error == 0);
2756 2785  
2757 2786          zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2758 2787          zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2759 2788  }
2760 2789  
2761 2790  /*
2762 2791   * ==========================================================================
2763 2792   * Generate and verify checksums
2764 2793   * ==========================================================================
2765 2794   */
2766 2795  static int
2767 2796  zio_checksum_generate(zio_t *zio)
2768 2797  {
2769 2798          blkptr_t *bp = zio->io_bp;
2770 2799          enum zio_checksum checksum;
2771 2800  
2772 2801          if (bp == NULL) {
2773 2802                  /*
2774 2803                   * This is zio_write_phys().
2775 2804                   * We're either generating a label checksum, or none at all.
2776 2805                   */
2777 2806                  checksum = zio->io_prop.zp_checksum;
2778 2807  
2779 2808                  if (checksum == ZIO_CHECKSUM_OFF)
2780 2809                          return (ZIO_PIPELINE_CONTINUE);
2781 2810  
2782 2811                  ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2783 2812          } else {
2784 2813                  if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2785 2814                          ASSERT(!IO_IS_ALLOCATING(zio));
2786 2815                          checksum = ZIO_CHECKSUM_GANG_HEADER;
2787 2816                  } else {
2788 2817                          checksum = BP_GET_CHECKSUM(bp);
2789 2818                  }
2790 2819          }
2791 2820  
2792 2821          zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2793 2822  
2794 2823          return (ZIO_PIPELINE_CONTINUE);
2795 2824  }
2796 2825  
2797 2826  static int
2798 2827  zio_checksum_verify(zio_t *zio)
2799 2828  {
2800 2829          zio_bad_cksum_t info;
2801 2830          blkptr_t *bp = zio->io_bp;
2802 2831          int error;
2803 2832  
2804 2833          ASSERT(zio->io_vd != NULL);
2805 2834  
2806 2835          if (bp == NULL) {
2807 2836                  /*
2808 2837                   * This is zio_read_phys().
2809 2838                   * We're either verifying a label checksum, or nothing at all.
2810 2839                   */
2811 2840                  if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2812 2841                          return (ZIO_PIPELINE_CONTINUE);
2813 2842  
2814 2843                  ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2815 2844          }
2816 2845  
2817 2846          if ((error = zio_checksum_error(zio, &info)) != 0) {
2818 2847                  zio->io_error = error;
2819 2848                  if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2820 2849                          zfs_ereport_start_checksum(zio->io_spa,
2821 2850                              zio->io_vd, zio, zio->io_offset,
2822 2851                              zio->io_size, NULL, &info);
2823 2852                  }
2824 2853          }
2825 2854  
2826 2855          return (ZIO_PIPELINE_CONTINUE);
2827 2856  }
2828 2857  
2829 2858  /*
2830 2859   * Called by RAID-Z to ensure we don't compute the checksum twice.
2831 2860   */
2832 2861  void
2833 2862  zio_checksum_verified(zio_t *zio)
2834 2863  {
2835 2864          zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2836 2865  }
2837 2866  
2838 2867  /*
2839 2868   * ==========================================================================
2840 2869   * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2841 2870   * An error of 0 indicates success.  ENXIO indicates whole-device failure,
2842 2871   * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2843 2872   * indicate errors that are specific to one I/O, and most likely permanent.
2844 2873   * Any other error is presumed to be worse because we weren't expecting it.
2845 2874   * ==========================================================================
2846 2875   */
2847 2876  int
2848 2877  zio_worst_error(int e1, int e2)
2849 2878  {
2850 2879          static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2851 2880          int r1, r2;
2852 2881  
2853 2882          for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2854 2883                  if (e1 == zio_error_rank[r1])
2855 2884                          break;
2856 2885  
2857 2886          for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2858 2887                  if (e2 == zio_error_rank[r2])
2859 2888                          break;
2860 2889  
2861 2890          return (r1 > r2 ? e1 : e2);
2862 2891  }
2863 2892  
2864 2893  /*
2865 2894   * ==========================================================================
2866 2895   * I/O completion
2867 2896   * ==========================================================================
2868 2897   */
2869 2898  static int
2870 2899  zio_ready(zio_t *zio)
2871 2900  {
2872 2901          blkptr_t *bp = zio->io_bp;
2873 2902          zio_t *pio, *pio_next;
2874 2903  
2875 2904          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2876 2905              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2877 2906                  return (ZIO_PIPELINE_STOP);
2878 2907  
2879 2908          if (zio->io_ready) {
2880 2909                  ASSERT(IO_IS_ALLOCATING(zio));
2881 2910                  ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
2882 2911                      (zio->io_flags & ZIO_FLAG_NOPWRITE));
2883 2912                  ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2884 2913  
2885 2914                  zio->io_ready(zio);
2886 2915          }
2887 2916  
2888 2917          if (bp != NULL && bp != &zio->io_bp_copy)
2889 2918                  zio->io_bp_copy = *bp;
2890 2919  
2891 2920          if (zio->io_error)
2892 2921                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2893 2922  
2894 2923          mutex_enter(&zio->io_lock);
2895 2924          zio->io_state[ZIO_WAIT_READY] = 1;
2896 2925          pio = zio_walk_parents(zio);
2897 2926          mutex_exit(&zio->io_lock);
2898 2927  
2899 2928          /*
2900 2929           * As we notify zio's parents, new parents could be added.
2901 2930           * New parents go to the head of zio's io_parent_list, however,
2902 2931           * so we will (correctly) not notify them.  The remainder of zio's
2903 2932           * io_parent_list, from 'pio_next' onward, cannot change because
2904 2933           * all parents must wait for us to be done before they can be done.
2905 2934           */
2906 2935          for (; pio != NULL; pio = pio_next) {
2907 2936                  pio_next = zio_walk_parents(zio);
2908 2937                  zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2909 2938          }
2910 2939  
2911 2940          if (zio->io_flags & ZIO_FLAG_NODATA) {
2912 2941                  if (BP_IS_GANG(bp)) {
2913 2942                          zio->io_flags &= ~ZIO_FLAG_NODATA;
2914 2943                  } else {
2915 2944                          ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
2916 2945                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2917 2946                  }
2918 2947          }
2919 2948  
2920 2949          if (zio_injection_enabled &&
2921 2950              zio->io_spa->spa_syncing_txg == zio->io_txg)
2922 2951                  zio_handle_ignored_writes(zio);
2923 2952  
2924 2953          return (ZIO_PIPELINE_CONTINUE);
2925 2954  }
2926 2955  
2927 2956  static int
2928 2957  zio_done(zio_t *zio)
2929 2958  {
2930 2959          spa_t *spa = zio->io_spa;
2931 2960          zio_t *lio = zio->io_logical;
2932 2961          blkptr_t *bp = zio->io_bp;
2933 2962          vdev_t *vd = zio->io_vd;
2934 2963          uint64_t psize = zio->io_size;
2935 2964          zio_t *pio, *pio_next;
2936 2965  
2937 2966          /*
2938 2967           * If our children haven't all completed,
2939 2968           * wait for them and then repeat this pipeline stage.
2940 2969           */
2941 2970          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2942 2971              zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2943 2972              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
2944 2973              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2945 2974                  return (ZIO_PIPELINE_STOP);
2946 2975  
2947 2976          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2948 2977                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2949 2978                          ASSERT(zio->io_children[c][w] == 0);
2950 2979  
2951 2980          if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
2952 2981                  ASSERT(bp->blk_pad[0] == 0);
2953 2982                  ASSERT(bp->blk_pad[1] == 0);
2954 2983                  ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2955 2984                      (bp == zio_unique_parent(zio)->io_bp));
2956 2985                  if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2957 2986                      zio->io_bp_override == NULL &&
2958 2987                      !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2959 2988                          ASSERT(!BP_SHOULD_BYTESWAP(bp));
2960 2989                          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
2961 2990                          ASSERT(BP_COUNT_GANG(bp) == 0 ||
2962 2991                              (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
2963 2992                  }
2964 2993                  if (zio->io_flags & ZIO_FLAG_NOPWRITE)
2965 2994                          VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
2966 2995          }
2967 2996  
2968 2997          /*
2969 2998           * If there were child vdev/gang/ddt errors, they apply to us now.
2970 2999           */
2971 3000          zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
2972 3001          zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
2973 3002          zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
2974 3003  
2975 3004          /*
2976 3005           * If the I/O on the transformed data was successful, generate any
2977 3006           * checksum reports now while we still have the transformed data.
2978 3007           */
2979 3008          if (zio->io_error == 0) {
2980 3009                  while (zio->io_cksum_report != NULL) {
2981 3010                          zio_cksum_report_t *zcr = zio->io_cksum_report;
2982 3011                          uint64_t align = zcr->zcr_align;
2983 3012                          uint64_t asize = P2ROUNDUP(psize, align);
2984 3013                          char *abuf = zio->io_data;
2985 3014  
2986 3015                          if (asize != psize) {
2987 3016                                  abuf = zio_buf_alloc(asize);
2988 3017                                  bcopy(zio->io_data, abuf, psize);
2989 3018                                  bzero(abuf + psize, asize - psize);
2990 3019                          }
2991 3020  
2992 3021                          zio->io_cksum_report = zcr->zcr_next;
2993 3022                          zcr->zcr_next = NULL;
2994 3023                          zcr->zcr_finish(zcr, abuf);
2995 3024                          zfs_ereport_free_checksum(zcr);
2996 3025  
2997 3026                          if (asize != psize)
2998 3027                                  zio_buf_free(abuf, asize);
2999 3028                  }
3000 3029          }
3001 3030  
3002 3031          zio_pop_transforms(zio);        /* note: may set zio->io_error */
3003 3032  
3004 3033          vdev_stat_update(zio, psize);
3005 3034  
3006 3035          if (zio->io_error) {
3007 3036                  /*
3008 3037                   * If this I/O is attached to a particular vdev,
3009 3038                   * generate an error message describing the I/O failure
3010 3039                   * at the block level.  We ignore these errors if the
3011 3040                   * device is currently unavailable.
3012 3041                   */
3013 3042                  if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
3014 3043                          zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
3015 3044  
3016 3045                  if ((zio->io_error == EIO || !(zio->io_flags &
3017 3046                      (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3018 3047                      zio == lio) {
3019 3048                          /*
3020 3049                           * For logical I/O requests, tell the SPA to log the
3021 3050                           * error and generate a logical data ereport.
3022 3051                           */
3023 3052                          spa_log_error(spa, zio);
3024 3053                          zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
3025 3054                              0, 0);
3026 3055                  }
3027 3056          }
3028 3057  
3029 3058          if (zio->io_error && zio == lio) {
3030 3059                  /*
3031 3060                   * Determine whether zio should be reexecuted.  This will
3032 3061                   * propagate all the way to the root via zio_notify_parent().
3033 3062                   */
3034 3063                  ASSERT(vd == NULL && bp != NULL);
3035 3064                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3036 3065  
3037 3066                  if (IO_IS_ALLOCATING(zio) &&
3038 3067                      !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3039 3068                          if (zio->io_error != ENOSPC)
3040 3069                                  zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3041 3070                          else
3042 3071                                  zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3043 3072                  }
3044 3073  
3045 3074                  if ((zio->io_type == ZIO_TYPE_READ ||
3046 3075                      zio->io_type == ZIO_TYPE_FREE) &&
3047 3076                      !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3048 3077                      zio->io_error == ENXIO &&
3049 3078                      spa_load_state(spa) == SPA_LOAD_NONE &&
3050 3079                      spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
3051 3080                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3052 3081  
3053 3082                  if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3054 3083                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3055 3084  
3056 3085                  /*
3057 3086                   * Here is a possibly good place to attempt to do
3058 3087                   * either combinatorial reconstruction or error correction
3059 3088                   * based on checksums.  It also might be a good place
3060 3089                   * to send out preliminary ereports before we suspend
3061 3090                   * processing.
3062 3091                   */
3063 3092          }
3064 3093  
3065 3094          /*
3066 3095           * If there were logical child errors, they apply to us now.
3067 3096           * We defer this until now to avoid conflating logical child
3068 3097           * errors with errors that happened to the zio itself when
3069 3098           * updating vdev stats and reporting FMA events above.
3070 3099           */
3071 3100          zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3072 3101  
3073 3102          if ((zio->io_error || zio->io_reexecute) &&
3074 3103              IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3075 3104              !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3076 3105                  zio_dva_unallocate(zio, zio->io_gang_tree, bp);
3077 3106  
3078 3107          zio_gang_tree_free(&zio->io_gang_tree);
3079 3108  
3080 3109          /*
3081 3110           * Godfather I/Os should never suspend.
3082 3111           */
3083 3112          if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3084 3113              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3085 3114                  zio->io_reexecute = 0;
3086 3115  
3087 3116          if (zio->io_reexecute) {
3088 3117                  /*
3089 3118                   * This is a logical I/O that wants to reexecute.
3090 3119                   *
3091 3120                   * Reexecute is top-down.  When an i/o fails, if it's not
3092 3121                   * the root, it simply notifies its parent and sticks around.
3093 3122                   * The parent, seeing that it still has children in zio_done(),
3094 3123                   * does the same.  This percolates all the way up to the root.
3095 3124                   * The root i/o will reexecute or suspend the entire tree.
3096 3125                   *
3097 3126                   * This approach ensures that zio_reexecute() honors
3098 3127                   * all the original i/o dependency relationships, e.g.
3099 3128                   * parents not executing until children are ready.
3100 3129                   */
3101 3130                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3102 3131  
3103 3132                  zio->io_gang_leader = NULL;
3104 3133  
3105 3134                  mutex_enter(&zio->io_lock);
3106 3135                  zio->io_state[ZIO_WAIT_DONE] = 1;
3107 3136                  mutex_exit(&zio->io_lock);
3108 3137  
3109 3138                  /*
3110 3139                   * "The Godfather" I/O monitors its children but is
3111 3140                   * not a true parent to them. It will track them through
3112 3141                   * the pipeline but severs its ties whenever they get into
3113 3142                   * trouble (e.g. suspended). This allows "The Godfather"
3114 3143                   * I/O to return status without blocking.
3115 3144                   */
3116 3145                  for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3117 3146                          zio_link_t *zl = zio->io_walk_link;
3118 3147                          pio_next = zio_walk_parents(zio);
3119 3148  
3120 3149                          if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3121 3150                              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3122 3151                                  zio_remove_child(pio, zio, zl);
3123 3152                                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3124 3153                          }
3125 3154                  }
3126 3155  
3127 3156                  if ((pio = zio_unique_parent(zio)) != NULL) {
3128 3157                          /*
3129 3158                           * We're not a root i/o, so there's nothing to do
3130 3159                           * but notify our parent.  Don't propagate errors
3131 3160                           * upward since we haven't permanently failed yet.
3132 3161                           */
3133 3162                          ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3134 3163                          zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3135 3164                          zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3136 3165                  } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3137 3166                          /*
3138 3167                           * We'd fail again if we reexecuted now, so suspend
3139 3168                           * until conditions improve (e.g. device comes online).
3140 3169                           */
3141 3170                          zio_suspend(spa, zio);
3142 3171                  } else {
3143 3172                          /*
3144 3173                           * Reexecution is potentially a huge amount of work.
3145 3174                           * Hand it off to the otherwise-unused claim taskq.
3146 3175                           */
3147 3176                          ASSERT(zio->io_tqent.tqent_next == NULL);
3148 3177                          spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
3149 3178                              ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
3150 3179                              0, &zio->io_tqent);
3151 3180                  }
3152 3181                  return (ZIO_PIPELINE_STOP);
3153 3182          }
3154 3183  
3155 3184          ASSERT(zio->io_child_count == 0);
3156 3185          ASSERT(zio->io_reexecute == 0);
3157 3186          ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3158 3187  
3159 3188          /*
3160 3189           * Report any checksum errors, since the I/O is complete.
3161 3190           */
3162 3191          while (zio->io_cksum_report != NULL) {
3163 3192                  zio_cksum_report_t *zcr = zio->io_cksum_report;
3164 3193                  zio->io_cksum_report = zcr->zcr_next;
3165 3194                  zcr->zcr_next = NULL;
3166 3195                  zcr->zcr_finish(zcr, NULL);
3167 3196                  zfs_ereport_free_checksum(zcr);
3168 3197          }
3169 3198  
3170 3199          /*
3171 3200           * It is the responsibility of the done callback to ensure that this
3172 3201           * particular zio is no longer discoverable for adoption, and as
3173 3202           * such, cannot acquire any new parents.
3174 3203           */
3175 3204          if (zio->io_done)
3176 3205                  zio->io_done(zio);
3177 3206  
3178 3207          mutex_enter(&zio->io_lock);
3179 3208          zio->io_state[ZIO_WAIT_DONE] = 1;
3180 3209          mutex_exit(&zio->io_lock);
3181 3210  
3182 3211          for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3183 3212                  zio_link_t *zl = zio->io_walk_link;
3184 3213                  pio_next = zio_walk_parents(zio);
3185 3214                  zio_remove_child(pio, zio, zl);
3186 3215                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3187 3216          }
3188 3217  
3189 3218          if (zio->io_waiter != NULL) {
3190 3219                  mutex_enter(&zio->io_lock);
3191 3220                  zio->io_executor = NULL;
3192 3221                  cv_broadcast(&zio->io_cv);
3193 3222                  mutex_exit(&zio->io_lock);
3194 3223          } else {
3195 3224                  zio_destroy(zio);
3196 3225          }
3197 3226  
3198 3227          return (ZIO_PIPELINE_STOP);
3199 3228  }
3200 3229  
3201 3230  /*
3202 3231   * ==========================================================================
3203 3232   * I/O pipeline definition
3204 3233   * ==========================================================================
3205 3234   */
3206 3235  static zio_pipe_stage_t *zio_pipeline[] = {
3207 3236          NULL,
3208 3237          zio_read_bp_init,
3209 3238          zio_free_bp_init,
3210 3239          zio_issue_async,
3211 3240          zio_write_bp_init,
3212 3241          zio_checksum_generate,
3213 3242          zio_nop_write,
3214 3243          zio_ddt_read_start,
3215 3244          zio_ddt_read_done,
3216 3245          zio_ddt_write,
3217 3246          zio_ddt_free,
3218 3247          zio_gang_assemble,
3219 3248          zio_gang_issue,
3220 3249          zio_dva_allocate,
3221 3250          zio_dva_free,
3222 3251          zio_dva_claim,
3223 3252          zio_ready,
3224 3253          zio_vdev_io_start,
3225 3254          zio_vdev_io_done,
3226 3255          zio_vdev_io_assess,
3227 3256          zio_checksum_verify,
3228 3257          zio_done
3229 3258  };
3230 3259  
3231 3260  /* dnp is the dnode for zb1->zb_object */
3232 3261  boolean_t
3233 3262  zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3234 3263      const zbookmark_phys_t *zb2)
3235 3264  {
3236 3265          uint64_t zb1nextL0, zb2thisobj;
3237 3266  
3238 3267          ASSERT(zb1->zb_objset == zb2->zb_objset);
3239 3268          ASSERT(zb2->zb_level == 0);
3240 3269  
3241 3270          /* The objset_phys_t isn't before anything. */
3242 3271          if (dnp == NULL)
3243 3272                  return (B_FALSE);
3244 3273  
3245 3274          zb1nextL0 = (zb1->zb_blkid + 1) <<
3246 3275              ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3247 3276  
3248 3277          zb2thisobj = zb2->zb_object ? zb2->zb_object :
3249 3278              zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3250 3279  
3251 3280          if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3252 3281                  uint64_t nextobj = zb1nextL0 *
3253 3282                      (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3254 3283                  return (nextobj <= zb2thisobj);
3255 3284          }
3256 3285  
3257 3286          if (zb1->zb_object < zb2thisobj)
3258 3287                  return (B_TRUE);
3259 3288          if (zb1->zb_object > zb2thisobj)
3260 3289                  return (B_FALSE);
3261 3290          if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3262 3291                  return (B_FALSE);
3263 3292          return (zb1nextL0 <= zb2->zb_blkid);
3264 3293  }

↓ open down ↓

3016 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX