illumos-gate Wdiff usr/src/uts/common/fs/zfs/vdev_queue.c

Print this page

10703 smatch unreachable code checking needs reworking
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/vdev_queue.c
          +++ new/usr/src/uts/common/fs/zfs/vdev_queue.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *

↓ open down ↓

18 lines elided

↑ open up ↑

  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  28   28   * Copyright (c) 2014 Integros [integros.com]
       29 + * Copyright 2019 Joyent, Inc.
  29   30   */
  30   31  
  31   32  #include <sys/zfs_context.h>
  32   33  #include <sys/vdev_impl.h>
  33   34  #include <sys/spa_impl.h>
  34   35  #include <sys/zio.h>
  35   36  #include <sys/avl.h>
  36   37  #include <sys/dsl_pool.h>
  37   38  #include <sys/metaslab_impl.h>
  38   39  #include <sys/abd.h>

  39   40  
  40   41  /*
  41   42   * ZFS I/O Scheduler
  42   43   * ---------------
  43   44   *
  44   45   * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
  45   46   * I/O scheduler determines when and in what order those operations are
  46   47   * issued.  The I/O scheduler divides operations into five I/O classes
  47   48   * prioritized in the following order: sync read, sync write, async read,
  48   49   * async write, and scrub/resilver.  Each queue defines the minimum and
  49   50   * maximum number of concurrent operations that may be issued to the device.
  50   51   * In addition, the device has an aggregate maximum. Note that the sum of the
  51   52   * per-queue minimums must not exceed the aggregate maximum, and if the
  52   53   * aggregate maximum is equal to or greater than the sum of the per-queue
  53   54   * maximums, the per-queue minimum has no effect.
  54   55   *
  55   56   * For many physical devices, throughput increases with the number of
  56   57   * concurrent operations, but latency typically suffers. Further, physical
  57   58   * devices typically have a limit at which more concurrent operations have no
  58   59   * effect on throughput or can actually cause it to decrease.
  59   60   *
  60   61   * The scheduler selects the next operation to issue by first looking for an
  61   62   * I/O class whose minimum has not been satisfied. Once all are satisfied and
  62   63   * the aggregate maximum has not been hit, the scheduler looks for classes
  63   64   * whose maximum has not been satisfied. Iteration through the I/O classes is
  64   65   * done in the order specified above. No further operations are issued if the
  65   66   * aggregate maximum number of concurrent operations has been hit or if there
  66   67   * are no operations queued for an I/O class that has not hit its maximum.
  67   68   * Every time an i/o is queued or an operation completes, the I/O scheduler
  68   69   * looks for new operations to issue.
  69   70   *
  70   71   * All I/O classes have a fixed maximum number of outstanding operations
  71   72   * except for the async write class. Asynchronous writes represent the data
  72   73   * that is committed to stable storage during the syncing stage for
  73   74   * transaction groups (see txg.c). Transaction groups enter the syncing state
  74   75   * periodically so the number of queued async writes will quickly burst up and
  75   76   * then bleed down to zero. Rather than servicing them as quickly as possible,
  76   77   * the I/O scheduler changes the maximum number of active async write i/os
  77   78   * according to the amount of dirty data in the pool (see dsl_pool.c). Since
  78   79   * both throughput and latency typically increase with the number of
  79   80   * concurrent operations issued to physical devices, reducing the burstiness
  80   81   * in the number of concurrent operations also stabilizes the response time of
  81   82   * operations from other -- and in particular synchronous -- queues. In broad
  82   83   * strokes, the I/O scheduler will issue more concurrent operations from the
  83   84   * async write queue as there's more dirty data in the pool.
  84   85   *
  85   86   * Async Writes
  86   87   *
  87   88   * The number of concurrent operations issued for the async write I/O class
  88   89   * follows a piece-wise linear function defined by a few adjustable points.
  89   90   *
  90   91   *        |                   o---------| <-- zfs_vdev_async_write_max_active
  91   92   *   ^    |                  /^         |
  92   93   *   |    |                 / |         |
  93   94   * active |                /  |         |
  94   95   *  I/O   |               /   |         |
  95   96   * count  |              /    |         |
  96   97   *        |             /     |         |
  97   98   *        |------------o      |         | <-- zfs_vdev_async_write_min_active
  98   99   *       0|____________^______|_________|
  99  100   *        0%           |      |       100% of zfs_dirty_data_max
 100  101   *                     |      |
 101  102   *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
 102  103   *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
 103  104   *
 104  105   * Until the amount of dirty data exceeds a minimum percentage of the dirty
 105  106   * data allowed in the pool, the I/O scheduler will limit the number of
 106  107   * concurrent operations to the minimum. As that threshold is crossed, the
 107  108   * number of concurrent operations issued increases linearly to the maximum at
 108  109   * the specified maximum percentage of the dirty data allowed in the pool.
 109  110   *
 110  111   * Ideally, the amount of dirty data on a busy pool will stay in the sloped
 111  112   * part of the function between zfs_vdev_async_write_active_min_dirty_percent
 112  113   * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
 113  114   * maximum percentage, this indicates that the rate of incoming data is
 114  115   * greater than the rate that the backend storage can handle. In this case, we
 115  116   * must further throttle incoming writes (see dmu_tx_delay() for details).
 116  117   */
 117  118  
 118  119  /*
 119  120   * The maximum number of i/os active to each device.  Ideally, this will be >=
 120  121   * the sum of each queue's max_active.  It must be at least the sum of each
 121  122   * queue's min_active.
 122  123   */
 123  124  uint32_t zfs_vdev_max_active = 1000;
 124  125  
 125  126  /*
 126  127   * Per-queue limits on the number of i/os active to each device.  If the
 127  128   * sum of the queue's max_active is < zfs_vdev_max_active, then the
 128  129   * min_active comes into play.  We will send min_active from each queue,
 129  130   * and then select from queues in the order defined by zio_priority_t.
 130  131   *
 131  132   * In general, smaller max_active's will lead to lower latency of synchronous
 132  133   * operations.  Larger max_active's may lead to higher overall throughput,
 133  134   * depending on underlying storage.
 134  135   *
 135  136   * The ratio of the queues' max_actives determines the balance of performance
 136  137   * between reads, writes, and scrubs.  E.g., increasing
 137  138   * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
 138  139   * more quickly, but reads and writes to have higher latency and lower
 139  140   * throughput.
 140  141   */
 141  142  uint32_t zfs_vdev_sync_read_min_active = 10;
 142  143  uint32_t zfs_vdev_sync_read_max_active = 10;
 143  144  uint32_t zfs_vdev_sync_write_min_active = 10;
 144  145  uint32_t zfs_vdev_sync_write_max_active = 10;
 145  146  uint32_t zfs_vdev_async_read_min_active = 1;
 146  147  uint32_t zfs_vdev_async_read_max_active = 3;
 147  148  uint32_t zfs_vdev_async_write_min_active = 1;
 148  149  uint32_t zfs_vdev_async_write_max_active = 10;
 149  150  uint32_t zfs_vdev_scrub_min_active = 1;
 150  151  uint32_t zfs_vdev_scrub_max_active = 2;
 151  152  uint32_t zfs_vdev_removal_min_active = 1;
 152  153  uint32_t zfs_vdev_removal_max_active = 2;
 153  154  uint32_t zfs_vdev_initializing_min_active = 1;
 154  155  uint32_t zfs_vdev_initializing_max_active = 1;
 155  156  
 156  157  /*
 157  158   * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
 158  159   * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
 159  160   * zfs_vdev_async_write_active_max_dirty_percent, use
 160  161   * zfs_vdev_async_write_max_active. The value is linearly interpolated
 161  162   * between min and max.
 162  163   */
 163  164  int zfs_vdev_async_write_active_min_dirty_percent = 30;
 164  165  int zfs_vdev_async_write_active_max_dirty_percent = 60;
 165  166  
 166  167  /*
 167  168   * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
 168  169   * For read I/Os, we also aggregate across small adjacency gaps; for writes
 169  170   * we include spans of optional I/Os to aid aggregation at the disk even when
 170  171   * they aren't able to help us aggregate at this level.
 171  172   */
 172  173  int zfs_vdev_aggregation_limit = 1 << 20;
 173  174  int zfs_vdev_read_gap_limit = 32 << 10;
 174  175  int zfs_vdev_write_gap_limit = 4 << 10;
 175  176  
 176  177  /*
 177  178   * Define the queue depth percentage for each top-level. This percentage is
 178  179   * used in conjunction with zfs_vdev_async_max_active to determine how many
 179  180   * allocations a specific top-level vdev should handle. Once the queue depth
 180  181   * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
 181  182   * then allocator will stop allocating blocks on that top-level device.
 182  183   * The default kernel setting is 1000% which will yield 100 allocations per
 183  184   * device. For userland testing, the default setting is 300% which equates
 184  185   * to 30 allocations per device.
 185  186   */
 186  187  #ifdef _KERNEL
 187  188  int zfs_vdev_queue_depth_pct = 1000;
 188  189  #else
 189  190  int zfs_vdev_queue_depth_pct = 300;
 190  191  #endif
 191  192  
 192  193  /*
 193  194   * When performing allocations for a given metaslab, we want to make sure that
 194  195   * there are enough IOs to aggregate together to improve throughput. We want to
 195  196   * ensure that there are at least 128k worth of IOs that can be aggregated, and
 196  197   * we assume that the average allocation size is 4k, so we need the queue depth
 197  198   * to be 32 per allocator to get good aggregation of sequential writes.
 198  199   */
 199  200  int zfs_vdev_def_queue_depth = 32;
 200  201  
 201  202  
 202  203  int
 203  204  vdev_queue_offset_compare(const void *x1, const void *x2)
 204  205  {
 205  206          const zio_t *z1 = (const zio_t *)x1;
 206  207          const zio_t *z2 = (const zio_t *)x2;
 207  208  
 208  209          int cmp = AVL_CMP(z1->io_offset, z2->io_offset);
 209  210  
 210  211          if (likely(cmp))
 211  212                  return (cmp);
 212  213  
 213  214          return (AVL_PCMP(z1, z2));
 214  215  }
 215  216  
 216  217  static inline avl_tree_t *
 217  218  vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
 218  219  {
 219  220          return (&vq->vq_class[p].vqc_queued_tree);
 220  221  }
 221  222  
 222  223  static inline avl_tree_t *
 223  224  vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
 224  225  {
 225  226          ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
 226  227          if (t == ZIO_TYPE_READ)
 227  228                  return (&vq->vq_read_offset_tree);
 228  229          else
 229  230                  return (&vq->vq_write_offset_tree);
 230  231  }
 231  232  
 232  233  int
 233  234  vdev_queue_timestamp_compare(const void *x1, const void *x2)
 234  235  {
 235  236          const zio_t *z1 = (const zio_t *)x1;
 236  237          const zio_t *z2 = (const zio_t *)x2;
 237  238  
 238  239          int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp);
 239  240  
 240  241          if (likely(cmp))
 241  242                  return (cmp);
 242  243  
 243  244          return (AVL_PCMP(z1, z2));
 244  245  }
 245  246  
 246  247  void
 247  248  vdev_queue_init(vdev_t *vd)
 248  249  {
 249  250          vdev_queue_t *vq = &vd->vdev_queue;
 250  251  
 251  252          mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 252  253          vq->vq_vdev = vd;
 253  254  
 254  255          avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
 255  256              sizeof (zio_t), offsetof(struct zio, io_queue_node));
 256  257          avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
 257  258              vdev_queue_offset_compare, sizeof (zio_t),
 258  259              offsetof(struct zio, io_offset_node));
 259  260          avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
 260  261              vdev_queue_offset_compare, sizeof (zio_t),
 261  262              offsetof(struct zio, io_offset_node));
 262  263  
 263  264          for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 264  265                  int (*compfn) (const void *, const void *);
 265  266  
 266  267                  /*
 267  268                   * The synchronous i/o queues are dispatched in FIFO rather
 268  269                   * than LBA order.  This provides more consistent latency for
 269  270                   * these i/os.
 270  271                   */
 271  272                  if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
 272  273                          compfn = vdev_queue_timestamp_compare;
 273  274                  else
 274  275                          compfn = vdev_queue_offset_compare;
 275  276  
 276  277                  avl_create(vdev_queue_class_tree(vq, p), compfn,
 277  278                      sizeof (zio_t), offsetof(struct zio, io_queue_node));
 278  279          }
 279  280  
 280  281          vq->vq_last_offset = 0;
 281  282  }
 282  283  
 283  284  void
 284  285  vdev_queue_fini(vdev_t *vd)
 285  286  {
 286  287          vdev_queue_t *vq = &vd->vdev_queue;
 287  288  
 288  289          for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
 289  290                  avl_destroy(vdev_queue_class_tree(vq, p));
 290  291          avl_destroy(&vq->vq_active_tree);
 291  292          avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
 292  293          avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 293  294  
 294  295          mutex_destroy(&vq->vq_lock);
 295  296  }
 296  297  
 297  298  static void
 298  299  vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 299  300  {
 300  301          spa_t *spa = zio->io_spa;
 301  302  
 302  303          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 303  304          avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 304  305          avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 305  306  
 306  307          mutex_enter(&spa->spa_iokstat_lock);
 307  308          spa->spa_queue_stats[zio->io_priority].spa_queued++;
 308  309          if (spa->spa_iokstat != NULL)
 309  310                  kstat_waitq_enter(spa->spa_iokstat->ks_data);
 310  311          mutex_exit(&spa->spa_iokstat_lock);
 311  312  }
 312  313  
 313  314  static void
 314  315  vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 315  316  {
 316  317          spa_t *spa = zio->io_spa;
 317  318  
 318  319          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 319  320          avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 320  321          avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 321  322  
 322  323          mutex_enter(&spa->spa_iokstat_lock);
 323  324          ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
 324  325          spa->spa_queue_stats[zio->io_priority].spa_queued--;
 325  326          if (spa->spa_iokstat != NULL)
 326  327                  kstat_waitq_exit(spa->spa_iokstat->ks_data);
 327  328          mutex_exit(&spa->spa_iokstat_lock);
 328  329  }
 329  330  
 330  331  static void
 331  332  vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 332  333  {
 333  334          spa_t *spa = zio->io_spa;
 334  335          ASSERT(MUTEX_HELD(&vq->vq_lock));
 335  336          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 336  337          vq->vq_class[zio->io_priority].vqc_active++;
 337  338          avl_add(&vq->vq_active_tree, zio);
 338  339  
 339  340          mutex_enter(&spa->spa_iokstat_lock);
 340  341          spa->spa_queue_stats[zio->io_priority].spa_active++;
 341  342          if (spa->spa_iokstat != NULL)
 342  343                  kstat_runq_enter(spa->spa_iokstat->ks_data);
 343  344          mutex_exit(&spa->spa_iokstat_lock);
 344  345  }
 345  346  
 346  347  static void
 347  348  vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 348  349  {
 349  350          spa_t *spa = zio->io_spa;
 350  351          ASSERT(MUTEX_HELD(&vq->vq_lock));
 351  352          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 352  353          vq->vq_class[zio->io_priority].vqc_active--;
 353  354          avl_remove(&vq->vq_active_tree, zio);
 354  355  
 355  356          mutex_enter(&spa->spa_iokstat_lock);
 356  357          ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
 357  358          spa->spa_queue_stats[zio->io_priority].spa_active--;
 358  359          if (spa->spa_iokstat != NULL) {
 359  360                  kstat_io_t *ksio = spa->spa_iokstat->ks_data;
 360  361  
 361  362                  kstat_runq_exit(spa->spa_iokstat->ks_data);
 362  363                  if (zio->io_type == ZIO_TYPE_READ) {
 363  364                          ksio->reads++;
 364  365                          ksio->nread += zio->io_size;
 365  366                  } else if (zio->io_type == ZIO_TYPE_WRITE) {
 366  367                          ksio->writes++;
 367  368                          ksio->nwritten += zio->io_size;
 368  369                  }
 369  370          }
 370  371          mutex_exit(&spa->spa_iokstat_lock);
 371  372  }
 372  373  
 373  374  static void
 374  375  vdev_queue_agg_io_done(zio_t *aio)
 375  376  {
 376  377          if (aio->io_type == ZIO_TYPE_READ) {
 377  378                  zio_t *pio;
 378  379                  zio_link_t *zl = NULL;
 379  380                  while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
 380  381                          abd_copy_off(pio->io_abd, aio->io_abd,
 381  382                              0, pio->io_offset - aio->io_offset, pio->io_size);
 382  383                  }
 383  384          }
 384  385  
 385  386          abd_free(aio->io_abd);
 386  387  }
 387  388  
 388  389  static int
 389  390  vdev_queue_class_min_active(zio_priority_t p)
 390  391  {
 391  392          switch (p) {
 392  393          case ZIO_PRIORITY_SYNC_READ:
 393  394                  return (zfs_vdev_sync_read_min_active);
 394  395          case ZIO_PRIORITY_SYNC_WRITE:
 395  396                  return (zfs_vdev_sync_write_min_active);
 396  397          case ZIO_PRIORITY_ASYNC_READ:
 397  398                  return (zfs_vdev_async_read_min_active);

↓ open down ↓

359 lines elided

↑ open up ↑

 398  399          case ZIO_PRIORITY_ASYNC_WRITE:
 399  400                  return (zfs_vdev_async_write_min_active);
 400  401          case ZIO_PRIORITY_SCRUB:
 401  402                  return (zfs_vdev_scrub_min_active);
 402  403          case ZIO_PRIORITY_REMOVAL:
 403  404                  return (zfs_vdev_removal_min_active);
 404  405          case ZIO_PRIORITY_INITIALIZING:
 405  406                  return (zfs_vdev_initializing_min_active);
 406  407          default:
 407  408                  panic("invalid priority %u", p);
 408      -                return (0);
 409  409          }
 410  410  }
 411  411  
 412  412  static int
 413  413  vdev_queue_max_async_writes(spa_t *spa)
 414  414  {
 415  415          int writes;
 416  416          uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
 417  417          uint64_t min_bytes = zfs_dirty_data_max *
 418  418              zfs_vdev_async_write_active_min_dirty_percent / 100;

 419  419          uint64_t max_bytes = zfs_dirty_data_max *
 420  420              zfs_vdev_async_write_active_max_dirty_percent / 100;
 421  421  
 422  422          /*
 423  423           * Sync tasks correspond to interactive user actions. To reduce the
 424  424           * execution time of those actions we push data out as fast as possible.
 425  425           */
 426  426          if (spa_has_pending_synctask(spa)) {
 427  427                  return (zfs_vdev_async_write_max_active);
 428  428          }
 429  429  
 430  430          if (dirty < min_bytes)
 431  431                  return (zfs_vdev_async_write_min_active);
 432  432          if (dirty > max_bytes)
 433  433                  return (zfs_vdev_async_write_max_active);
 434  434  
 435  435          /*
 436  436           * linear interpolation:
 437  437           * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
 438  438           * move right by min_bytes
 439  439           * move up by min_writes
 440  440           */
 441  441          writes = (dirty - min_bytes) *
 442  442              (zfs_vdev_async_write_max_active -
 443  443              zfs_vdev_async_write_min_active) /
 444  444              (max_bytes - min_bytes) +
 445  445              zfs_vdev_async_write_min_active;
 446  446          ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
 447  447          ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
 448  448          return (writes);
 449  449  }
 450  450  
 451  451  static int
 452  452  vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
 453  453  {
 454  454          switch (p) {
 455  455          case ZIO_PRIORITY_SYNC_READ:
 456  456                  return (zfs_vdev_sync_read_max_active);
 457  457          case ZIO_PRIORITY_SYNC_WRITE:
 458  458                  return (zfs_vdev_sync_write_max_active);
 459  459          case ZIO_PRIORITY_ASYNC_READ:
 460  460                  return (zfs_vdev_async_read_max_active);

↓ open down ↓

42 lines elided

↑ open up ↑

 461  461          case ZIO_PRIORITY_ASYNC_WRITE:
 462  462                  return (vdev_queue_max_async_writes(spa));
 463  463          case ZIO_PRIORITY_SCRUB:
 464  464                  return (zfs_vdev_scrub_max_active);
 465  465          case ZIO_PRIORITY_REMOVAL:
 466  466                  return (zfs_vdev_removal_max_active);
 467  467          case ZIO_PRIORITY_INITIALIZING:
 468  468                  return (zfs_vdev_initializing_max_active);
 469  469          default:
 470  470                  panic("invalid priority %u", p);
 471      -                return (0);
 472  471          }
 473  472  }
 474  473  
 475  474  /*
 476  475   * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
 477  476   * there is no eligible class.
 478  477   */
 479  478  static zio_priority_t
 480  479  vdev_queue_class_to_issue(vdev_queue_t *vq)
 481  480  {

 482  481          spa_t *spa = vq->vq_vdev->vdev_spa;
 483  482          zio_priority_t p;
 484  483  
 485  484          if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
 486  485                  return (ZIO_PRIORITY_NUM_QUEUEABLE);
 487  486  
 488  487          /* find a queue that has not reached its minimum # outstanding i/os */
 489  488          for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 490  489                  if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 491  490                      vq->vq_class[p].vqc_active <
 492  491                      vdev_queue_class_min_active(p))
 493  492                          return (p);
 494  493          }
 495  494  
 496  495          /*
 497  496           * If we haven't found a queue, look for one that hasn't reached its
 498  497           * maximum # outstanding i/os.
 499  498           */
 500  499          for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 501  500                  if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 502  501                      vq->vq_class[p].vqc_active <
 503  502                      vdev_queue_class_max_active(spa, p))
 504  503                          return (p);
 505  504          }
 506  505  
 507  506          /* No eligible queued i/os */
 508  507          return (ZIO_PRIORITY_NUM_QUEUEABLE);
 509  508  }
 510  509  
 511  510  /*
 512  511   * Compute the range spanned by two i/os, which is the endpoint of the last
 513  512   * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
 514  513   * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
 515  514   * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
 516  515   */
 517  516  #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
 518  517  #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 519  518  
 520  519  static zio_t *
 521  520  vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 522  521  {
 523  522          zio_t *first, *last, *aio, *dio, *mandatory, *nio;
 524  523          zio_link_t *zl = NULL;
 525  524          uint64_t maxgap = 0;
 526  525          uint64_t size;
 527  526          boolean_t stretch = B_FALSE;
 528  527          avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
 529  528          enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 530  529  
 531  530          if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
 532  531                  return (NULL);
 533  532  
 534  533          first = last = zio;
 535  534  
 536  535          if (zio->io_type == ZIO_TYPE_READ)
 537  536                  maxgap = zfs_vdev_read_gap_limit;
 538  537  
 539  538          /*
 540  539           * We can aggregate I/Os that are sufficiently adjacent and of
 541  540           * the same flavor, as expressed by the AGG_INHERIT flags.
 542  541           * The latter requirement is necessary so that certain
 543  542           * attributes of the I/O, such as whether it's a normal I/O
 544  543           * or a scrub/resilver, can be preserved in the aggregate.
 545  544           * We can include optional I/Os, but don't allow them
 546  545           * to begin a range as they add no benefit in that situation.
 547  546           */
 548  547  
 549  548          /*
 550  549           * We keep track of the last non-optional I/O.
 551  550           */
 552  551          mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
 553  552  
 554  553          /*
 555  554           * Walk backwards through sufficiently contiguous I/Os
 556  555           * recording the last non-optional I/O.
 557  556           */
 558  557          while ((dio = AVL_PREV(t, first)) != NULL &&
 559  558              (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 560  559              IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
 561  560              IO_GAP(dio, first) <= maxgap &&
 562  561              dio->io_type == zio->io_type) {
 563  562                  first = dio;
 564  563                  if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
 565  564                          mandatory = first;
 566  565          }
 567  566  
 568  567          /*
 569  568           * Skip any initial optional I/Os.
 570  569           */
 571  570          while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
 572  571                  first = AVL_NEXT(t, first);
 573  572                  ASSERT(first != NULL);
 574  573          }
 575  574  
 576  575          /*
 577  576           * Walk forward through sufficiently contiguous I/Os.
 578  577           * The aggregation limit does not apply to optional i/os, so that
 579  578           * we can issue contiguous writes even if they are larger than the
 580  579           * aggregation limit.
 581  580           */
 582  581          while ((dio = AVL_NEXT(t, last)) != NULL &&
 583  582              (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 584  583              (IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit ||
 585  584              (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
 586  585              IO_GAP(last, dio) <= maxgap &&
 587  586              dio->io_type == zio->io_type) {
 588  587                  last = dio;
 589  588                  if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
 590  589                          mandatory = last;
 591  590          }
 592  591  
 593  592          /*
 594  593           * Now that we've established the range of the I/O aggregation
 595  594           * we must decide what to do with trailing optional I/Os.
 596  595           * For reads, there's nothing to do. While we are unable to
 597  596           * aggregate further, it's possible that a trailing optional
 598  597           * I/O would allow the underlying device to aggregate with
 599  598           * subsequent I/Os. We must therefore determine if the next
 600  599           * non-optional I/O is close enough to make aggregation
 601  600           * worthwhile.
 602  601           */
 603  602          if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
 604  603                  zio_t *nio = last;
 605  604                  while ((dio = AVL_NEXT(t, nio)) != NULL &&
 606  605                      IO_GAP(nio, dio) == 0 &&
 607  606                      IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
 608  607                          nio = dio;
 609  608                          if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
 610  609                                  stretch = B_TRUE;
 611  610                                  break;
 612  611                          }
 613  612                  }
 614  613          }
 615  614  
 616  615          if (stretch) {
 617  616                  /*
 618  617                   * We are going to include an optional io in our aggregated
 619  618                   * span, thus closing the write gap.  Only mandatory i/os can
 620  619                   * start aggregated spans, so make sure that the next i/o
 621  620                   * after our span is mandatory.
 622  621                   */
 623  622                  dio = AVL_NEXT(t, last);
 624  623                  dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
 625  624          } else {
 626  625                  /* do not include the optional i/o */
 627  626                  while (last != mandatory && last != first) {
 628  627                          ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
 629  628                          last = AVL_PREV(t, last);
 630  629                          ASSERT(last != NULL);
 631  630                  }
 632  631          }
 633  632  
 634  633          if (first == last)
 635  634                  return (NULL);
 636  635  
 637  636          size = IO_SPAN(first, last);
 638  637          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 639  638  
 640  639          aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
 641  640              abd_alloc_for_io(size, B_TRUE), size, first->io_type,
 642  641              zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 643  642              vdev_queue_agg_io_done, NULL);
 644  643          aio->io_timestamp = first->io_timestamp;
 645  644  
 646  645          nio = first;
 647  646          do {
 648  647                  dio = nio;
 649  648                  nio = AVL_NEXT(t, dio);
 650  649                  ASSERT3U(dio->io_type, ==, aio->io_type);
 651  650  
 652  651                  if (dio->io_flags & ZIO_FLAG_NODATA) {
 653  652                          ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
 654  653                          abd_zero_off(aio->io_abd,
 655  654                              dio->io_offset - aio->io_offset, dio->io_size);
 656  655                  } else if (dio->io_type == ZIO_TYPE_WRITE) {
 657  656                          abd_copy_off(aio->io_abd, dio->io_abd,
 658  657                              dio->io_offset - aio->io_offset, 0, dio->io_size);
 659  658                  }
 660  659  
 661  660                  zio_add_child(dio, aio);
 662  661                  vdev_queue_io_remove(vq, dio);
 663  662          } while (dio != last);
 664  663  
 665  664          /*
 666  665           * We need to drop the vdev queue's lock to avoid a deadlock that we
 667  666           * could encounter since this I/O will complete immediately.
 668  667           */
 669  668          mutex_exit(&vq->vq_lock);
 670  669          while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
 671  670                  zio_vdev_io_bypass(dio);
 672  671                  zio_execute(dio);
 673  672          }
 674  673          mutex_enter(&vq->vq_lock);
 675  674  
 676  675          return (aio);
 677  676  }
 678  677  
 679  678  static zio_t *
 680  679  vdev_queue_io_to_issue(vdev_queue_t *vq)
 681  680  {
 682  681          zio_t *zio, *aio;
 683  682          zio_priority_t p;
 684  683          avl_index_t idx;
 685  684          avl_tree_t *tree;
 686  685          zio_t search;
 687  686  
 688  687  again:
 689  688          ASSERT(MUTEX_HELD(&vq->vq_lock));
 690  689  
 691  690          p = vdev_queue_class_to_issue(vq);
 692  691  
 693  692          if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
 694  693                  /* No eligible queued i/os */
 695  694                  return (NULL);
 696  695          }
 697  696  
 698  697          /*
 699  698           * For LBA-ordered queues (async / scrub / initializing), issue the
 700  699           * i/o which follows the most recently issued i/o in LBA (offset) order.
 701  700           *
 702  701           * For FIFO queues (sync), issue the i/o with the lowest timestamp.
 703  702           */
 704  703          tree = vdev_queue_class_tree(vq, p);
 705  704          search.io_timestamp = 0;
 706  705          search.io_offset = vq->vq_last_offset - 1;
 707  706          VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
 708  707          zio = avl_nearest(tree, idx, AVL_AFTER);
 709  708          if (zio == NULL)
 710  709                  zio = avl_first(tree);
 711  710          ASSERT3U(zio->io_priority, ==, p);
 712  711  
 713  712          aio = vdev_queue_aggregate(vq, zio);
 714  713          if (aio != NULL)
 715  714                  zio = aio;
 716  715          else
 717  716                  vdev_queue_io_remove(vq, zio);
 718  717  
 719  718          /*
 720  719           * If the I/O is or was optional and therefore has no data, we need to
 721  720           * simply discard it. We need to drop the vdev queue's lock to avoid a
 722  721           * deadlock that we could encounter since this I/O will complete
 723  722           * immediately.
 724  723           */
 725  724          if (zio->io_flags & ZIO_FLAG_NODATA) {
 726  725                  mutex_exit(&vq->vq_lock);
 727  726                  zio_vdev_io_bypass(zio);
 728  727                  zio_execute(zio);
 729  728                  mutex_enter(&vq->vq_lock);
 730  729                  goto again;
 731  730          }
 732  731  
 733  732          vdev_queue_pending_add(vq, zio);
 734  733          vq->vq_last_offset = zio->io_offset + zio->io_size;
 735  734  
 736  735          return (zio);
 737  736  }
 738  737  
 739  738  zio_t *
 740  739  vdev_queue_io(zio_t *zio)
 741  740  {
 742  741          vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 743  742          zio_t *nio;
 744  743  
 745  744          if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
 746  745                  return (zio);
 747  746  
 748  747          /*
 749  748           * Children i/os inherent their parent's priority, which might
 750  749           * not match the child's i/o type.  Fix it up here.
 751  750           */
 752  751          if (zio->io_type == ZIO_TYPE_READ) {
 753  752                  if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 754  753                      zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 755  754                      zio->io_priority != ZIO_PRIORITY_SCRUB &&
 756  755                      zio->io_priority != ZIO_PRIORITY_REMOVAL &&
 757  756                      zio->io_priority != ZIO_PRIORITY_INITIALIZING)
 758  757                          zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
 759  758          } else {
 760  759                  ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 761  760                  if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 762  761                      zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
 763  762                      zio->io_priority != ZIO_PRIORITY_REMOVAL &&
 764  763                      zio->io_priority != ZIO_PRIORITY_INITIALIZING)
 765  764                          zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
 766  765          }
 767  766  
 768  767          zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
 769  768  
 770  769          mutex_enter(&vq->vq_lock);
 771  770          zio->io_timestamp = gethrtime();
 772  771          vdev_queue_io_add(vq, zio);
 773  772          nio = vdev_queue_io_to_issue(vq);
 774  773          mutex_exit(&vq->vq_lock);
 775  774  
 776  775          if (nio == NULL)
 777  776                  return (NULL);
 778  777  
 779  778          if (nio->io_done == vdev_queue_agg_io_done) {
 780  779                  zio_nowait(nio);
 781  780                  return (NULL);
 782  781          }
 783  782  
 784  783          return (nio);
 785  784  }
 786  785  
 787  786  void
 788  787  vdev_queue_io_done(zio_t *zio)
 789  788  {
 790  789          vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 791  790          zio_t *nio;
 792  791  
 793  792          mutex_enter(&vq->vq_lock);
 794  793  
 795  794          vdev_queue_pending_remove(vq, zio);
 796  795  
 797  796          vq->vq_io_complete_ts = gethrtime();
 798  797  
 799  798          while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
 800  799                  mutex_exit(&vq->vq_lock);
 801  800                  if (nio->io_done == vdev_queue_agg_io_done) {
 802  801                          zio_nowait(nio);
 803  802                  } else {
 804  803                          zio_vdev_io_reissue(nio);
 805  804                          zio_execute(nio);
 806  805                  }
 807  806                  mutex_enter(&vq->vq_lock);
 808  807          }
 809  808  
 810  809          mutex_exit(&vq->vq_lock);
 811  810  }
 812  811  
 813  812  void
 814  813  vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 815  814  {
 816  815          vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 817  816          avl_tree_t *tree;
 818  817  
 819  818          /*
 820  819           * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
 821  820           * code to issue IOs without adding them to the vdev queue. In this
 822  821           * case, the zio is already going to be issued as quickly as possible
 823  822           * and so it doesn't need any reprioitization to help.
 824  823           */
 825  824          if (zio->io_priority == ZIO_PRIORITY_NOW)
 826  825                  return;
 827  826  
 828  827          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 829  828          ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 830  829  
 831  830          if (zio->io_type == ZIO_TYPE_READ) {
 832  831                  if (priority != ZIO_PRIORITY_SYNC_READ &&
 833  832                      priority != ZIO_PRIORITY_ASYNC_READ &&
 834  833                      priority != ZIO_PRIORITY_SCRUB)
 835  834                          priority = ZIO_PRIORITY_ASYNC_READ;
 836  835          } else {
 837  836                  ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 838  837                  if (priority != ZIO_PRIORITY_SYNC_WRITE &&
 839  838                      priority != ZIO_PRIORITY_ASYNC_WRITE)
 840  839                          priority = ZIO_PRIORITY_ASYNC_WRITE;
 841  840          }
 842  841  
 843  842          mutex_enter(&vq->vq_lock);
 844  843  
 845  844          /*
 846  845           * If the zio is in none of the queues we can simply change
 847  846           * the priority. If the zio is waiting to be submitted we must
 848  847           * remove it from the queue and re-insert it with the new priority.
 849  848           * Otherwise, the zio is currently active and we cannot change its
 850  849           * priority.
 851  850           */
 852  851          tree = vdev_queue_class_tree(vq, zio->io_priority);
 853  852          if (avl_find(tree, zio, NULL) == zio) {
 854  853                  spa_t *spa = zio->io_spa;
 855  854                  zio_priority_t oldpri = zio->io_priority;
 856  855  
 857  856                  avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 858  857                  zio->io_priority = priority;
 859  858                  avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 860  859  
 861  860                  mutex_enter(&spa->spa_iokstat_lock);
 862  861                  ASSERT3U(spa->spa_queue_stats[oldpri].spa_queued, >, 0);
 863  862                  spa->spa_queue_stats[oldpri].spa_queued--;
 864  863                  spa->spa_queue_stats[zio->io_priority].spa_queued++;
 865  864                  mutex_exit(&spa->spa_iokstat_lock);
 866  865          } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
 867  866                  zio->io_priority = priority;
 868  867          }
 869  868  
 870  869          mutex_exit(&vq->vq_lock);
 871  870  }
 872  871  
 873  872  /*
 874  873   * As these two methods are only used for load calculations we're not
 875  874   * concerned if we get an incorrect value on 32bit platforms due to lack of
 876  875   * vq_lock mutex use here, instead we prefer to keep it lock free for
 877  876   * performance.
 878  877   */
 879  878  int
 880  879  vdev_queue_length(vdev_t *vd)
 881  880  {
 882  881          return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
 883  882  }
 884  883  
 885  884  uint64_t
 886  885  vdev_queue_last_offset(vdev_t *vd)
 887  886  {
 888  887          return (vd->vdev_queue.vq_last_offset);
 889  888  }

↓ open down ↓

408 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX