Print this page
3741 zfs needs better comments
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/vdev_raidz.c
          +++ new/usr/src/uts/common/fs/zfs/vdev_raidz.c
↓ open down ↓ 423 lines elided ↑ open up ↑
 424  424                  buf += col->rc_size;
 425  425          }
 426  426          ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 427  427  }
 428  428  
 429  429  static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 430  430          vdev_raidz_map_free_vsd,
 431  431          vdev_raidz_cksum_report
 432  432  };
 433  433  
      434 +/*
      435 + * Divides the IO evenly across all child vdevs; usually, dcols is
      436 + * the number of children in the target vdev.
      437 + */
 434  438  static raidz_map_t *
 435  439  vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 436  440      uint64_t nparity)
 437  441  {
 438  442          raidz_map_t *rm;
      443 +        /* The starting RAIDZ (parent) vdev sector of the block. */
 439  444          uint64_t b = zio->io_offset >> unit_shift;
      445 +        /* The zio's size in units of the vdev's minimum sector size */
 440  446          uint64_t s = zio->io_size >> unit_shift;
      447 +        /* The first column for this stripe. */
 441  448          uint64_t f = b % dcols;
      449 +        /* The starting byte offset on each child vdev. */
 442  450          uint64_t o = (b / dcols) << unit_shift;
 443  451          uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 444  452  
      453 +        /*
      454 +         * "Quotient": The number of data sectors for this stripe on all but
      455 +         * the "big column" child vdevs that also contain "remainder" data.
      456 +         */
 445  457          q = s / (dcols - nparity);
      458 +
      459 +        /*
      460 +         * "Remainder": The number of partial stripe data sectors in this I/O.
      461 +         * This will add a sector to some, but not all, child vdevs.
      462 +         */
 446  463          r = s - q * (dcols - nparity);
      464 +
      465 +        /* The number of "big columns" - those which contain remainder data. */
 447  466          bc = (r == 0 ? 0 : r + nparity);
      467 +
      468 +        /*
      469 +         * The total number of data and parity sectors associated with
      470 +         * this I/O.
      471 +         */
 448  472          tot = s + nparity * (q + (r == 0 ? 0 : 1));
 449  473  
      474 +        /* acols: The columns that will be accessed. */
      475 +        /* scols: The columns that will be accessed or skipped. */
 450  476          if (q == 0) {
      477 +                /* Our I/O request doesn't span all child vdevs. */
 451  478                  acols = bc;
 452  479                  scols = MIN(dcols, roundup(bc, nparity + 1));
 453  480          } else {
 454  481                  acols = dcols;
 455  482                  scols = dcols;
 456  483          }
 457  484  
 458  485          ASSERT3U(acols, <=, scols);
 459  486  
 460  487          rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
↓ open down ↓ 1053 lines elided ↑ open up ↑
1514 1541  static void
1515 1542  vdev_raidz_child_done(zio_t *zio)
1516 1543  {
1517 1544          raidz_col_t *rc = zio->io_private;
1518 1545  
1519 1546          rc->rc_error = zio->io_error;
1520 1547          rc->rc_tried = 1;
1521 1548          rc->rc_skipped = 0;
1522 1549  }
1523 1550  
     1551 +/*
     1552 + * Start an IO operation on a RAIDZ VDev
     1553 + *
     1554 + * Outline:
     1555 + * - For write operations:
     1556 + *   1. Generate the parity data
     1557 + *   2. Create child zio write operations to each column's vdev, for both
     1558 + *      data and parity.
     1559 + *   3. If the column skips any sectors for padding, create optional dummy
     1560 + *      write zio children for those areas to improve aggregation continuity.
     1561 + * - For read operations:
     1562 + *   1. Create child zio read operations to each data column's vdev to read
     1563 + *      the range of data required for zio.
     1564 + *   2. If this is a scrub or resilver operation, or if any of the data
     1565 + *      vdevs have had errors, then create zio read operations to the parity
     1566 + *      columns' VDevs as well.
     1567 + */
1524 1568  static int
1525 1569  vdev_raidz_io_start(zio_t *zio)
1526 1570  {
1527 1571          vdev_t *vd = zio->io_vd;
1528 1572          vdev_t *tvd = vd->vdev_top;
1529 1573          vdev_t *cvd;
1530 1574          raidz_map_t *rm;
1531 1575          raidz_col_t *rc;
1532 1576          int c, i;
1533 1577  
↓ open down ↓ 320 lines elided ↑ open up ↑
1854 1898          }
1855 1899          n--;
1856 1900  done:
1857 1901          for (i = 0; i < n; i++) {
1858 1902                  zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1859 1903          }
1860 1904  
1861 1905          return (ret);
1862 1906  }
1863 1907  
     1908 +/*
     1909 + * Complete an IO operation on a RAIDZ VDev
     1910 + *
     1911 + * Outline:
     1912 + * - For write operations:
     1913 + *   1. Check for errors on the child IOs.
     1914 + *   2. Return, setting an error code if too few child VDevs were written
     1915 + *      to reconstruct the data later.  Note that partial writes are
     1916 + *      considered successful if they can be reconstructed at all.
     1917 + * - For read operations:
     1918 + *   1. Check for errors on the child IOs.
     1919 + *   2. If data errors occurred:
     1920 + *      a. Try to reassemble the data from the parity available.
     1921 + *      b. If we haven't yet read the parity drives, read them now.
     1922 + *      c. If all parity drives have been read but the data still doesn't
     1923 + *         reassemble with a correct checksum, then try combinatorial
     1924 + *         reconstruction.
     1925 + *      d. If that doesn't work, return an error.
     1926 + *   3. If there were unexpected errors or this is a resilver operation,
     1927 + *      rewrite the vdevs that had errors.
     1928 + */
1864 1929  static void
1865 1930  vdev_raidz_io_done(zio_t *zio)
1866 1931  {
1867 1932          vdev_t *vd = zio->io_vd;
1868 1933          vdev_t *cvd;
1869 1934          raidz_map_t *rm = zio->io_vsd;
1870 1935          raidz_col_t *rc;
1871 1936          int unexpected_errors = 0;
1872 1937          int parity_errors = 0;
1873 1938          int parity_untried = 0;
↓ open down ↓ 278 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX