Print this page
3741 zfs needs better comments
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>


 414                 size += rm->rm_col[c].rc_size;
 415 
 416         buf = rm->rm_datacopy = zio_buf_alloc(size);
 417 
 418         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 419                 raidz_col_t *col = &rm->rm_col[c];
 420 
 421                 bcopy(col->rc_data, buf, col->rc_size);
 422                 col->rc_data = buf;
 423 
 424                 buf += col->rc_size;
 425         }
 426         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 427 }
 428 
 429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 430         vdev_raidz_map_free_vsd,
 431         vdev_raidz_cksum_report
 432 };
 433 




 434 static raidz_map_t *
 435 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 436     uint64_t nparity)
 437 {
 438         raidz_map_t *rm;

 439         uint64_t b = zio->io_offset >> unit_shift;

 440         uint64_t s = zio->io_size >> unit_shift;

 441         uint64_t f = b % dcols;

 442         uint64_t o = (b / dcols) << unit_shift;
 443         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 444 




 445         q = s / (dcols - nparity);





 446         r = s - q * (dcols - nparity);


 447         bc = (r == 0 ? 0 : r + nparity);





 448         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 449 


 450         if (q == 0) {

 451                 acols = bc;
 452                 scols = MIN(dcols, roundup(bc, nparity + 1));
 453         } else {
 454                 acols = dcols;
 455                 scols = dcols;
 456         }
 457 
 458         ASSERT3U(acols, <=, scols);
 459 
 460         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 461 
 462         rm->rm_cols = acols;
 463         rm->rm_scols = scols;
 464         rm->rm_bigcols = bc;
 465         rm->rm_skipstart = bc;
 466         rm->rm_missingdata = 0;
 467         rm->rm_missingparity = 0;
 468         rm->rm_firstdatacol = nparity;
 469         rm->rm_datacopy = NULL;
 470         rm->rm_reports = 0;


1504         uint64_t cols = vd->vdev_children;
1505         uint64_t nparity = vd->vdev_nparity;
1506 
1507         asize = ((psize - 1) >> ashift) + 1;
1508         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1509         asize = roundup(asize, nparity + 1) << ashift;
1510 
1511         return (asize);
1512 }
1513 
1514 static void
1515 vdev_raidz_child_done(zio_t *zio)
1516 {
1517         raidz_col_t *rc = zio->io_private;
1518 
1519         rc->rc_error = zio->io_error;
1520         rc->rc_tried = 1;
1521         rc->rc_skipped = 0;
1522 }
1523 

















1524 static int
1525 vdev_raidz_io_start(zio_t *zio)
1526 {
1527         vdev_t *vd = zio->io_vd;
1528         vdev_t *tvd = vd->vdev_top;
1529         vdev_t *cvd;
1530         raidz_map_t *rm;
1531         raidz_col_t *rc;
1532         int c, i;
1533 
1534         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1535             vd->vdev_nparity);
1536 
1537         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1538 
1539         if (zio->io_type == ZIO_TYPE_WRITE) {
1540                 vdev_raidz_generate_parity(rm);
1541 
1542                 for (c = 0; c < rm->rm_cols; c++) {
1543                         rc = &rm->rm_col[c];


1844                                  */
1845                                 for (c = tgts[current - 1] + 1;
1846                                     rm->rm_col[c].rc_error != 0; c++)
1847                                         continue;
1848 
1849                                 tgts[current] = c;
1850                                 current++;
1851 
1852                         } while (current != n);
1853                 }
1854         }
1855         n--;
1856 done:
1857         for (i = 0; i < n; i++) {
1858                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1859         }
1860 
1861         return (ret);
1862 }
1863 





















1864 static void
1865 vdev_raidz_io_done(zio_t *zio)
1866 {
1867         vdev_t *vd = zio->io_vd;
1868         vdev_t *cvd;
1869         raidz_map_t *rm = zio->io_vsd;
1870         raidz_col_t *rc;
1871         int unexpected_errors = 0;
1872         int parity_errors = 0;
1873         int parity_untried = 0;
1874         int data_errors = 0;
1875         int total_errors = 0;
1876         int n, c;
1877         int tgts[VDEV_RAIDZ_MAXPARITY];
1878         int code;
1879 
1880         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1881 
1882         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1883         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);




 414                 size += rm->rm_col[c].rc_size;
 415 
 416         buf = rm->rm_datacopy = zio_buf_alloc(size);
 417 
 418         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 419                 raidz_col_t *col = &rm->rm_col[c];
 420 
 421                 bcopy(col->rc_data, buf, col->rc_size);
 422                 col->rc_data = buf;
 423 
 424                 buf += col->rc_size;
 425         }
 426         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 427 }
 428 
 429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 430         vdev_raidz_map_free_vsd,
 431         vdev_raidz_cksum_report
 432 };
 433 
 434 /*
 435  * Divides the IO evenly across all child vdevs; usually, dcols is
 436  * the number of children in the target vdev.
 437  */
 438 static raidz_map_t *
 439 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 440     uint64_t nparity)
 441 {
 442         raidz_map_t *rm;
 443         /* The starting RAIDZ (parent) vdev sector of the block. */
 444         uint64_t b = zio->io_offset >> unit_shift;
 445         /* The zio's size in units of the vdev's minimum sector size */
 446         uint64_t s = zio->io_size >> unit_shift;
 447         /* The first column for this stripe. */
 448         uint64_t f = b % dcols;
 449         /* The starting byte offset on each child vdev. */
 450         uint64_t o = (b / dcols) << unit_shift;
 451         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 452 
 453         /*
 454          * "Quotient": The number of data sectors for this stripe on all but
 455          * the "big column" child vdevs that also contain "remainder" data.
 456          */
 457         q = s / (dcols - nparity);
 458 
 459         /*
 460          * "Remainder": The number of partial stripe data sectors in this I/O.
 461          * This will add a sector to some, but not all, child vdevs.
 462          */
 463         r = s - q * (dcols - nparity);
 464 
 465         /* The number of "big columns" - those which contain remainder data. */
 466         bc = (r == 0 ? 0 : r + nparity);
 467 
 468         /*
 469          * The total number of data and parity sectors associated with
 470          * this I/O.
 471          */
 472         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 473 
 474         /* acols: The columns that will be accessed. */
 475         /* scols: The columns that will be accessed or skipped. */
 476         if (q == 0) {
 477                 /* Our I/O request doesn't span all child vdevs. */
 478                 acols = bc;
 479                 scols = MIN(dcols, roundup(bc, nparity + 1));
 480         } else {
 481                 acols = dcols;
 482                 scols = dcols;
 483         }
 484 
 485         ASSERT3U(acols, <=, scols);
 486 
 487         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 488 
 489         rm->rm_cols = acols;
 490         rm->rm_scols = scols;
 491         rm->rm_bigcols = bc;
 492         rm->rm_skipstart = bc;
 493         rm->rm_missingdata = 0;
 494         rm->rm_missingparity = 0;
 495         rm->rm_firstdatacol = nparity;
 496         rm->rm_datacopy = NULL;
 497         rm->rm_reports = 0;


1531         uint64_t cols = vd->vdev_children;
1532         uint64_t nparity = vd->vdev_nparity;
1533 
1534         asize = ((psize - 1) >> ashift) + 1;
1535         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1536         asize = roundup(asize, nparity + 1) << ashift;
1537 
1538         return (asize);
1539 }
1540 
1541 static void
1542 vdev_raidz_child_done(zio_t *zio)
1543 {
1544         raidz_col_t *rc = zio->io_private;
1545 
1546         rc->rc_error = zio->io_error;
1547         rc->rc_tried = 1;
1548         rc->rc_skipped = 0;
1549 }
1550 
1551 /*
1552  * Start an IO operation on a RAIDZ VDev
1553  *
1554  * Outline:
1555  * - For write operations:
1556  *   1. Generate the parity data
1557  *   2. Create child zio write operations to each column's vdev, for both
1558  *      data and parity.
1559  *   3. If the column skips any sectors for padding, create optional dummy
1560  *      write zio children for those areas to improve aggregation continuity.
1561  * - For read operations:
1562  *   1. Create child zio read operations to each data column's vdev to read
1563  *      the range of data required for zio.
1564  *   2. If this is a scrub or resilver operation, or if any of the data
1565  *      vdevs have had errors, then create zio read operations to the parity
1566  *      columns' VDevs as well.
1567  */
1568 static int
1569 vdev_raidz_io_start(zio_t *zio)
1570 {
1571         vdev_t *vd = zio->io_vd;
1572         vdev_t *tvd = vd->vdev_top;
1573         vdev_t *cvd;
1574         raidz_map_t *rm;
1575         raidz_col_t *rc;
1576         int c, i;
1577 
1578         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1579             vd->vdev_nparity);
1580 
1581         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1582 
1583         if (zio->io_type == ZIO_TYPE_WRITE) {
1584                 vdev_raidz_generate_parity(rm);
1585 
1586                 for (c = 0; c < rm->rm_cols; c++) {
1587                         rc = &rm->rm_col[c];


1888                                  */
1889                                 for (c = tgts[current - 1] + 1;
1890                                     rm->rm_col[c].rc_error != 0; c++)
1891                                         continue;
1892 
1893                                 tgts[current] = c;
1894                                 current++;
1895 
1896                         } while (current != n);
1897                 }
1898         }
1899         n--;
1900 done:
1901         for (i = 0; i < n; i++) {
1902                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1903         }
1904 
1905         return (ret);
1906 }
1907 
1908 /*
1909  * Complete an IO operation on a RAIDZ VDev
1910  *
1911  * Outline:
1912  * - For write operations:
1913  *   1. Check for errors on the child IOs.
1914  *   2. Return, setting an error code if too few child VDevs were written
1915  *      to reconstruct the data later.  Note that partial writes are
1916  *      considered successful if they can be reconstructed at all.
1917  * - For read operations:
1918  *   1. Check for errors on the child IOs.
1919  *   2. If data errors occurred:
1920  *      a. Try to reassemble the data from the parity available.
1921  *      b. If we haven't yet read the parity drives, read them now.
1922  *      c. If all parity drives have been read but the data still doesn't
1923  *         reassemble with a correct checksum, then try combinatorial
1924  *         reconstruction.
1925  *      d. If that doesn't work, return an error.
1926  *   3. If there were unexpected errors or this is a resilver operation,
1927  *      rewrite the vdevs that had errors.
1928  */
1929 static void
1930 vdev_raidz_io_done(zio_t *zio)
1931 {
1932         vdev_t *vd = zio->io_vd;
1933         vdev_t *cvd;
1934         raidz_map_t *rm = zio->io_vsd;
1935         raidz_col_t *rc;
1936         int unexpected_errors = 0;
1937         int parity_errors = 0;
1938         int parity_untried = 0;
1939         int data_errors = 0;
1940         int total_errors = 0;
1941         int n, c;
1942         int tgts[VDEV_RAIDZ_MAXPARITY];
1943         int code;
1944 
1945         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1946 
1947         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1948         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);