414 size += rm->rm_col[c].rc_size;
415
416 buf = rm->rm_datacopy = zio_buf_alloc(size);
417
418 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419 raidz_col_t *col = &rm->rm_col[c];
420
421 bcopy(col->rc_data, buf, col->rc_size);
422 col->rc_data = buf;
423
424 buf += col->rc_size;
425 }
426 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427 }
428
429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430 vdev_raidz_map_free_vsd,
431 vdev_raidz_cksum_report
432 };
433
434 static raidz_map_t *
435 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
436 uint64_t nparity)
437 {
438 raidz_map_t *rm;
439 uint64_t b = zio->io_offset >> unit_shift;
440 uint64_t s = zio->io_size >> unit_shift;
441 uint64_t f = b % dcols;
442 uint64_t o = (b / dcols) << unit_shift;
443 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
444
445 q = s / (dcols - nparity);
446 r = s - q * (dcols - nparity);
447 bc = (r == 0 ? 0 : r + nparity);
448 tot = s + nparity * (q + (r == 0 ? 0 : 1));
449
450 if (q == 0) {
451 acols = bc;
452 scols = MIN(dcols, roundup(bc, nparity + 1));
453 } else {
454 acols = dcols;
455 scols = dcols;
456 }
457
458 ASSERT3U(acols, <=, scols);
459
460 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
461
462 rm->rm_cols = acols;
463 rm->rm_scols = scols;
464 rm->rm_bigcols = bc;
465 rm->rm_skipstart = bc;
466 rm->rm_missingdata = 0;
467 rm->rm_missingparity = 0;
468 rm->rm_firstdatacol = nparity;
469 rm->rm_datacopy = NULL;
470 rm->rm_reports = 0;
1504 uint64_t cols = vd->vdev_children;
1505 uint64_t nparity = vd->vdev_nparity;
1506
1507 asize = ((psize - 1) >> ashift) + 1;
1508 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1509 asize = roundup(asize, nparity + 1) << ashift;
1510
1511 return (asize);
1512 }
1513
1514 static void
1515 vdev_raidz_child_done(zio_t *zio)
1516 {
1517 raidz_col_t *rc = zio->io_private;
1518
1519 rc->rc_error = zio->io_error;
1520 rc->rc_tried = 1;
1521 rc->rc_skipped = 0;
1522 }
1523
1524 static int
1525 vdev_raidz_io_start(zio_t *zio)
1526 {
1527 vdev_t *vd = zio->io_vd;
1528 vdev_t *tvd = vd->vdev_top;
1529 vdev_t *cvd;
1530 raidz_map_t *rm;
1531 raidz_col_t *rc;
1532 int c, i;
1533
1534 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1535 vd->vdev_nparity);
1536
1537 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1538
1539 if (zio->io_type == ZIO_TYPE_WRITE) {
1540 vdev_raidz_generate_parity(rm);
1541
1542 for (c = 0; c < rm->rm_cols; c++) {
1543 rc = &rm->rm_col[c];
1844 */
1845 for (c = tgts[current - 1] + 1;
1846 rm->rm_col[c].rc_error != 0; c++)
1847 continue;
1848
1849 tgts[current] = c;
1850 current++;
1851
1852 } while (current != n);
1853 }
1854 }
1855 n--;
1856 done:
1857 for (i = 0; i < n; i++) {
1858 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1859 }
1860
1861 return (ret);
1862 }
1863
1864 static void
1865 vdev_raidz_io_done(zio_t *zio)
1866 {
1867 vdev_t *vd = zio->io_vd;
1868 vdev_t *cvd;
1869 raidz_map_t *rm = zio->io_vsd;
1870 raidz_col_t *rc;
1871 int unexpected_errors = 0;
1872 int parity_errors = 0;
1873 int parity_untried = 0;
1874 int data_errors = 0;
1875 int total_errors = 0;
1876 int n, c;
1877 int tgts[VDEV_RAIDZ_MAXPARITY];
1878 int code;
1879
1880 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1881
1882 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1883 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
|
414 size += rm->rm_col[c].rc_size;
415
416 buf = rm->rm_datacopy = zio_buf_alloc(size);
417
418 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419 raidz_col_t *col = &rm->rm_col[c];
420
421 bcopy(col->rc_data, buf, col->rc_size);
422 col->rc_data = buf;
423
424 buf += col->rc_size;
425 }
426 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427 }
428
429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430 vdev_raidz_map_free_vsd,
431 vdev_raidz_cksum_report
432 };
433
434 /*
435 * Divides the IO evenly across all child vdevs; usually, dcols is
436 * the number of children in the target vdev.
437 */
438 static raidz_map_t *
439 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
440 uint64_t nparity)
441 {
442 raidz_map_t *rm;
443 /* The starting RAIDZ (parent) vdev sector of the block. */
444 uint64_t b = zio->io_offset >> unit_shift;
445 /* The zio's size in units of the vdev's minimum sector size */
446 uint64_t s = zio->io_size >> unit_shift;
447 /* The first column for this stripe. */
448 uint64_t f = b % dcols;
449 /* The starting byte offset on each child vdev. */
450 uint64_t o = (b / dcols) << unit_shift;
451 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
452
453 /*
454 * "Quotient": The number of data sectors for this stripe on all but
455 * the "big column" child vdevs that also contain "remainder" data.
456 */
457 q = s / (dcols - nparity);
458
459 /*
460 * "Remainder": The number of partial stripe data sectors in this I/O.
461 * This will add a sector to some, but not all, child vdevs.
462 */
463 r = s - q * (dcols - nparity);
464
465 /* The number of "big columns" - those which contain remainder data. */
466 bc = (r == 0 ? 0 : r + nparity);
467
468 /*
469 * The total number of data and parity sectors associated with
470 * this I/O.
471 */
472 tot = s + nparity * (q + (r == 0 ? 0 : 1));
473
474 /* acols: The columns that will be accessed. */
475 /* scols: The columns that will be accessed or skipped. */
476 if (q == 0) {
477 /* Our I/O request doesn't span all child vdevs. */
478 acols = bc;
479 scols = MIN(dcols, roundup(bc, nparity + 1));
480 } else {
481 acols = dcols;
482 scols = dcols;
483 }
484
485 ASSERT3U(acols, <=, scols);
486
487 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
488
489 rm->rm_cols = acols;
490 rm->rm_scols = scols;
491 rm->rm_bigcols = bc;
492 rm->rm_skipstart = bc;
493 rm->rm_missingdata = 0;
494 rm->rm_missingparity = 0;
495 rm->rm_firstdatacol = nparity;
496 rm->rm_datacopy = NULL;
497 rm->rm_reports = 0;
1531 uint64_t cols = vd->vdev_children;
1532 uint64_t nparity = vd->vdev_nparity;
1533
1534 asize = ((psize - 1) >> ashift) + 1;
1535 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1536 asize = roundup(asize, nparity + 1) << ashift;
1537
1538 return (asize);
1539 }
1540
1541 static void
1542 vdev_raidz_child_done(zio_t *zio)
1543 {
1544 raidz_col_t *rc = zio->io_private;
1545
1546 rc->rc_error = zio->io_error;
1547 rc->rc_tried = 1;
1548 rc->rc_skipped = 0;
1549 }
1550
1551 /*
1552 * Start an IO operation on a RAIDZ VDev
1553 *
1554 * Outline:
1555 * - For write operations:
1556 * 1. Generate the parity data
1557 * 2. Create child zio write operations to each column's vdev, for both
1558 * data and parity.
1559 * 3. If the column skips any sectors for padding, create optional dummy
1560 * write zio children for those areas to improve aggregation continuity.
1561 * - For read operations:
1562 * 1. Create child zio read operations to each data column's vdev to read
1563 * the range of data required for zio.
1564 * 2. If this is a scrub or resilver operation, or if any of the data
1565 * vdevs have had errors, then create zio read operations to the parity
1566 * columns' VDevs as well.
1567 */
1568 static int
1569 vdev_raidz_io_start(zio_t *zio)
1570 {
1571 vdev_t *vd = zio->io_vd;
1572 vdev_t *tvd = vd->vdev_top;
1573 vdev_t *cvd;
1574 raidz_map_t *rm;
1575 raidz_col_t *rc;
1576 int c, i;
1577
1578 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1579 vd->vdev_nparity);
1580
1581 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1582
1583 if (zio->io_type == ZIO_TYPE_WRITE) {
1584 vdev_raidz_generate_parity(rm);
1585
1586 for (c = 0; c < rm->rm_cols; c++) {
1587 rc = &rm->rm_col[c];
1888 */
1889 for (c = tgts[current - 1] + 1;
1890 rm->rm_col[c].rc_error != 0; c++)
1891 continue;
1892
1893 tgts[current] = c;
1894 current++;
1895
1896 } while (current != n);
1897 }
1898 }
1899 n--;
1900 done:
1901 for (i = 0; i < n; i++) {
1902 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1903 }
1904
1905 return (ret);
1906 }
1907
1908 /*
1909 * Complete an IO operation on a RAIDZ VDev
1910 *
1911 * Outline:
1912 * - For write operations:
1913 * 1. Check for errors on the child IOs.
1914 * 2. Return, setting an error code if too few child VDevs were written
1915 * to reconstruct the data later. Note that partial writes are
1916 * considered successful if they can be reconstructed at all.
1917 * - For read operations:
1918 * 1. Check for errors on the child IOs.
1919 * 2. If data errors occurred:
1920 * a. Try to reassemble the data from the parity available.
1921 * b. If we haven't yet read the parity drives, read them now.
1922 * c. If all parity drives have been read but the data still doesn't
1923 * reassemble with a correct checksum, then try combinatorial
1924 * reconstruction.
1925 * d. If that doesn't work, return an error.
1926 * 3. If there were unexpected errors or this is a resilver operation,
1927 * rewrite the vdevs that had errors.
1928 */
1929 static void
1930 vdev_raidz_io_done(zio_t *zio)
1931 {
1932 vdev_t *vd = zio->io_vd;
1933 vdev_t *cvd;
1934 raidz_map_t *rm = zio->io_vsd;
1935 raidz_col_t *rc;
1936 int unexpected_errors = 0;
1937 int parity_errors = 0;
1938 int parity_untried = 0;
1939 int data_errors = 0;
1940 int total_errors = 0;
1941 int n, c;
1942 int tgts[VDEV_RAIDZ_MAXPARITY];
1943 int code;
1944
1945 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1946
1947 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1948 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
|