Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dbuf.c
          +++ new/usr/src/uts/common/fs/zfs/dbuf.c
↓ open down ↓ 833 lines elided ↑ open up ↑
 834  834                  /*
 835  835                   * If we are receiving, we expect there to be no dbufs in
 836  836                   * the range to be freed, because receive modifies each
 837  837                   * block at most once, and in offset order.  If this is
 838  838                   * not the case, it can lead to performance problems,
 839  839                   * so note that we unexpectedly took the slow path.
 840  840                   */
 841  841                  atomic_inc_64(&zfs_free_range_recv_miss);
 842  842          }
 843  843  
 844      -        for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
      844 +        for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
 845  845                  db_next = list_next(&dn->dn_dbufs, db);
 846  846                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 847  847  
 848  848                  if (db->db_level == 1 &&
 849  849                      db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
 850  850                          mutex_enter(&db->db_mtx);
 851  851                          if (db->db_last_dirty &&
 852  852                              db->db_last_dirty->dr_txg < txg) {
 853  853                                  dbuf_add_ref(db, FTAG);
 854  854                                  mutex_exit(&db->db_mtx);
↓ open down ↓ 325 lines elided ↑ open up ↑
1180 1180                          }
1181 1181                          ASSERT(data_old != NULL);
1182 1182                  }
1183 1183                  dr->dt.dl.dr_data = data_old;
1184 1184          } else {
1185 1185                  mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1186 1186                  list_create(&dr->dt.di.dr_children,
1187 1187                      sizeof (dbuf_dirty_record_t),
1188 1188                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
1189 1189          }
     1190 +        if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
     1191 +                dr->dr_accounted = db->db.db_size;
1190 1192          dr->dr_dbuf = db;
1191 1193          dr->dr_txg = tx->tx_txg;
1192 1194          dr->dr_next = *drp;
1193 1195          *drp = dr;
1194 1196  
1195 1197          /*
1196 1198           * We could have been freed_in_flight between the dbuf_noread
1197 1199           * and dbuf_dirty.  We win, as though the dbuf_noread() had
1198 1200           * happened after the free.
1199 1201           */
↓ open down ↓ 63 lines elided ↑ open up ↑
1263 1265                          parent_held = TRUE;
1264 1266                  }
1265 1267                  if (drop_struct_lock)
1266 1268                          rw_exit(&dn->dn_struct_rwlock);
1267 1269                  ASSERT3U(db->db_level+1, ==, parent->db_level);
1268 1270                  di = dbuf_dirty(parent, tx);
1269 1271                  if (parent_held)
1270 1272                          dbuf_rele(parent, FTAG);
1271 1273  
1272 1274                  mutex_enter(&db->db_mtx);
1273      -                /*  possible race with dbuf_undirty() */
     1275 +                /*
     1276 +                 * Since we've dropped the mutex, it's possible that
     1277 +                 * dbuf_undirty() might have changed this out from under us.
     1278 +                 */
1274 1279                  if (db->db_last_dirty == dr ||
1275 1280                      dn->dn_object == DMU_META_DNODE_OBJECT) {
1276 1281                          mutex_enter(&di->dt.di.dr_mtx);
1277 1282                          ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1278 1283                          ASSERT(!list_link_active(&dr->dr_dirty_node));
1279 1284                          list_insert_tail(&di->dt.di.dr_children, dr);
1280 1285                          mutex_exit(&di->dt.di.dr_mtx);
1281 1286                          dr->dr_parent = di;
1282 1287                  }
1283 1288                  mutex_exit(&db->db_mtx);
↓ open down ↓ 49 lines elided ↑ open up ↑
1333 1338           * holders, but it is untested in that scenerio, as the ZPL and
1334 1339           * ztest have additional locking (the range locks) that prevents
1335 1340           * that type of concurrent access.
1336 1341           */
1337 1342          ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1338 1343  
1339 1344          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1340 1345  
1341 1346          ASSERT(db->db.db_size != 0);
1342 1347  
1343      -        /* XXX would be nice to fix up dn_towrite_space[] */
     1348 +        /*
     1349 +         * Any space we accounted for in dp_dirty_* will be cleaned up by
     1350 +         * dsl_pool_sync().  This is relatively rare so the discrepancy
     1351 +         * is not a big deal.
     1352 +         */
1344 1353  
1345 1354          *drp = dr->dr_next;
1346 1355  
1347 1356          /*
1348 1357           * Note that there are three places in dbuf_dirty()
1349 1358           * where this dirty record may be put on a list.
1350 1359           * Make sure to do a list_remove corresponding to
1351 1360           * every one of those list_insert calls.
1352 1361           */
1353 1362          if (dr->dr_parent) {
↓ open down ↓ 159 lines elided ↑ open up ↑
1513 1522          ASSERT(db->db_buf == NULL);
1514 1523          dbuf_set_data(db, buf);
1515 1524          db->db_state = DB_FILL;
1516 1525          mutex_exit(&db->db_mtx);
1517 1526          (void) dbuf_dirty(db, tx);
1518 1527          dbuf_fill_done(db, tx);
1519 1528  }
1520 1529  
1521 1530  /*
1522 1531   * "Clear" the contents of this dbuf.  This will mark the dbuf
1523      - * EVICTING and clear *most* of its references.  Unfortunetely,
     1532 + * EVICTING and clear *most* of its references.  Unfortunately,
1524 1533   * when we are not holding the dn_dbufs_mtx, we can't clear the
1525 1534   * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1526 1535   * in this case.  For callers from the DMU we will usually see:
1527 1536   *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1528 1537   * For the arc callback, we will usually see:
1529 1538   *      dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1530 1539   * Sometimes, though, we will get a mix of these two:
1531 1540   *      DMU: dbuf_clear()->arc_buf_evict()
1532 1541   *      ARC: dbuf_do_evict()->dbuf_destroy()
1533 1542   */
↓ open down ↓ 166 lines elided ↑ open up ↑
1700 1709                  db->db_state = DB_UNCACHED;
1701 1710                  /* the bonus dbuf is not placed in the hash table */
1702 1711                  arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1703 1712                  return (db);
1704 1713          } else if (blkid == DMU_SPILL_BLKID) {
1705 1714                  db->db.db_size = (blkptr != NULL) ?
1706 1715                      BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1707 1716                  db->db.db_offset = 0;
1708 1717          } else {
1709 1718                  int blocksize =
1710      -                    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
     1719 +                    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1711 1720                  db->db.db_size = blocksize;
1712 1721                  db->db.db_offset = db->db_blkid * blocksize;
1713 1722          }
1714 1723  
1715 1724          /*
1716 1725           * Hold the dn_dbufs_mtx while we get the new dbuf
1717 1726           * in the hash table *and* added to the dbufs list.
1718 1727           * This prevents a possible deadlock with someone
1719 1728           * trying to look up this dbuf before its added to the
1720 1729           * dn_dbufs list.
↓ open down ↓ 88 lines elided ↑ open up ↑
1809 1818          ASSERT(db->db.db_data == NULL);
1810 1819          ASSERT(db->db_hash_next == NULL);
1811 1820          ASSERT(db->db_blkptr == NULL);
1812 1821          ASSERT(db->db_data_pending == NULL);
1813 1822  
1814 1823          kmem_cache_free(dbuf_cache, db);
1815 1824          arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1816 1825  }
1817 1826  
1818 1827  void
1819      -dbuf_prefetch(dnode_t *dn, uint64_t blkid)
     1828 +dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1820 1829  {
1821 1830          dmu_buf_impl_t *db = NULL;
1822 1831          blkptr_t *bp = NULL;
1823 1832  
1824 1833          ASSERT(blkid != DMU_BONUS_BLKID);
1825 1834          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1826 1835  
1827 1836          if (dnode_block_freed(dn, blkid))
1828 1837                  return;
1829 1838  
↓ open down ↓ 3 lines elided ↑ open up ↑
1833 1842                   * This dbuf is already in the cache.  We assume that
1834 1843                   * it is already CACHED, or else about to be either
1835 1844                   * read or filled.
1836 1845                   */
1837 1846                  mutex_exit(&db->db_mtx);
1838 1847                  return;
1839 1848          }
1840 1849  
1841 1850          if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1842 1851                  if (bp && !BP_IS_HOLE(bp)) {
1843      -                        int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
1844      -                            ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
1845 1852                          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1846 1853                          uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1847 1854                          zbookmark_t zb;
1848 1855  
1849 1856                          SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1850 1857                              dn->dn_object, 0, blkid);
1851 1858  
1852 1859                          (void) arc_read(NULL, dn->dn_objset->os_spa,
1853      -                            bp, NULL, NULL, priority,
     1860 +                            bp, NULL, NULL, prio,
1854 1861                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1855 1862                              &aflags, &zb);
1856 1863                  }
1857 1864                  if (db)
1858 1865                          dbuf_rele(db, NULL);
1859 1866          }
1860 1867  }
1861 1868  
1862 1869  /*
1863 1870   * Returns with db_holds incremented, and db_mtx not held.
↓ open down ↓ 660 lines elided ↑ open up ↑
2524 2531                          fill += ibp->blk_fill;
2525 2532                  }
2526 2533          }
2527 2534          DB_DNODE_EXIT(db);
2528 2535  
2529 2536          bp->blk_fill = fill;
2530 2537  
2531 2538          mutex_exit(&db->db_mtx);
2532 2539  }
2533 2540  
     2541 +/*
     2542 + * The SPA will call this callback several times for each zio - once
     2543 + * for every physical child i/o (zio->io_phys_children times).  This
     2544 + * allows the DMU to monitor the progress of each logical i/o.  For example,
     2545 + * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
     2546 + * block.  There may be a long delay before all copies/fragments are completed,
     2547 + * so this callback allows us to retire dirty space gradually, as the physical
     2548 + * i/os complete.
     2549 + */
2534 2550  /* ARGSUSED */
2535 2551  static void
     2552 +dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
     2553 +{
     2554 +        dmu_buf_impl_t *db = arg;
     2555 +        objset_t *os = db->db_objset;
     2556 +        dsl_pool_t *dp = dmu_objset_pool(os);
     2557 +        dbuf_dirty_record_t *dr;
     2558 +        int delta = 0;
     2559 +
     2560 +        dr = db->db_data_pending;
     2561 +        ASSERT3U(dr->dr_txg, ==, zio->io_txg);
     2562 +
     2563 +        /*
     2564 +         * The callback will be called io_phys_children times.  Retire one
     2565 +         * portion of our dirty space each time we are called.  Any rounding
     2566 +         * error will be cleaned up by dsl_pool_sync()'s call to
     2567 +         * dsl_pool_undirty_space().
     2568 +         */
     2569 +        delta = dr->dr_accounted / zio->io_phys_children;
     2570 +        dsl_pool_undirty_space(dp, delta, zio->io_txg);
     2571 +}
     2572 +
     2573 +/* ARGSUSED */
     2574 +static void
2536 2575  dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2537 2576  {
2538 2577          dmu_buf_impl_t *db = vdb;
2539 2578          blkptr_t *bp = zio->io_bp;
2540 2579          blkptr_t *bp_orig = &zio->io_bp_orig;
2541 2580          uint64_t txg = zio->io_txg;
2542 2581          dbuf_dirty_record_t **drp, *dr;
2543 2582  
2544 2583          ASSERT0(zio->io_error);
2545 2584          ASSERT(db->db_blkptr == bp);
↓ open down ↓ 72 lines elided ↑ open up ↑
2618 2657                  DB_DNODE_EXIT(db);
2619 2658                  mutex_destroy(&dr->dt.di.dr_mtx);
2620 2659                  list_destroy(&dr->dt.di.dr_children);
2621 2660          }
2622 2661          kmem_free(dr, sizeof (dbuf_dirty_record_t));
2623 2662  
2624 2663          cv_broadcast(&db->db_changed);
2625 2664          ASSERT(db->db_dirtycnt > 0);
2626 2665          db->db_dirtycnt -= 1;
2627 2666          db->db_data_pending = NULL;
     2667 +
2628 2668          dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2629 2669  }
2630 2670  
2631 2671  static void
2632 2672  dbuf_write_nofill_ready(zio_t *zio)
2633 2673  {
2634 2674          dbuf_write_ready(zio, NULL, zio->io_private);
2635 2675  }
2636 2676  
2637 2677  static void
↓ open down ↓ 98 lines elided ↑ open up ↑
2736 2776                  wp_flag = WP_SPILL;
2737 2777          wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2738 2778  
2739 2779          dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2740 2780          DB_DNODE_EXIT(db);
2741 2781  
2742 2782          if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2743 2783                  ASSERT(db->db_state != DB_NOFILL);
2744 2784                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2745 2785                      db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2746      -                    dbuf_write_override_ready, dbuf_write_override_done, dr,
2747      -                    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
     2786 +                    dbuf_write_override_ready, NULL, dbuf_write_override_done,
     2787 +                    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2748 2788                  mutex_enter(&db->db_mtx);
2749 2789                  dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2750 2790                  zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2751 2791                      dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2752 2792                  mutex_exit(&db->db_mtx);
2753 2793          } else if (db->db_state == DB_NOFILL) {
2754 2794                  ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2755 2795                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2756 2796                      db->db_blkptr, NULL, db->db.db_size, &zp,
2757      -                    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
     2797 +                    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2758 2798                      ZIO_PRIORITY_ASYNC_WRITE,
2759 2799                      ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2760 2800          } else {
2761 2801                  ASSERT(arc_released(data));
2762 2802                  dr->dr_zio = arc_write(zio, os->os_spa, txg,
2763 2803                      db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2764 2804                      DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2765      -                    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
2766      -                    ZIO_FLAG_MUSTSUCCEED, &zb);
     2805 +                    dbuf_write_physdone, dbuf_write_done, db,
     2806 +                    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2767 2807          }
2768 2808  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX