dlpx-os-diff Cdiff usr/src/uts/common/fs/zfs/dbuf.c

Print this page

4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


*** 839,849 ****
                   * so note that we unexpectedly took the slow path.
                   */
                  atomic_inc_64(&zfs_free_range_recv_miss);
          }
  
!         for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
                  db_next = list_next(&dn->dn_dbufs, db);
                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
  
                  if (db->db_level == 1 &&
                      db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
--- 839,849 ----
                   * so note that we unexpectedly took the slow path.
                   */
                  atomic_inc_64(&zfs_free_range_recv_miss);
          }
  
!         for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
                  db_next = list_next(&dn->dn_dbufs, db);
                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
  
                  if (db->db_level == 1 &&
                      db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
*** 1185,1194 ****
--- 1185,1196 ----
                  mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
                  list_create(&dr->dt.di.dr_children,
                      sizeof (dbuf_dirty_record_t),
                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
          }
+         if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+                 dr->dr_accounted = db->db.db_size;
          dr->dr_dbuf = db;
          dr->dr_txg = tx->tx_txg;
          dr->dr_next = *drp;
          *drp = dr;
  
*** 1268,1278 ****
                  di = dbuf_dirty(parent, tx);
                  if (parent_held)
                          dbuf_rele(parent, FTAG);
  
                  mutex_enter(&db->db_mtx);
!                 /*  possible race with dbuf_undirty() */
                  if (db->db_last_dirty == dr ||
                      dn->dn_object == DMU_META_DNODE_OBJECT) {
                          mutex_enter(&di->dt.di.dr_mtx);
                          ASSERT3U(di->dr_txg, ==, tx->tx_txg);
                          ASSERT(!list_link_active(&dr->dr_dirty_node));
--- 1270,1283 ----
                  di = dbuf_dirty(parent, tx);
                  if (parent_held)
                          dbuf_rele(parent, FTAG);
  
                  mutex_enter(&db->db_mtx);
!                 /*
!                  * Since we've dropped the mutex, it's possible that
!                  * dbuf_undirty() might have changed this out from under us.
!                  */
                  if (db->db_last_dirty == dr ||
                      dn->dn_object == DMU_META_DNODE_OBJECT) {
                          mutex_enter(&di->dt.di.dr_mtx);
                          ASSERT3U(di->dr_txg, ==, tx->tx_txg);
                          ASSERT(!list_link_active(&dr->dr_dirty_node));
*** 1338,1348 ****
  
          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
  
          ASSERT(db->db.db_size != 0);
  
!         /* XXX would be nice to fix up dn_towrite_space[] */
  
          *drp = dr->dr_next;
  
          /*
           * Note that there are three places in dbuf_dirty()
--- 1343,1357 ----
  
          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
  
          ASSERT(db->db.db_size != 0);
  
!         /*
!          * Any space we accounted for in dp_dirty_* will be cleaned up by
!          * dsl_pool_sync().  This is relatively rare so the discrepancy
!          * is not a big deal.
!          */
  
          *drp = dr->dr_next;
  
          /*
           * Note that there are three places in dbuf_dirty()
*** 1518,1528 ****
          dbuf_fill_done(db, tx);
  }
  
  /*
   * "Clear" the contents of this dbuf.  This will mark the dbuf
!  * EVICTING and clear *most* of its references.  Unfortunetely,
   * when we are not holding the dn_dbufs_mtx, we can't clear the
   * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
   * in this case.  For callers from the DMU we will usually see:
   *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
   * For the arc callback, we will usually see:
--- 1527,1537 ----
          dbuf_fill_done(db, tx);
  }
  
  /*
   * "Clear" the contents of this dbuf.  This will mark the dbuf
!  * EVICTING and clear *most* of its references.  Unfortunately,
   * when we are not holding the dn_dbufs_mtx, we can't clear the
   * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
   * in this case.  For callers from the DMU we will usually see:
   *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
   * For the arc callback, we will usually see:
*** 1705,1715 ****
                  db->db.db_size = (blkptr != NULL) ?
                      BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
                  db->db.db_offset = 0;
          } else {
                  int blocksize =
!                     db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
                  db->db.db_size = blocksize;
                  db->db.db_offset = db->db_blkid * blocksize;
          }
  
          /*
--- 1714,1724 ----
                  db->db.db_size = (blkptr != NULL) ?
                      BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
                  db->db.db_offset = 0;
          } else {
                  int blocksize =
!                     db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
                  db->db.db_size = blocksize;
                  db->db.db_offset = db->db_blkid * blocksize;
          }
  
          /*
*** 1814,1824 ****
          kmem_cache_free(dbuf_cache, db);
          arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
  }
  
  void
! dbuf_prefetch(dnode_t *dn, uint64_t blkid)
  {
          dmu_buf_impl_t *db = NULL;
          blkptr_t *bp = NULL;
  
          ASSERT(blkid != DMU_BONUS_BLKID);
--- 1823,1833 ----
          kmem_cache_free(dbuf_cache, db);
          arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
  }
  
  void
! dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
  {
          dmu_buf_impl_t *db = NULL;
          blkptr_t *bp = NULL;
  
          ASSERT(blkid != DMU_BONUS_BLKID);
*** 1838,1858 ****
                  return;
          }
  
          if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
                  if (bp && !BP_IS_HOLE(bp)) {
-                         int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
-                             ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
                          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                          uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
                          zbookmark_t zb;
  
                          SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
                              dn->dn_object, 0, blkid);
  
                          (void) arc_read(NULL, dn->dn_objset->os_spa,
!                             bp, NULL, NULL, priority,
                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
                              &aflags, &zb);
                  }
                  if (db)
                          dbuf_rele(db, NULL);
--- 1847,1865 ----
                  return;
          }
  
          if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
                  if (bp && !BP_IS_HOLE(bp)) {
                          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                          uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
                          zbookmark_t zb;
  
                          SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
                              dn->dn_object, 0, blkid);
  
                          (void) arc_read(NULL, dn->dn_objset->os_spa,
!                             bp, NULL, NULL, prio,
                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
                              &aflags, &zb);
                  }
                  if (db)
                          dbuf_rele(db, NULL);
*** 2529,2540 ****
--- 2536,2579 ----
          bp->blk_fill = fill;
  
          mutex_exit(&db->db_mtx);
  }
  
+ /*
+  * The SPA will call this callback several times for each zio - once
+  * for every physical child i/o (zio->io_phys_children times).  This
+  * allows the DMU to monitor the progress of each logical i/o.  For example,
+  * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+  * block.  There may be a long delay before all copies/fragments are completed,
+  * so this callback allows us to retire dirty space gradually, as the physical
+  * i/os complete.
+  */
  /* ARGSUSED */
  static void
+ dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+ {
+         dmu_buf_impl_t *db = arg;
+         objset_t *os = db->db_objset;
+         dsl_pool_t *dp = dmu_objset_pool(os);
+         dbuf_dirty_record_t *dr;
+         int delta = 0;
+ 
+         dr = db->db_data_pending;
+         ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+ 
+         /*
+          * The callback will be called io_phys_children times.  Retire one
+          * portion of our dirty space each time we are called.  Any rounding
+          * error will be cleaned up by dsl_pool_sync()'s call to
+          * dsl_pool_undirty_space().
+          */
+         delta = dr->dr_accounted / zio->io_phys_children;
+         dsl_pool_undirty_space(dp, delta, zio->io_txg);
+ }
+ 
+ /* ARGSUSED */
+ static void
  dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
  {
          dmu_buf_impl_t *db = vdb;
          blkptr_t *bp = zio->io_bp;
          blkptr_t *bp_orig = &zio->io_bp_orig;
*** 2623,2632 ****
--- 2662,2672 ----
  
          cv_broadcast(&db->db_changed);
          ASSERT(db->db_dirtycnt > 0);
          db->db_dirtycnt -= 1;
          db->db_data_pending = NULL;
+ 
          dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
  }
  
  static void
  dbuf_write_nofill_ready(zio_t *zio)
*** 2741,2768 ****
  
          if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
                  ASSERT(db->db_state != DB_NOFILL);
                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
                      db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
!                     dbuf_write_override_ready, dbuf_write_override_done, dr,
!                     ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                  mutex_enter(&db->db_mtx);
                  dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
                  zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
                      dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
                  mutex_exit(&db->db_mtx);
          } else if (db->db_state == DB_NOFILL) {
                  ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
                      db->db_blkptr, NULL, db->db.db_size, &zp,
!                     dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
                      ZIO_PRIORITY_ASYNC_WRITE,
                      ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
          } else {
                  ASSERT(arc_released(data));
                  dr->dr_zio = arc_write(zio, os->os_spa, txg,
                      db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
                      DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
!                     dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
!                     ZIO_FLAG_MUSTSUCCEED, &zb);
          }
  }
--- 2781,2808 ----
  
          if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
                  ASSERT(db->db_state != DB_NOFILL);
                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
                      db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
!                     dbuf_write_override_ready, NULL, dbuf_write_override_done,
!                     dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                  mutex_enter(&db->db_mtx);
                  dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
                  zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
                      dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
                  mutex_exit(&db->db_mtx);
          } else if (db->db_state == DB_NOFILL) {
                  ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
                      db->db_blkptr, NULL, db->db.db_size, &zp,
!                     dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
                      ZIO_PRIORITY_ASYNC_WRITE,
                      ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
          } else {
                  ASSERT(arc_released(data));
                  dr->dr_zio = arc_write(zio, os->os_spa, txg,
                      db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
                      DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
!                     dbuf_write_physdone, dbuf_write_done, db,
!                     ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
          }
  }