Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


 354         int err;
 355 
 356         DB_DNODE_ENTER(db);
 357         dn = DB_DNODE(db);
 358         err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
 359         DB_DNODE_EXIT(db);
 360 
 361         return (err);
 362 }
 363 
 364 /*
 365  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
 366  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
 367  * and can induce severe lock contention when writing to several files
 368  * whose dnodes are in the same block.
 369  */
 370 static int
 371 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 372     int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 373 {
 374         dsl_pool_t *dp = NULL;
 375         dmu_buf_t **dbp;
 376         uint64_t blkid, nblks, i;
 377         uint32_t dbuf_flags;
 378         int err;
 379         zio_t *zio;
 380         hrtime_t start;
 381 
 382         ASSERT(length <= DMU_MAX_ACCESS);
 383 
 384         dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
 385         if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
 386                 dbuf_flags |= DB_RF_NOPREFETCH;
 387 
 388         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 389         if (dn->dn_datablkshift) {
 390                 int blkshift = dn->dn_datablkshift;
 391                 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
 392                     P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
 393         } else {
 394                 if (offset + length > dn->dn_datablksz) {
 395                         zfs_panic_recover("zfs: accessing past end of object "
 396                             "%llx/%llx (size=%u access=%llu+%llu)",
 397                             (longlong_t)dn->dn_objset->
 398                             os_dsl_dataset->ds_object,
 399                             (longlong_t)dn->dn_object, dn->dn_datablksz,
 400                             (longlong_t)offset, (longlong_t)length);
 401                         rw_exit(&dn->dn_struct_rwlock);
 402                         return (SET_ERROR(EIO));
 403                 }
 404                 nblks = 1;
 405         }
 406         dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 407 
 408         if (dn->dn_objset->os_dsl_dataset)
 409                 dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
 410         start = gethrtime();
 411         zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 412         blkid = dbuf_whichblock(dn, offset);
 413         for (i = 0; i < nblks; i++) {
 414                 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
 415                 if (db == NULL) {
 416                         rw_exit(&dn->dn_struct_rwlock);
 417                         dmu_buf_rele_array(dbp, nblks, tag);
 418                         zio_nowait(zio);
 419                         return (SET_ERROR(EIO));
 420                 }
 421                 /* initiate async i/o */
 422                 if (read) {
 423                         (void) dbuf_read(db, zio, dbuf_flags);
 424                 }
 425                 dbp[i] = &db->db;
 426         }
 427         rw_exit(&dn->dn_struct_rwlock);
 428 
 429         /* wait for async i/o */
 430         err = zio_wait(zio);
 431         /* track read overhead when we are in sync context */
 432         if (dp && dsl_pool_sync_context(dp))
 433                 dp->dp_read_overhead += gethrtime() - start;
 434         if (err) {
 435                 dmu_buf_rele_array(dbp, nblks, tag);
 436                 return (err);
 437         }
 438 
 439         /* wait for other io to complete */
 440         if (read) {
 441                 for (i = 0; i < nblks; i++) {
 442                         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 443                         mutex_enter(&db->db_mtx);
 444                         while (db->db_state == DB_READ ||
 445                             db->db_state == DB_FILL)
 446                                 cv_wait(&db->db_changed, &db->db_mtx);
 447                         if (db->db_state == DB_UNCACHED)
 448                                 err = SET_ERROR(EIO);
 449                         mutex_exit(&db->db_mtx);
 450                         if (err) {
 451                                 dmu_buf_rele_array(dbp, nblks, tag);
 452                                 return (err);
 453                         }


 495         return (err);
 496 }
 497 
 498 void
 499 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 500 {
 501         int i;
 502         dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 503 
 504         if (numbufs == 0)
 505                 return;
 506 
 507         for (i = 0; i < numbufs; i++) {
 508                 if (dbp[i])
 509                         dbuf_rele(dbp[i], tag);
 510         }
 511 
 512         kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 513 }
 514 










 515 void
 516 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 517 {
 518         dnode_t *dn;
 519         uint64_t blkid;
 520         int nblks, i, err;
 521 
 522         if (zfs_prefetch_disable)
 523                 return;
 524 
 525         if (len == 0) {  /* they're interested in the bonus buffer */
 526                 dn = DMU_META_DNODE(os);
 527 
 528                 if (object == 0 || object >= DN_MAX_OBJECT)
 529                         return;
 530 
 531                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
 532                 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
 533                 dbuf_prefetch(dn, blkid);
 534                 rw_exit(&dn->dn_struct_rwlock);
 535                 return;
 536         }
 537 
 538         /*
 539          * XXX - Note, if the dnode for the requested object is not
 540          * already cached, we will do a *synchronous* read in the
 541          * dnode_hold() call.  The same is true for any indirects.
 542          */
 543         err = dnode_hold(os, object, FTAG, &dn);
 544         if (err != 0)
 545                 return;
 546 
 547         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 548         if (dn->dn_datablkshift) {
 549                 int blkshift = dn->dn_datablkshift;
 550                 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
 551                     P2ALIGN(offset, 1<<blkshift)) >> blkshift;
 552         } else {
 553                 nblks = (offset < dn->dn_datablksz);
 554         }
 555 
 556         if (nblks != 0) {
 557                 blkid = dbuf_whichblock(dn, offset);
 558                 for (i = 0; i < nblks; i++)
 559                         dbuf_prefetch(dn, blkid+i);
 560         }
 561 
 562         rw_exit(&dn->dn_struct_rwlock);
 563 
 564         dnode_rele(dn, FTAG);
 565 }
 566 
 567 /*
 568  * Get the next "chunk" of file data to free.  We traverse the file from
 569  * the end so that the file gets shorter over time (if we crashes in the
 570  * middle, this will leave us in a better state).  We find allocated file
 571  * data by simply searching the allocated level 1 indirects.
 572  *
 573  * On input, *start should be the first offset that does not need to be
 574  * freed (e.g. "offset + length").  On return, *start will be the first
 575  * offset that should be freed.
 576  */
 577 static int
 578 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
 579 {


1339 {
1340         dmu_sync_arg_t *dsa;
1341         dmu_tx_t *tx;
1342 
1343         tx = dmu_tx_create(os);
1344         dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1345         if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1346                 dmu_tx_abort(tx);
1347                 /* Make zl_get_data do txg_waited_synced() */
1348                 return (SET_ERROR(EIO));
1349         }
1350 
1351         dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1352         dsa->dsa_dr = NULL;
1353         dsa->dsa_done = done;
1354         dsa->dsa_zgd = zgd;
1355         dsa->dsa_tx = tx;
1356 
1357         zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1358             zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
1359             dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
1360             ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1361 
1362         return (0);
1363 }
1364 
1365 /*
1366  * Intent log support: sync the block associated with db to disk.
1367  * N.B. and XXX: the caller is responsible for making sure that the
1368  * data isn't changing while dmu_sync() is writing it.
1369  *
1370  * Return values:
1371  *
1372  *      EEXIST: this txg has already been synced, so there's nothing to do.
1373  *              The caller should not log the write.
1374  *
1375  *      ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1376  *              The caller should not log the write.
1377  *
1378  *      EALREADY: this block is already in the process of being synced.
1379  *              The caller should track its progress (somehow).


1479                  * We have already issued a sync write for this buffer,
1480                  * or this buffer has already been synced.  It could not
1481                  * have been dirtied since, or we would have cleared the state.
1482                  */
1483                 mutex_exit(&db->db_mtx);
1484                 return (SET_ERROR(EALREADY));
1485         }
1486 
1487         ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1488         dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1489         mutex_exit(&db->db_mtx);
1490 
1491         dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1492         dsa->dsa_dr = dr;
1493         dsa->dsa_done = done;
1494         dsa->dsa_zgd = zgd;
1495         dsa->dsa_tx = NULL;
1496 
1497         zio_nowait(arc_write(pio, os->os_spa, txg,
1498             bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1499             DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
1500             dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));

1501 
1502         return (0);
1503 }
1504 
1505 int
1506 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1507         dmu_tx_t *tx)
1508 {
1509         dnode_t *dn;
1510         int err;
1511 
1512         err = dnode_hold(os, object, FTAG, &dn);
1513         if (err)
1514                 return (err);
1515         err = dnode_set_blksz(dn, size, ibs, tx);
1516         dnode_rele(dn, FTAG);
1517         return (err);
1518 }
1519 
1520 void




 354         int err;
 355 
 356         DB_DNODE_ENTER(db);
 357         dn = DB_DNODE(db);
 358         err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
 359         DB_DNODE_EXIT(db);
 360 
 361         return (err);
 362 }
 363 
 364 /*
 365  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
 366  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
 367  * and can induce severe lock contention when writing to several files
 368  * whose dnodes are in the same block.
 369  */
 370 static int
 371 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 372     int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 373 {

 374         dmu_buf_t **dbp;
 375         uint64_t blkid, nblks, i;
 376         uint32_t dbuf_flags;
 377         int err;
 378         zio_t *zio;

 379 
 380         ASSERT(length <= DMU_MAX_ACCESS);
 381 
 382         dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
 383         if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
 384                 dbuf_flags |= DB_RF_NOPREFETCH;
 385 
 386         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 387         if (dn->dn_datablkshift) {
 388                 int blkshift = dn->dn_datablkshift;
 389                 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
 390                     P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
 391         } else {
 392                 if (offset + length > dn->dn_datablksz) {
 393                         zfs_panic_recover("zfs: accessing past end of object "
 394                             "%llx/%llx (size=%u access=%llu+%llu)",
 395                             (longlong_t)dn->dn_objset->
 396                             os_dsl_dataset->ds_object,
 397                             (longlong_t)dn->dn_object, dn->dn_datablksz,
 398                             (longlong_t)offset, (longlong_t)length);
 399                         rw_exit(&dn->dn_struct_rwlock);
 400                         return (SET_ERROR(EIO));
 401                 }
 402                 nblks = 1;
 403         }
 404         dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 405 



 406         zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 407         blkid = dbuf_whichblock(dn, offset);
 408         for (i = 0; i < nblks; i++) {
 409                 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
 410                 if (db == NULL) {
 411                         rw_exit(&dn->dn_struct_rwlock);
 412                         dmu_buf_rele_array(dbp, nblks, tag);
 413                         zio_nowait(zio);
 414                         return (SET_ERROR(EIO));
 415                 }
 416                 /* initiate async i/o */
 417                 if (read) {
 418                         (void) dbuf_read(db, zio, dbuf_flags);
 419                 }
 420                 dbp[i] = &db->db;
 421         }
 422         rw_exit(&dn->dn_struct_rwlock);
 423 
 424         /* wait for async i/o */
 425         err = zio_wait(zio);



 426         if (err) {
 427                 dmu_buf_rele_array(dbp, nblks, tag);
 428                 return (err);
 429         }
 430 
 431         /* wait for other io to complete */
 432         if (read) {
 433                 for (i = 0; i < nblks; i++) {
 434                         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 435                         mutex_enter(&db->db_mtx);
 436                         while (db->db_state == DB_READ ||
 437                             db->db_state == DB_FILL)
 438                                 cv_wait(&db->db_changed, &db->db_mtx);
 439                         if (db->db_state == DB_UNCACHED)
 440                                 err = SET_ERROR(EIO);
 441                         mutex_exit(&db->db_mtx);
 442                         if (err) {
 443                                 dmu_buf_rele_array(dbp, nblks, tag);
 444                                 return (err);
 445                         }


 487         return (err);
 488 }
 489 
 490 void
 491 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 492 {
 493         int i;
 494         dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 495 
 496         if (numbufs == 0)
 497                 return;
 498 
 499         for (i = 0; i < numbufs; i++) {
 500                 if (dbp[i])
 501                         dbuf_rele(dbp[i], tag);
 502         }
 503 
 504         kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 505 }
 506 
 507 /*
 508  * Issue prefetch i/os for the given blocks.
 509  *
 510  * Note: The assumption is that we *know* these blocks will be needed
 511  * almost immediately.  Therefore, the prefetch i/os will be issued at
 512  * ZIO_PRIORITY_SYNC_READ
 513  *
 514  * Note: indirect blocks and other metadata will be read synchronously,
 515  * causing this function to block if they are not already cached.
 516  */
 517 void
 518 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 519 {
 520         dnode_t *dn;
 521         uint64_t blkid;
 522         int nblks, err;
 523 
 524         if (zfs_prefetch_disable)
 525                 return;
 526 
 527         if (len == 0) {  /* they're interested in the bonus buffer */
 528                 dn = DMU_META_DNODE(os);
 529 
 530                 if (object == 0 || object >= DN_MAX_OBJECT)
 531                         return;
 532 
 533                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
 534                 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
 535                 dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
 536                 rw_exit(&dn->dn_struct_rwlock);
 537                 return;
 538         }
 539 
 540         /*
 541          * XXX - Note, if the dnode for the requested object is not
 542          * already cached, we will do a *synchronous* read in the
 543          * dnode_hold() call.  The same is true for any indirects.
 544          */
 545         err = dnode_hold(os, object, FTAG, &dn);
 546         if (err != 0)
 547                 return;
 548 
 549         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 550         if (dn->dn_datablkshift) {
 551                 int blkshift = dn->dn_datablkshift;
 552                 nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
 553                     P2ALIGN(offset, 1 << blkshift)) >> blkshift;
 554         } else {
 555                 nblks = (offset < dn->dn_datablksz);
 556         }
 557 
 558         if (nblks != 0) {
 559                 blkid = dbuf_whichblock(dn, offset);
 560                 for (int i = 0; i < nblks; i++)
 561                         dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
 562         }
 563 
 564         rw_exit(&dn->dn_struct_rwlock);
 565 
 566         dnode_rele(dn, FTAG);
 567 }
 568 
 569 /*
 570  * Get the next "chunk" of file data to free.  We traverse the file from
 571  * the end so that the file gets shorter over time (if we crashes in the
 572  * middle, this will leave us in a better state).  We find allocated file
 573  * data by simply searching the allocated level 1 indirects.
 574  *
 575  * On input, *start should be the first offset that does not need to be
 576  * freed (e.g. "offset + length").  On return, *start will be the first
 577  * offset that should be freed.
 578  */
 579 static int
 580 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
 581 {


1341 {
1342         dmu_sync_arg_t *dsa;
1343         dmu_tx_t *tx;
1344 
1345         tx = dmu_tx_create(os);
1346         dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1347         if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1348                 dmu_tx_abort(tx);
1349                 /* Make zl_get_data do txg_waited_synced() */
1350                 return (SET_ERROR(EIO));
1351         }
1352 
1353         dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1354         dsa->dsa_dr = NULL;
1355         dsa->dsa_done = done;
1356         dsa->dsa_zgd = zgd;
1357         dsa->dsa_tx = tx;
1358 
1359         zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1360             zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
1361             dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
1362             ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1363 
1364         return (0);
1365 }
1366 
1367 /*
1368  * Intent log support: sync the block associated with db to disk.
1369  * N.B. and XXX: the caller is responsible for making sure that the
1370  * data isn't changing while dmu_sync() is writing it.
1371  *
1372  * Return values:
1373  *
1374  *      EEXIST: this txg has already been synced, so there's nothing to do.
1375  *              The caller should not log the write.
1376  *
1377  *      ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1378  *              The caller should not log the write.
1379  *
1380  *      EALREADY: this block is already in the process of being synced.
1381  *              The caller should track its progress (somehow).


1481                  * We have already issued a sync write for this buffer,
1482                  * or this buffer has already been synced.  It could not
1483                  * have been dirtied since, or we would have cleared the state.
1484                  */
1485                 mutex_exit(&db->db_mtx);
1486                 return (SET_ERROR(EALREADY));
1487         }
1488 
1489         ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1490         dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1491         mutex_exit(&db->db_mtx);
1492 
1493         dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1494         dsa->dsa_dr = dr;
1495         dsa->dsa_done = done;
1496         dsa->dsa_zgd = zgd;
1497         dsa->dsa_tx = NULL;
1498 
1499         zio_nowait(arc_write(pio, os->os_spa, txg,
1500             bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1501             DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
1502             NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
1503             ZIO_FLAG_CANFAIL, &zb));
1504 
1505         return (0);
1506 }
1507 
1508 int
1509 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1510         dmu_tx_t *tx)
1511 {
1512         dnode_t *dn;
1513         int err;
1514 
1515         err = dnode_hold(os, object, FTAG, &dn);
1516         if (err)
1517                 return (err);
1518         err = dnode_set_blksz(dn, size, ibs, tx);
1519         dnode_rele(dn, FTAG);
1520         return (err);
1521 }
1522 
1523 void