Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dmu.c
          +++ new/usr/src/uts/common/fs/zfs/dmu.c
↓ open down ↓ 363 lines elided ↑ open up ↑
 364  364  /*
 365  365   * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
 366  366   * to take a held dnode rather than <os, object> -- the lookup is wasteful,
 367  367   * and can induce severe lock contention when writing to several files
 368  368   * whose dnodes are in the same block.
 369  369   */
 370  370  static int
 371  371  dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 372  372      int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 373  373  {
 374      -        dsl_pool_t *dp = NULL;
 375  374          dmu_buf_t **dbp;
 376  375          uint64_t blkid, nblks, i;
 377  376          uint32_t dbuf_flags;
 378  377          int err;
 379  378          zio_t *zio;
 380      -        hrtime_t start;
 381  379  
 382  380          ASSERT(length <= DMU_MAX_ACCESS);
 383  381  
 384  382          dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
 385  383          if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
 386  384                  dbuf_flags |= DB_RF_NOPREFETCH;
 387  385  
 388  386          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 389  387          if (dn->dn_datablkshift) {
 390  388                  int blkshift = dn->dn_datablkshift;
↓ open down ↓ 7 lines elided ↑ open up ↑
 398  396                              os_dsl_dataset->ds_object,
 399  397                              (longlong_t)dn->dn_object, dn->dn_datablksz,
 400  398                              (longlong_t)offset, (longlong_t)length);
 401  399                          rw_exit(&dn->dn_struct_rwlock);
 402  400                          return (SET_ERROR(EIO));
 403  401                  }
 404  402                  nblks = 1;
 405  403          }
 406  404          dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 407  405  
 408      -        if (dn->dn_objset->os_dsl_dataset)
 409      -                dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
 410      -        start = gethrtime();
 411  406          zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 412  407          blkid = dbuf_whichblock(dn, offset);
 413  408          for (i = 0; i < nblks; i++) {
 414  409                  dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
 415  410                  if (db == NULL) {
 416  411                          rw_exit(&dn->dn_struct_rwlock);
 417  412                          dmu_buf_rele_array(dbp, nblks, tag);
 418  413                          zio_nowait(zio);
 419  414                          return (SET_ERROR(EIO));
 420  415                  }
 421  416                  /* initiate async i/o */
 422  417                  if (read) {
 423  418                          (void) dbuf_read(db, zio, dbuf_flags);
 424  419                  }
 425  420                  dbp[i] = &db->db;
 426  421          }
 427  422          rw_exit(&dn->dn_struct_rwlock);
 428  423  
 429  424          /* wait for async i/o */
 430  425          err = zio_wait(zio);
 431      -        /* track read overhead when we are in sync context */
 432      -        if (dp && dsl_pool_sync_context(dp))
 433      -                dp->dp_read_overhead += gethrtime() - start;
 434  426          if (err) {
 435  427                  dmu_buf_rele_array(dbp, nblks, tag);
 436  428                  return (err);
 437  429          }
 438  430  
 439  431          /* wait for other io to complete */
 440  432          if (read) {
 441  433                  for (i = 0; i < nblks; i++) {
 442  434                          dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 443  435                          mutex_enter(&db->db_mtx);
↓ open down ↓ 61 lines elided ↑ open up ↑
 505  497                  return;
 506  498  
 507  499          for (i = 0; i < numbufs; i++) {
 508  500                  if (dbp[i])
 509  501                          dbuf_rele(dbp[i], tag);
 510  502          }
 511  503  
 512  504          kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 513  505  }
 514  506  
      507 +/*
      508 + * Issue prefetch i/os for the given blocks.
      509 + *
      510 + * Note: The assumption is that we *know* these blocks will be needed
      511 + * almost immediately.  Therefore, the prefetch i/os will be issued at
      512 + * ZIO_PRIORITY_SYNC_READ
      513 + *
      514 + * Note: indirect blocks and other metadata will be read synchronously,
      515 + * causing this function to block if they are not already cached.
      516 + */
 515  517  void
 516  518  dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 517  519  {
 518  520          dnode_t *dn;
 519  521          uint64_t blkid;
 520      -        int nblks, i, err;
      522 +        int nblks, err;
 521  523  
 522  524          if (zfs_prefetch_disable)
 523  525                  return;
 524  526  
 525  527          if (len == 0) {  /* they're interested in the bonus buffer */
 526  528                  dn = DMU_META_DNODE(os);
 527  529  
 528  530                  if (object == 0 || object >= DN_MAX_OBJECT)
 529  531                          return;
 530  532  
 531  533                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 532  534                  blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
 533      -                dbuf_prefetch(dn, blkid);
      535 +                dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
 534  536                  rw_exit(&dn->dn_struct_rwlock);
 535  537                  return;
 536  538          }
 537  539  
 538  540          /*
 539  541           * XXX - Note, if the dnode for the requested object is not
 540  542           * already cached, we will do a *synchronous* read in the
 541  543           * dnode_hold() call.  The same is true for any indirects.
 542  544           */
 543  545          err = dnode_hold(os, object, FTAG, &dn);
 544  546          if (err != 0)
 545  547                  return;
 546  548  
 547  549          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 548  550          if (dn->dn_datablkshift) {
 549  551                  int blkshift = dn->dn_datablkshift;
 550      -                nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
 551      -                    P2ALIGN(offset, 1<<blkshift)) >> blkshift;
      552 +                nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
      553 +                    P2ALIGN(offset, 1 << blkshift)) >> blkshift;
 552  554          } else {
 553  555                  nblks = (offset < dn->dn_datablksz);
 554  556          }
 555  557  
 556  558          if (nblks != 0) {
 557  559                  blkid = dbuf_whichblock(dn, offset);
 558      -                for (i = 0; i < nblks; i++)
 559      -                        dbuf_prefetch(dn, blkid+i);
      560 +                for (int i = 0; i < nblks; i++)
      561 +                        dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
 560  562          }
 561  563  
 562  564          rw_exit(&dn->dn_struct_rwlock);
 563  565  
 564  566          dnode_rele(dn, FTAG);
 565  567  }
 566  568  
 567  569  /*
 568  570   * Get the next "chunk" of file data to free.  We traverse the file from
 569  571   * the end so that the file gets shorter over time (if we crashes in the
↓ open down ↓ 779 lines elided ↑ open up ↑
1349 1351          }
1350 1352  
1351 1353          dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1352 1354          dsa->dsa_dr = NULL;
1353 1355          dsa->dsa_done = done;
1354 1356          dsa->dsa_zgd = zgd;
1355 1357          dsa->dsa_tx = tx;
1356 1358  
1357 1359          zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1358 1360              zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
1359      -            dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
     1361 +            dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
1360 1362              ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1361 1363  
1362 1364          return (0);
1363 1365  }
1364 1366  
1365 1367  /*
1366 1368   * Intent log support: sync the block associated with db to disk.
1367 1369   * N.B. and XXX: the caller is responsible for making sure that the
1368 1370   * data isn't changing while dmu_sync() is writing it.
1369 1371   *
↓ open down ↓ 119 lines elided ↑ open up ↑
1489 1491          mutex_exit(&db->db_mtx);
1490 1492  
1491 1493          dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1492 1494          dsa->dsa_dr = dr;
1493 1495          dsa->dsa_done = done;
1494 1496          dsa->dsa_zgd = zgd;
1495 1497          dsa->dsa_tx = NULL;
1496 1498  
1497 1499          zio_nowait(arc_write(pio, os->os_spa, txg,
1498 1500              bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1499      -            DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
1500      -            dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
     1501 +            DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
     1502 +            NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
     1503 +            ZIO_FLAG_CANFAIL, &zb));
1501 1504  
1502 1505          return (0);
1503 1506  }
1504 1507  
1505 1508  int
1506 1509  dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1507 1510          dmu_tx_t *tx)
1508 1511  {
1509 1512          dnode_t *dn;
1510 1513          int err;
↓ open down ↓ 317 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX