354 int err;
355
356 DB_DNODE_ENTER(db);
357 dn = DB_DNODE(db);
358 err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
359 DB_DNODE_EXIT(db);
360
361 return (err);
362 }
363
364 /*
365 * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
366 * to take a held dnode rather than <os, object> -- the lookup is wasteful,
367 * and can induce severe lock contention when writing to several files
368 * whose dnodes are in the same block.
369 */
370 static int
371 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
372 int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
373 {
374 dsl_pool_t *dp = NULL;
375 dmu_buf_t **dbp;
376 uint64_t blkid, nblks, i;
377 uint32_t dbuf_flags;
378 int err;
379 zio_t *zio;
380 hrtime_t start;
381
382 ASSERT(length <= DMU_MAX_ACCESS);
383
384 dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
385 if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
386 dbuf_flags |= DB_RF_NOPREFETCH;
387
388 rw_enter(&dn->dn_struct_rwlock, RW_READER);
389 if (dn->dn_datablkshift) {
390 int blkshift = dn->dn_datablkshift;
391 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
392 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
393 } else {
394 if (offset + length > dn->dn_datablksz) {
395 zfs_panic_recover("zfs: accessing past end of object "
396 "%llx/%llx (size=%u access=%llu+%llu)",
397 (longlong_t)dn->dn_objset->
398 os_dsl_dataset->ds_object,
399 (longlong_t)dn->dn_object, dn->dn_datablksz,
400 (longlong_t)offset, (longlong_t)length);
401 rw_exit(&dn->dn_struct_rwlock);
402 return (SET_ERROR(EIO));
403 }
404 nblks = 1;
405 }
406 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
407
408 if (dn->dn_objset->os_dsl_dataset)
409 dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
410 start = gethrtime();
411 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
412 blkid = dbuf_whichblock(dn, offset);
413 for (i = 0; i < nblks; i++) {
414 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
415 if (db == NULL) {
416 rw_exit(&dn->dn_struct_rwlock);
417 dmu_buf_rele_array(dbp, nblks, tag);
418 zio_nowait(zio);
419 return (SET_ERROR(EIO));
420 }
421 /* initiate async i/o */
422 if (read) {
423 (void) dbuf_read(db, zio, dbuf_flags);
424 }
425 dbp[i] = &db->db;
426 }
427 rw_exit(&dn->dn_struct_rwlock);
428
429 /* wait for async i/o */
430 err = zio_wait(zio);
431 /* track read overhead when we are in sync context */
432 if (dp && dsl_pool_sync_context(dp))
433 dp->dp_read_overhead += gethrtime() - start;
434 if (err) {
435 dmu_buf_rele_array(dbp, nblks, tag);
436 return (err);
437 }
438
439 /* wait for other io to complete */
440 if (read) {
441 for (i = 0; i < nblks; i++) {
442 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
443 mutex_enter(&db->db_mtx);
444 while (db->db_state == DB_READ ||
445 db->db_state == DB_FILL)
446 cv_wait(&db->db_changed, &db->db_mtx);
447 if (db->db_state == DB_UNCACHED)
448 err = SET_ERROR(EIO);
449 mutex_exit(&db->db_mtx);
450 if (err) {
451 dmu_buf_rele_array(dbp, nblks, tag);
452 return (err);
453 }
495 return (err);
496 }
497
498 void
499 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
500 {
501 int i;
502 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
503
504 if (numbufs == 0)
505 return;
506
507 for (i = 0; i < numbufs; i++) {
508 if (dbp[i])
509 dbuf_rele(dbp[i], tag);
510 }
511
512 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
513 }
514
515 void
516 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
517 {
518 dnode_t *dn;
519 uint64_t blkid;
520 int nblks, i, err;
521
522 if (zfs_prefetch_disable)
523 return;
524
525 if (len == 0) { /* they're interested in the bonus buffer */
526 dn = DMU_META_DNODE(os);
527
528 if (object == 0 || object >= DN_MAX_OBJECT)
529 return;
530
531 rw_enter(&dn->dn_struct_rwlock, RW_READER);
532 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
533 dbuf_prefetch(dn, blkid);
534 rw_exit(&dn->dn_struct_rwlock);
535 return;
536 }
537
538 /*
539 * XXX - Note, if the dnode for the requested object is not
540 * already cached, we will do a *synchronous* read in the
541 * dnode_hold() call. The same is true for any indirects.
542 */
543 err = dnode_hold(os, object, FTAG, &dn);
544 if (err != 0)
545 return;
546
547 rw_enter(&dn->dn_struct_rwlock, RW_READER);
548 if (dn->dn_datablkshift) {
549 int blkshift = dn->dn_datablkshift;
550 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
551 P2ALIGN(offset, 1<<blkshift)) >> blkshift;
552 } else {
553 nblks = (offset < dn->dn_datablksz);
554 }
555
556 if (nblks != 0) {
557 blkid = dbuf_whichblock(dn, offset);
558 for (i = 0; i < nblks; i++)
559 dbuf_prefetch(dn, blkid+i);
560 }
561
562 rw_exit(&dn->dn_struct_rwlock);
563
564 dnode_rele(dn, FTAG);
565 }
566
567 /*
568 * Get the next "chunk" of file data to free. We traverse the file from
569 * the end so that the file gets shorter over time (if we crashes in the
570 * middle, this will leave us in a better state). We find allocated file
571 * data by simply searching the allocated level 1 indirects.
572 *
573 * On input, *start should be the first offset that does not need to be
574 * freed (e.g. "offset + length"). On return, *start will be the first
575 * offset that should be freed.
576 */
577 static int
578 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
579 {
1339 {
1340 dmu_sync_arg_t *dsa;
1341 dmu_tx_t *tx;
1342
1343 tx = dmu_tx_create(os);
1344 dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1345 if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1346 dmu_tx_abort(tx);
1347 /* Make zl_get_data do txg_waited_synced() */
1348 return (SET_ERROR(EIO));
1349 }
1350
1351 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1352 dsa->dsa_dr = NULL;
1353 dsa->dsa_done = done;
1354 dsa->dsa_zgd = zgd;
1355 dsa->dsa_tx = tx;
1356
1357 zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1358 zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
1359 dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
1360 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1361
1362 return (0);
1363 }
1364
1365 /*
1366 * Intent log support: sync the block associated with db to disk.
1367 * N.B. and XXX: the caller is responsible for making sure that the
1368 * data isn't changing while dmu_sync() is writing it.
1369 *
1370 * Return values:
1371 *
1372 * EEXIST: this txg has already been synced, so there's nothing to do.
1373 * The caller should not log the write.
1374 *
1375 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1376 * The caller should not log the write.
1377 *
1378 * EALREADY: this block is already in the process of being synced.
1379 * The caller should track its progress (somehow).
1479 * We have already issued a sync write for this buffer,
1480 * or this buffer has already been synced. It could not
1481 * have been dirtied since, or we would have cleared the state.
1482 */
1483 mutex_exit(&db->db_mtx);
1484 return (SET_ERROR(EALREADY));
1485 }
1486
1487 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1488 dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1489 mutex_exit(&db->db_mtx);
1490
1491 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1492 dsa->dsa_dr = dr;
1493 dsa->dsa_done = done;
1494 dsa->dsa_zgd = zgd;
1495 dsa->dsa_tx = NULL;
1496
1497 zio_nowait(arc_write(pio, os->os_spa, txg,
1498 bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1499 DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
1500 dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
1501
1502 return (0);
1503 }
1504
1505 int
1506 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1507 dmu_tx_t *tx)
1508 {
1509 dnode_t *dn;
1510 int err;
1511
1512 err = dnode_hold(os, object, FTAG, &dn);
1513 if (err)
1514 return (err);
1515 err = dnode_set_blksz(dn, size, ibs, tx);
1516 dnode_rele(dn, FTAG);
1517 return (err);
1518 }
1519
1520 void
|
354 int err;
355
356 DB_DNODE_ENTER(db);
357 dn = DB_DNODE(db);
358 err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
359 DB_DNODE_EXIT(db);
360
361 return (err);
362 }
363
364 /*
365 * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
366 * to take a held dnode rather than <os, object> -- the lookup is wasteful,
367 * and can induce severe lock contention when writing to several files
368 * whose dnodes are in the same block.
369 */
370 static int
371 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
372 int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
373 {
374 dmu_buf_t **dbp;
375 uint64_t blkid, nblks, i;
376 uint32_t dbuf_flags;
377 int err;
378 zio_t *zio;
379
380 ASSERT(length <= DMU_MAX_ACCESS);
381
382 dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
383 if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
384 dbuf_flags |= DB_RF_NOPREFETCH;
385
386 rw_enter(&dn->dn_struct_rwlock, RW_READER);
387 if (dn->dn_datablkshift) {
388 int blkshift = dn->dn_datablkshift;
389 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
390 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
391 } else {
392 if (offset + length > dn->dn_datablksz) {
393 zfs_panic_recover("zfs: accessing past end of object "
394 "%llx/%llx (size=%u access=%llu+%llu)",
395 (longlong_t)dn->dn_objset->
396 os_dsl_dataset->ds_object,
397 (longlong_t)dn->dn_object, dn->dn_datablksz,
398 (longlong_t)offset, (longlong_t)length);
399 rw_exit(&dn->dn_struct_rwlock);
400 return (SET_ERROR(EIO));
401 }
402 nblks = 1;
403 }
404 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
405
406 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
407 blkid = dbuf_whichblock(dn, offset);
408 for (i = 0; i < nblks; i++) {
409 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
410 if (db == NULL) {
411 rw_exit(&dn->dn_struct_rwlock);
412 dmu_buf_rele_array(dbp, nblks, tag);
413 zio_nowait(zio);
414 return (SET_ERROR(EIO));
415 }
416 /* initiate async i/o */
417 if (read) {
418 (void) dbuf_read(db, zio, dbuf_flags);
419 }
420 dbp[i] = &db->db;
421 }
422 rw_exit(&dn->dn_struct_rwlock);
423
424 /* wait for async i/o */
425 err = zio_wait(zio);
426 if (err) {
427 dmu_buf_rele_array(dbp, nblks, tag);
428 return (err);
429 }
430
431 /* wait for other io to complete */
432 if (read) {
433 for (i = 0; i < nblks; i++) {
434 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
435 mutex_enter(&db->db_mtx);
436 while (db->db_state == DB_READ ||
437 db->db_state == DB_FILL)
438 cv_wait(&db->db_changed, &db->db_mtx);
439 if (db->db_state == DB_UNCACHED)
440 err = SET_ERROR(EIO);
441 mutex_exit(&db->db_mtx);
442 if (err) {
443 dmu_buf_rele_array(dbp, nblks, tag);
444 return (err);
445 }
487 return (err);
488 }
489
490 void
491 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
492 {
493 int i;
494 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
495
496 if (numbufs == 0)
497 return;
498
499 for (i = 0; i < numbufs; i++) {
500 if (dbp[i])
501 dbuf_rele(dbp[i], tag);
502 }
503
504 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
505 }
506
507 /*
508 * Issue prefetch i/os for the given blocks.
509 *
510 * Note: The assumption is that we *know* these blocks will be needed
511 * almost immediately. Therefore, the prefetch i/os will be issued at
512 * ZIO_PRIORITY_SYNC_READ
513 *
514 * Note: indirect blocks and other metadata will be read synchronously,
515 * causing this function to block if they are not already cached.
516 */
517 void
518 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
519 {
520 dnode_t *dn;
521 uint64_t blkid;
522 int nblks, err;
523
524 if (zfs_prefetch_disable)
525 return;
526
527 if (len == 0) { /* they're interested in the bonus buffer */
528 dn = DMU_META_DNODE(os);
529
530 if (object == 0 || object >= DN_MAX_OBJECT)
531 return;
532
533 rw_enter(&dn->dn_struct_rwlock, RW_READER);
534 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
535 dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
536 rw_exit(&dn->dn_struct_rwlock);
537 return;
538 }
539
540 /*
541 * XXX - Note, if the dnode for the requested object is not
542 * already cached, we will do a *synchronous* read in the
543 * dnode_hold() call. The same is true for any indirects.
544 */
545 err = dnode_hold(os, object, FTAG, &dn);
546 if (err != 0)
547 return;
548
549 rw_enter(&dn->dn_struct_rwlock, RW_READER);
550 if (dn->dn_datablkshift) {
551 int blkshift = dn->dn_datablkshift;
552 nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
553 P2ALIGN(offset, 1 << blkshift)) >> blkshift;
554 } else {
555 nblks = (offset < dn->dn_datablksz);
556 }
557
558 if (nblks != 0) {
559 blkid = dbuf_whichblock(dn, offset);
560 for (int i = 0; i < nblks; i++)
561 dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
562 }
563
564 rw_exit(&dn->dn_struct_rwlock);
565
566 dnode_rele(dn, FTAG);
567 }
568
569 /*
570 * Get the next "chunk" of file data to free. We traverse the file from
571 * the end so that the file gets shorter over time (if we crashes in the
572 * middle, this will leave us in a better state). We find allocated file
573 * data by simply searching the allocated level 1 indirects.
574 *
575 * On input, *start should be the first offset that does not need to be
576 * freed (e.g. "offset + length"). On return, *start will be the first
577 * offset that should be freed.
578 */
579 static int
580 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
581 {
1341 {
1342 dmu_sync_arg_t *dsa;
1343 dmu_tx_t *tx;
1344
1345 tx = dmu_tx_create(os);
1346 dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1347 if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1348 dmu_tx_abort(tx);
1349 /* Make zl_get_data do txg_waited_synced() */
1350 return (SET_ERROR(EIO));
1351 }
1352
1353 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1354 dsa->dsa_dr = NULL;
1355 dsa->dsa_done = done;
1356 dsa->dsa_zgd = zgd;
1357 dsa->dsa_tx = tx;
1358
1359 zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1360 zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
1361 dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
1362 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1363
1364 return (0);
1365 }
1366
1367 /*
1368 * Intent log support: sync the block associated with db to disk.
1369 * N.B. and XXX: the caller is responsible for making sure that the
1370 * data isn't changing while dmu_sync() is writing it.
1371 *
1372 * Return values:
1373 *
1374 * EEXIST: this txg has already been synced, so there's nothing to do.
1375 * The caller should not log the write.
1376 *
1377 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1378 * The caller should not log the write.
1379 *
1380 * EALREADY: this block is already in the process of being synced.
1381 * The caller should track its progress (somehow).
1481 * We have already issued a sync write for this buffer,
1482 * or this buffer has already been synced. It could not
1483 * have been dirtied since, or we would have cleared the state.
1484 */
1485 mutex_exit(&db->db_mtx);
1486 return (SET_ERROR(EALREADY));
1487 }
1488
1489 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1490 dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1491 mutex_exit(&db->db_mtx);
1492
1493 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1494 dsa->dsa_dr = dr;
1495 dsa->dsa_done = done;
1496 dsa->dsa_zgd = zgd;
1497 dsa->dsa_tx = NULL;
1498
1499 zio_nowait(arc_write(pio, os->os_spa, txg,
1500 bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1501 DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
1502 NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
1503 ZIO_FLAG_CANFAIL, &zb));
1504
1505 return (0);
1506 }
1507
1508 int
1509 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1510 dmu_tx_t *tx)
1511 {
1512 dnode_t *dn;
1513 int err;
1514
1515 err = dnode_hold(os, object, FTAG, &dn);
1516 if (err)
1517 return (err);
1518 err = dnode_set_blksz(dn, size, ibs, tx);
1519 dnode_rele(dn, FTAG);
1520 return (err);
1521 }
1522
1523 void
|