1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 26 */ 27 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dbuf.h> 32 #include <sys/dnode.h> 33 #include <sys/zfs_context.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dmu_traverse.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/dsl_dir.h> 38 #include <sys/dsl_prop.h> 39 #include <sys/dsl_pool.h> 40 #include <sys/dsl_synctask.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/zap.h> 43 #include <sys/zio_checksum.h> 44 #include <sys/zfs_znode.h> 45 #include <zfs_fletcher.h> 46 #include <sys/avl.h> 47 #include <sys/ddt.h> 48 #include <sys/zfs_onexit.h> 49 #include <sys/dmu_send.h> 50 #include <sys/dsl_destroy.h> 51 52 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 53 int zfs_send_corrupt_data = B_FALSE; 54 55 static char *dmu_recv_tag = "dmu_recv_tag"; 56 static const char *recv_clone_name = "%recv"; 57 58 static int 59 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 60 { 61 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 62 ssize_t resid; /* have to get resid to get detailed errno */ 63 ASSERT0(len % 8); 64 65 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 66 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, 67 (caddr_t)buf, len, 68 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 69 70 mutex_enter(&ds->ds_sendstream_lock); 71 *dsp->dsa_off += len; 72 mutex_exit(&ds->ds_sendstream_lock); 73 74 return (dsp->dsa_err); 75 } 76 77 static int 78 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 79 uint64_t length) 80 { 81 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 82 83 if (length != -1ULL && offset + length < offset) 84 length = -1ULL; 85 86 /* 87 * If there is a pending op, but it's not PENDING_FREE, push it out, 88 * since free block aggregation can only be done for blocks of the 89 * same type (i.e., DRR_FREE records can only be aggregated with 90 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 91 * aggregated with other DRR_FREEOBJECTS records. 92 */ 93 if (dsp->dsa_pending_op != PENDING_NONE && 94 dsp->dsa_pending_op != PENDING_FREE) { 95 if (dump_bytes(dsp, dsp->dsa_drr, 96 sizeof (dmu_replay_record_t)) != 0) 97 return (SET_ERROR(EINTR)); 98 dsp->dsa_pending_op = PENDING_NONE; 99 } 100 101 if (dsp->dsa_pending_op == PENDING_FREE) { 102 /* 103 * There should never be a PENDING_FREE if length is -1 104 * (because dump_dnode is the only place where this 105 * function is called with a -1, and only after flushing 106 * any pending record). 107 */ 108 ASSERT(length != -1ULL); 109 /* 110 * Check to see whether this free block can be aggregated 111 * with pending one. 112 */ 113 if (drrf->drr_object == object && drrf->drr_offset + 114 drrf->drr_length == offset) { 115 drrf->drr_length += length; 116 return (0); 117 } else { 118 /* not a continuation. Push out pending record */ 119 if (dump_bytes(dsp, dsp->dsa_drr, 120 sizeof (dmu_replay_record_t)) != 0) 121 return (SET_ERROR(EINTR)); 122 dsp->dsa_pending_op = PENDING_NONE; 123 } 124 } 125 /* create a FREE record and make it pending */ 126 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 127 dsp->dsa_drr->drr_type = DRR_FREE; 128 drrf->drr_object = object; 129 drrf->drr_offset = offset; 130 drrf->drr_length = length; 131 drrf->drr_toguid = dsp->dsa_toguid; 132 if (length == -1ULL) { 133 if (dump_bytes(dsp, dsp->dsa_drr, 134 sizeof (dmu_replay_record_t)) != 0) 135 return (SET_ERROR(EINTR)); 136 } else { 137 dsp->dsa_pending_op = PENDING_FREE; 138 } 139 140 return (0); 141 } 142 143 static int 144 dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, 145 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 146 { 147 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 148 149 150 /* 151 * If there is any kind of pending aggregation (currently either 152 * a grouping of free objects or free blocks), push it out to 153 * the stream, since aggregation can't be done across operations 154 * of different types. 155 */ 156 if (dsp->dsa_pending_op != PENDING_NONE) { 157 if (dump_bytes(dsp, dsp->dsa_drr, 158 sizeof (dmu_replay_record_t)) != 0) 159 return (SET_ERROR(EINTR)); 160 dsp->dsa_pending_op = PENDING_NONE; 161 } 162 /* write a DATA record */ 163 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 164 dsp->dsa_drr->drr_type = DRR_WRITE; 165 drrw->drr_object = object; 166 drrw->drr_type = type; 167 drrw->drr_offset = offset; 168 drrw->drr_length = blksz; 169 drrw->drr_toguid = dsp->dsa_toguid; 170 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 171 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 172 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 173 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 174 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 175 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 176 drrw->drr_key.ddk_cksum = bp->blk_cksum; 177 178 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 179 return (SET_ERROR(EINTR)); 180 if (dump_bytes(dsp, data, blksz) != 0) 181 return (SET_ERROR(EINTR)); 182 return (0); 183 } 184 185 static int 186 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 187 { 188 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 189 190 if (dsp->dsa_pending_op != PENDING_NONE) { 191 if (dump_bytes(dsp, dsp->dsa_drr, 192 sizeof (dmu_replay_record_t)) != 0) 193 return (SET_ERROR(EINTR)); 194 dsp->dsa_pending_op = PENDING_NONE; 195 } 196 197 /* write a SPILL record */ 198 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 199 dsp->dsa_drr->drr_type = DRR_SPILL; 200 drrs->drr_object = object; 201 drrs->drr_length = blksz; 202 drrs->drr_toguid = dsp->dsa_toguid; 203 204 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 205 return (SET_ERROR(EINTR)); 206 if (dump_bytes(dsp, data, blksz)) 207 return (SET_ERROR(EINTR)); 208 return (0); 209 } 210 211 static int 212 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 213 { 214 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 215 216 /* 217 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 218 * push it out, since free block aggregation can only be done for 219 * blocks of the same type (i.e., DRR_FREE records can only be 220 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 221 * can only be aggregated with other DRR_FREEOBJECTS records. 222 */ 223 if (dsp->dsa_pending_op != PENDING_NONE && 224 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 225 if (dump_bytes(dsp, dsp->dsa_drr, 226 sizeof (dmu_replay_record_t)) != 0) 227 return (SET_ERROR(EINTR)); 228 dsp->dsa_pending_op = PENDING_NONE; 229 } 230 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 231 /* 232 * See whether this free object array can be aggregated 233 * with pending one 234 */ 235 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 236 drrfo->drr_numobjs += numobjs; 237 return (0); 238 } else { 239 /* can't be aggregated. Push out pending record */ 240 if (dump_bytes(dsp, dsp->dsa_drr, 241 sizeof (dmu_replay_record_t)) != 0) 242 return (SET_ERROR(EINTR)); 243 dsp->dsa_pending_op = PENDING_NONE; 244 } 245 } 246 247 /* write a FREEOBJECTS record */ 248 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 249 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 250 drrfo->drr_firstobj = firstobj; 251 drrfo->drr_numobjs = numobjs; 252 drrfo->drr_toguid = dsp->dsa_toguid; 253 254 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 255 256 return (0); 257 } 258 259 static int 260 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 261 { 262 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 263 264 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 265 return (dump_freeobjects(dsp, object, 1)); 266 267 if (dsp->dsa_pending_op != PENDING_NONE) { 268 if (dump_bytes(dsp, dsp->dsa_drr, 269 sizeof (dmu_replay_record_t)) != 0) 270 return (SET_ERROR(EINTR)); 271 dsp->dsa_pending_op = PENDING_NONE; 272 } 273 274 /* write an OBJECT record */ 275 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 276 dsp->dsa_drr->drr_type = DRR_OBJECT; 277 drro->drr_object = object; 278 drro->drr_type = dnp->dn_type; 279 drro->drr_bonustype = dnp->dn_bonustype; 280 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 281 drro->drr_bonuslen = dnp->dn_bonuslen; 282 drro->drr_checksumtype = dnp->dn_checksum; 283 drro->drr_compress = dnp->dn_compress; 284 drro->drr_toguid = dsp->dsa_toguid; 285 286 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 287 return (SET_ERROR(EINTR)); 288 289 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 290 return (SET_ERROR(EINTR)); 291 292 /* free anything past the end of the file */ 293 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 294 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 295 return (SET_ERROR(EINTR)); 296 if (dsp->dsa_err != 0) 297 return (SET_ERROR(EINTR)); 298 return (0); 299 } 300 301 #define BP_SPAN(dnp, level) \ 302 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 303 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 304 305 /* ARGSUSED */ 306 static int 307 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 308 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 309 { 310 dmu_sendarg_t *dsp = arg; 311 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 312 int err = 0; 313 314 if (issig(JUSTLOOKING) && issig(FORREAL)) 315 return (SET_ERROR(EINTR)); 316 317 if (zb->zb_object != DMU_META_DNODE_OBJECT && 318 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 319 return (0); 320 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 321 uint64_t span = BP_SPAN(dnp, zb->zb_level); 322 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 323 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 324 } else if (bp == NULL) { 325 uint64_t span = BP_SPAN(dnp, zb->zb_level); 326 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 327 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 328 return (0); 329 } else if (type == DMU_OT_DNODE) { 330 dnode_phys_t *blk; 331 int i; 332 int blksz = BP_GET_LSIZE(bp); 333 uint32_t aflags = ARC_WAIT; 334 arc_buf_t *abuf; 335 336 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 337 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 338 &aflags, zb) != 0) 339 return (SET_ERROR(EIO)); 340 341 blk = abuf->b_data; 342 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 343 uint64_t dnobj = (zb->zb_blkid << 344 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 345 err = dump_dnode(dsp, dnobj, blk+i); 346 if (err != 0) 347 break; 348 } 349 (void) arc_buf_remove_ref(abuf, &abuf); 350 } else if (type == DMU_OT_SA) { 351 uint32_t aflags = ARC_WAIT; 352 arc_buf_t *abuf; 353 int blksz = BP_GET_LSIZE(bp); 354 355 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 356 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 357 &aflags, zb) != 0) 358 return (SET_ERROR(EIO)); 359 360 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 361 (void) arc_buf_remove_ref(abuf, &abuf); 362 } else { /* it's a level-0 block of a regular object */ 363 uint32_t aflags = ARC_WAIT; 364 arc_buf_t *abuf; 365 int blksz = BP_GET_LSIZE(bp); 366 367 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 368 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 369 &aflags, zb) != 0) { 370 if (zfs_send_corrupt_data) { 371 /* Send a block filled with 0x"zfs badd bloc" */ 372 abuf = arc_buf_alloc(spa, blksz, &abuf, 373 ARC_BUFC_DATA); 374 uint64_t *ptr; 375 for (ptr = abuf->b_data; 376 (char *)ptr < (char *)abuf->b_data + blksz; 377 ptr++) 378 *ptr = 0x2f5baddb10c; 379 } else { 380 return (SET_ERROR(EIO)); 381 } 382 } 383 384 err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, 385 blksz, bp, abuf->b_data); 386 (void) arc_buf_remove_ref(abuf, &abuf); 387 } 388 389 ASSERT(err == 0 || err == EINTR); 390 return (err); 391 } 392 393 /* 394 * Releases dp, ds, and fromds, using the specified tag. 395 */ 396 static int 397 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 398 dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off) 399 { 400 objset_t *os; 401 dmu_replay_record_t *drr; 402 dmu_sendarg_t *dsp; 403 int err; 404 uint64_t fromtxg = 0; 405 406 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) { 407 dsl_dataset_rele(fromds, tag); 408 dsl_dataset_rele(ds, tag); 409 dsl_pool_rele(dp, tag); 410 return (SET_ERROR(EXDEV)); 411 } 412 413 err = dmu_objset_from_ds(ds, &os); 414 if (err != 0) { 415 if (fromds != NULL) 416 dsl_dataset_rele(fromds, tag); 417 dsl_dataset_rele(ds, tag); 418 dsl_pool_rele(dp, tag); 419 return (err); 420 } 421 422 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 423 drr->drr_type = DRR_BEGIN; 424 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 425 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 426 DMU_SUBSTREAM); 427 428 #ifdef _KERNEL 429 if (dmu_objset_type(os) == DMU_OST_ZFS) { 430 uint64_t version; 431 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 432 kmem_free(drr, sizeof (dmu_replay_record_t)); 433 if (fromds != NULL) 434 dsl_dataset_rele(fromds, tag); 435 dsl_dataset_rele(ds, tag); 436 dsl_pool_rele(dp, tag); 437 return (SET_ERROR(EINVAL)); 438 } 439 if (version >= ZPL_VERSION_SA) { 440 DMU_SET_FEATUREFLAGS( 441 drr->drr_u.drr_begin.drr_versioninfo, 442 DMU_BACKUP_FEATURE_SA_SPILL); 443 } 444 } 445 #endif 446 447 drr->drr_u.drr_begin.drr_creation_time = 448 ds->ds_phys->ds_creation_time; 449 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 450 if (fromds != NULL && ds->ds_dir != fromds->ds_dir) 451 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 452 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 453 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 454 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 455 456 if (fromds != NULL) 457 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 458 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 459 460 if (fromds != NULL) { 461 fromtxg = fromds->ds_phys->ds_creation_txg; 462 dsl_dataset_rele(fromds, tag); 463 fromds = NULL; 464 } 465 466 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 467 468 dsp->dsa_drr = drr; 469 dsp->dsa_vp = vp; 470 dsp->dsa_outfd = outfd; 471 dsp->dsa_proc = curproc; 472 dsp->dsa_os = os; 473 dsp->dsa_off = off; 474 dsp->dsa_toguid = ds->ds_phys->ds_guid; 475 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 476 dsp->dsa_pending_op = PENDING_NONE; 477 478 mutex_enter(&ds->ds_sendstream_lock); 479 list_insert_head(&ds->ds_sendstreams, dsp); 480 mutex_exit(&ds->ds_sendstream_lock); 481 482 dsl_dataset_long_hold(ds, FTAG); 483 dsl_pool_rele(dp, tag); 484 485 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 486 err = dsp->dsa_err; 487 goto out; 488 } 489 490 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 491 backup_cb, dsp); 492 493 if (dsp->dsa_pending_op != PENDING_NONE) 494 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 495 err = SET_ERROR(EINTR); 496 497 if (err != 0) { 498 if (err == EINTR && dsp->dsa_err != 0) 499 err = dsp->dsa_err; 500 goto out; 501 } 502 503 bzero(drr, sizeof (dmu_replay_record_t)); 504 drr->drr_type = DRR_END; 505 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 506 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 507 508 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 509 err = dsp->dsa_err; 510 goto out; 511 } 512 513 out: 514 mutex_enter(&ds->ds_sendstream_lock); 515 list_remove(&ds->ds_sendstreams, dsp); 516 mutex_exit(&ds->ds_sendstream_lock); 517 518 kmem_free(drr, sizeof (dmu_replay_record_t)); 519 kmem_free(dsp, sizeof (dmu_sendarg_t)); 520 521 dsl_dataset_long_rele(ds, FTAG); 522 dsl_dataset_rele(ds, tag); 523 524 return (err); 525 } 526 527 int 528 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 529 int outfd, vnode_t *vp, offset_t *off) 530 { 531 dsl_pool_t *dp; 532 dsl_dataset_t *ds; 533 dsl_dataset_t *fromds = NULL; 534 int err; 535 536 err = dsl_pool_hold(pool, FTAG, &dp); 537 if (err != 0) 538 return (err); 539 540 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 541 if (err != 0) { 542 dsl_pool_rele(dp, FTAG); 543 return (err); 544 } 545 546 if (fromsnap != 0) { 547 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 548 if (err != 0) { 549 dsl_dataset_rele(ds, FTAG); 550 dsl_pool_rele(dp, FTAG); 551 return (err); 552 } 553 } 554 555 return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); 556 } 557 558 int 559 dmu_send(const char *tosnap, const char *fromsnap, 560 int outfd, vnode_t *vp, offset_t *off) 561 { 562 dsl_pool_t *dp; 563 dsl_dataset_t *ds; 564 dsl_dataset_t *fromds = NULL; 565 int err; 566 567 if (strchr(tosnap, '@') == NULL) 568 return (SET_ERROR(EINVAL)); 569 if (fromsnap != NULL && strchr(fromsnap, '@') == NULL) 570 return (SET_ERROR(EINVAL)); 571 572 err = dsl_pool_hold(tosnap, FTAG, &dp); 573 if (err != 0) 574 return (err); 575 576 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 577 if (err != 0) { 578 dsl_pool_rele(dp, FTAG); 579 return (err); 580 } 581 582 if (fromsnap != NULL) { 583 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 584 if (err != 0) { 585 dsl_dataset_rele(ds, FTAG); 586 dsl_pool_rele(dp, FTAG); 587 return (err); 588 } 589 } 590 return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); 591 } 592 593 int 594 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 595 { 596 dsl_pool_t *dp = ds->ds_dir->dd_pool; 597 int err; 598 uint64_t size; 599 600 ASSERT(dsl_pool_config_held(dp)); 601 602 /* tosnap must be a snapshot */ 603 if (!dsl_dataset_is_snapshot(ds)) 604 return (SET_ERROR(EINVAL)); 605 606 /* 607 * fromsnap must be an earlier snapshot from the same fs as tosnap, 608 * or the origin's fs. 609 */ 610 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) 611 return (SET_ERROR(EXDEV)); 612 613 /* Get uncompressed size estimate of changed data. */ 614 if (fromds == NULL) { 615 size = ds->ds_phys->ds_uncompressed_bytes; 616 } else { 617 uint64_t used, comp; 618 err = dsl_dataset_space_written(fromds, ds, 619 &used, &comp, &size); 620 if (err != 0) 621 return (err); 622 } 623 624 /* 625 * Assume that space (both on-disk and in-stream) is dominated by 626 * data. We will adjust for indirect blocks and the copies property, 627 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 628 */ 629 630 /* 631 * Subtract out approximate space used by indirect blocks. 632 * Assume most space is used by data blocks (non-indirect, non-dnode). 633 * Assume all blocks are recordsize. Assume ditto blocks and 634 * internal fragmentation counter out compression. 635 * 636 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 637 * block, which we observe in practice. 638 */ 639 uint64_t recordsize; 640 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 641 if (err != 0) 642 return (err); 643 size -= size / recordsize * sizeof (blkptr_t); 644 645 /* Add in the space for the record associated with each block. */ 646 size += size / recordsize * sizeof (dmu_replay_record_t); 647 648 *sizep = size; 649 650 return (0); 651 } 652 653 typedef struct dmu_recv_begin_arg { 654 const char *drba_origin; 655 dmu_recv_cookie_t *drba_cookie; 656 cred_t *drba_cred; 657 } dmu_recv_begin_arg_t; 658 659 static int 660 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 661 uint64_t fromguid) 662 { 663 uint64_t val; 664 int error; 665 dsl_pool_t *dp = ds->ds_dir->dd_pool; 666 667 /* must not have any changes since most recent snapshot */ 668 if (!drba->drba_cookie->drc_force && 669 dsl_dataset_modified_since_lastsnap(ds)) 670 return (SET_ERROR(ETXTBSY)); 671 672 /* temporary clone name must not exist */ 673 error = zap_lookup(dp->dp_meta_objset, 674 ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, 675 8, 1, &val); 676 if (error != ENOENT) 677 return (error == 0 ? EBUSY : error); 678 679 /* new snapshot name must not exist */ 680 error = zap_lookup(dp->dp_meta_objset, 681 ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 682 8, 1, &val); 683 if (error != ENOENT) 684 return (error == 0 ? EEXIST : error); 685 686 if (fromguid != 0) { 687 /* if incremental, most recent snapshot must match fromguid */ 688 if (ds->ds_prev == NULL) 689 return (SET_ERROR(ENODEV)); 690 691 /* 692 * most recent snapshot must match fromguid, or there are no 693 * changes since the fromguid one 694 */ 695 if (ds->ds_prev->ds_phys->ds_guid != fromguid) { 696 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 697 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 698 while (obj != 0) { 699 dsl_dataset_t *snap; 700 error = dsl_dataset_hold_obj(dp, obj, FTAG, 701 &snap); 702 if (error != 0) 703 return (SET_ERROR(ENODEV)); 704 if (snap->ds_phys->ds_creation_txg < birth) { 705 dsl_dataset_rele(snap, FTAG); 706 return (SET_ERROR(ENODEV)); 707 } 708 if (snap->ds_phys->ds_guid == fromguid) { 709 dsl_dataset_rele(snap, FTAG); 710 break; /* it's ok */ 711 } 712 obj = snap->ds_phys->ds_prev_snap_obj; 713 dsl_dataset_rele(snap, FTAG); 714 } 715 if (obj == 0) 716 return (SET_ERROR(ENODEV)); 717 } 718 } else { 719 /* if full, most recent snapshot must be $ORIGIN */ 720 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 721 return (SET_ERROR(ENODEV)); 722 } 723 724 return (0); 725 726 } 727 728 static int 729 dmu_recv_begin_check(void *arg, dmu_tx_t *tx) 730 { 731 dmu_recv_begin_arg_t *drba = arg; 732 dsl_pool_t *dp = dmu_tx_pool(tx); 733 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 734 uint64_t fromguid = drrb->drr_fromguid; 735 int flags = drrb->drr_flags; 736 int error; 737 dsl_dataset_t *ds; 738 const char *tofs = drba->drba_cookie->drc_tofs; 739 740 /* already checked */ 741 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 742 743 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 744 DMU_COMPOUNDSTREAM || 745 drrb->drr_type >= DMU_OST_NUMTYPES || 746 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 747 return (SET_ERROR(EINVAL)); 748 749 /* Verify pool version supports SA if SA_SPILL feature set */ 750 if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 751 DMU_BACKUP_FEATURE_SA_SPILL) && 752 spa_version(dp->dp_spa) < SPA_VERSION_SA) { 753 return (SET_ERROR(ENOTSUP)); 754 } 755 756 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 757 if (error == 0) { 758 /* target fs already exists; recv into temp clone */ 759 760 /* Can't recv a clone into an existing fs */ 761 if (flags & DRR_FLAG_CLONE) { 762 dsl_dataset_rele(ds, FTAG); 763 return (SET_ERROR(EINVAL)); 764 } 765 766 error = recv_begin_check_existing_impl(drba, ds, fromguid); 767 dsl_dataset_rele(ds, FTAG); 768 } else if (error == ENOENT) { 769 /* target fs does not exist; must be a full backup or clone */ 770 char buf[MAXNAMELEN]; 771 772 /* 773 * If it's a non-clone incremental, we are missing the 774 * target fs, so fail the recv. 775 */ 776 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 777 return (SET_ERROR(ENOENT)); 778 779 /* Open the parent of tofs */ 780 ASSERT3U(strlen(tofs), <, MAXNAMELEN); 781 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 782 error = dsl_dataset_hold(dp, buf, FTAG, &ds); 783 if (error != 0) 784 return (error); 785 786 if (drba->drba_origin != NULL) { 787 dsl_dataset_t *origin; 788 error = dsl_dataset_hold(dp, drba->drba_origin, 789 FTAG, &origin); 790 if (error != 0) { 791 dsl_dataset_rele(ds, FTAG); 792 return (error); 793 } 794 if (!dsl_dataset_is_snapshot(origin)) { 795 dsl_dataset_rele(origin, FTAG); 796 dsl_dataset_rele(ds, FTAG); 797 return (SET_ERROR(EINVAL)); 798 } 799 if (origin->ds_phys->ds_guid != fromguid) { 800 dsl_dataset_rele(origin, FTAG); 801 dsl_dataset_rele(ds, FTAG); 802 return (SET_ERROR(ENODEV)); 803 } 804 dsl_dataset_rele(origin, FTAG); 805 } 806 dsl_dataset_rele(ds, FTAG); 807 error = 0; 808 } 809 return (error); 810 } 811 812 static void 813 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 814 { 815 dmu_recv_begin_arg_t *drba = arg; 816 dsl_pool_t *dp = dmu_tx_pool(tx); 817 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 818 const char *tofs = drba->drba_cookie->drc_tofs; 819 dsl_dataset_t *ds, *newds; 820 uint64_t dsobj; 821 int error; 822 uint64_t crflags; 823 824 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 825 DS_FLAG_CI_DATASET : 0; 826 827 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 828 if (error == 0) { 829 /* create temporary clone */ 830 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 831 ds->ds_prev, crflags, drba->drba_cred, tx); 832 dsl_dataset_rele(ds, FTAG); 833 } else { 834 dsl_dir_t *dd; 835 const char *tail; 836 dsl_dataset_t *origin = NULL; 837 838 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 839 840 if (drba->drba_origin != NULL) { 841 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 842 FTAG, &origin)); 843 } 844 845 /* Create new dataset. */ 846 dsobj = dsl_dataset_create_sync(dd, 847 strrchr(tofs, '/') + 1, 848 origin, crflags, drba->drba_cred, tx); 849 if (origin != NULL) 850 dsl_dataset_rele(origin, FTAG); 851 dsl_dir_rele(dd, FTAG); 852 drba->drba_cookie->drc_newfs = B_TRUE; 853 } 854 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 855 856 dmu_buf_will_dirty(newds->ds_dbuf, tx); 857 newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 858 859 /* 860 * If we actually created a non-clone, we need to create the 861 * objset in our new dataset. 862 */ 863 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 864 (void) dmu_objset_create_impl(dp->dp_spa, 865 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 866 } 867 868 drba->drba_cookie->drc_ds = newds; 869 870 spa_history_log_internal_ds(newds, "receive", tx, ""); 871 } 872 873 /* 874 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 875 * succeeds; otherwise we will leak the holds on the datasets. 876 */ 877 int 878 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 879 boolean_t force, char *origin, dmu_recv_cookie_t *drc) 880 { 881 dmu_recv_begin_arg_t drba = { 0 }; 882 dmu_replay_record_t *drr; 883 884 bzero(drc, sizeof (dmu_recv_cookie_t)); 885 drc->drc_drrb = drrb; 886 drc->drc_tosnap = tosnap; 887 drc->drc_tofs = tofs; 888 drc->drc_force = force; 889 890 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 891 drc->drc_byteswap = B_TRUE; 892 else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 893 return (SET_ERROR(EINVAL)); 894 895 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 896 drr->drr_type = DRR_BEGIN; 897 drr->drr_u.drr_begin = *drc->drc_drrb; 898 if (drc->drc_byteswap) { 899 fletcher_4_incremental_byteswap(drr, 900 sizeof (dmu_replay_record_t), &drc->drc_cksum); 901 } else { 902 fletcher_4_incremental_native(drr, 903 sizeof (dmu_replay_record_t), &drc->drc_cksum); 904 } 905 kmem_free(drr, sizeof (dmu_replay_record_t)); 906 907 if (drc->drc_byteswap) { 908 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 909 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 910 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 911 drrb->drr_type = BSWAP_32(drrb->drr_type); 912 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 913 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 914 } 915 916 drba.drba_origin = origin; 917 drba.drba_cookie = drc; 918 drba.drba_cred = CRED(); 919 920 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 921 &drba, 5)); 922 } 923 924 struct restorearg { 925 int err; 926 boolean_t byteswap; 927 vnode_t *vp; 928 char *buf; 929 uint64_t voff; 930 int bufsize; /* amount of memory allocated for buf */ 931 zio_cksum_t cksum; 932 avl_tree_t *guid_to_ds_map; 933 }; 934 935 typedef struct guid_map_entry { 936 uint64_t guid; 937 dsl_dataset_t *gme_ds; 938 avl_node_t avlnode; 939 } guid_map_entry_t; 940 941 static int 942 guid_compare(const void *arg1, const void *arg2) 943 { 944 const guid_map_entry_t *gmep1 = arg1; 945 const guid_map_entry_t *gmep2 = arg2; 946 947 if (gmep1->guid < gmep2->guid) 948 return (-1); 949 else if (gmep1->guid > gmep2->guid) 950 return (1); 951 return (0); 952 } 953 954 static void 955 free_guid_map_onexit(void *arg) 956 { 957 avl_tree_t *ca = arg; 958 void *cookie = NULL; 959 guid_map_entry_t *gmep; 960 961 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 962 dsl_dataset_long_rele(gmep->gme_ds, gmep); 963 dsl_dataset_rele(gmep->gme_ds, gmep); 964 kmem_free(gmep, sizeof (guid_map_entry_t)); 965 } 966 avl_destroy(ca); 967 kmem_free(ca, sizeof (avl_tree_t)); 968 } 969 970 static void * 971 restore_read(struct restorearg *ra, int len) 972 { 973 void *rv; 974 int done = 0; 975 976 /* some things will require 8-byte alignment, so everything must */ 977 ASSERT0(len % 8); 978 979 while (done < len) { 980 ssize_t resid; 981 982 ra->err = vn_rdwr(UIO_READ, ra->vp, 983 (caddr_t)ra->buf + done, len - done, 984 ra->voff, UIO_SYSSPACE, FAPPEND, 985 RLIM64_INFINITY, CRED(), &resid); 986 987 if (resid == len - done) 988 ra->err = SET_ERROR(EINVAL); 989 ra->voff += len - done - resid; 990 done = len - resid; 991 if (ra->err != 0) 992 return (NULL); 993 } 994 995 ASSERT3U(done, ==, len); 996 rv = ra->buf; 997 if (ra->byteswap) 998 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 999 else 1000 fletcher_4_incremental_native(rv, len, &ra->cksum); 1001 return (rv); 1002 } 1003 1004 static void 1005 backup_byteswap(dmu_replay_record_t *drr) 1006 { 1007 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1008 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1009 drr->drr_type = BSWAP_32(drr->drr_type); 1010 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1011 switch (drr->drr_type) { 1012 case DRR_BEGIN: 1013 DO64(drr_begin.drr_magic); 1014 DO64(drr_begin.drr_versioninfo); 1015 DO64(drr_begin.drr_creation_time); 1016 DO32(drr_begin.drr_type); 1017 DO32(drr_begin.drr_flags); 1018 DO64(drr_begin.drr_toguid); 1019 DO64(drr_begin.drr_fromguid); 1020 break; 1021 case DRR_OBJECT: 1022 DO64(drr_object.drr_object); 1023 /* DO64(drr_object.drr_allocation_txg); */ 1024 DO32(drr_object.drr_type); 1025 DO32(drr_object.drr_bonustype); 1026 DO32(drr_object.drr_blksz); 1027 DO32(drr_object.drr_bonuslen); 1028 DO64(drr_object.drr_toguid); 1029 break; 1030 case DRR_FREEOBJECTS: 1031 DO64(drr_freeobjects.drr_firstobj); 1032 DO64(drr_freeobjects.drr_numobjs); 1033 DO64(drr_freeobjects.drr_toguid); 1034 break; 1035 case DRR_WRITE: 1036 DO64(drr_write.drr_object); 1037 DO32(drr_write.drr_type); 1038 DO64(drr_write.drr_offset); 1039 DO64(drr_write.drr_length); 1040 DO64(drr_write.drr_toguid); 1041 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1042 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1043 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1044 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1045 DO64(drr_write.drr_key.ddk_prop); 1046 break; 1047 case DRR_WRITE_BYREF: 1048 DO64(drr_write_byref.drr_object); 1049 DO64(drr_write_byref.drr_offset); 1050 DO64(drr_write_byref.drr_length); 1051 DO64(drr_write_byref.drr_toguid); 1052 DO64(drr_write_byref.drr_refguid); 1053 DO64(drr_write_byref.drr_refobject); 1054 DO64(drr_write_byref.drr_refoffset); 1055 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1056 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1057 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1058 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1059 DO64(drr_write_byref.drr_key.ddk_prop); 1060 break; 1061 case DRR_FREE: 1062 DO64(drr_free.drr_object); 1063 DO64(drr_free.drr_offset); 1064 DO64(drr_free.drr_length); 1065 DO64(drr_free.drr_toguid); 1066 break; 1067 case DRR_SPILL: 1068 DO64(drr_spill.drr_object); 1069 DO64(drr_spill.drr_length); 1070 DO64(drr_spill.drr_toguid); 1071 break; 1072 case DRR_END: 1073 DO64(drr_end.drr_checksum.zc_word[0]); 1074 DO64(drr_end.drr_checksum.zc_word[1]); 1075 DO64(drr_end.drr_checksum.zc_word[2]); 1076 DO64(drr_end.drr_checksum.zc_word[3]); 1077 DO64(drr_end.drr_toguid); 1078 break; 1079 } 1080 #undef DO64 1081 #undef DO32 1082 } 1083 1084 static int 1085 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1086 { 1087 int err; 1088 dmu_tx_t *tx; 1089 void *data = NULL; 1090 1091 if (drro->drr_type == DMU_OT_NONE || 1092 !DMU_OT_IS_VALID(drro->drr_type) || 1093 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1094 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1095 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1096 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1097 drro->drr_blksz < SPA_MINBLOCKSIZE || 1098 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1099 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1100 return (SET_ERROR(EINVAL)); 1101 } 1102 1103 err = dmu_object_info(os, drro->drr_object, NULL); 1104 1105 if (err != 0 && err != ENOENT) 1106 return (SET_ERROR(EINVAL)); 1107 1108 if (drro->drr_bonuslen) { 1109 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 1110 if (ra->err != 0) 1111 return (ra->err); 1112 } 1113 1114 if (err == ENOENT) { 1115 /* currently free, want to be allocated */ 1116 tx = dmu_tx_create(os); 1117 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1118 err = dmu_tx_assign(tx, TXG_WAIT); 1119 if (err != 0) { 1120 dmu_tx_abort(tx); 1121 return (err); 1122 } 1123 err = dmu_object_claim(os, drro->drr_object, 1124 drro->drr_type, drro->drr_blksz, 1125 drro->drr_bonustype, drro->drr_bonuslen, tx); 1126 dmu_tx_commit(tx); 1127 } else { 1128 /* currently allocated, want to be allocated */ 1129 err = dmu_object_reclaim(os, drro->drr_object, 1130 drro->drr_type, drro->drr_blksz, 1131 drro->drr_bonustype, drro->drr_bonuslen); 1132 } 1133 if (err != 0) { 1134 return (SET_ERROR(EINVAL)); 1135 } 1136 1137 tx = dmu_tx_create(os); 1138 dmu_tx_hold_bonus(tx, drro->drr_object); 1139 err = dmu_tx_assign(tx, TXG_WAIT); 1140 if (err != 0) { 1141 dmu_tx_abort(tx); 1142 return (err); 1143 } 1144 1145 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1146 tx); 1147 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1148 1149 if (data != NULL) { 1150 dmu_buf_t *db; 1151 1152 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1153 dmu_buf_will_dirty(db, tx); 1154 1155 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1156 bcopy(data, db->db_data, drro->drr_bonuslen); 1157 if (ra->byteswap) { 1158 dmu_object_byteswap_t byteswap = 1159 DMU_OT_BYTESWAP(drro->drr_bonustype); 1160 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1161 drro->drr_bonuslen); 1162 } 1163 dmu_buf_rele(db, FTAG); 1164 } 1165 dmu_tx_commit(tx); 1166 return (0); 1167 } 1168 1169 /* ARGSUSED */ 1170 static int 1171 restore_freeobjects(struct restorearg *ra, objset_t *os, 1172 struct drr_freeobjects *drrfo) 1173 { 1174 uint64_t obj; 1175 1176 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1177 return (SET_ERROR(EINVAL)); 1178 1179 for (obj = drrfo->drr_firstobj; 1180 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1181 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1182 int err; 1183 1184 if (dmu_object_info(os, obj, NULL) != 0) 1185 continue; 1186 1187 err = dmu_free_object(os, obj); 1188 if (err != 0) 1189 return (err); 1190 } 1191 return (0); 1192 } 1193 1194 static int 1195 restore_write(struct restorearg *ra, objset_t *os, 1196 struct drr_write *drrw) 1197 { 1198 dmu_tx_t *tx; 1199 void *data; 1200 int err; 1201 1202 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1203 !DMU_OT_IS_VALID(drrw->drr_type)) 1204 return (SET_ERROR(EINVAL)); 1205 1206 data = restore_read(ra, drrw->drr_length); 1207 if (data == NULL) 1208 return (ra->err); 1209 1210 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1211 return (SET_ERROR(EINVAL)); 1212 1213 tx = dmu_tx_create(os); 1214 1215 dmu_tx_hold_write(tx, drrw->drr_object, 1216 drrw->drr_offset, drrw->drr_length); 1217 err = dmu_tx_assign(tx, TXG_WAIT); 1218 if (err != 0) { 1219 dmu_tx_abort(tx); 1220 return (err); 1221 } 1222 if (ra->byteswap) { 1223 dmu_object_byteswap_t byteswap = 1224 DMU_OT_BYTESWAP(drrw->drr_type); 1225 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 1226 } 1227 dmu_write(os, drrw->drr_object, 1228 drrw->drr_offset, drrw->drr_length, data, tx); 1229 dmu_tx_commit(tx); 1230 return (0); 1231 } 1232 1233 /* 1234 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1235 * streams to refer to a copy of the data that is already on the 1236 * system because it came in earlier in the stream. This function 1237 * finds the earlier copy of the data, and uses that copy instead of 1238 * data from the stream to fulfill this write. 1239 */ 1240 static int 1241 restore_write_byref(struct restorearg *ra, objset_t *os, 1242 struct drr_write_byref *drrwbr) 1243 { 1244 dmu_tx_t *tx; 1245 int err; 1246 guid_map_entry_t gmesrch; 1247 guid_map_entry_t *gmep; 1248 avl_index_t where; 1249 objset_t *ref_os = NULL; 1250 dmu_buf_t *dbp; 1251 1252 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1253 return (SET_ERROR(EINVAL)); 1254 1255 /* 1256 * If the GUID of the referenced dataset is different from the 1257 * GUID of the target dataset, find the referenced dataset. 1258 */ 1259 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1260 gmesrch.guid = drrwbr->drr_refguid; 1261 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1262 &where)) == NULL) { 1263 return (SET_ERROR(EINVAL)); 1264 } 1265 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1266 return (SET_ERROR(EINVAL)); 1267 } else { 1268 ref_os = os; 1269 } 1270 1271 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1272 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) 1273 return (err); 1274 1275 tx = dmu_tx_create(os); 1276 1277 dmu_tx_hold_write(tx, drrwbr->drr_object, 1278 drrwbr->drr_offset, drrwbr->drr_length); 1279 err = dmu_tx_assign(tx, TXG_WAIT); 1280 if (err != 0) { 1281 dmu_tx_abort(tx); 1282 return (err); 1283 } 1284 dmu_write(os, drrwbr->drr_object, 1285 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1286 dmu_buf_rele(dbp, FTAG); 1287 dmu_tx_commit(tx); 1288 return (0); 1289 } 1290 1291 static int 1292 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1293 { 1294 dmu_tx_t *tx; 1295 void *data; 1296 dmu_buf_t *db, *db_spill; 1297 int err; 1298 1299 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1300 drrs->drr_length > SPA_MAXBLOCKSIZE) 1301 return (SET_ERROR(EINVAL)); 1302 1303 data = restore_read(ra, drrs->drr_length); 1304 if (data == NULL) 1305 return (ra->err); 1306 1307 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1308 return (SET_ERROR(EINVAL)); 1309 1310 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1311 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1312 dmu_buf_rele(db, FTAG); 1313 return (err); 1314 } 1315 1316 tx = dmu_tx_create(os); 1317 1318 dmu_tx_hold_spill(tx, db->db_object); 1319 1320 err = dmu_tx_assign(tx, TXG_WAIT); 1321 if (err != 0) { 1322 dmu_buf_rele(db, FTAG); 1323 dmu_buf_rele(db_spill, FTAG); 1324 dmu_tx_abort(tx); 1325 return (err); 1326 } 1327 dmu_buf_will_dirty(db_spill, tx); 1328 1329 if (db_spill->db_size < drrs->drr_length) 1330 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1331 drrs->drr_length, tx)); 1332 bcopy(data, db_spill->db_data, drrs->drr_length); 1333 1334 dmu_buf_rele(db, FTAG); 1335 dmu_buf_rele(db_spill, FTAG); 1336 1337 dmu_tx_commit(tx); 1338 return (0); 1339 } 1340 1341 /* ARGSUSED */ 1342 static int 1343 restore_free(struct restorearg *ra, objset_t *os, 1344 struct drr_free *drrf) 1345 { 1346 int err; 1347 1348 if (drrf->drr_length != -1ULL && 1349 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1350 return (SET_ERROR(EINVAL)); 1351 1352 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1353 return (SET_ERROR(EINVAL)); 1354 1355 err = dmu_free_long_range(os, drrf->drr_object, 1356 drrf->drr_offset, drrf->drr_length); 1357 return (err); 1358 } 1359 1360 /* used to destroy the drc_ds on error */ 1361 static void 1362 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1363 { 1364 char name[MAXNAMELEN]; 1365 dsl_dataset_name(drc->drc_ds, name); 1366 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1367 (void) dsl_destroy_head(name); 1368 } 1369 1370 /* 1371 * NB: callers *must* call dmu_recv_end() if this succeeds. 1372 */ 1373 int 1374 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1375 int cleanup_fd, uint64_t *action_handlep) 1376 { 1377 struct restorearg ra = { 0 }; 1378 dmu_replay_record_t *drr; 1379 objset_t *os; 1380 zio_cksum_t pcksum; 1381 int featureflags; 1382 1383 ra.byteswap = drc->drc_byteswap; 1384 ra.cksum = drc->drc_cksum; 1385 ra.vp = vp; 1386 ra.voff = *voffp; 1387 ra.bufsize = 1<<20; 1388 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1389 1390 /* these were verified in dmu_recv_begin */ 1391 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1392 DMU_SUBSTREAM); 1393 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1394 1395 /* 1396 * Open the objset we are modifying. 1397 */ 1398 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); 1399 1400 ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1401 1402 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1403 1404 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1405 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1406 minor_t minor; 1407 1408 if (cleanup_fd == -1) { 1409 ra.err = SET_ERROR(EBADF); 1410 goto out; 1411 } 1412 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1413 if (ra.err != 0) { 1414 cleanup_fd = -1; 1415 goto out; 1416 } 1417 1418 if (*action_handlep == 0) { 1419 ra.guid_to_ds_map = 1420 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1421 avl_create(ra.guid_to_ds_map, guid_compare, 1422 sizeof (guid_map_entry_t), 1423 offsetof(guid_map_entry_t, avlnode)); 1424 ra.err = zfs_onexit_add_cb(minor, 1425 free_guid_map_onexit, ra.guid_to_ds_map, 1426 action_handlep); 1427 if (ra.err != 0) 1428 goto out; 1429 } else { 1430 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1431 (void **)&ra.guid_to_ds_map); 1432 if (ra.err != 0) 1433 goto out; 1434 } 1435 1436 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1437 } 1438 1439 /* 1440 * Read records and process them. 1441 */ 1442 pcksum = ra.cksum; 1443 while (ra.err == 0 && 1444 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1445 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1446 ra.err = SET_ERROR(EINTR); 1447 goto out; 1448 } 1449 1450 if (ra.byteswap) 1451 backup_byteswap(drr); 1452 1453 switch (drr->drr_type) { 1454 case DRR_OBJECT: 1455 { 1456 /* 1457 * We need to make a copy of the record header, 1458 * because restore_{object,write} may need to 1459 * restore_read(), which will invalidate drr. 1460 */ 1461 struct drr_object drro = drr->drr_u.drr_object; 1462 ra.err = restore_object(&ra, os, &drro); 1463 break; 1464 } 1465 case DRR_FREEOBJECTS: 1466 { 1467 struct drr_freeobjects drrfo = 1468 drr->drr_u.drr_freeobjects; 1469 ra.err = restore_freeobjects(&ra, os, &drrfo); 1470 break; 1471 } 1472 case DRR_WRITE: 1473 { 1474 struct drr_write drrw = drr->drr_u.drr_write; 1475 ra.err = restore_write(&ra, os, &drrw); 1476 break; 1477 } 1478 case DRR_WRITE_BYREF: 1479 { 1480 struct drr_write_byref drrwbr = 1481 drr->drr_u.drr_write_byref; 1482 ra.err = restore_write_byref(&ra, os, &drrwbr); 1483 break; 1484 } 1485 case DRR_FREE: 1486 { 1487 struct drr_free drrf = drr->drr_u.drr_free; 1488 ra.err = restore_free(&ra, os, &drrf); 1489 break; 1490 } 1491 case DRR_END: 1492 { 1493 struct drr_end drre = drr->drr_u.drr_end; 1494 /* 1495 * We compare against the *previous* checksum 1496 * value, because the stored checksum is of 1497 * everything before the DRR_END record. 1498 */ 1499 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1500 ra.err = SET_ERROR(ECKSUM); 1501 goto out; 1502 } 1503 case DRR_SPILL: 1504 { 1505 struct drr_spill drrs = drr->drr_u.drr_spill; 1506 ra.err = restore_spill(&ra, os, &drrs); 1507 break; 1508 } 1509 default: 1510 ra.err = SET_ERROR(EINVAL); 1511 goto out; 1512 } 1513 pcksum = ra.cksum; 1514 } 1515 ASSERT(ra.err != 0); 1516 1517 out: 1518 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1519 zfs_onexit_fd_rele(cleanup_fd); 1520 1521 if (ra.err != 0) { 1522 /* 1523 * destroy what we created, so we don't leave it in the 1524 * inconsistent restoring state. 1525 */ 1526 dmu_recv_cleanup_ds(drc); 1527 } 1528 1529 kmem_free(ra.buf, ra.bufsize); 1530 *voffp = ra.voff; 1531 return (ra.err); 1532 } 1533 1534 static int 1535 dmu_recv_end_check(void *arg, dmu_tx_t *tx) 1536 { 1537 dmu_recv_cookie_t *drc = arg; 1538 dsl_pool_t *dp = dmu_tx_pool(tx); 1539 int error; 1540 1541 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 1542 1543 if (!drc->drc_newfs) { 1544 dsl_dataset_t *origin_head; 1545 1546 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 1547 if (error != 0) 1548 return (error); 1549 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 1550 origin_head, drc->drc_force, drc->drc_owner, tx); 1551 if (error != 0) { 1552 dsl_dataset_rele(origin_head, FTAG); 1553 return (error); 1554 } 1555 error = dsl_dataset_snapshot_check_impl(origin_head, 1556 drc->drc_tosnap, tx, B_TRUE); 1557 dsl_dataset_rele(origin_head, FTAG); 1558 if (error != 0) 1559 return (error); 1560 1561 error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 1562 } else { 1563 error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 1564 drc->drc_tosnap, tx, B_TRUE); 1565 } 1566 return (error); 1567 } 1568 1569 static void 1570 dmu_recv_end_sync(void *arg, dmu_tx_t *tx) 1571 { 1572 dmu_recv_cookie_t *drc = arg; 1573 dsl_pool_t *dp = dmu_tx_pool(tx); 1574 1575 spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 1576 tx, "snap=%s", drc->drc_tosnap); 1577 1578 if (!drc->drc_newfs) { 1579 dsl_dataset_t *origin_head; 1580 1581 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 1582 &origin_head)); 1583 dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 1584 origin_head, tx); 1585 dsl_dataset_snapshot_sync_impl(origin_head, 1586 drc->drc_tosnap, tx); 1587 1588 /* set snapshot's creation time and guid */ 1589 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 1590 origin_head->ds_prev->ds_phys->ds_creation_time = 1591 drc->drc_drrb->drr_creation_time; 1592 origin_head->ds_prev->ds_phys->ds_guid = 1593 drc->drc_drrb->drr_toguid; 1594 origin_head->ds_prev->ds_phys->ds_flags &= 1595 ~DS_FLAG_INCONSISTENT; 1596 1597 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 1598 origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1599 1600 dsl_dataset_rele(origin_head, FTAG); 1601 dsl_destroy_head_sync_impl(drc->drc_ds, tx); 1602 1603 if (drc->drc_owner != NULL) 1604 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 1605 } else { 1606 dsl_dataset_t *ds = drc->drc_ds; 1607 1608 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 1609 1610 /* set snapshot's creation time and guid */ 1611 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1612 ds->ds_prev->ds_phys->ds_creation_time = 1613 drc->drc_drrb->drr_creation_time; 1614 ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; 1615 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1616 1617 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1618 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1619 } 1620 drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; 1621 /* 1622 * Release the hold from dmu_recv_begin. This must be done before 1623 * we return to open context, so that when we free the dataset's dnode, 1624 * we can evict its bonus buffer. 1625 */ 1626 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1627 drc->drc_ds = NULL; 1628 } 1629 1630 static int 1631 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 1632 { 1633 dsl_pool_t *dp; 1634 dsl_dataset_t *snapds; 1635 guid_map_entry_t *gmep; 1636 int err; 1637 1638 ASSERT(guid_map != NULL); 1639 1640 err = dsl_pool_hold(name, FTAG, &dp); 1641 if (err != 0) 1642 return (err); 1643 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 1644 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 1645 if (err == 0) { 1646 gmep->guid = snapds->ds_phys->ds_guid; 1647 gmep->gme_ds = snapds; 1648 avl_add(guid_map, gmep); 1649 dsl_dataset_long_hold(snapds, gmep); 1650 } else { 1651 kmem_free(gmep, sizeof (*gmep)); 1652 } 1653 1654 dsl_pool_rele(dp, FTAG); 1655 return (err); 1656 } 1657 1658 static int dmu_recv_end_modified_blocks = 3; 1659 1660 static int 1661 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1662 { 1663 int error; 1664 char name[MAXNAMELEN]; 1665 1666 #ifdef _KERNEL 1667 /* 1668 * We will be destroying the ds; make sure its origin is unmounted if 1669 * necessary. 1670 */ 1671 dsl_dataset_name(drc->drc_ds, name); 1672 zfs_destroy_unmount_origin(name); 1673 #endif 1674 1675 error = dsl_sync_task(drc->drc_tofs, 1676 dmu_recv_end_check, dmu_recv_end_sync, drc, 1677 dmu_recv_end_modified_blocks); 1678 1679 if (error != 0) 1680 dmu_recv_cleanup_ds(drc); 1681 return (error); 1682 } 1683 1684 static int 1685 dmu_recv_new_end(dmu_recv_cookie_t *drc) 1686 { 1687 int error; 1688 1689 error = dsl_sync_task(drc->drc_tofs, 1690 dmu_recv_end_check, dmu_recv_end_sync, drc, 1691 dmu_recv_end_modified_blocks); 1692 1693 if (error != 0) { 1694 dmu_recv_cleanup_ds(drc); 1695 } else if (drc->drc_guid_to_ds_map != NULL) { 1696 (void) add_ds_to_guidmap(drc->drc_tofs, 1697 drc->drc_guid_to_ds_map, 1698 drc->drc_newsnapobj); 1699 } 1700 return (error); 1701 } 1702 1703 int 1704 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 1705 { 1706 drc->drc_owner = owner; 1707 1708 if (drc->drc_newfs) 1709 return (dmu_recv_new_end(drc)); 1710 else 1711 return (dmu_recv_existing_end(drc)); 1712 }