1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012 by Delphix. All rights reserved. 25 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 26 */ 27 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dbuf.h> 32 #include <sys/dnode.h> 33 #include <sys/zfs_context.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dmu_traverse.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/dsl_dir.h> 38 #include <sys/dsl_prop.h> 39 #include <sys/dsl_pool.h> 40 #include <sys/dsl_synctask.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/zap.h> 43 #include <sys/zio_checksum.h> 44 #include <sys/zfs_znode.h> 45 #include <zfs_fletcher.h> 46 #include <sys/avl.h> 47 #include <sys/ddt.h> 48 #include <sys/zfs_onexit.h> 49 50 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 51 int zfs_send_corrupt_data = B_FALSE; 52 53 static char *dmu_recv_tag = "dmu_recv_tag"; 54 55 static int 56 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 57 { 58 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 59 ssize_t resid; /* have to get resid to get detailed errno */ 60 ASSERT3U(len % 8, ==, 0); 61 62 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 63 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, 64 (caddr_t)buf, len, 65 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 66 67 mutex_enter(&ds->ds_sendstream_lock); 68 *dsp->dsa_off += len; 69 mutex_exit(&ds->ds_sendstream_lock); 70 71 return (dsp->dsa_err); 72 } 73 74 static int 75 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 76 uint64_t length) 77 { 78 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 79 80 /* 81 * If there is a pending op, but it's not PENDING_FREE, push it out, 82 * since free block aggregation can only be done for blocks of the 83 * same type (i.e., DRR_FREE records can only be aggregated with 84 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 85 * aggregated with other DRR_FREEOBJECTS records. 86 */ 87 if (dsp->dsa_pending_op != PENDING_NONE && 88 dsp->dsa_pending_op != PENDING_FREE) { 89 if (dump_bytes(dsp, dsp->dsa_drr, 90 sizeof (dmu_replay_record_t)) != 0) 91 return (EINTR); 92 dsp->dsa_pending_op = PENDING_NONE; 93 } 94 95 if (dsp->dsa_pending_op == PENDING_FREE) { 96 /* 97 * There should never be a PENDING_FREE if length is -1 98 * (because dump_dnode is the only place where this 99 * function is called with a -1, and only after flushing 100 * any pending record). 101 */ 102 ASSERT(length != -1ULL); 103 /* 104 * Check to see whether this free block can be aggregated 105 * with pending one. 106 */ 107 if (drrf->drr_object == object && drrf->drr_offset + 108 drrf->drr_length == offset) { 109 drrf->drr_length += length; 110 return (0); 111 } else { 112 /* not a continuation. Push out pending record */ 113 if (dump_bytes(dsp, dsp->dsa_drr, 114 sizeof (dmu_replay_record_t)) != 0) 115 return (EINTR); 116 dsp->dsa_pending_op = PENDING_NONE; 117 } 118 } 119 /* create a FREE record and make it pending */ 120 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 121 dsp->dsa_drr->drr_type = DRR_FREE; 122 drrf->drr_object = object; 123 drrf->drr_offset = offset; 124 drrf->drr_length = length; 125 drrf->drr_toguid = dsp->dsa_toguid; 126 if (length == -1ULL) { 127 if (dump_bytes(dsp, dsp->dsa_drr, 128 sizeof (dmu_replay_record_t)) != 0) 129 return (EINTR); 130 } else { 131 dsp->dsa_pending_op = PENDING_FREE; 132 } 133 134 return (0); 135 } 136 137 static int 138 dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, 139 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 140 { 141 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 142 143 144 /* 145 * If there is any kind of pending aggregation (currently either 146 * a grouping of free objects or free blocks), push it out to 147 * the stream, since aggregation can't be done across operations 148 * of different types. 149 */ 150 if (dsp->dsa_pending_op != PENDING_NONE) { 151 if (dump_bytes(dsp, dsp->dsa_drr, 152 sizeof (dmu_replay_record_t)) != 0) 153 return (EINTR); 154 dsp->dsa_pending_op = PENDING_NONE; 155 } 156 /* write a DATA record */ 157 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 158 dsp->dsa_drr->drr_type = DRR_WRITE; 159 drrw->drr_object = object; 160 drrw->drr_type = type; 161 drrw->drr_offset = offset; 162 drrw->drr_length = blksz; 163 drrw->drr_toguid = dsp->dsa_toguid; 164 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 165 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 166 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 167 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 168 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 169 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 170 drrw->drr_key.ddk_cksum = bp->blk_cksum; 171 172 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 173 return (EINTR); 174 if (dump_bytes(dsp, data, blksz) != 0) 175 return (EINTR); 176 return (0); 177 } 178 179 static int 180 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 181 { 182 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 183 184 if (dsp->dsa_pending_op != PENDING_NONE) { 185 if (dump_bytes(dsp, dsp->dsa_drr, 186 sizeof (dmu_replay_record_t)) != 0) 187 return (EINTR); 188 dsp->dsa_pending_op = PENDING_NONE; 189 } 190 191 /* write a SPILL record */ 192 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 193 dsp->dsa_drr->drr_type = DRR_SPILL; 194 drrs->drr_object = object; 195 drrs->drr_length = blksz; 196 drrs->drr_toguid = dsp->dsa_toguid; 197 198 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 199 return (EINTR); 200 if (dump_bytes(dsp, data, blksz)) 201 return (EINTR); 202 return (0); 203 } 204 205 static int 206 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 207 { 208 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 209 210 /* 211 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 212 * push it out, since free block aggregation can only be done for 213 * blocks of the same type (i.e., DRR_FREE records can only be 214 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 215 * can only be aggregated with other DRR_FREEOBJECTS records. 216 */ 217 if (dsp->dsa_pending_op != PENDING_NONE && 218 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 219 if (dump_bytes(dsp, dsp->dsa_drr, 220 sizeof (dmu_replay_record_t)) != 0) 221 return (EINTR); 222 dsp->dsa_pending_op = PENDING_NONE; 223 } 224 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 225 /* 226 * See whether this free object array can be aggregated 227 * with pending one 228 */ 229 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 230 drrfo->drr_numobjs += numobjs; 231 return (0); 232 } else { 233 /* can't be aggregated. Push out pending record */ 234 if (dump_bytes(dsp, dsp->dsa_drr, 235 sizeof (dmu_replay_record_t)) != 0) 236 return (EINTR); 237 dsp->dsa_pending_op = PENDING_NONE; 238 } 239 } 240 241 /* write a FREEOBJECTS record */ 242 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 243 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 244 drrfo->drr_firstobj = firstobj; 245 drrfo->drr_numobjs = numobjs; 246 drrfo->drr_toguid = dsp->dsa_toguid; 247 248 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 249 250 return (0); 251 } 252 253 static int 254 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 255 { 256 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 257 258 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 259 return (dump_freeobjects(dsp, object, 1)); 260 261 if (dsp->dsa_pending_op != PENDING_NONE) { 262 if (dump_bytes(dsp, dsp->dsa_drr, 263 sizeof (dmu_replay_record_t)) != 0) 264 return (EINTR); 265 dsp->dsa_pending_op = PENDING_NONE; 266 } 267 268 /* write an OBJECT record */ 269 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 270 dsp->dsa_drr->drr_type = DRR_OBJECT; 271 drro->drr_object = object; 272 drro->drr_type = dnp->dn_type; 273 drro->drr_bonustype = dnp->dn_bonustype; 274 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 275 drro->drr_bonuslen = dnp->dn_bonuslen; 276 drro->drr_checksumtype = dnp->dn_checksum; 277 drro->drr_compress = dnp->dn_compress; 278 drro->drr_toguid = dsp->dsa_toguid; 279 280 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 281 return (EINTR); 282 283 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 284 return (EINTR); 285 286 /* free anything past the end of the file */ 287 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 288 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 289 return (EINTR); 290 if (dsp->dsa_err) 291 return (EINTR); 292 return (0); 293 } 294 295 #define BP_SPAN(dnp, level) \ 296 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 297 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 298 299 /* ARGSUSED */ 300 static int 301 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 302 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 303 { 304 dmu_sendarg_t *dsp = arg; 305 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 306 int err = 0; 307 308 if (issig(JUSTLOOKING) && issig(FORREAL)) 309 return (EINTR); 310 311 if (zb->zb_object != DMU_META_DNODE_OBJECT && 312 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 313 return (0); 314 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 315 uint64_t span = BP_SPAN(dnp, zb->zb_level); 316 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 317 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 318 } else if (bp == NULL) { 319 uint64_t span = BP_SPAN(dnp, zb->zb_level); 320 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 321 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 322 return (0); 323 } else if (type == DMU_OT_DNODE) { 324 dnode_phys_t *blk; 325 int i; 326 int blksz = BP_GET_LSIZE(bp); 327 uint32_t aflags = ARC_WAIT; 328 arc_buf_t *abuf; 329 330 if (dsl_read(NULL, spa, bp, pbuf, 331 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 332 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 333 return (EIO); 334 335 blk = abuf->b_data; 336 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 337 uint64_t dnobj = (zb->zb_blkid << 338 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 339 err = dump_dnode(dsp, dnobj, blk+i); 340 if (err) 341 break; 342 } 343 (void) arc_buf_remove_ref(abuf, &abuf); 344 } else if (type == DMU_OT_SA) { 345 uint32_t aflags = ARC_WAIT; 346 arc_buf_t *abuf; 347 int blksz = BP_GET_LSIZE(bp); 348 349 if (arc_read_nolock(NULL, spa, bp, 350 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 351 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 352 return (EIO); 353 354 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 355 (void) arc_buf_remove_ref(abuf, &abuf); 356 } else { /* it's a level-0 block of a regular object */ 357 uint32_t aflags = ARC_WAIT; 358 arc_buf_t *abuf; 359 int blksz = BP_GET_LSIZE(bp); 360 361 if (dsl_read(NULL, spa, bp, pbuf, 362 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 363 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { 364 if (zfs_send_corrupt_data) { 365 /* Send a block filled with 0x"zfs badd bloc" */ 366 abuf = arc_buf_alloc(spa, blksz, &abuf, 367 ARC_BUFC_DATA); 368 uint64_t *ptr; 369 for (ptr = abuf->b_data; 370 (char *)ptr < (char *)abuf->b_data + blksz; 371 ptr++) 372 *ptr = 0x2f5baddb10c; 373 } else { 374 return (EIO); 375 } 376 } 377 378 err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, 379 blksz, bp, abuf->b_data); 380 (void) arc_buf_remove_ref(abuf, &abuf); 381 } 382 383 ASSERT(err == 0 || err == EINTR); 384 return (err); 385 } 386 387 int 388 dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 389 int outfd, vnode_t *vp, offset_t *off) 390 { 391 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 392 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 393 dmu_replay_record_t *drr; 394 dmu_sendarg_t *dsp; 395 int err; 396 uint64_t fromtxg = 0; 397 398 /* tosnap must be a snapshot */ 399 if (ds->ds_phys->ds_next_snap_obj == 0) 400 return (EINVAL); 401 402 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 403 if (fromds && (ds->ds_dir != fromds->ds_dir || 404 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 405 return (EXDEV); 406 407 if (fromorigin) { 408 dsl_pool_t *dp = ds->ds_dir->dd_pool; 409 410 if (fromsnap) 411 return (EINVAL); 412 413 if (dsl_dir_is_clone(ds->ds_dir)) { 414 rw_enter(&dp->dp_config_rwlock, RW_READER); 415 err = dsl_dataset_hold_obj(dp, 416 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 417 rw_exit(&dp->dp_config_rwlock); 418 if (err) 419 return (err); 420 } else { 421 fromorigin = B_FALSE; 422 } 423 } 424 425 426 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 427 drr->drr_type = DRR_BEGIN; 428 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 429 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 430 DMU_SUBSTREAM); 431 432 #ifdef _KERNEL 433 if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { 434 uint64_t version; 435 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { 436 kmem_free(drr, sizeof (dmu_replay_record_t)); 437 return (EINVAL); 438 } 439 if (version == ZPL_VERSION_SA) { 440 DMU_SET_FEATUREFLAGS( 441 drr->drr_u.drr_begin.drr_versioninfo, 442 DMU_BACKUP_FEATURE_SA_SPILL); 443 } 444 } 445 #endif 446 447 drr->drr_u.drr_begin.drr_creation_time = 448 ds->ds_phys->ds_creation_time; 449 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; 450 if (fromorigin) 451 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 452 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 453 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 454 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 455 456 if (fromds) 457 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 458 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 459 460 if (fromds) 461 fromtxg = fromds->ds_phys->ds_creation_txg; 462 if (fromorigin) 463 dsl_dataset_rele(fromds, FTAG); 464 465 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 466 467 dsp->dsa_drr = drr; 468 dsp->dsa_vp = vp; 469 dsp->dsa_outfd = outfd; 470 dsp->dsa_proc = curproc; 471 dsp->dsa_os = tosnap; 472 dsp->dsa_off = off; 473 dsp->dsa_toguid = ds->ds_phys->ds_guid; 474 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 475 dsp->dsa_pending_op = PENDING_NONE; 476 477 mutex_enter(&ds->ds_sendstream_lock); 478 list_insert_head(&ds->ds_sendstreams, dsp); 479 mutex_exit(&ds->ds_sendstream_lock); 480 481 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 482 err = dsp->dsa_err; 483 goto out; 484 } 485 486 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 487 backup_cb, dsp); 488 489 if (dsp->dsa_pending_op != PENDING_NONE) 490 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 491 err = EINTR; 492 493 if (err) { 494 if (err == EINTR && dsp->dsa_err) 495 err = dsp->dsa_err; 496 goto out; 497 } 498 499 bzero(drr, sizeof (dmu_replay_record_t)); 500 drr->drr_type = DRR_END; 501 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 502 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 503 504 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 505 err = dsp->dsa_err; 506 goto out; 507 } 508 509 out: 510 mutex_enter(&ds->ds_sendstream_lock); 511 list_remove(&ds->ds_sendstreams, dsp); 512 mutex_exit(&ds->ds_sendstream_lock); 513 514 kmem_free(drr, sizeof (dmu_replay_record_t)); 515 kmem_free(dsp, sizeof (dmu_sendarg_t)); 516 517 return (err); 518 } 519 520 int 521 dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 522 uint64_t *sizep) 523 { 524 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 525 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 526 dsl_pool_t *dp = ds->ds_dir->dd_pool; 527 int err; 528 uint64_t size; 529 530 /* tosnap must be a snapshot */ 531 if (ds->ds_phys->ds_next_snap_obj == 0) 532 return (EINVAL); 533 534 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 535 if (fromds && (ds->ds_dir != fromds->ds_dir || 536 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 537 return (EXDEV); 538 539 if (fromorigin) { 540 if (fromsnap) 541 return (EINVAL); 542 543 if (dsl_dir_is_clone(ds->ds_dir)) { 544 rw_enter(&dp->dp_config_rwlock, RW_READER); 545 err = dsl_dataset_hold_obj(dp, 546 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 547 rw_exit(&dp->dp_config_rwlock); 548 if (err) 549 return (err); 550 } else { 551 fromorigin = B_FALSE; 552 } 553 } 554 555 /* Get uncompressed size estimate of changed data. */ 556 if (fromds == NULL) { 557 size = ds->ds_phys->ds_uncompressed_bytes; 558 } else { 559 uint64_t used, comp; 560 err = dsl_dataset_space_written(fromds, ds, 561 &used, &comp, &size); 562 if (fromorigin) 563 dsl_dataset_rele(fromds, FTAG); 564 if (err) 565 return (err); 566 } 567 568 /* 569 * Assume that space (both on-disk and in-stream) is dominated by 570 * data. We will adjust for indirect blocks and the copies property, 571 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 572 */ 573 574 /* 575 * Subtract out approximate space used by indirect blocks. 576 * Assume most space is used by data blocks (non-indirect, non-dnode). 577 * Assume all blocks are recordsize. Assume ditto blocks and 578 * internal fragmentation counter out compression. 579 * 580 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 581 * block, which we observe in practice. 582 */ 583 uint64_t recordsize; 584 rw_enter(&dp->dp_config_rwlock, RW_READER); 585 err = dsl_prop_get_ds(ds, "recordsize", 586 sizeof (recordsize), 1, &recordsize, NULL); 587 rw_exit(&dp->dp_config_rwlock); 588 if (err) 589 return (err); 590 size -= size / recordsize * sizeof (blkptr_t); 591 592 /* Add in the space for the record associated with each block. */ 593 size += size / recordsize * sizeof (dmu_replay_record_t); 594 595 *sizep = size; 596 597 return (0); 598 } 599 600 struct recvbeginsyncarg { 601 const char *tofs; 602 const char *tosnap; 603 dsl_dataset_t *origin; 604 uint64_t fromguid; 605 dmu_objset_type_t type; 606 void *tag; 607 boolean_t force; 608 uint64_t dsflags; 609 char clonelastname[MAXNAMELEN]; 610 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ 611 cred_t *cr; 612 }; 613 614 /* ARGSUSED */ 615 static int 616 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) 617 { 618 dsl_dir_t *dd = arg1; 619 struct recvbeginsyncarg *rbsa = arg2; 620 objset_t *mos = dd->dd_pool->dp_meta_objset; 621 uint64_t val; 622 int err; 623 624 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 625 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); 626 627 if (err != ENOENT) 628 return (err ? err : EEXIST); 629 630 if (rbsa->origin) { 631 /* make sure it's a snap in the same pool */ 632 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) 633 return (EXDEV); 634 if (!dsl_dataset_is_snapshot(rbsa->origin)) 635 return (EINVAL); 636 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) 637 return (ENODEV); 638 } 639 640 return (0); 641 } 642 643 static void 644 recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) 645 { 646 dsl_dir_t *dd = arg1; 647 struct recvbeginsyncarg *rbsa = arg2; 648 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 649 uint64_t dsobj; 650 651 /* Create and open new dataset. */ 652 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, 653 rbsa->origin, flags, rbsa->cr, tx); 654 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, 655 B_TRUE, dmu_recv_tag, &rbsa->ds)); 656 657 if (rbsa->origin == NULL) { 658 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, 659 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); 660 } 661 662 spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, 663 dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); 664 } 665 666 /* ARGSUSED */ 667 static int 668 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) 669 { 670 dsl_dataset_t *ds = arg1; 671 struct recvbeginsyncarg *rbsa = arg2; 672 int err; 673 uint64_t val; 674 675 /* must not have any changes since most recent snapshot */ 676 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) 677 return (ETXTBSY); 678 679 /* new snapshot name must not exist */ 680 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 681 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); 682 if (err == 0) 683 return (EEXIST); 684 if (err != ENOENT) 685 return (err); 686 687 if (rbsa->fromguid) { 688 /* if incremental, most recent snapshot must match fromguid */ 689 if (ds->ds_prev == NULL) 690 return (ENODEV); 691 692 /* 693 * most recent snapshot must match fromguid, or there are no 694 * changes since the fromguid one 695 */ 696 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { 697 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 698 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 699 while (obj != 0) { 700 dsl_dataset_t *snap; 701 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 702 obj, FTAG, &snap); 703 if (err) 704 return (ENODEV); 705 if (snap->ds_phys->ds_creation_txg < birth) { 706 dsl_dataset_rele(snap, FTAG); 707 return (ENODEV); 708 } 709 if (snap->ds_phys->ds_guid == rbsa->fromguid) { 710 dsl_dataset_rele(snap, FTAG); 711 break; /* it's ok */ 712 } 713 obj = snap->ds_phys->ds_prev_snap_obj; 714 dsl_dataset_rele(snap, FTAG); 715 } 716 if (obj == 0) 717 return (ENODEV); 718 } 719 } else { 720 /* if full, most recent snapshot must be $ORIGIN */ 721 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 722 return (ENODEV); 723 } 724 725 /* temporary clone name must not exist */ 726 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 727 ds->ds_dir->dd_phys->dd_child_dir_zapobj, 728 rbsa->clonelastname, 8, 1, &val); 729 if (err == 0) 730 return (EEXIST); 731 if (err != ENOENT) 732 return (err); 733 734 return (0); 735 } 736 737 /* ARGSUSED */ 738 static void 739 recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) 740 { 741 dsl_dataset_t *ohds = arg1; 742 struct recvbeginsyncarg *rbsa = arg2; 743 dsl_pool_t *dp = ohds->ds_dir->dd_pool; 744 dsl_dataset_t *cds; 745 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 746 uint64_t dsobj; 747 748 /* create and open the temporary clone */ 749 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, 750 ohds->ds_prev, flags, rbsa->cr, tx); 751 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); 752 753 /* 754 * If we actually created a non-clone, we need to create the 755 * objset in our new dataset. 756 */ 757 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { 758 (void) dmu_objset_create_impl(dp->dp_spa, 759 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); 760 } 761 762 rbsa->ds = cds; 763 764 spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, 765 dp->dp_spa, tx, "dataset = %lld", dsobj); 766 } 767 768 static boolean_t 769 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) 770 { 771 int featureflags; 772 773 featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 774 775 /* Verify pool version supports SA if SA_SPILL feature set */ 776 return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 777 (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); 778 } 779 780 /* 781 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 782 * succeeds; otherwise we will leak the holds on the datasets. 783 */ 784 int 785 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, 786 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) 787 { 788 int err = 0; 789 boolean_t byteswap; 790 struct recvbeginsyncarg rbsa = { 0 }; 791 uint64_t versioninfo; 792 int flags; 793 dsl_dataset_t *ds; 794 795 if (drrb->drr_magic == DMU_BACKUP_MAGIC) 796 byteswap = FALSE; 797 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 798 byteswap = TRUE; 799 else 800 return (EINVAL); 801 802 rbsa.tofs = tofs; 803 rbsa.tosnap = tosnap; 804 rbsa.origin = origin ? origin->os_dsl_dataset : NULL; 805 rbsa.fromguid = drrb->drr_fromguid; 806 rbsa.type = drrb->drr_type; 807 rbsa.tag = FTAG; 808 rbsa.dsflags = 0; 809 rbsa.cr = CRED(); 810 versioninfo = drrb->drr_versioninfo; 811 flags = drrb->drr_flags; 812 813 if (byteswap) { 814 rbsa.type = BSWAP_32(rbsa.type); 815 rbsa.fromguid = BSWAP_64(rbsa.fromguid); 816 versioninfo = BSWAP_64(versioninfo); 817 flags = BSWAP_32(flags); 818 } 819 820 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || 821 rbsa.type >= DMU_OST_NUMTYPES || 822 ((flags & DRR_FLAG_CLONE) && origin == NULL)) 823 return (EINVAL); 824 825 if (flags & DRR_FLAG_CI_DATA) 826 rbsa.dsflags = DS_FLAG_CI_DATASET; 827 828 bzero(drc, sizeof (dmu_recv_cookie_t)); 829 drc->drc_drrb = drrb; 830 drc->drc_tosnap = tosnap; 831 drc->drc_top_ds = top_ds; 832 drc->drc_force = force; 833 834 /* 835 * Process the begin in syncing context. 836 */ 837 838 /* open the dataset we are logically receiving into */ 839 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); 840 if (err == 0) { 841 if (dmu_recv_verify_features(ds, drrb)) { 842 dsl_dataset_rele(ds, dmu_recv_tag); 843 return (ENOTSUP); 844 } 845 /* target fs already exists; recv into temp clone */ 846 847 /* Can't recv a clone into an existing fs */ 848 if (flags & DRR_FLAG_CLONE) { 849 dsl_dataset_rele(ds, dmu_recv_tag); 850 return (EINVAL); 851 } 852 853 /* tmp clone name is: tofs/%tosnap" */ 854 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), 855 "%%%s", tosnap); 856 rbsa.force = force; 857 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 858 recv_existing_check, recv_existing_sync, ds, &rbsa, 5); 859 if (err) { 860 dsl_dataset_rele(ds, dmu_recv_tag); 861 return (err); 862 } 863 drc->drc_logical_dsobj = ds->ds_object; 864 drc->drc_real_ds = rbsa.ds; 865 dsl_dataset_rele(ds, dmu_recv_tag); 866 } else if (err == ENOENT) { 867 /* target fs does not exist; must be a full backup or clone */ 868 char *cp; 869 870 /* 871 * If it's a non-clone incremental, we are missing the 872 * target fs, so fail the recv. 873 */ 874 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) 875 return (ENOENT); 876 877 /* Open the parent of tofs */ 878 cp = strrchr(tofs, '/'); 879 *cp = '\0'; 880 err = dsl_dataset_hold(tofs, FTAG, &ds); 881 *cp = '/'; 882 if (err) 883 return (err); 884 885 if (dmu_recv_verify_features(ds, drrb)) { 886 dsl_dataset_rele(ds, FTAG); 887 return (ENOTSUP); 888 } 889 890 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 891 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); 892 dsl_dataset_rele(ds, FTAG); 893 if (err) 894 return (err); 895 drc->drc_real_ds = rbsa.ds; 896 drc->drc_newfs = B_TRUE; 897 } 898 899 return (err); 900 } 901 902 struct restorearg { 903 int err; 904 int byteswap; 905 vnode_t *vp; 906 char *buf; 907 uint64_t voff; 908 int bufsize; /* amount of memory allocated for buf */ 909 zio_cksum_t cksum; 910 avl_tree_t *guid_to_ds_map; 911 }; 912 913 typedef struct guid_map_entry { 914 uint64_t guid; 915 dsl_dataset_t *gme_ds; 916 avl_node_t avlnode; 917 } guid_map_entry_t; 918 919 static int 920 guid_compare(const void *arg1, const void *arg2) 921 { 922 const guid_map_entry_t *gmep1 = arg1; 923 const guid_map_entry_t *gmep2 = arg2; 924 925 if (gmep1->guid < gmep2->guid) 926 return (-1); 927 else if (gmep1->guid > gmep2->guid) 928 return (1); 929 return (0); 930 } 931 932 static void 933 free_guid_map_onexit(void *arg) 934 { 935 avl_tree_t *ca = arg; 936 void *cookie = NULL; 937 guid_map_entry_t *gmep; 938 939 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 940 dsl_dataset_rele(gmep->gme_ds, ca); 941 kmem_free(gmep, sizeof (guid_map_entry_t)); 942 } 943 avl_destroy(ca); 944 kmem_free(ca, sizeof (avl_tree_t)); 945 } 946 947 static void * 948 restore_read(struct restorearg *ra, int len) 949 { 950 void *rv; 951 int done = 0; 952 953 /* some things will require 8-byte alignment, so everything must */ 954 ASSERT3U(len % 8, ==, 0); 955 956 while (done < len) { 957 ssize_t resid; 958 959 ra->err = vn_rdwr(UIO_READ, ra->vp, 960 (caddr_t)ra->buf + done, len - done, 961 ra->voff, UIO_SYSSPACE, FAPPEND, 962 RLIM64_INFINITY, CRED(), &resid); 963 964 if (resid == len - done) 965 ra->err = EINVAL; 966 ra->voff += len - done - resid; 967 done = len - resid; 968 if (ra->err) 969 return (NULL); 970 } 971 972 ASSERT3U(done, ==, len); 973 rv = ra->buf; 974 if (ra->byteswap) 975 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 976 else 977 fletcher_4_incremental_native(rv, len, &ra->cksum); 978 return (rv); 979 } 980 981 static void 982 backup_byteswap(dmu_replay_record_t *drr) 983 { 984 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 985 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 986 drr->drr_type = BSWAP_32(drr->drr_type); 987 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 988 switch (drr->drr_type) { 989 case DRR_BEGIN: 990 DO64(drr_begin.drr_magic); 991 DO64(drr_begin.drr_versioninfo); 992 DO64(drr_begin.drr_creation_time); 993 DO32(drr_begin.drr_type); 994 DO32(drr_begin.drr_flags); 995 DO64(drr_begin.drr_toguid); 996 DO64(drr_begin.drr_fromguid); 997 break; 998 case DRR_OBJECT: 999 DO64(drr_object.drr_object); 1000 /* DO64(drr_object.drr_allocation_txg); */ 1001 DO32(drr_object.drr_type); 1002 DO32(drr_object.drr_bonustype); 1003 DO32(drr_object.drr_blksz); 1004 DO32(drr_object.drr_bonuslen); 1005 DO64(drr_object.drr_toguid); 1006 break; 1007 case DRR_FREEOBJECTS: 1008 DO64(drr_freeobjects.drr_firstobj); 1009 DO64(drr_freeobjects.drr_numobjs); 1010 DO64(drr_freeobjects.drr_toguid); 1011 break; 1012 case DRR_WRITE: 1013 DO64(drr_write.drr_object); 1014 DO32(drr_write.drr_type); 1015 DO64(drr_write.drr_offset); 1016 DO64(drr_write.drr_length); 1017 DO64(drr_write.drr_toguid); 1018 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1019 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1020 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1021 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1022 DO64(drr_write.drr_key.ddk_prop); 1023 break; 1024 case DRR_WRITE_BYREF: 1025 DO64(drr_write_byref.drr_object); 1026 DO64(drr_write_byref.drr_offset); 1027 DO64(drr_write_byref.drr_length); 1028 DO64(drr_write_byref.drr_toguid); 1029 DO64(drr_write_byref.drr_refguid); 1030 DO64(drr_write_byref.drr_refobject); 1031 DO64(drr_write_byref.drr_refoffset); 1032 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1033 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1034 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1035 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1036 DO64(drr_write_byref.drr_key.ddk_prop); 1037 break; 1038 case DRR_FREE: 1039 DO64(drr_free.drr_object); 1040 DO64(drr_free.drr_offset); 1041 DO64(drr_free.drr_length); 1042 DO64(drr_free.drr_toguid); 1043 break; 1044 case DRR_SPILL: 1045 DO64(drr_spill.drr_object); 1046 DO64(drr_spill.drr_length); 1047 DO64(drr_spill.drr_toguid); 1048 break; 1049 case DRR_END: 1050 DO64(drr_end.drr_checksum.zc_word[0]); 1051 DO64(drr_end.drr_checksum.zc_word[1]); 1052 DO64(drr_end.drr_checksum.zc_word[2]); 1053 DO64(drr_end.drr_checksum.zc_word[3]); 1054 DO64(drr_end.drr_toguid); 1055 break; 1056 } 1057 #undef DO64 1058 #undef DO32 1059 } 1060 1061 static int 1062 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1063 { 1064 int err; 1065 dmu_tx_t *tx; 1066 void *data = NULL; 1067 1068 if (drro->drr_type == DMU_OT_NONE || 1069 !DMU_OT_IS_VALID(drro->drr_type) || 1070 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1071 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1072 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1073 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1074 drro->drr_blksz < SPA_MINBLOCKSIZE || 1075 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1076 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1077 return (EINVAL); 1078 } 1079 1080 err = dmu_object_info(os, drro->drr_object, NULL); 1081 1082 if (err != 0 && err != ENOENT) 1083 return (EINVAL); 1084 1085 if (drro->drr_bonuslen) { 1086 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 1087 if (ra->err) 1088 return (ra->err); 1089 } 1090 1091 if (err == ENOENT) { 1092 /* currently free, want to be allocated */ 1093 tx = dmu_tx_create(os); 1094 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1095 err = dmu_tx_assign(tx, TXG_WAIT); 1096 if (err) { 1097 dmu_tx_abort(tx); 1098 return (err); 1099 } 1100 err = dmu_object_claim(os, drro->drr_object, 1101 drro->drr_type, drro->drr_blksz, 1102 drro->drr_bonustype, drro->drr_bonuslen, tx); 1103 dmu_tx_commit(tx); 1104 } else { 1105 /* currently allocated, want to be allocated */ 1106 err = dmu_object_reclaim(os, drro->drr_object, 1107 drro->drr_type, drro->drr_blksz, 1108 drro->drr_bonustype, drro->drr_bonuslen); 1109 } 1110 if (err) { 1111 return (EINVAL); 1112 } 1113 1114 tx = dmu_tx_create(os); 1115 dmu_tx_hold_bonus(tx, drro->drr_object); 1116 err = dmu_tx_assign(tx, TXG_WAIT); 1117 if (err) { 1118 dmu_tx_abort(tx); 1119 return (err); 1120 } 1121 1122 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1123 tx); 1124 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1125 1126 if (data != NULL) { 1127 dmu_buf_t *db; 1128 1129 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1130 dmu_buf_will_dirty(db, tx); 1131 1132 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1133 bcopy(data, db->db_data, drro->drr_bonuslen); 1134 if (ra->byteswap) { 1135 dmu_object_byteswap_t byteswap = 1136 DMU_OT_BYTESWAP(drro->drr_bonustype); 1137 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1138 drro->drr_bonuslen); 1139 } 1140 dmu_buf_rele(db, FTAG); 1141 } 1142 dmu_tx_commit(tx); 1143 return (0); 1144 } 1145 1146 /* ARGSUSED */ 1147 static int 1148 restore_freeobjects(struct restorearg *ra, objset_t *os, 1149 struct drr_freeobjects *drrfo) 1150 { 1151 uint64_t obj; 1152 1153 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1154 return (EINVAL); 1155 1156 for (obj = drrfo->drr_firstobj; 1157 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1158 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1159 int err; 1160 1161 if (dmu_object_info(os, obj, NULL) != 0) 1162 continue; 1163 1164 err = dmu_free_object(os, obj); 1165 if (err) 1166 return (err); 1167 } 1168 return (0); 1169 } 1170 1171 static int 1172 restore_write(struct restorearg *ra, objset_t *os, 1173 struct drr_write *drrw) 1174 { 1175 dmu_tx_t *tx; 1176 void *data; 1177 int err; 1178 1179 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1180 !DMU_OT_IS_VALID(drrw->drr_type)) 1181 return (EINVAL); 1182 1183 data = restore_read(ra, drrw->drr_length); 1184 if (data == NULL) 1185 return (ra->err); 1186 1187 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1188 return (EINVAL); 1189 1190 tx = dmu_tx_create(os); 1191 1192 dmu_tx_hold_write(tx, drrw->drr_object, 1193 drrw->drr_offset, drrw->drr_length); 1194 err = dmu_tx_assign(tx, TXG_WAIT); 1195 if (err) { 1196 dmu_tx_abort(tx); 1197 return (err); 1198 } 1199 if (ra->byteswap) { 1200 dmu_object_byteswap_t byteswap = 1201 DMU_OT_BYTESWAP(drrw->drr_type); 1202 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 1203 } 1204 dmu_write(os, drrw->drr_object, 1205 drrw->drr_offset, drrw->drr_length, data, tx); 1206 dmu_tx_commit(tx); 1207 return (0); 1208 } 1209 1210 /* 1211 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1212 * streams to refer to a copy of the data that is already on the 1213 * system because it came in earlier in the stream. This function 1214 * finds the earlier copy of the data, and uses that copy instead of 1215 * data from the stream to fulfill this write. 1216 */ 1217 static int 1218 restore_write_byref(struct restorearg *ra, objset_t *os, 1219 struct drr_write_byref *drrwbr) 1220 { 1221 dmu_tx_t *tx; 1222 int err; 1223 guid_map_entry_t gmesrch; 1224 guid_map_entry_t *gmep; 1225 avl_index_t where; 1226 objset_t *ref_os = NULL; 1227 dmu_buf_t *dbp; 1228 1229 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1230 return (EINVAL); 1231 1232 /* 1233 * If the GUID of the referenced dataset is different from the 1234 * GUID of the target dataset, find the referenced dataset. 1235 */ 1236 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1237 gmesrch.guid = drrwbr->drr_refguid; 1238 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1239 &where)) == NULL) { 1240 return (EINVAL); 1241 } 1242 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1243 return (EINVAL); 1244 } else { 1245 ref_os = os; 1246 } 1247 1248 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1249 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) 1250 return (err); 1251 1252 tx = dmu_tx_create(os); 1253 1254 dmu_tx_hold_write(tx, drrwbr->drr_object, 1255 drrwbr->drr_offset, drrwbr->drr_length); 1256 err = dmu_tx_assign(tx, TXG_WAIT); 1257 if (err) { 1258 dmu_tx_abort(tx); 1259 return (err); 1260 } 1261 dmu_write(os, drrwbr->drr_object, 1262 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1263 dmu_buf_rele(dbp, FTAG); 1264 dmu_tx_commit(tx); 1265 return (0); 1266 } 1267 1268 static int 1269 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1270 { 1271 dmu_tx_t *tx; 1272 void *data; 1273 dmu_buf_t *db, *db_spill; 1274 int err; 1275 1276 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1277 drrs->drr_length > SPA_MAXBLOCKSIZE) 1278 return (EINVAL); 1279 1280 data = restore_read(ra, drrs->drr_length); 1281 if (data == NULL) 1282 return (ra->err); 1283 1284 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1285 return (EINVAL); 1286 1287 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1288 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1289 dmu_buf_rele(db, FTAG); 1290 return (err); 1291 } 1292 1293 tx = dmu_tx_create(os); 1294 1295 dmu_tx_hold_spill(tx, db->db_object); 1296 1297 err = dmu_tx_assign(tx, TXG_WAIT); 1298 if (err) { 1299 dmu_buf_rele(db, FTAG); 1300 dmu_buf_rele(db_spill, FTAG); 1301 dmu_tx_abort(tx); 1302 return (err); 1303 } 1304 dmu_buf_will_dirty(db_spill, tx); 1305 1306 if (db_spill->db_size < drrs->drr_length) 1307 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1308 drrs->drr_length, tx)); 1309 bcopy(data, db_spill->db_data, drrs->drr_length); 1310 1311 dmu_buf_rele(db, FTAG); 1312 dmu_buf_rele(db_spill, FTAG); 1313 1314 dmu_tx_commit(tx); 1315 return (0); 1316 } 1317 1318 /* ARGSUSED */ 1319 static int 1320 restore_free(struct restorearg *ra, objset_t *os, 1321 struct drr_free *drrf) 1322 { 1323 int err; 1324 1325 if (drrf->drr_length != -1ULL && 1326 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1327 return (EINVAL); 1328 1329 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1330 return (EINVAL); 1331 1332 err = dmu_free_long_range(os, drrf->drr_object, 1333 drrf->drr_offset, drrf->drr_length); 1334 return (err); 1335 } 1336 1337 /* 1338 * NB: callers *must* call dmu_recv_end() if this succeeds. 1339 */ 1340 int 1341 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1342 int cleanup_fd, uint64_t *action_handlep) 1343 { 1344 struct restorearg ra = { 0 }; 1345 dmu_replay_record_t *drr; 1346 objset_t *os; 1347 zio_cksum_t pcksum; 1348 int featureflags; 1349 1350 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1351 ra.byteswap = TRUE; 1352 1353 { 1354 /* compute checksum of drr_begin record */ 1355 dmu_replay_record_t *drr; 1356 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1357 1358 drr->drr_type = DRR_BEGIN; 1359 drr->drr_u.drr_begin = *drc->drc_drrb; 1360 if (ra.byteswap) { 1361 fletcher_4_incremental_byteswap(drr, 1362 sizeof (dmu_replay_record_t), &ra.cksum); 1363 } else { 1364 fletcher_4_incremental_native(drr, 1365 sizeof (dmu_replay_record_t), &ra.cksum); 1366 } 1367 kmem_free(drr, sizeof (dmu_replay_record_t)); 1368 } 1369 1370 if (ra.byteswap) { 1371 struct drr_begin *drrb = drc->drc_drrb; 1372 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1373 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1374 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1375 drrb->drr_type = BSWAP_32(drrb->drr_type); 1376 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1377 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1378 } 1379 1380 ra.vp = vp; 1381 ra.voff = *voffp; 1382 ra.bufsize = 1<<20; 1383 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1384 1385 /* these were verified in dmu_recv_begin */ 1386 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == 1387 DMU_SUBSTREAM); 1388 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); 1389 1390 /* 1391 * Open the objset we are modifying. 1392 */ 1393 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); 1394 1395 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1396 1397 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1398 1399 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1400 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1401 minor_t minor; 1402 1403 if (cleanup_fd == -1) { 1404 ra.err = EBADF; 1405 goto out; 1406 } 1407 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1408 if (ra.err) { 1409 cleanup_fd = -1; 1410 goto out; 1411 } 1412 1413 if (*action_handlep == 0) { 1414 ra.guid_to_ds_map = 1415 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1416 avl_create(ra.guid_to_ds_map, guid_compare, 1417 sizeof (guid_map_entry_t), 1418 offsetof(guid_map_entry_t, avlnode)); 1419 ra.err = zfs_onexit_add_cb(minor, 1420 free_guid_map_onexit, ra.guid_to_ds_map, 1421 action_handlep); 1422 if (ra.err) 1423 goto out; 1424 } else { 1425 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1426 (void **)&ra.guid_to_ds_map); 1427 if (ra.err) 1428 goto out; 1429 } 1430 1431 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1432 } 1433 1434 /* 1435 * Read records and process them. 1436 */ 1437 pcksum = ra.cksum; 1438 while (ra.err == 0 && 1439 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1440 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1441 ra.err = EINTR; 1442 goto out; 1443 } 1444 1445 if (ra.byteswap) 1446 backup_byteswap(drr); 1447 1448 switch (drr->drr_type) { 1449 case DRR_OBJECT: 1450 { 1451 /* 1452 * We need to make a copy of the record header, 1453 * because restore_{object,write} may need to 1454 * restore_read(), which will invalidate drr. 1455 */ 1456 struct drr_object drro = drr->drr_u.drr_object; 1457 ra.err = restore_object(&ra, os, &drro); 1458 break; 1459 } 1460 case DRR_FREEOBJECTS: 1461 { 1462 struct drr_freeobjects drrfo = 1463 drr->drr_u.drr_freeobjects; 1464 ra.err = restore_freeobjects(&ra, os, &drrfo); 1465 break; 1466 } 1467 case DRR_WRITE: 1468 { 1469 struct drr_write drrw = drr->drr_u.drr_write; 1470 ra.err = restore_write(&ra, os, &drrw); 1471 break; 1472 } 1473 case DRR_WRITE_BYREF: 1474 { 1475 struct drr_write_byref drrwbr = 1476 drr->drr_u.drr_write_byref; 1477 ra.err = restore_write_byref(&ra, os, &drrwbr); 1478 break; 1479 } 1480 case DRR_FREE: 1481 { 1482 struct drr_free drrf = drr->drr_u.drr_free; 1483 ra.err = restore_free(&ra, os, &drrf); 1484 break; 1485 } 1486 case DRR_END: 1487 { 1488 struct drr_end drre = drr->drr_u.drr_end; 1489 /* 1490 * We compare against the *previous* checksum 1491 * value, because the stored checksum is of 1492 * everything before the DRR_END record. 1493 */ 1494 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1495 ra.err = ECKSUM; 1496 goto out; 1497 } 1498 case DRR_SPILL: 1499 { 1500 struct drr_spill drrs = drr->drr_u.drr_spill; 1501 ra.err = restore_spill(&ra, os, &drrs); 1502 break; 1503 } 1504 default: 1505 ra.err = EINVAL; 1506 goto out; 1507 } 1508 pcksum = ra.cksum; 1509 } 1510 ASSERT(ra.err != 0); 1511 1512 out: 1513 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1514 zfs_onexit_fd_rele(cleanup_fd); 1515 1516 if (ra.err != 0) { 1517 /* 1518 * destroy what we created, so we don't leave it in the 1519 * inconsistent restoring state. 1520 */ 1521 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); 1522 1523 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1524 B_FALSE); 1525 } 1526 1527 kmem_free(ra.buf, ra.bufsize); 1528 *voffp = ra.voff; 1529 return (ra.err); 1530 } 1531 1532 struct recvendsyncarg { 1533 char *tosnap; 1534 uint64_t creation_time; 1535 uint64_t toguid; 1536 }; 1537 1538 static int 1539 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 1540 { 1541 dsl_dataset_t *ds = arg1; 1542 struct recvendsyncarg *resa = arg2; 1543 1544 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); 1545 } 1546 1547 static void 1548 recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1549 { 1550 dsl_dataset_t *ds = arg1; 1551 struct recvendsyncarg *resa = arg2; 1552 1553 dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); 1554 1555 /* set snapshot's creation time and guid */ 1556 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1557 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; 1558 ds->ds_prev->ds_phys->ds_guid = resa->toguid; 1559 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1560 1561 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1562 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1563 } 1564 1565 static int 1566 add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) 1567 { 1568 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1569 uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; 1570 dsl_dataset_t *snapds; 1571 guid_map_entry_t *gmep; 1572 int err; 1573 1574 ASSERT(guid_map != NULL); 1575 1576 rw_enter(&dp->dp_config_rwlock, RW_READER); 1577 err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); 1578 if (err == 0) { 1579 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 1580 gmep->guid = snapds->ds_phys->ds_guid; 1581 gmep->gme_ds = snapds; 1582 avl_add(guid_map, gmep); 1583 } 1584 1585 rw_exit(&dp->dp_config_rwlock); 1586 return (err); 1587 } 1588 1589 static int 1590 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1591 { 1592 struct recvendsyncarg resa; 1593 dsl_dataset_t *ds; 1594 int err, myerr; 1595 dsl_pool_t *dp = drc->drc_real_ds->ds_dir->dd_pool; 1596 1597 rw_enter(&dp->dp_config_rwlock, RW_READER); 1598 err = dsl_dataset_own_obj(dp, drc->drc_logical_dsobj, FALSE, 1599 dmu_recv_tag, &ds); 1600 rw_exit(&dp->dp_config_rwlock); 1601 1602 if (err) { 1603 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1604 B_FALSE); 1605 return (EBUSY); 1606 } 1607 1608 /* 1609 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1610 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1611 * can close it. 1612 */ 1613 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1614 1615 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, drc->drc_force); 1616 if (err) 1617 goto out; 1618 1619 resa.creation_time = drc->drc_drrb->drr_creation_time; 1620 resa.toguid = drc->drc_drrb->drr_toguid; 1621 resa.tosnap = drc->drc_tosnap; 1622 1623 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1624 recv_end_check, recv_end_sync, ds, &resa, 3); 1625 if (err) { 1626 /* swap back */ 1627 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); 1628 } 1629 1630 out: 1631 if (err == 0 && drc->drc_guid_to_ds_map != NULL) 1632 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1633 dsl_dataset_disown(ds, dmu_recv_tag); 1634 myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); 1635 ASSERT3U(myerr, ==, 0); 1636 return (err); 1637 } 1638 1639 static int 1640 dmu_recv_new_end(dmu_recv_cookie_t *drc) 1641 { 1642 struct recvendsyncarg resa; 1643 dsl_dataset_t *ds = drc->drc_real_ds; 1644 int err; 1645 1646 /* 1647 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1648 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1649 * can close it. 1650 */ 1651 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1652 1653 resa.creation_time = drc->drc_drrb->drr_creation_time; 1654 resa.toguid = drc->drc_drrb->drr_toguid; 1655 resa.tosnap = drc->drc_tosnap; 1656 1657 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1658 recv_end_check, recv_end_sync, ds, &resa, 3); 1659 if (err) { 1660 /* clean up the fs we just recv'd into */ 1661 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); 1662 } else { 1663 if (drc->drc_guid_to_ds_map != NULL) 1664 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1665 /* release the hold from dmu_recv_begin */ 1666 dsl_dataset_disown(ds, dmu_recv_tag); 1667 } 1668 return (err); 1669 } 1670 1671 int 1672 dmu_recv_end(dmu_recv_cookie_t *drc) 1673 { 1674 if (!drc->drc_newfs) 1675 return (dmu_recv_existing_end(drc)); 1676 else 1677 return (dmu_recv_new_end(drc)); 1678 }