1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012 by Delphix. All rights reserved. 25 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 26 */ 27 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dbuf.h> 32 #include <sys/dnode.h> 33 #include <sys/zfs_context.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dmu_traverse.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/dsl_dir.h> 38 #include <sys/dsl_prop.h> 39 #include <sys/dsl_pool.h> 40 #include <sys/dsl_synctask.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/zap.h> 43 #include <sys/zio_checksum.h> 44 #include <sys/zfs_znode.h> 45 #include <zfs_fletcher.h> 46 #include <sys/avl.h> 47 #include <sys/ddt.h> 48 #include <sys/zfs_onexit.h> 49 50 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 51 int zfs_send_corrupt_data = B_FALSE; 52 53 static char *dmu_recv_tag = "dmu_recv_tag"; 54 55 static int 56 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 57 { 58 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 59 ssize_t resid; /* have to get resid to get detailed errno */ 60 ASSERT3U(len % 8, ==, 0); 61 62 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 63 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, 64 (caddr_t)buf, len, 65 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 66 67 mutex_enter(&ds->ds_sendstream_lock); 68 *dsp->dsa_off += len; 69 mutex_exit(&ds->ds_sendstream_lock); 70 71 return (dsp->dsa_err); 72 } 73 74 static int 75 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 76 uint64_t length) 77 { 78 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 79 80 /* 81 * If there is a pending op, but it's not PENDING_FREE, push it out, 82 * since free block aggregation can only be done for blocks of the 83 * same type (i.e., DRR_FREE records can only be aggregated with 84 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 85 * aggregated with other DRR_FREEOBJECTS records. 86 */ 87 if (dsp->dsa_pending_op != PENDING_NONE && 88 dsp->dsa_pending_op != PENDING_FREE) { 89 if (dump_bytes(dsp, dsp->dsa_drr, 90 sizeof (dmu_replay_record_t)) != 0) 91 return (EINTR); 92 dsp->dsa_pending_op = PENDING_NONE; 93 } 94 95 if (dsp->dsa_pending_op == PENDING_FREE) { 96 /* 97 * There should never be a PENDING_FREE if length is -1 98 * (because dump_dnode is the only place where this 99 * function is called with a -1, and only after flushing 100 * any pending record). 101 */ 102 ASSERT(length != -1ULL); 103 /* 104 * Check to see whether this free block can be aggregated 105 * with pending one. 106 */ 107 if (drrf->drr_object == object && drrf->drr_offset + 108 drrf->drr_length == offset) { 109 drrf->drr_length += length; 110 return (0); 111 } else { 112 /* not a continuation. Push out pending record */ 113 if (dump_bytes(dsp, dsp->dsa_drr, 114 sizeof (dmu_replay_record_t)) != 0) 115 return (EINTR); 116 dsp->dsa_pending_op = PENDING_NONE; 117 } 118 } 119 /* create a FREE record and make it pending */ 120 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 121 dsp->dsa_drr->drr_type = DRR_FREE; 122 drrf->drr_object = object; 123 drrf->drr_offset = offset; 124 drrf->drr_length = length; 125 drrf->drr_toguid = dsp->dsa_toguid; 126 if (length == -1ULL) { 127 if (dump_bytes(dsp, dsp->dsa_drr, 128 sizeof (dmu_replay_record_t)) != 0) 129 return (EINTR); 130 } else { 131 dsp->dsa_pending_op = PENDING_FREE; 132 } 133 134 return (0); 135 } 136 137 static int 138 dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, 139 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 140 { 141 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 142 143 144 /* 145 * If there is any kind of pending aggregation (currently either 146 * a grouping of free objects or free blocks), push it out to 147 * the stream, since aggregation can't be done across operations 148 * of different types. 149 */ 150 if (dsp->dsa_pending_op != PENDING_NONE) { 151 if (dump_bytes(dsp, dsp->dsa_drr, 152 sizeof (dmu_replay_record_t)) != 0) 153 return (EINTR); 154 dsp->dsa_pending_op = PENDING_NONE; 155 } 156 /* write a DATA record */ 157 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 158 dsp->dsa_drr->drr_type = DRR_WRITE; 159 drrw->drr_object = object; 160 drrw->drr_type = type; 161 drrw->drr_offset = offset; 162 drrw->drr_length = blksz; 163 drrw->drr_toguid = dsp->dsa_toguid; 164 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 165 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 166 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 167 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 168 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 169 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 170 drrw->drr_key.ddk_cksum = bp->blk_cksum; 171 172 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 173 return (EINTR); 174 if (dump_bytes(dsp, data, blksz) != 0) 175 return (EINTR); 176 return (0); 177 } 178 179 static int 180 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 181 { 182 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 183 184 if (dsp->dsa_pending_op != PENDING_NONE) { 185 if (dump_bytes(dsp, dsp->dsa_drr, 186 sizeof (dmu_replay_record_t)) != 0) 187 return (EINTR); 188 dsp->dsa_pending_op = PENDING_NONE; 189 } 190 191 /* write a SPILL record */ 192 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 193 dsp->dsa_drr->drr_type = DRR_SPILL; 194 drrs->drr_object = object; 195 drrs->drr_length = blksz; 196 drrs->drr_toguid = dsp->dsa_toguid; 197 198 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 199 return (EINTR); 200 if (dump_bytes(dsp, data, blksz)) 201 return (EINTR); 202 return (0); 203 } 204 205 static int 206 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 207 { 208 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 209 210 /* 211 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 212 * push it out, since free block aggregation can only be done for 213 * blocks of the same type (i.e., DRR_FREE records can only be 214 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 215 * can only be aggregated with other DRR_FREEOBJECTS records. 216 */ 217 if (dsp->dsa_pending_op != PENDING_NONE && 218 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 219 if (dump_bytes(dsp, dsp->dsa_drr, 220 sizeof (dmu_replay_record_t)) != 0) 221 return (EINTR); 222 dsp->dsa_pending_op = PENDING_NONE; 223 } 224 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 225 /* 226 * See whether this free object array can be aggregated 227 * with pending one 228 */ 229 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 230 drrfo->drr_numobjs += numobjs; 231 return (0); 232 } else { 233 /* can't be aggregated. Push out pending record */ 234 if (dump_bytes(dsp, dsp->dsa_drr, 235 sizeof (dmu_replay_record_t)) != 0) 236 return (EINTR); 237 dsp->dsa_pending_op = PENDING_NONE; 238 } 239 } 240 241 /* write a FREEOBJECTS record */ 242 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 243 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 244 drrfo->drr_firstobj = firstobj; 245 drrfo->drr_numobjs = numobjs; 246 drrfo->drr_toguid = dsp->dsa_toguid; 247 248 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 249 250 return (0); 251 } 252 253 static int 254 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 255 { 256 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 257 258 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 259 return (dump_freeobjects(dsp, object, 1)); 260 261 if (dsp->dsa_pending_op != PENDING_NONE) { 262 if (dump_bytes(dsp, dsp->dsa_drr, 263 sizeof (dmu_replay_record_t)) != 0) 264 return (EINTR); 265 dsp->dsa_pending_op = PENDING_NONE; 266 } 267 268 /* write an OBJECT record */ 269 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 270 dsp->dsa_drr->drr_type = DRR_OBJECT; 271 drro->drr_object = object; 272 drro->drr_type = dnp->dn_type; 273 drro->drr_bonustype = dnp->dn_bonustype; 274 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 275 drro->drr_bonuslen = dnp->dn_bonuslen; 276 drro->drr_checksumtype = dnp->dn_checksum; 277 drro->drr_compress = dnp->dn_compress; 278 drro->drr_toguid = dsp->dsa_toguid; 279 280 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 281 return (EINTR); 282 283 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 284 return (EINTR); 285 286 /* free anything past the end of the file */ 287 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 288 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 289 return (EINTR); 290 if (dsp->dsa_err) 291 return (EINTR); 292 return (0); 293 } 294 295 #define BP_SPAN(dnp, level) \ 296 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 297 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 298 299 /* ARGSUSED */ 300 static int 301 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 302 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 303 { 304 dmu_sendarg_t *dsp = arg; 305 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 306 int err = 0; 307 308 if (issig(JUSTLOOKING) && issig(FORREAL)) 309 return (EINTR); 310 311 if (zb->zb_object != DMU_META_DNODE_OBJECT && 312 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 313 return (0); 314 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 315 uint64_t span = BP_SPAN(dnp, zb->zb_level); 316 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 317 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 318 } else if (bp == NULL) { 319 uint64_t span = BP_SPAN(dnp, zb->zb_level); 320 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 321 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 322 return (0); 323 } else if (type == DMU_OT_DNODE) { 324 dnode_phys_t *blk; 325 int i; 326 int blksz = BP_GET_LSIZE(bp); 327 uint32_t aflags = ARC_WAIT; 328 arc_buf_t *abuf; 329 330 if (dsl_read(NULL, spa, bp, pbuf, 331 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 332 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 333 return (EIO); 334 335 blk = abuf->b_data; 336 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 337 uint64_t dnobj = (zb->zb_blkid << 338 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 339 err = dump_dnode(dsp, dnobj, blk+i); 340 if (err) 341 break; 342 } 343 (void) arc_buf_remove_ref(abuf, &abuf); 344 } else if (type == DMU_OT_SA) { 345 uint32_t aflags = ARC_WAIT; 346 arc_buf_t *abuf; 347 int blksz = BP_GET_LSIZE(bp); 348 349 if (arc_read_nolock(NULL, spa, bp, 350 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 351 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 352 return (EIO); 353 354 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 355 (void) arc_buf_remove_ref(abuf, &abuf); 356 } else { /* it's a level-0 block of a regular object */ 357 uint32_t aflags = ARC_WAIT; 358 arc_buf_t *abuf; 359 int blksz = BP_GET_LSIZE(bp); 360 361 if (dsl_read(NULL, spa, bp, pbuf, 362 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 363 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { 364 if (zfs_send_corrupt_data) { 365 /* Send a block filled with 0x"zfs badd bloc" */ 366 abuf = arc_buf_alloc(spa, blksz, &abuf, 367 ARC_BUFC_DATA); 368 uint64_t *ptr; 369 for (ptr = abuf->b_data; 370 (char *)ptr < (char *)abuf->b_data + blksz; 371 ptr++) 372 *ptr = 0x2f5baddb10c; 373 } else { 374 return (EIO); 375 } 376 } 377 378 err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, 379 blksz, bp, abuf->b_data); 380 (void) arc_buf_remove_ref(abuf, &abuf); 381 } 382 383 ASSERT(err == 0 || err == EINTR); 384 return (err); 385 } 386 387 int 388 dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 389 int outfd, vnode_t *vp, offset_t *off) 390 { 391 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 392 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 393 dmu_replay_record_t *drr; 394 dmu_sendarg_t *dsp; 395 int err; 396 uint64_t fromtxg = 0; 397 398 /* tosnap must be a snapshot */ 399 if (ds->ds_phys->ds_next_snap_obj == 0) 400 return (EINVAL); 401 402 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 403 if (fromds && (ds->ds_dir != fromds->ds_dir || 404 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 405 return (EXDEV); 406 407 if (fromorigin) { 408 dsl_pool_t *dp = ds->ds_dir->dd_pool; 409 410 if (fromsnap) 411 return (EINVAL); 412 413 if (dsl_dir_is_clone(ds->ds_dir)) { 414 rw_enter(&dp->dp_config_rwlock, RW_READER); 415 err = dsl_dataset_hold_obj(dp, 416 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 417 rw_exit(&dp->dp_config_rwlock); 418 if (err) 419 return (err); 420 } else { 421 fromorigin = B_FALSE; 422 } 423 } 424 425 426 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 427 drr->drr_type = DRR_BEGIN; 428 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 429 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 430 DMU_SUBSTREAM); 431 432 #ifdef _KERNEL 433 if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { 434 uint64_t version; 435 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { 436 kmem_free(drr, sizeof (dmu_replay_record_t)); 437 return (EINVAL); 438 } 439 if (version == ZPL_VERSION_SA) { 440 DMU_SET_FEATUREFLAGS( 441 drr->drr_u.drr_begin.drr_versioninfo, 442 DMU_BACKUP_FEATURE_SA_SPILL); 443 } 444 } 445 #endif 446 447 drr->drr_u.drr_begin.drr_creation_time = 448 ds->ds_phys->ds_creation_time; 449 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; 450 if (fromorigin) 451 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 452 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 453 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 454 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 455 456 if (fromds) 457 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 458 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 459 460 if (fromds) 461 fromtxg = fromds->ds_phys->ds_creation_txg; 462 if (fromorigin) 463 dsl_dataset_rele(fromds, FTAG); 464 465 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 466 467 dsp->dsa_drr = drr; 468 dsp->dsa_vp = vp; 469 dsp->dsa_outfd = outfd; 470 dsp->dsa_proc = curproc; 471 dsp->dsa_os = tosnap; 472 dsp->dsa_off = off; 473 dsp->dsa_toguid = ds->ds_phys->ds_guid; 474 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 475 dsp->dsa_pending_op = PENDING_NONE; 476 477 mutex_enter(&ds->ds_sendstream_lock); 478 list_insert_head(&ds->ds_sendstreams, dsp); 479 mutex_exit(&ds->ds_sendstream_lock); 480 481 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 482 err = dsp->dsa_err; 483 goto out; 484 } 485 486 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 487 backup_cb, dsp); 488 489 if (dsp->dsa_pending_op != PENDING_NONE) 490 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 491 err = EINTR; 492 493 if (err) { 494 if (err == EINTR && dsp->dsa_err) 495 err = dsp->dsa_err; 496 goto out; 497 } 498 499 bzero(drr, sizeof (dmu_replay_record_t)); 500 drr->drr_type = DRR_END; 501 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 502 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 503 504 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 505 err = dsp->dsa_err; 506 goto out; 507 } 508 509 out: 510 mutex_enter(&ds->ds_sendstream_lock); 511 list_remove(&ds->ds_sendstreams, dsp); 512 mutex_exit(&ds->ds_sendstream_lock); 513 514 kmem_free(drr, sizeof (dmu_replay_record_t)); 515 kmem_free(dsp, sizeof (dmu_sendarg_t)); 516 517 return (err); 518 } 519 520 int 521 dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 522 uint64_t *sizep) 523 { 524 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 525 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 526 dsl_pool_t *dp = ds->ds_dir->dd_pool; 527 int err; 528 uint64_t size; 529 530 /* tosnap must be a snapshot */ 531 if (ds->ds_phys->ds_next_snap_obj == 0) 532 return (EINVAL); 533 534 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 535 if (fromds && (ds->ds_dir != fromds->ds_dir || 536 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 537 return (EXDEV); 538 539 if (fromorigin) { 540 if (fromsnap) 541 return (EINVAL); 542 543 if (dsl_dir_is_clone(ds->ds_dir)) { 544 rw_enter(&dp->dp_config_rwlock, RW_READER); 545 err = dsl_dataset_hold_obj(dp, 546 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 547 rw_exit(&dp->dp_config_rwlock); 548 if (err) 549 return (err); 550 } else { 551 fromorigin = B_FALSE; 552 } 553 } 554 555 /* Get uncompressed size estimate of changed data. */ 556 if (fromds == NULL) { 557 size = ds->ds_phys->ds_uncompressed_bytes; 558 } else { 559 uint64_t used, comp; 560 err = dsl_dataset_space_written(fromds, ds, 561 &used, &comp, &size); 562 if (fromorigin) 563 dsl_dataset_rele(fromds, FTAG); 564 if (err) 565 return (err); 566 } 567 568 /* 569 * Assume that space (both on-disk and in-stream) is dominated by 570 * data. We will adjust for indirect blocks and the copies property, 571 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 572 */ 573 574 /* 575 * Subtract out approximate space used by indirect blocks. 576 * Assume most space is used by data blocks (non-indirect, non-dnode). 577 * Assume all blocks are recordsize. Assume ditto blocks and 578 * internal fragmentation counter out compression. 579 * 580 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 581 * block, which we observe in practice. 582 */ 583 uint64_t recordsize; 584 rw_enter(&dp->dp_config_rwlock, RW_READER); 585 err = dsl_prop_get_ds(ds, "recordsize", 586 sizeof (recordsize), 1, &recordsize, NULL); 587 rw_exit(&dp->dp_config_rwlock); 588 if (err) 589 return (err); 590 size -= size / recordsize * sizeof (blkptr_t); 591 592 /* Add in the space for the record associated with each block. */ 593 size += size / recordsize * sizeof (dmu_replay_record_t); 594 595 *sizep = size; 596 597 return (0); 598 } 599 600 struct recvbeginsyncarg { 601 const char *tofs; 602 const char *tosnap; 603 dsl_dataset_t *origin; 604 uint64_t fromguid; 605 dmu_objset_type_t type; 606 void *tag; 607 boolean_t force; 608 uint64_t dsflags; 609 char clonelastname[MAXNAMELEN]; 610 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ 611 cred_t *cr; 612 }; 613 614 /* ARGSUSED */ 615 static int 616 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) 617 { 618 dsl_dir_t *dd = arg1; 619 struct recvbeginsyncarg *rbsa = arg2; 620 objset_t *mos = dd->dd_pool->dp_meta_objset; 621 uint64_t val; 622 int err; 623 624 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 625 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); 626 627 if (err != ENOENT) 628 return (err ? err : EEXIST); 629 630 if (rbsa->origin) { 631 /* make sure it's a snap in the same pool */ 632 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) 633 return (EXDEV); 634 if (!dsl_dataset_is_snapshot(rbsa->origin)) 635 return (EINVAL); 636 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) 637 return (ENODEV); 638 } 639 640 return (0); 641 } 642 643 static void 644 recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) 645 { 646 dsl_dir_t *dd = arg1; 647 struct recvbeginsyncarg *rbsa = arg2; 648 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 649 uint64_t dsobj; 650 651 /* Create and open new dataset. */ 652 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, 653 rbsa->origin, flags, rbsa->cr, tx); 654 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, 655 B_TRUE, dmu_recv_tag, &rbsa->ds)); 656 657 if (rbsa->origin == NULL) { 658 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, 659 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); 660 } 661 662 spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, 663 dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); 664 } 665 666 /* ARGSUSED */ 667 static int 668 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) 669 { 670 dsl_dataset_t *ds = arg1; 671 struct recvbeginsyncarg *rbsa = arg2; 672 int err; 673 uint64_t val; 674 675 /* must not have any changes since most recent snapshot */ 676 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) 677 return (ETXTBSY); 678 679 /* new snapshot name must not exist */ 680 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 681 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); 682 if (err == 0) 683 return (EEXIST); 684 if (err != ENOENT) 685 return (err); 686 687 if (rbsa->fromguid) { 688 /* if incremental, most recent snapshot must match fromguid */ 689 if (ds->ds_prev == NULL) 690 return (ENODEV); 691 692 /* 693 * most recent snapshot must match fromguid, or there are no 694 * changes since the fromguid one 695 */ 696 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { 697 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 698 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 699 while (obj != 0) { 700 dsl_dataset_t *snap; 701 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 702 obj, FTAG, &snap); 703 if (err) 704 return (ENODEV); 705 if (snap->ds_phys->ds_creation_txg < birth) { 706 dsl_dataset_rele(snap, FTAG); 707 return (ENODEV); 708 } 709 if (snap->ds_phys->ds_guid == rbsa->fromguid) { 710 dsl_dataset_rele(snap, FTAG); 711 break; /* it's ok */ 712 } 713 obj = snap->ds_phys->ds_prev_snap_obj; 714 dsl_dataset_rele(snap, FTAG); 715 } 716 if (obj == 0) 717 return (ENODEV); 718 } 719 } else { 720 /* if full, most recent snapshot must be $ORIGIN */ 721 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 722 return (ENODEV); 723 } 724 725 /* temporary clone name must not exist */ 726 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 727 ds->ds_dir->dd_phys->dd_child_dir_zapobj, 728 rbsa->clonelastname, 8, 1, &val); 729 if (err == 0) 730 return (EEXIST); 731 if (err != ENOENT) 732 return (err); 733 734 return (0); 735 } 736 737 /* ARGSUSED */ 738 static void 739 recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) 740 { 741 dsl_dataset_t *ohds = arg1; 742 struct recvbeginsyncarg *rbsa = arg2; 743 dsl_pool_t *dp = ohds->ds_dir->dd_pool; 744 dsl_dataset_t *cds; 745 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 746 uint64_t dsobj; 747 748 /* create and open the temporary clone */ 749 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, 750 ohds->ds_prev, flags, rbsa->cr, tx); 751 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); 752 753 /* 754 * If we actually created a non-clone, we need to create the 755 * objset in our new dataset. 756 */ 757 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { 758 (void) dmu_objset_create_impl(dp->dp_spa, 759 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); 760 } 761 762 rbsa->ds = cds; 763 764 spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, 765 dp->dp_spa, tx, "dataset = %lld", dsobj); 766 } 767 768 static boolean_t 769 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) 770 { 771 int featureflags; 772 773 featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 774 775 /* Verify pool version supports SA if SA_SPILL feature set */ 776 return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 777 (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); 778 } 779 780 /* 781 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 782 * succeeds; otherwise we will leak the holds on the datasets. 783 */ 784 int 785 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, 786 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) 787 { 788 int err = 0; 789 boolean_t byteswap; 790 struct recvbeginsyncarg rbsa = { 0 }; 791 uint64_t versioninfo; 792 int flags; 793 dsl_dataset_t *ds; 794 795 if (drrb->drr_magic == DMU_BACKUP_MAGIC) 796 byteswap = FALSE; 797 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 798 byteswap = TRUE; 799 else 800 return (EINVAL); 801 802 rbsa.tofs = tofs; 803 rbsa.tosnap = tosnap; 804 rbsa.origin = origin ? origin->os_dsl_dataset : NULL; 805 rbsa.fromguid = drrb->drr_fromguid; 806 rbsa.type = drrb->drr_type; 807 rbsa.tag = FTAG; 808 rbsa.dsflags = 0; 809 rbsa.cr = CRED(); 810 versioninfo = drrb->drr_versioninfo; 811 flags = drrb->drr_flags; 812 813 if (byteswap) { 814 rbsa.type = BSWAP_32(rbsa.type); 815 rbsa.fromguid = BSWAP_64(rbsa.fromguid); 816 versioninfo = BSWAP_64(versioninfo); 817 flags = BSWAP_32(flags); 818 } 819 820 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || 821 rbsa.type >= DMU_OST_NUMTYPES || 822 ((flags & DRR_FLAG_CLONE) && origin == NULL)) 823 return (EINVAL); 824 825 if (flags & DRR_FLAG_CI_DATA) 826 rbsa.dsflags = DS_FLAG_CI_DATASET; 827 828 bzero(drc, sizeof (dmu_recv_cookie_t)); 829 drc->drc_drrb = drrb; 830 drc->drc_tosnap = tosnap; 831 drc->drc_top_ds = top_ds; 832 drc->drc_force = force; 833 834 /* 835 * Process the begin in syncing context. 836 */ 837 838 /* open the dataset we are logically receiving into */ 839 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); 840 if (err == 0) { 841 if (dmu_recv_verify_features(ds, drrb)) { 842 dsl_dataset_rele(ds, dmu_recv_tag); 843 return (ENOTSUP); 844 } 845 /* target fs already exists; recv into temp clone */ 846 847 /* Can't recv a clone into an existing fs */ 848 if (flags & DRR_FLAG_CLONE) { 849 dsl_dataset_rele(ds, dmu_recv_tag); 850 return (EINVAL); 851 } 852 853 /* must not have an incremental recv already in progress */ 854 if (!mutex_tryenter(&ds->ds_recvlock)) { 855 dsl_dataset_rele(ds, dmu_recv_tag); 856 return (EBUSY); 857 } 858 859 /* tmp clone name is: tofs/%tosnap" */ 860 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), 861 "%%%s", tosnap); 862 rbsa.force = force; 863 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 864 recv_existing_check, recv_existing_sync, ds, &rbsa, 5); 865 if (err) { 866 mutex_exit(&ds->ds_recvlock); 867 dsl_dataset_rele(ds, dmu_recv_tag); 868 return (err); 869 } 870 drc->drc_logical_ds = ds; 871 drc->drc_real_ds = rbsa.ds; 872 } else if (err == ENOENT) { 873 /* target fs does not exist; must be a full backup or clone */ 874 char *cp; 875 876 /* 877 * If it's a non-clone incremental, we are missing the 878 * target fs, so fail the recv. 879 */ 880 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) 881 return (ENOENT); 882 883 /* Open the parent of tofs */ 884 cp = strrchr(tofs, '/'); 885 *cp = '\0'; 886 err = dsl_dataset_hold(tofs, FTAG, &ds); 887 *cp = '/'; 888 if (err) 889 return (err); 890 891 if (dmu_recv_verify_features(ds, drrb)) { 892 dsl_dataset_rele(ds, FTAG); 893 return (ENOTSUP); 894 } 895 896 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 897 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); 898 dsl_dataset_rele(ds, FTAG); 899 if (err) 900 return (err); 901 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; 902 drc->drc_newfs = B_TRUE; 903 } 904 905 return (err); 906 } 907 908 struct restorearg { 909 int err; 910 int byteswap; 911 vnode_t *vp; 912 char *buf; 913 uint64_t voff; 914 int bufsize; /* amount of memory allocated for buf */ 915 zio_cksum_t cksum; 916 avl_tree_t *guid_to_ds_map; 917 }; 918 919 typedef struct guid_map_entry { 920 uint64_t guid; 921 dsl_dataset_t *gme_ds; 922 avl_node_t avlnode; 923 } guid_map_entry_t; 924 925 static int 926 guid_compare(const void *arg1, const void *arg2) 927 { 928 const guid_map_entry_t *gmep1 = arg1; 929 const guid_map_entry_t *gmep2 = arg2; 930 931 if (gmep1->guid < gmep2->guid) 932 return (-1); 933 else if (gmep1->guid > gmep2->guid) 934 return (1); 935 return (0); 936 } 937 938 static void 939 free_guid_map_onexit(void *arg) 940 { 941 avl_tree_t *ca = arg; 942 void *cookie = NULL; 943 guid_map_entry_t *gmep; 944 945 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 946 dsl_dataset_rele(gmep->gme_ds, ca); 947 kmem_free(gmep, sizeof (guid_map_entry_t)); 948 } 949 avl_destroy(ca); 950 kmem_free(ca, sizeof (avl_tree_t)); 951 } 952 953 static void * 954 restore_read(struct restorearg *ra, int len) 955 { 956 void *rv; 957 int done = 0; 958 959 /* some things will require 8-byte alignment, so everything must */ 960 ASSERT3U(len % 8, ==, 0); 961 962 while (done < len) { 963 ssize_t resid; 964 965 ra->err = vn_rdwr(UIO_READ, ra->vp, 966 (caddr_t)ra->buf + done, len - done, 967 ra->voff, UIO_SYSSPACE, FAPPEND, 968 RLIM64_INFINITY, CRED(), &resid); 969 970 if (resid == len - done) 971 ra->err = EINVAL; 972 ra->voff += len - done - resid; 973 done = len - resid; 974 if (ra->err) 975 return (NULL); 976 } 977 978 ASSERT3U(done, ==, len); 979 rv = ra->buf; 980 if (ra->byteswap) 981 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 982 else 983 fletcher_4_incremental_native(rv, len, &ra->cksum); 984 return (rv); 985 } 986 987 static void 988 backup_byteswap(dmu_replay_record_t *drr) 989 { 990 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 991 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 992 drr->drr_type = BSWAP_32(drr->drr_type); 993 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 994 switch (drr->drr_type) { 995 case DRR_BEGIN: 996 DO64(drr_begin.drr_magic); 997 DO64(drr_begin.drr_versioninfo); 998 DO64(drr_begin.drr_creation_time); 999 DO32(drr_begin.drr_type); 1000 DO32(drr_begin.drr_flags); 1001 DO64(drr_begin.drr_toguid); 1002 DO64(drr_begin.drr_fromguid); 1003 break; 1004 case DRR_OBJECT: 1005 DO64(drr_object.drr_object); 1006 /* DO64(drr_object.drr_allocation_txg); */ 1007 DO32(drr_object.drr_type); 1008 DO32(drr_object.drr_bonustype); 1009 DO32(drr_object.drr_blksz); 1010 DO32(drr_object.drr_bonuslen); 1011 DO64(drr_object.drr_toguid); 1012 break; 1013 case DRR_FREEOBJECTS: 1014 DO64(drr_freeobjects.drr_firstobj); 1015 DO64(drr_freeobjects.drr_numobjs); 1016 DO64(drr_freeobjects.drr_toguid); 1017 break; 1018 case DRR_WRITE: 1019 DO64(drr_write.drr_object); 1020 DO32(drr_write.drr_type); 1021 DO64(drr_write.drr_offset); 1022 DO64(drr_write.drr_length); 1023 DO64(drr_write.drr_toguid); 1024 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1025 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1026 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1027 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1028 DO64(drr_write.drr_key.ddk_prop); 1029 break; 1030 case DRR_WRITE_BYREF: 1031 DO64(drr_write_byref.drr_object); 1032 DO64(drr_write_byref.drr_offset); 1033 DO64(drr_write_byref.drr_length); 1034 DO64(drr_write_byref.drr_toguid); 1035 DO64(drr_write_byref.drr_refguid); 1036 DO64(drr_write_byref.drr_refobject); 1037 DO64(drr_write_byref.drr_refoffset); 1038 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1039 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1040 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1041 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1042 DO64(drr_write_byref.drr_key.ddk_prop); 1043 break; 1044 case DRR_FREE: 1045 DO64(drr_free.drr_object); 1046 DO64(drr_free.drr_offset); 1047 DO64(drr_free.drr_length); 1048 DO64(drr_free.drr_toguid); 1049 break; 1050 case DRR_SPILL: 1051 DO64(drr_spill.drr_object); 1052 DO64(drr_spill.drr_length); 1053 DO64(drr_spill.drr_toguid); 1054 break; 1055 case DRR_END: 1056 DO64(drr_end.drr_checksum.zc_word[0]); 1057 DO64(drr_end.drr_checksum.zc_word[1]); 1058 DO64(drr_end.drr_checksum.zc_word[2]); 1059 DO64(drr_end.drr_checksum.zc_word[3]); 1060 DO64(drr_end.drr_toguid); 1061 break; 1062 } 1063 #undef DO64 1064 #undef DO32 1065 } 1066 1067 static int 1068 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1069 { 1070 int err; 1071 dmu_tx_t *tx; 1072 void *data = NULL; 1073 1074 if (drro->drr_type == DMU_OT_NONE || 1075 !DMU_OT_IS_VALID(drro->drr_type) || 1076 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1077 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1078 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1079 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1080 drro->drr_blksz < SPA_MINBLOCKSIZE || 1081 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1082 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1083 return (EINVAL); 1084 } 1085 1086 err = dmu_object_info(os, drro->drr_object, NULL); 1087 1088 if (err != 0 && err != ENOENT) 1089 return (EINVAL); 1090 1091 if (drro->drr_bonuslen) { 1092 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 1093 if (ra->err) 1094 return (ra->err); 1095 } 1096 1097 if (err == ENOENT) { 1098 /* currently free, want to be allocated */ 1099 tx = dmu_tx_create(os); 1100 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1101 err = dmu_tx_assign(tx, TXG_WAIT); 1102 if (err) { 1103 dmu_tx_abort(tx); 1104 return (err); 1105 } 1106 err = dmu_object_claim(os, drro->drr_object, 1107 drro->drr_type, drro->drr_blksz, 1108 drro->drr_bonustype, drro->drr_bonuslen, tx); 1109 dmu_tx_commit(tx); 1110 } else { 1111 /* currently allocated, want to be allocated */ 1112 err = dmu_object_reclaim(os, drro->drr_object, 1113 drro->drr_type, drro->drr_blksz, 1114 drro->drr_bonustype, drro->drr_bonuslen); 1115 } 1116 if (err) { 1117 return (EINVAL); 1118 } 1119 1120 tx = dmu_tx_create(os); 1121 dmu_tx_hold_bonus(tx, drro->drr_object); 1122 err = dmu_tx_assign(tx, TXG_WAIT); 1123 if (err) { 1124 dmu_tx_abort(tx); 1125 return (err); 1126 } 1127 1128 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1129 tx); 1130 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1131 1132 if (data != NULL) { 1133 dmu_buf_t *db; 1134 1135 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1136 dmu_buf_will_dirty(db, tx); 1137 1138 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1139 bcopy(data, db->db_data, drro->drr_bonuslen); 1140 if (ra->byteswap) { 1141 dmu_object_byteswap_t byteswap = 1142 DMU_OT_BYTESWAP(drro->drr_bonustype); 1143 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1144 drro->drr_bonuslen); 1145 } 1146 dmu_buf_rele(db, FTAG); 1147 } 1148 dmu_tx_commit(tx); 1149 return (0); 1150 } 1151 1152 /* ARGSUSED */ 1153 static int 1154 restore_freeobjects(struct restorearg *ra, objset_t *os, 1155 struct drr_freeobjects *drrfo) 1156 { 1157 uint64_t obj; 1158 1159 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1160 return (EINVAL); 1161 1162 for (obj = drrfo->drr_firstobj; 1163 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1164 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1165 int err; 1166 1167 if (dmu_object_info(os, obj, NULL) != 0) 1168 continue; 1169 1170 err = dmu_free_object(os, obj); 1171 if (err) 1172 return (err); 1173 } 1174 return (0); 1175 } 1176 1177 static int 1178 restore_write(struct restorearg *ra, objset_t *os, 1179 struct drr_write *drrw) 1180 { 1181 dmu_tx_t *tx; 1182 void *data; 1183 int err; 1184 1185 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1186 !DMU_OT_IS_VALID(drrw->drr_type)) 1187 return (EINVAL); 1188 1189 data = restore_read(ra, drrw->drr_length); 1190 if (data == NULL) 1191 return (ra->err); 1192 1193 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1194 return (EINVAL); 1195 1196 tx = dmu_tx_create(os); 1197 1198 dmu_tx_hold_write(tx, drrw->drr_object, 1199 drrw->drr_offset, drrw->drr_length); 1200 err = dmu_tx_assign(tx, TXG_WAIT); 1201 if (err) { 1202 dmu_tx_abort(tx); 1203 return (err); 1204 } 1205 if (ra->byteswap) { 1206 dmu_object_byteswap_t byteswap = 1207 DMU_OT_BYTESWAP(drrw->drr_type); 1208 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 1209 } 1210 dmu_write(os, drrw->drr_object, 1211 drrw->drr_offset, drrw->drr_length, data, tx); 1212 dmu_tx_commit(tx); 1213 return (0); 1214 } 1215 1216 /* 1217 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1218 * streams to refer to a copy of the data that is already on the 1219 * system because it came in earlier in the stream. This function 1220 * finds the earlier copy of the data, and uses that copy instead of 1221 * data from the stream to fulfill this write. 1222 */ 1223 static int 1224 restore_write_byref(struct restorearg *ra, objset_t *os, 1225 struct drr_write_byref *drrwbr) 1226 { 1227 dmu_tx_t *tx; 1228 int err; 1229 guid_map_entry_t gmesrch; 1230 guid_map_entry_t *gmep; 1231 avl_index_t where; 1232 objset_t *ref_os = NULL; 1233 dmu_buf_t *dbp; 1234 1235 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1236 return (EINVAL); 1237 1238 /* 1239 * If the GUID of the referenced dataset is different from the 1240 * GUID of the target dataset, find the referenced dataset. 1241 */ 1242 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1243 gmesrch.guid = drrwbr->drr_refguid; 1244 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1245 &where)) == NULL) { 1246 return (EINVAL); 1247 } 1248 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1249 return (EINVAL); 1250 } else { 1251 ref_os = os; 1252 } 1253 1254 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1255 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) 1256 return (err); 1257 1258 tx = dmu_tx_create(os); 1259 1260 dmu_tx_hold_write(tx, drrwbr->drr_object, 1261 drrwbr->drr_offset, drrwbr->drr_length); 1262 err = dmu_tx_assign(tx, TXG_WAIT); 1263 if (err) { 1264 dmu_tx_abort(tx); 1265 return (err); 1266 } 1267 dmu_write(os, drrwbr->drr_object, 1268 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1269 dmu_buf_rele(dbp, FTAG); 1270 dmu_tx_commit(tx); 1271 return (0); 1272 } 1273 1274 static int 1275 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1276 { 1277 dmu_tx_t *tx; 1278 void *data; 1279 dmu_buf_t *db, *db_spill; 1280 int err; 1281 1282 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1283 drrs->drr_length > SPA_MAXBLOCKSIZE) 1284 return (EINVAL); 1285 1286 data = restore_read(ra, drrs->drr_length); 1287 if (data == NULL) 1288 return (ra->err); 1289 1290 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1291 return (EINVAL); 1292 1293 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1294 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1295 dmu_buf_rele(db, FTAG); 1296 return (err); 1297 } 1298 1299 tx = dmu_tx_create(os); 1300 1301 dmu_tx_hold_spill(tx, db->db_object); 1302 1303 err = dmu_tx_assign(tx, TXG_WAIT); 1304 if (err) { 1305 dmu_buf_rele(db, FTAG); 1306 dmu_buf_rele(db_spill, FTAG); 1307 dmu_tx_abort(tx); 1308 return (err); 1309 } 1310 dmu_buf_will_dirty(db_spill, tx); 1311 1312 if (db_spill->db_size < drrs->drr_length) 1313 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1314 drrs->drr_length, tx)); 1315 bcopy(data, db_spill->db_data, drrs->drr_length); 1316 1317 dmu_buf_rele(db, FTAG); 1318 dmu_buf_rele(db_spill, FTAG); 1319 1320 dmu_tx_commit(tx); 1321 return (0); 1322 } 1323 1324 /* ARGSUSED */ 1325 static int 1326 restore_free(struct restorearg *ra, objset_t *os, 1327 struct drr_free *drrf) 1328 { 1329 int err; 1330 1331 if (drrf->drr_length != -1ULL && 1332 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1333 return (EINVAL); 1334 1335 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1336 return (EINVAL); 1337 1338 err = dmu_free_long_range(os, drrf->drr_object, 1339 drrf->drr_offset, drrf->drr_length); 1340 return (err); 1341 } 1342 1343 /* 1344 * NB: callers *must* call dmu_recv_end() if this succeeds. 1345 */ 1346 int 1347 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1348 int cleanup_fd, uint64_t *action_handlep) 1349 { 1350 struct restorearg ra = { 0 }; 1351 dmu_replay_record_t *drr; 1352 objset_t *os; 1353 zio_cksum_t pcksum; 1354 int featureflags; 1355 1356 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1357 ra.byteswap = TRUE; 1358 1359 { 1360 /* compute checksum of drr_begin record */ 1361 dmu_replay_record_t *drr; 1362 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1363 1364 drr->drr_type = DRR_BEGIN; 1365 drr->drr_u.drr_begin = *drc->drc_drrb; 1366 if (ra.byteswap) { 1367 fletcher_4_incremental_byteswap(drr, 1368 sizeof (dmu_replay_record_t), &ra.cksum); 1369 } else { 1370 fletcher_4_incremental_native(drr, 1371 sizeof (dmu_replay_record_t), &ra.cksum); 1372 } 1373 kmem_free(drr, sizeof (dmu_replay_record_t)); 1374 } 1375 1376 if (ra.byteswap) { 1377 struct drr_begin *drrb = drc->drc_drrb; 1378 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1379 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1380 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1381 drrb->drr_type = BSWAP_32(drrb->drr_type); 1382 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1383 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1384 } 1385 1386 ra.vp = vp; 1387 ra.voff = *voffp; 1388 ra.bufsize = 1<<20; 1389 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1390 1391 /* these were verified in dmu_recv_begin */ 1392 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == 1393 DMU_SUBSTREAM); 1394 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); 1395 1396 /* 1397 * Open the objset we are modifying. 1398 */ 1399 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); 1400 1401 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1402 1403 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1404 1405 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1406 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1407 minor_t minor; 1408 1409 if (cleanup_fd == -1) { 1410 ra.err = EBADF; 1411 goto out; 1412 } 1413 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1414 if (ra.err) { 1415 cleanup_fd = -1; 1416 goto out; 1417 } 1418 1419 if (*action_handlep == 0) { 1420 ra.guid_to_ds_map = 1421 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1422 avl_create(ra.guid_to_ds_map, guid_compare, 1423 sizeof (guid_map_entry_t), 1424 offsetof(guid_map_entry_t, avlnode)); 1425 ra.err = zfs_onexit_add_cb(minor, 1426 free_guid_map_onexit, ra.guid_to_ds_map, 1427 action_handlep); 1428 if (ra.err) 1429 goto out; 1430 } else { 1431 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1432 (void **)&ra.guid_to_ds_map); 1433 if (ra.err) 1434 goto out; 1435 } 1436 1437 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1438 } 1439 1440 /* 1441 * Read records and process them. 1442 */ 1443 pcksum = ra.cksum; 1444 while (ra.err == 0 && 1445 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1446 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1447 ra.err = EINTR; 1448 goto out; 1449 } 1450 1451 if (ra.byteswap) 1452 backup_byteswap(drr); 1453 1454 switch (drr->drr_type) { 1455 case DRR_OBJECT: 1456 { 1457 /* 1458 * We need to make a copy of the record header, 1459 * because restore_{object,write} may need to 1460 * restore_read(), which will invalidate drr. 1461 */ 1462 struct drr_object drro = drr->drr_u.drr_object; 1463 ra.err = restore_object(&ra, os, &drro); 1464 break; 1465 } 1466 case DRR_FREEOBJECTS: 1467 { 1468 struct drr_freeobjects drrfo = 1469 drr->drr_u.drr_freeobjects; 1470 ra.err = restore_freeobjects(&ra, os, &drrfo); 1471 break; 1472 } 1473 case DRR_WRITE: 1474 { 1475 struct drr_write drrw = drr->drr_u.drr_write; 1476 ra.err = restore_write(&ra, os, &drrw); 1477 break; 1478 } 1479 case DRR_WRITE_BYREF: 1480 { 1481 struct drr_write_byref drrwbr = 1482 drr->drr_u.drr_write_byref; 1483 ra.err = restore_write_byref(&ra, os, &drrwbr); 1484 break; 1485 } 1486 case DRR_FREE: 1487 { 1488 struct drr_free drrf = drr->drr_u.drr_free; 1489 ra.err = restore_free(&ra, os, &drrf); 1490 break; 1491 } 1492 case DRR_END: 1493 { 1494 struct drr_end drre = drr->drr_u.drr_end; 1495 /* 1496 * We compare against the *previous* checksum 1497 * value, because the stored checksum is of 1498 * everything before the DRR_END record. 1499 */ 1500 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1501 ra.err = ECKSUM; 1502 goto out; 1503 } 1504 case DRR_SPILL: 1505 { 1506 struct drr_spill drrs = drr->drr_u.drr_spill; 1507 ra.err = restore_spill(&ra, os, &drrs); 1508 break; 1509 } 1510 default: 1511 ra.err = EINVAL; 1512 goto out; 1513 } 1514 pcksum = ra.cksum; 1515 } 1516 ASSERT(ra.err != 0); 1517 1518 out: 1519 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1520 zfs_onexit_fd_rele(cleanup_fd); 1521 1522 if (ra.err != 0) { 1523 /* 1524 * destroy what we created, so we don't leave it in the 1525 * inconsistent restoring state. 1526 */ 1527 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); 1528 1529 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1530 B_FALSE); 1531 if (drc->drc_real_ds != drc->drc_logical_ds) { 1532 mutex_exit(&drc->drc_logical_ds->ds_recvlock); 1533 dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); 1534 } 1535 } 1536 1537 kmem_free(ra.buf, ra.bufsize); 1538 *voffp = ra.voff; 1539 return (ra.err); 1540 } 1541 1542 struct recvendsyncarg { 1543 char *tosnap; 1544 uint64_t creation_time; 1545 uint64_t toguid; 1546 }; 1547 1548 static int 1549 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 1550 { 1551 dsl_dataset_t *ds = arg1; 1552 struct recvendsyncarg *resa = arg2; 1553 1554 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); 1555 } 1556 1557 static void 1558 recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1559 { 1560 dsl_dataset_t *ds = arg1; 1561 struct recvendsyncarg *resa = arg2; 1562 1563 dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); 1564 1565 /* set snapshot's creation time and guid */ 1566 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1567 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; 1568 ds->ds_prev->ds_phys->ds_guid = resa->toguid; 1569 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1570 1571 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1572 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1573 } 1574 1575 static int 1576 add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) 1577 { 1578 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1579 uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; 1580 dsl_dataset_t *snapds; 1581 guid_map_entry_t *gmep; 1582 int err; 1583 1584 ASSERT(guid_map != NULL); 1585 1586 rw_enter(&dp->dp_config_rwlock, RW_READER); 1587 err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); 1588 if (err == 0) { 1589 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 1590 gmep->guid = snapds->ds_phys->ds_guid; 1591 gmep->gme_ds = snapds; 1592 avl_add(guid_map, gmep); 1593 } 1594 1595 rw_exit(&dp->dp_config_rwlock); 1596 return (err); 1597 } 1598 1599 static int 1600 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1601 { 1602 struct recvendsyncarg resa; 1603 dsl_dataset_t *ds = drc->drc_logical_ds; 1604 int err, myerr; 1605 1606 /* 1607 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1608 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1609 * can close it. 1610 */ 1611 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1612 1613 if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { 1614 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, 1615 drc->drc_force); 1616 if (err) 1617 goto out; 1618 } else { 1619 mutex_exit(&ds->ds_recvlock); 1620 dsl_dataset_rele(ds, dmu_recv_tag); 1621 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1622 B_FALSE); 1623 return (EBUSY); 1624 } 1625 1626 resa.creation_time = drc->drc_drrb->drr_creation_time; 1627 resa.toguid = drc->drc_drrb->drr_toguid; 1628 resa.tosnap = drc->drc_tosnap; 1629 1630 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1631 recv_end_check, recv_end_sync, ds, &resa, 3); 1632 if (err) { 1633 /* swap back */ 1634 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); 1635 } 1636 1637 out: 1638 mutex_exit(&ds->ds_recvlock); 1639 if (err == 0 && drc->drc_guid_to_ds_map != NULL) 1640 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1641 dsl_dataset_disown(ds, dmu_recv_tag); 1642 myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); 1643 ASSERT3U(myerr, ==, 0); 1644 return (err); 1645 } 1646 1647 static int 1648 dmu_recv_new_end(dmu_recv_cookie_t *drc) 1649 { 1650 struct recvendsyncarg resa; 1651 dsl_dataset_t *ds = drc->drc_logical_ds; 1652 int err; 1653 1654 /* 1655 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1656 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1657 * can close it. 1658 */ 1659 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1660 1661 resa.creation_time = drc->drc_drrb->drr_creation_time; 1662 resa.toguid = drc->drc_drrb->drr_toguid; 1663 resa.tosnap = drc->drc_tosnap; 1664 1665 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1666 recv_end_check, recv_end_sync, ds, &resa, 3); 1667 if (err) { 1668 /* clean up the fs we just recv'd into */ 1669 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); 1670 } else { 1671 if (drc->drc_guid_to_ds_map != NULL) 1672 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1673 /* release the hold from dmu_recv_begin */ 1674 dsl_dataset_disown(ds, dmu_recv_tag); 1675 } 1676 return (err); 1677 } 1678 1679 int 1680 dmu_recv_end(dmu_recv_cookie_t *drc) 1681 { 1682 if (drc->drc_logical_ds != drc->drc_real_ds) 1683 return (dmu_recv_existing_end(drc)); 1684 else 1685 return (dmu_recv_new_end(drc)); 1686 }