1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/fm/fs/zfs.h> 29 #include <sys/spa.h> 30 #include <sys/txg.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio_impl.h> 34 #include <sys/zio_compress.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/dmu_objset.h> 37 #include <sys/arc.h> 38 #include <sys/ddt.h> 39 #include <sys/blkptr.h> 40 #include <sys/zfeature.h> 41 42 /* 43 * ========================================================================== 44 * I/O type descriptions 45 * ========================================================================== 46 */ 47 const char *zio_type_name[ZIO_TYPES] = { 48 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 49 "zio_ioctl" 50 }; 51 52 /* 53 * ========================================================================== 54 * I/O kmem caches 55 * ========================================================================== 56 */ 57 kmem_cache_t *zio_cache; 58 kmem_cache_t *zio_link_cache; 59 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 60 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 61 62 #ifdef _KERNEL 63 extern vmem_t *zio_alloc_arena; 64 #endif 65 66 /* 67 * The following actions directly effect the spa's sync-to-convergence logic. 68 * The values below define the sync pass when we start performing the action. 69 * Care should be taken when changing these values as they directly impact 70 * spa_sync() performance. Tuning these values may introduce subtle performance 71 * pathologies and should only be done in the context of performance analysis. 72 * These tunables will eventually be removed and replaced with #defines once 73 * enough analysis has been done to determine optimal values. 74 * 75 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 76 * regular blocks are not deferred. 77 */ 78 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 79 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 80 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 81 82 /* 83 * An allocating zio is one that either currently has the DVA allocate 84 * stage set or will have it later in its lifetime. 85 */ 86 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 87 88 boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 89 90 #ifdef ZFS_DEBUG 91 int zio_buf_debug_limit = 16384; 92 #else 93 int zio_buf_debug_limit = 0; 94 #endif 95 96 void 97 zio_init(void) 98 { 99 size_t c; 100 vmem_t *data_alloc_arena = NULL; 101 102 #ifdef _KERNEL 103 data_alloc_arena = zio_alloc_arena; 104 #endif 105 zio_cache = kmem_cache_create("zio_cache", 106 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 107 zio_link_cache = kmem_cache_create("zio_link_cache", 108 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 109 110 /* 111 * For small buffers, we want a cache for each multiple of 112 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 113 * for each quarter-power of 2. For large buffers, we want 114 * a cache for each multiple of PAGESIZE. 115 */ 116 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 117 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 118 size_t p2 = size; 119 size_t align = 0; 120 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 121 122 while (p2 & (p2 - 1)) 123 p2 &= p2 - 1; 124 125 #ifndef _KERNEL 126 /* 127 * If we are using watchpoints, put each buffer on its own page, 128 * to eliminate the performance overhead of trapping to the 129 * kernel when modifying a non-watched buffer that shares the 130 * page with a watched buffer. 131 */ 132 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 133 continue; 134 #endif 135 if (size <= 4 * SPA_MINBLOCKSIZE) { 136 align = SPA_MINBLOCKSIZE; 137 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 138 align = PAGESIZE; 139 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 140 align = p2 >> 2; 141 } 142 143 if (align != 0) { 144 char name[36]; 145 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 146 zio_buf_cache[c] = kmem_cache_create(name, size, 147 align, NULL, NULL, NULL, NULL, NULL, cflags); 148 149 /* 150 * Since zio_data bufs do not appear in crash dumps, we 151 * pass KMC_NOTOUCH so that no allocator metadata is 152 * stored with the buffers. 153 */ 154 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 155 zio_data_buf_cache[c] = kmem_cache_create(name, size, 156 align, NULL, NULL, NULL, NULL, data_alloc_arena, 157 cflags | KMC_NOTOUCH); 158 } 159 } 160 161 while (--c != 0) { 162 ASSERT(zio_buf_cache[c] != NULL); 163 if (zio_buf_cache[c - 1] == NULL) 164 zio_buf_cache[c - 1] = zio_buf_cache[c]; 165 166 ASSERT(zio_data_buf_cache[c] != NULL); 167 if (zio_data_buf_cache[c - 1] == NULL) 168 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 169 } 170 171 zio_inject_init(); 172 } 173 174 void 175 zio_fini(void) 176 { 177 size_t c; 178 kmem_cache_t *last_cache = NULL; 179 kmem_cache_t *last_data_cache = NULL; 180 181 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 182 if (zio_buf_cache[c] != last_cache) { 183 last_cache = zio_buf_cache[c]; 184 kmem_cache_destroy(zio_buf_cache[c]); 185 } 186 zio_buf_cache[c] = NULL; 187 188 if (zio_data_buf_cache[c] != last_data_cache) { 189 last_data_cache = zio_data_buf_cache[c]; 190 kmem_cache_destroy(zio_data_buf_cache[c]); 191 } 192 zio_data_buf_cache[c] = NULL; 193 } 194 195 kmem_cache_destroy(zio_link_cache); 196 kmem_cache_destroy(zio_cache); 197 198 zio_inject_fini(); 199 } 200 201 /* 202 * ========================================================================== 203 * Allocate and free I/O buffers 204 * ========================================================================== 205 */ 206 207 /* 208 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 209 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 210 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 211 * excess / transient data in-core during a crashdump. 212 */ 213 void * 214 zio_buf_alloc(size_t size) 215 { 216 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 217 218 ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 219 220 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 221 } 222 223 /* 224 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 225 * crashdump if the kernel panics. This exists so that we will limit the amount 226 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 227 * of kernel heap dumped to disk when the kernel panics) 228 */ 229 void * 230 zio_data_buf_alloc(size_t size) 231 { 232 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 233 234 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 235 236 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 237 } 238 239 void 240 zio_buf_free(void *buf, size_t size) 241 { 242 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 243 244 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 245 246 kmem_cache_free(zio_buf_cache[c], buf); 247 } 248 249 void 250 zio_data_buf_free(void *buf, size_t size) 251 { 252 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 253 254 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 255 256 kmem_cache_free(zio_data_buf_cache[c], buf); 257 } 258 259 /* 260 * ========================================================================== 261 * Push and pop I/O transform buffers 262 * ========================================================================== 263 */ 264 static void 265 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 266 zio_transform_func_t *transform) 267 { 268 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 269 270 zt->zt_orig_data = zio->io_data; 271 zt->zt_orig_size = zio->io_size; 272 zt->zt_bufsize = bufsize; 273 zt->zt_transform = transform; 274 275 zt->zt_next = zio->io_transform_stack; 276 zio->io_transform_stack = zt; 277 278 zio->io_data = data; 279 zio->io_size = size; 280 } 281 282 static void 283 zio_pop_transforms(zio_t *zio) 284 { 285 zio_transform_t *zt; 286 287 while ((zt = zio->io_transform_stack) != NULL) { 288 if (zt->zt_transform != NULL) 289 zt->zt_transform(zio, 290 zt->zt_orig_data, zt->zt_orig_size); 291 292 if (zt->zt_bufsize != 0) 293 zio_buf_free(zio->io_data, zt->zt_bufsize); 294 295 zio->io_data = zt->zt_orig_data; 296 zio->io_size = zt->zt_orig_size; 297 zio->io_transform_stack = zt->zt_next; 298 299 kmem_free(zt, sizeof (zio_transform_t)); 300 } 301 } 302 303 /* 304 * ========================================================================== 305 * I/O transform callbacks for subblocks and decompression 306 * ========================================================================== 307 */ 308 static void 309 zio_subblock(zio_t *zio, void *data, uint64_t size) 310 { 311 ASSERT(zio->io_size > size); 312 313 if (zio->io_type == ZIO_TYPE_READ) 314 bcopy(zio->io_data, data, size); 315 } 316 317 static void 318 zio_decompress(zio_t *zio, void *data, uint64_t size) 319 { 320 if (zio->io_error == 0 && 321 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 322 zio->io_data, data, zio->io_size, size) != 0) 323 zio->io_error = SET_ERROR(EIO); 324 } 325 326 /* 327 * ========================================================================== 328 * I/O parent/child relationships and pipeline interlocks 329 * ========================================================================== 330 */ 331 /* 332 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 333 * continue calling these functions until they return NULL. 334 * Otherwise, the next caller will pick up the list walk in 335 * some indeterminate state. (Otherwise every caller would 336 * have to pass in a cookie to keep the state represented by 337 * io_walk_link, which gets annoying.) 338 */ 339 zio_t * 340 zio_walk_parents(zio_t *cio) 341 { 342 zio_link_t *zl = cio->io_walk_link; 343 list_t *pl = &cio->io_parent_list; 344 345 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 346 cio->io_walk_link = zl; 347 348 if (zl == NULL) 349 return (NULL); 350 351 ASSERT(zl->zl_child == cio); 352 return (zl->zl_parent); 353 } 354 355 zio_t * 356 zio_walk_children(zio_t *pio) 357 { 358 zio_link_t *zl = pio->io_walk_link; 359 list_t *cl = &pio->io_child_list; 360 361 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 362 pio->io_walk_link = zl; 363 364 if (zl == NULL) 365 return (NULL); 366 367 ASSERT(zl->zl_parent == pio); 368 return (zl->zl_child); 369 } 370 371 zio_t * 372 zio_unique_parent(zio_t *cio) 373 { 374 zio_t *pio = zio_walk_parents(cio); 375 376 VERIFY(zio_walk_parents(cio) == NULL); 377 return (pio); 378 } 379 380 void 381 zio_add_child(zio_t *pio, zio_t *cio) 382 { 383 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 384 385 /* 386 * Logical I/Os can have logical, gang, or vdev children. 387 * Gang I/Os can have gang or vdev children. 388 * Vdev I/Os can only have vdev children. 389 * The following ASSERT captures all of these constraints. 390 */ 391 ASSERT(cio->io_child_type <= pio->io_child_type); 392 393 zl->zl_parent = pio; 394 zl->zl_child = cio; 395 396 mutex_enter(&cio->io_lock); 397 mutex_enter(&pio->io_lock); 398 399 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 400 401 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 402 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 403 404 list_insert_head(&pio->io_child_list, zl); 405 list_insert_head(&cio->io_parent_list, zl); 406 407 pio->io_child_count++; 408 cio->io_parent_count++; 409 410 mutex_exit(&pio->io_lock); 411 mutex_exit(&cio->io_lock); 412 } 413 414 static void 415 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 416 { 417 ASSERT(zl->zl_parent == pio); 418 ASSERT(zl->zl_child == cio); 419 420 mutex_enter(&cio->io_lock); 421 mutex_enter(&pio->io_lock); 422 423 list_remove(&pio->io_child_list, zl); 424 list_remove(&cio->io_parent_list, zl); 425 426 pio->io_child_count--; 427 cio->io_parent_count--; 428 429 mutex_exit(&pio->io_lock); 430 mutex_exit(&cio->io_lock); 431 432 kmem_cache_free(zio_link_cache, zl); 433 } 434 435 static boolean_t 436 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 437 { 438 uint64_t *countp = &zio->io_children[child][wait]; 439 boolean_t waiting = B_FALSE; 440 441 mutex_enter(&zio->io_lock); 442 ASSERT(zio->io_stall == NULL); 443 if (*countp != 0) { 444 zio->io_stage >>= 1; 445 zio->io_stall = countp; 446 waiting = B_TRUE; 447 } 448 mutex_exit(&zio->io_lock); 449 450 return (waiting); 451 } 452 453 static void 454 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 455 { 456 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 457 int *errorp = &pio->io_child_error[zio->io_child_type]; 458 459 mutex_enter(&pio->io_lock); 460 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 461 *errorp = zio_worst_error(*errorp, zio->io_error); 462 pio->io_reexecute |= zio->io_reexecute; 463 ASSERT3U(*countp, >, 0); 464 465 (*countp)--; 466 467 if (*countp == 0 && pio->io_stall == countp) { 468 pio->io_stall = NULL; 469 mutex_exit(&pio->io_lock); 470 zio_execute(pio); 471 } else { 472 mutex_exit(&pio->io_lock); 473 } 474 } 475 476 static void 477 zio_inherit_child_errors(zio_t *zio, enum zio_child c) 478 { 479 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 480 zio->io_error = zio->io_child_error[c]; 481 } 482 483 /* 484 * ========================================================================== 485 * Create the various types of I/O (read, write, free, etc) 486 * ========================================================================== 487 */ 488 static zio_t * 489 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 490 void *data, uint64_t size, zio_done_func_t *done, void *private, 491 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 492 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 493 enum zio_stage stage, enum zio_stage pipeline) 494 { 495 zio_t *zio; 496 497 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 498 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 499 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 500 501 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 502 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 503 ASSERT(vd || stage == ZIO_STAGE_OPEN); 504 505 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 506 bzero(zio, sizeof (zio_t)); 507 508 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 509 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 510 511 list_create(&zio->io_parent_list, sizeof (zio_link_t), 512 offsetof(zio_link_t, zl_parent_node)); 513 list_create(&zio->io_child_list, sizeof (zio_link_t), 514 offsetof(zio_link_t, zl_child_node)); 515 516 if (vd != NULL) 517 zio->io_child_type = ZIO_CHILD_VDEV; 518 else if (flags & ZIO_FLAG_GANG_CHILD) 519 zio->io_child_type = ZIO_CHILD_GANG; 520 else if (flags & ZIO_FLAG_DDT_CHILD) 521 zio->io_child_type = ZIO_CHILD_DDT; 522 else 523 zio->io_child_type = ZIO_CHILD_LOGICAL; 524 525 if (bp != NULL) { 526 zio->io_bp = (blkptr_t *)bp; 527 zio->io_bp_copy = *bp; 528 zio->io_bp_orig = *bp; 529 if (type != ZIO_TYPE_WRITE || 530 zio->io_child_type == ZIO_CHILD_DDT) 531 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 532 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 533 zio->io_logical = zio; 534 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 535 pipeline |= ZIO_GANG_STAGES; 536 } 537 538 zio->io_spa = spa; 539 zio->io_txg = txg; 540 zio->io_done = done; 541 zio->io_private = private; 542 zio->io_type = type; 543 zio->io_priority = priority; 544 zio->io_vd = vd; 545 zio->io_offset = offset; 546 zio->io_orig_data = zio->io_data = data; 547 zio->io_orig_size = zio->io_size = size; 548 zio->io_orig_flags = zio->io_flags = flags; 549 zio->io_orig_stage = zio->io_stage = stage; 550 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 551 552 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 553 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 554 555 if (zb != NULL) 556 zio->io_bookmark = *zb; 557 558 if (pio != NULL) { 559 if (zio->io_logical == NULL) 560 zio->io_logical = pio->io_logical; 561 if (zio->io_child_type == ZIO_CHILD_GANG) 562 zio->io_gang_leader = pio->io_gang_leader; 563 zio_add_child(pio, zio); 564 } 565 566 return (zio); 567 } 568 569 static void 570 zio_destroy(zio_t *zio) 571 { 572 list_destroy(&zio->io_parent_list); 573 list_destroy(&zio->io_child_list); 574 mutex_destroy(&zio->io_lock); 575 cv_destroy(&zio->io_cv); 576 kmem_cache_free(zio_cache, zio); 577 } 578 579 zio_t * 580 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 581 void *private, enum zio_flag flags) 582 { 583 zio_t *zio; 584 585 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 586 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 587 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 588 589 return (zio); 590 } 591 592 zio_t * 593 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 594 { 595 return (zio_null(NULL, spa, NULL, done, private, flags)); 596 } 597 598 zio_t * 599 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 600 void *data, uint64_t size, zio_done_func_t *done, void *private, 601 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 602 { 603 zio_t *zio; 604 605 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 606 data, size, done, private, 607 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 608 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 609 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 610 611 return (zio); 612 } 613 614 zio_t * 615 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 616 void *data, uint64_t size, const zio_prop_t *zp, 617 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 618 void *private, 619 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 620 { 621 zio_t *zio; 622 623 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 624 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 625 zp->zp_compress >= ZIO_COMPRESS_OFF && 626 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 627 DMU_OT_IS_VALID(zp->zp_type) && 628 zp->zp_level < 32 && 629 zp->zp_copies > 0 && 630 zp->zp_copies <= spa_max_replication(spa)); 631 632 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 633 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 634 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 635 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 636 637 zio->io_ready = ready; 638 zio->io_physdone = physdone; 639 zio->io_prop = *zp; 640 641 /* 642 * Data can be NULL if we are going to call zio_write_override() to 643 * provide the already-allocated BP. But we may need the data to 644 * verify a dedup hit (if requested). In this case, don't try to 645 * dedup (just take the already-allocated BP verbatim). 646 */ 647 if (data == NULL && zio->io_prop.zp_dedup_verify) { 648 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 649 } 650 651 return (zio); 652 } 653 654 zio_t * 655 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 656 uint64_t size, zio_done_func_t *done, void *private, 657 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 658 { 659 zio_t *zio; 660 661 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 662 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 663 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 664 665 return (zio); 666 } 667 668 void 669 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 670 { 671 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 672 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 673 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 674 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 675 676 /* 677 * We must reset the io_prop to match the values that existed 678 * when the bp was first written by dmu_sync() keeping in mind 679 * that nopwrite and dedup are mutually exclusive. 680 */ 681 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 682 zio->io_prop.zp_nopwrite = nopwrite; 683 zio->io_prop.zp_copies = copies; 684 zio->io_bp_override = bp; 685 } 686 687 void 688 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 689 { 690 691 /* 692 * The check for EMBEDDED is a performance optimization. We 693 * process the free here (by ignoring it) rather than 694 * putting it on the list and then processing it in zio_free_sync(). 695 */ 696 if (BP_IS_EMBEDDED(bp)) 697 return; 698 metaslab_check_free(spa, bp); 699 700 /* 701 * Frees that are for the currently-syncing txg, are not going to be 702 * deferred, and which will not need to do a read (i.e. not GANG or 703 * DEDUP), can be processed immediately. Otherwise, put them on the 704 * in-memory list for later processing. 705 */ 706 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 707 txg != spa->spa_syncing_txg || 708 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 709 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 710 } else { 711 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); 712 } 713 } 714 715 zio_t * 716 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 717 enum zio_flag flags) 718 { 719 zio_t *zio; 720 enum zio_stage stage = ZIO_FREE_PIPELINE; 721 722 ASSERT(!BP_IS_HOLE(bp)); 723 ASSERT(spa_syncing_txg(spa) == txg); 724 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 725 726 if (BP_IS_EMBEDDED(bp)) 727 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 728 729 metaslab_check_free(spa, bp); 730 arc_freed(spa, bp); 731 732 /* 733 * GANG and DEDUP blocks can induce a read (for the gang block header, 734 * or the DDT), so issue them asynchronously so that this thread is 735 * not tied up. 736 */ 737 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 738 stage |= ZIO_STAGE_ISSUE_ASYNC; 739 740 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 741 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 742 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 743 744 return (zio); 745 } 746 747 zio_t * 748 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 749 zio_done_func_t *done, void *private, enum zio_flag flags) 750 { 751 zio_t *zio; 752 753 dprintf_bp(bp, "claiming in txg %llu", txg); 754 755 if (BP_IS_EMBEDDED(bp)) 756 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 757 758 /* 759 * A claim is an allocation of a specific block. Claims are needed 760 * to support immediate writes in the intent log. The issue is that 761 * immediate writes contain committed data, but in a txg that was 762 * *not* committed. Upon opening the pool after an unclean shutdown, 763 * the intent log claims all blocks that contain immediate write data 764 * so that the SPA knows they're in use. 765 * 766 * All claims *must* be resolved in the first txg -- before the SPA 767 * starts allocating blocks -- so that nothing is allocated twice. 768 * If txg == 0 we just verify that the block is claimable. 769 */ 770 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 771 ASSERT(txg == spa_first_txg(spa) || txg == 0); 772 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 773 774 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 775 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 776 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 777 778 return (zio); 779 } 780 781 zio_t * 782 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 783 zio_done_func_t *done, void *private, enum zio_flag flags) 784 { 785 zio_t *zio; 786 int c; 787 788 if (vd->vdev_children == 0) { 789 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 790 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 791 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 792 793 zio->io_cmd = cmd; 794 } else { 795 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 796 797 for (c = 0; c < vd->vdev_children; c++) 798 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 799 done, private, flags)); 800 } 801 802 return (zio); 803 } 804 805 zio_t * 806 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 807 void *data, int checksum, zio_done_func_t *done, void *private, 808 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 809 { 810 zio_t *zio; 811 812 ASSERT(vd->vdev_children == 0); 813 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 814 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 815 ASSERT3U(offset + size, <=, vd->vdev_psize); 816 817 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 818 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 819 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 820 821 zio->io_prop.zp_checksum = checksum; 822 823 return (zio); 824 } 825 826 zio_t * 827 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 828 void *data, int checksum, zio_done_func_t *done, void *private, 829 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 830 { 831 zio_t *zio; 832 833 ASSERT(vd->vdev_children == 0); 834 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 835 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 836 ASSERT3U(offset + size, <=, vd->vdev_psize); 837 838 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 839 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 840 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 841 842 zio->io_prop.zp_checksum = checksum; 843 844 if (zio_checksum_table[checksum].ci_eck) { 845 /* 846 * zec checksums are necessarily destructive -- they modify 847 * the end of the write buffer to hold the verifier/checksum. 848 * Therefore, we must make a local copy in case the data is 849 * being written to multiple places in parallel. 850 */ 851 void *wbuf = zio_buf_alloc(size); 852 bcopy(data, wbuf, size); 853 zio_push_transform(zio, wbuf, size, size, NULL); 854 } 855 856 return (zio); 857 } 858 859 /* 860 * Create a child I/O to do some work for us. 861 */ 862 zio_t * 863 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 864 void *data, uint64_t size, int type, zio_priority_t priority, 865 enum zio_flag flags, zio_done_func_t *done, void *private) 866 { 867 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 868 zio_t *zio; 869 870 ASSERT(vd->vdev_parent == 871 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 872 873 if (type == ZIO_TYPE_READ && bp != NULL) { 874 /* 875 * If we have the bp, then the child should perform the 876 * checksum and the parent need not. This pushes error 877 * detection as close to the leaves as possible and 878 * eliminates redundant checksums in the interior nodes. 879 */ 880 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 881 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 882 } 883 884 if (vd->vdev_children == 0) 885 offset += VDEV_LABEL_START_SIZE; 886 887 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 888 889 /* 890 * If we've decided to do a repair, the write is not speculative -- 891 * even if the original read was. 892 */ 893 if (flags & ZIO_FLAG_IO_REPAIR) 894 flags &= ~ZIO_FLAG_SPECULATIVE; 895 896 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 897 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 898 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 899 900 zio->io_physdone = pio->io_physdone; 901 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 902 zio->io_logical->io_phys_children++; 903 904 return (zio); 905 } 906 907 zio_t * 908 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 909 int type, zio_priority_t priority, enum zio_flag flags, 910 zio_done_func_t *done, void *private) 911 { 912 zio_t *zio; 913 914 ASSERT(vd->vdev_ops->vdev_op_leaf); 915 916 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 917 data, size, done, private, type, priority, 918 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 919 vd, offset, NULL, 920 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 921 922 return (zio); 923 } 924 925 void 926 zio_flush(zio_t *zio, vdev_t *vd) 927 { 928 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 929 NULL, NULL, 930 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 931 } 932 933 void 934 zio_shrink(zio_t *zio, uint64_t size) 935 { 936 ASSERT(zio->io_executor == NULL); 937 ASSERT(zio->io_orig_size == zio->io_size); 938 ASSERT(size <= zio->io_size); 939 940 /* 941 * We don't shrink for raidz because of problems with the 942 * reconstruction when reading back less than the block size. 943 * Note, BP_IS_RAIDZ() assumes no compression. 944 */ 945 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 946 if (!BP_IS_RAIDZ(zio->io_bp)) 947 zio->io_orig_size = zio->io_size = size; 948 } 949 950 /* 951 * ========================================================================== 952 * Prepare to read and write logical blocks 953 * ========================================================================== 954 */ 955 956 static int 957 zio_read_bp_init(zio_t *zio) 958 { 959 blkptr_t *bp = zio->io_bp; 960 961 if (!BP_IS_EMBEDDED(bp) && BP_GET_PROP_RESERVATION(bp)) { 962 memset(zio->io_orig_data, 0, zio->io_orig_size); 963 zio->io_pipeline = ZIO_INTERLOCK_STAGES; 964 return (ZIO_PIPELINE_CONTINUE); 965 } 966 967 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 968 zio->io_child_type == ZIO_CHILD_LOGICAL && 969 !(zio->io_flags & ZIO_FLAG_RAW)) { 970 uint64_t psize = 971 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 972 void *cbuf = zio_buf_alloc(psize); 973 974 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 975 } 976 977 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 978 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 979 decode_embedded_bp_compressed(bp, zio->io_data); 980 } else { 981 ASSERT(!BP_IS_EMBEDDED(bp)); 982 } 983 984 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 985 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 986 987 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 988 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 989 990 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 991 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 992 993 return (ZIO_PIPELINE_CONTINUE); 994 } 995 996 static int 997 zio_write_bp_init(zio_t *zio) 998 { 999 spa_t *spa = zio->io_spa; 1000 zio_prop_t *zp = &zio->io_prop; 1001 enum zio_compress compress = zp->zp_compress; 1002 enum zio_checksum checksum = zp->zp_checksum; 1003 uint8_t dedup = zp->zp_dedup; 1004 blkptr_t *bp = zio->io_bp; 1005 uint64_t lsize = zio->io_size; 1006 uint64_t psize = lsize; 1007 int pass = 1; 1008 1009 /* 1010 * If our children haven't all reached the ready stage, 1011 * wait for them and then repeat this pipeline stage. 1012 */ 1013 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1014 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1015 return (ZIO_PIPELINE_STOP); 1016 1017 if (!IO_IS_ALLOCATING(zio)) 1018 return (ZIO_PIPELINE_CONTINUE); 1019 1020 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1021 1022 if (zp->zp_zero_write && !(zio->io_pipeline & ZIO_GANG_STAGES)) { 1023 dedup = B_FALSE; 1024 compress = ZIO_COMPRESS_OFF; 1025 checksum = ZIO_CHECKSUM_OFF; 1026 } 1027 1028 if (zio->io_bp_override) { 1029 ASSERT(bp->blk_birth != zio->io_txg); 1030 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1031 1032 *bp = *zio->io_bp_override; 1033 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1034 1035 if (BP_IS_EMBEDDED(bp)) 1036 return (ZIO_PIPELINE_CONTINUE); 1037 1038 /* 1039 * If we've been overridden and nopwrite is set then 1040 * set the flag accordingly to indicate that a nopwrite 1041 * has already occurred. 1042 */ 1043 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1044 ASSERT(!zp->zp_dedup); 1045 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1046 return (ZIO_PIPELINE_CONTINUE); 1047 } 1048 1049 ASSERT(!zp->zp_nopwrite); 1050 1051 if (BP_IS_HOLE(bp) || !dedup) 1052 return (ZIO_PIPELINE_CONTINUE); 1053 1054 ASSERT(zio_checksum_table[checksum].ci_dedup || 1055 zp->zp_dedup_verify); 1056 1057 if (BP_GET_CHECKSUM(bp) == checksum) { 1058 BP_SET_DEDUP(bp, 1); 1059 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1060 return (ZIO_PIPELINE_CONTINUE); 1061 } 1062 zio->io_bp_override = NULL; 1063 BP_ZERO(bp); 1064 } 1065 1066 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1067 /* 1068 * We're rewriting an existing block, which means we're 1069 * working on behalf of spa_sync(). For spa_sync() to 1070 * converge, it must eventually be the case that we don't 1071 * have to allocate new blocks. But compression changes 1072 * the blocksize, which forces a reallocate, and makes 1073 * convergence take longer. Therefore, after the first 1074 * few passes, stop compressing to ensure convergence. 1075 */ 1076 pass = spa_sync_pass(spa); 1077 1078 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1079 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1080 ASSERT(!BP_GET_DEDUP(bp)); 1081 1082 if (pass >= zfs_sync_pass_dont_compress) 1083 compress = ZIO_COMPRESS_OFF; 1084 1085 /* Make sure someone doesn't change their mind on overwrites */ 1086 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1087 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1088 } 1089 1090 if (compress != ZIO_COMPRESS_OFF) { 1091 void *cbuf = zio_buf_alloc(lsize); 1092 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1093 if (psize == 0 || psize == lsize) { 1094 compress = ZIO_COMPRESS_OFF; 1095 zio_buf_free(cbuf, lsize); 1096 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1097 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1098 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1099 encode_embedded_bp_compressed(bp, 1100 cbuf, compress, lsize, psize); 1101 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1102 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1103 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1104 zio_buf_free(cbuf, lsize); 1105 bp->blk_birth = zio->io_txg; 1106 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1107 ASSERT(spa_feature_is_active(spa, 1108 SPA_FEATURE_EMBEDDED_DATA)); 1109 return (ZIO_PIPELINE_CONTINUE); 1110 } else { 1111 /* 1112 * Round up compressed size to MINBLOCKSIZE and 1113 * zero the tail. 1114 */ 1115 size_t rounded = 1116 P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); 1117 if (rounded > psize) { 1118 bzero((char *)cbuf + psize, rounded - psize); 1119 psize = rounded; 1120 } 1121 if (psize == lsize) { 1122 compress = ZIO_COMPRESS_OFF; 1123 zio_buf_free(cbuf, lsize); 1124 } else { 1125 zio_push_transform(zio, cbuf, 1126 psize, lsize, NULL); 1127 } 1128 } 1129 } 1130 1131 /* 1132 * The final pass of spa_sync() must be all rewrites, but the first 1133 * few passes offer a trade-off: allocating blocks defers convergence, 1134 * but newly allocated blocks are sequential, so they can be written 1135 * to disk faster. Therefore, we allow the first few passes of 1136 * spa_sync() to allocate new blocks, but force rewrites after that. 1137 * There should only be a handful of blocks after pass 1 in any case. 1138 */ 1139 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1140 BP_GET_PSIZE(bp) == psize && 1141 pass >= zfs_sync_pass_rewrite) { 1142 ASSERT(psize != 0); 1143 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1144 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1145 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1146 } else { 1147 BP_ZERO(bp); 1148 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1149 } 1150 1151 if (psize == 0) { 1152 if (zio->io_bp_orig.blk_birth != 0 && 1153 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1154 BP_SET_LSIZE(bp, lsize); 1155 BP_SET_TYPE(bp, zp->zp_type); 1156 BP_SET_LEVEL(bp, zp->zp_level); 1157 BP_SET_BIRTH(bp, zio->io_txg, 0); 1158 } 1159 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1160 } else { 1161 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1162 BP_SET_LSIZE(bp, lsize); 1163 BP_SET_TYPE(bp, zp->zp_type); 1164 BP_SET_LEVEL(bp, zp->zp_level); 1165 BP_SET_PSIZE(bp, psize); 1166 BP_SET_COMPRESS(bp, compress); 1167 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1168 BP_SET_DEDUP(bp, zp->zp_dedup); 1169 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1170 if (zp->zp_zero_write && !(zio->io_pipeline & ZIO_GANG_STAGES)) { 1171 boolean_t need_allocate = B_FALSE; 1172 if (zio->io_pipeline & ZIO_STAGE_DVA_ALLOCATE) 1173 need_allocate = B_TRUE; 1174 zio->io_pipeline = ZIO_INTERLOCK_STAGES; 1175 if (need_allocate) 1176 zio->io_pipeline |= ZIO_STAGE_DVA_ALLOCATE; 1177 BP_SET_PROP_RESERVATION(bp, 1); 1178 } else { 1179 BP_SET_PROP_RESERVATION(bp, 0); 1180 } 1181 if (zp->zp_dedup) { 1182 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1183 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1184 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1185 } 1186 if (zp->zp_nopwrite) { 1187 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1188 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1189 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1190 } 1191 } 1192 1193 return (ZIO_PIPELINE_CONTINUE); 1194 } 1195 1196 static int 1197 zio_free_bp_init(zio_t *zio) 1198 { 1199 blkptr_t *bp = zio->io_bp; 1200 1201 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1202 if (BP_GET_DEDUP(bp)) 1203 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1204 } 1205 1206 return (ZIO_PIPELINE_CONTINUE); 1207 } 1208 1209 /* 1210 * ========================================================================== 1211 * Execute the I/O pipeline 1212 * ========================================================================== 1213 */ 1214 1215 static void 1216 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1217 { 1218 spa_t *spa = zio->io_spa; 1219 zio_type_t t = zio->io_type; 1220 int flags = (cutinline ? TQ_FRONT : 0); 1221 1222 /* 1223 * If we're a config writer or a probe, the normal issue and 1224 * interrupt threads may all be blocked waiting for the config lock. 1225 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1226 */ 1227 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1228 t = ZIO_TYPE_NULL; 1229 1230 /* 1231 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1232 */ 1233 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1234 t = ZIO_TYPE_NULL; 1235 1236 /* 1237 * If this is a high priority I/O, then use the high priority taskq if 1238 * available. 1239 */ 1240 if (zio->io_priority == ZIO_PRIORITY_NOW && 1241 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1242 q++; 1243 1244 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1245 1246 /* 1247 * NB: We are assuming that the zio can only be dispatched 1248 * to a single taskq at a time. It would be a grievous error 1249 * to dispatch the zio to another taskq at the same time. 1250 */ 1251 ASSERT(zio->io_tqent.tqent_next == NULL); 1252 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1253 flags, &zio->io_tqent); 1254 } 1255 1256 static boolean_t 1257 zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1258 { 1259 kthread_t *executor = zio->io_executor; 1260 spa_t *spa = zio->io_spa; 1261 1262 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1263 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1264 uint_t i; 1265 for (i = 0; i < tqs->stqs_count; i++) { 1266 if (taskq_member(tqs->stqs_taskq[i], executor)) 1267 return (B_TRUE); 1268 } 1269 } 1270 1271 return (B_FALSE); 1272 } 1273 1274 static int 1275 zio_issue_async(zio_t *zio) 1276 { 1277 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1278 1279 return (ZIO_PIPELINE_STOP); 1280 } 1281 1282 void 1283 zio_interrupt(zio_t *zio) 1284 { 1285 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1286 } 1287 1288 /* 1289 * Execute the I/O pipeline until one of the following occurs: 1290 * 1291 * (1) the I/O completes 1292 * (2) the pipeline stalls waiting for dependent child I/Os 1293 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1294 * (4) the I/O is delegated by vdev-level caching or aggregation 1295 * (5) the I/O is deferred due to vdev-level queueing 1296 * (6) the I/O is handed off to another thread. 1297 * 1298 * In all cases, the pipeline stops whenever there's no CPU work; it never 1299 * burns a thread in cv_wait(). 1300 * 1301 * There's no locking on io_stage because there's no legitimate way 1302 * for multiple threads to be attempting to process the same I/O. 1303 */ 1304 static zio_pipe_stage_t *zio_pipeline[]; 1305 1306 void 1307 zio_execute(zio_t *zio) 1308 { 1309 zio->io_executor = curthread; 1310 1311 while (zio->io_stage < ZIO_STAGE_DONE) { 1312 enum zio_stage pipeline = zio->io_pipeline; 1313 enum zio_stage stage = zio->io_stage; 1314 int rv; 1315 1316 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1317 ASSERT(ISP2(stage)); 1318 ASSERT(zio->io_stall == NULL); 1319 1320 do { 1321 stage <<= 1; 1322 } while ((stage & pipeline) == 0); 1323 1324 ASSERT(stage <= ZIO_STAGE_DONE); 1325 1326 /* 1327 * If we are in interrupt context and this pipeline stage 1328 * will grab a config lock that is held across I/O, 1329 * or may wait for an I/O that needs an interrupt thread 1330 * to complete, issue async to avoid deadlock. 1331 * 1332 * For VDEV_IO_START, we cut in line so that the io will 1333 * be sent to disk promptly. 1334 */ 1335 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1336 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1337 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1338 zio_requeue_io_start_cut_in_line : B_FALSE; 1339 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1340 return; 1341 } 1342 1343 zio->io_stage = stage; 1344 rv = zio_pipeline[highbit64(stage) - 1](zio); 1345 1346 if (rv == ZIO_PIPELINE_STOP) 1347 return; 1348 1349 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1350 } 1351 } 1352 1353 /* 1354 * ========================================================================== 1355 * Initiate I/O, either sync or async 1356 * ========================================================================== 1357 */ 1358 int 1359 zio_wait(zio_t *zio) 1360 { 1361 int error; 1362 1363 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1364 ASSERT(zio->io_executor == NULL); 1365 1366 zio->io_waiter = curthread; 1367 1368 zio_execute(zio); 1369 1370 mutex_enter(&zio->io_lock); 1371 while (zio->io_executor != NULL) 1372 cv_wait(&zio->io_cv, &zio->io_lock); 1373 mutex_exit(&zio->io_lock); 1374 1375 error = zio->io_error; 1376 zio_destroy(zio); 1377 1378 return (error); 1379 } 1380 1381 void 1382 zio_nowait(zio_t *zio) 1383 { 1384 ASSERT(zio->io_executor == NULL); 1385 1386 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1387 zio_unique_parent(zio) == NULL) { 1388 /* 1389 * This is a logical async I/O with no parent to wait for it. 1390 * We add it to the spa_async_root_zio "Godfather" I/O which 1391 * will ensure they complete prior to unloading the pool. 1392 */ 1393 spa_t *spa = zio->io_spa; 1394 1395 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1396 } 1397 1398 zio_execute(zio); 1399 } 1400 1401 /* 1402 * ========================================================================== 1403 * Reexecute or suspend/resume failed I/O 1404 * ========================================================================== 1405 */ 1406 1407 static void 1408 zio_reexecute(zio_t *pio) 1409 { 1410 zio_t *cio, *cio_next; 1411 1412 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1413 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1414 ASSERT(pio->io_gang_leader == NULL); 1415 ASSERT(pio->io_gang_tree == NULL); 1416 1417 pio->io_flags = pio->io_orig_flags; 1418 pio->io_stage = pio->io_orig_stage; 1419 pio->io_pipeline = pio->io_orig_pipeline; 1420 pio->io_reexecute = 0; 1421 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1422 pio->io_error = 0; 1423 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1424 pio->io_state[w] = 0; 1425 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1426 pio->io_child_error[c] = 0; 1427 1428 if (IO_IS_ALLOCATING(pio)) 1429 BP_ZERO(pio->io_bp); 1430 1431 /* 1432 * As we reexecute pio's children, new children could be created. 1433 * New children go to the head of pio's io_child_list, however, 1434 * so we will (correctly) not reexecute them. The key is that 1435 * the remainder of pio's io_child_list, from 'cio_next' onward, 1436 * cannot be affected by any side effects of reexecuting 'cio'. 1437 */ 1438 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1439 cio_next = zio_walk_children(pio); 1440 mutex_enter(&pio->io_lock); 1441 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1442 pio->io_children[cio->io_child_type][w]++; 1443 mutex_exit(&pio->io_lock); 1444 zio_reexecute(cio); 1445 } 1446 1447 /* 1448 * Now that all children have been reexecuted, execute the parent. 1449 * We don't reexecute "The Godfather" I/O here as it's the 1450 * responsibility of the caller to wait on him. 1451 */ 1452 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1453 zio_execute(pio); 1454 } 1455 1456 void 1457 zio_suspend(spa_t *spa, zio_t *zio) 1458 { 1459 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1460 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1461 "failure and the failure mode property for this pool " 1462 "is set to panic.", spa_name(spa)); 1463 1464 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1465 1466 mutex_enter(&spa->spa_suspend_lock); 1467 1468 if (spa->spa_suspend_zio_root == NULL) 1469 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1470 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1471 ZIO_FLAG_GODFATHER); 1472 1473 spa->spa_suspended = B_TRUE; 1474 1475 if (zio != NULL) { 1476 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1477 ASSERT(zio != spa->spa_suspend_zio_root); 1478 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1479 ASSERT(zio_unique_parent(zio) == NULL); 1480 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1481 zio_add_child(spa->spa_suspend_zio_root, zio); 1482 } 1483 1484 mutex_exit(&spa->spa_suspend_lock); 1485 } 1486 1487 int 1488 zio_resume(spa_t *spa) 1489 { 1490 zio_t *pio; 1491 1492 /* 1493 * Reexecute all previously suspended i/o. 1494 */ 1495 mutex_enter(&spa->spa_suspend_lock); 1496 spa->spa_suspended = B_FALSE; 1497 cv_broadcast(&spa->spa_suspend_cv); 1498 pio = spa->spa_suspend_zio_root; 1499 spa->spa_suspend_zio_root = NULL; 1500 mutex_exit(&spa->spa_suspend_lock); 1501 1502 if (pio == NULL) 1503 return (0); 1504 1505 zio_reexecute(pio); 1506 return (zio_wait(pio)); 1507 } 1508 1509 void 1510 zio_resume_wait(spa_t *spa) 1511 { 1512 mutex_enter(&spa->spa_suspend_lock); 1513 while (spa_suspended(spa)) 1514 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1515 mutex_exit(&spa->spa_suspend_lock); 1516 } 1517 1518 /* 1519 * ========================================================================== 1520 * Gang blocks. 1521 * 1522 * A gang block is a collection of small blocks that looks to the DMU 1523 * like one large block. When zio_dva_allocate() cannot find a block 1524 * of the requested size, due to either severe fragmentation or the pool 1525 * being nearly full, it calls zio_write_gang_block() to construct the 1526 * block from smaller fragments. 1527 * 1528 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1529 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1530 * an indirect block: it's an array of block pointers. It consumes 1531 * only one sector and hence is allocatable regardless of fragmentation. 1532 * The gang header's bps point to its gang members, which hold the data. 1533 * 1534 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1535 * as the verifier to ensure uniqueness of the SHA256 checksum. 1536 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1537 * not the gang header. This ensures that data block signatures (needed for 1538 * deduplication) are independent of how the block is physically stored. 1539 * 1540 * Gang blocks can be nested: a gang member may itself be a gang block. 1541 * Thus every gang block is a tree in which root and all interior nodes are 1542 * gang headers, and the leaves are normal blocks that contain user data. 1543 * The root of the gang tree is called the gang leader. 1544 * 1545 * To perform any operation (read, rewrite, free, claim) on a gang block, 1546 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1547 * in the io_gang_tree field of the original logical i/o by recursively 1548 * reading the gang leader and all gang headers below it. This yields 1549 * an in-core tree containing the contents of every gang header and the 1550 * bps for every constituent of the gang block. 1551 * 1552 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1553 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1554 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1555 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1556 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1557 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1558 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1559 * of the gang header plus zio_checksum_compute() of the data to update the 1560 * gang header's blk_cksum as described above. 1561 * 1562 * The two-phase assemble/issue model solves the problem of partial failure -- 1563 * what if you'd freed part of a gang block but then couldn't read the 1564 * gang header for another part? Assembling the entire gang tree first 1565 * ensures that all the necessary gang header I/O has succeeded before 1566 * starting the actual work of free, claim, or write. Once the gang tree 1567 * is assembled, free and claim are in-memory operations that cannot fail. 1568 * 1569 * In the event that a gang write fails, zio_dva_unallocate() walks the 1570 * gang tree to immediately free (i.e. insert back into the space map) 1571 * everything we've allocated. This ensures that we don't get ENOSPC 1572 * errors during repeated suspend/resume cycles due to a flaky device. 1573 * 1574 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1575 * the gang tree, we won't modify the block, so we can safely defer the free 1576 * (knowing that the block is still intact). If we *can* assemble the gang 1577 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1578 * each constituent bp and we can allocate a new block on the next sync pass. 1579 * 1580 * In all cases, the gang tree allows complete recovery from partial failure. 1581 * ========================================================================== 1582 */ 1583 1584 static zio_t * 1585 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1586 { 1587 if (gn != NULL) 1588 return (pio); 1589 1590 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1591 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1592 &pio->io_bookmark)); 1593 } 1594 1595 zio_t * 1596 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1597 { 1598 zio_t *zio; 1599 1600 if (gn != NULL) { 1601 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1602 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1603 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1604 /* 1605 * As we rewrite each gang header, the pipeline will compute 1606 * a new gang block header checksum for it; but no one will 1607 * compute a new data checksum, so we do that here. The one 1608 * exception is the gang leader: the pipeline already computed 1609 * its data checksum because that stage precedes gang assembly. 1610 * (Presently, nothing actually uses interior data checksums; 1611 * this is just good hygiene.) 1612 */ 1613 if (gn != pio->io_gang_leader->io_gang_tree) { 1614 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1615 data, BP_GET_PSIZE(bp)); 1616 } 1617 /* 1618 * If we are here to damage data for testing purposes, 1619 * leave the GBH alone so that we can detect the damage. 1620 */ 1621 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1622 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1623 } else { 1624 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1625 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1626 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1627 } 1628 1629 return (zio); 1630 } 1631 1632 /* ARGSUSED */ 1633 zio_t * 1634 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1635 { 1636 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1637 ZIO_GANG_CHILD_FLAGS(pio))); 1638 } 1639 1640 /* ARGSUSED */ 1641 zio_t * 1642 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1643 { 1644 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1645 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1646 } 1647 1648 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1649 NULL, 1650 zio_read_gang, 1651 zio_rewrite_gang, 1652 zio_free_gang, 1653 zio_claim_gang, 1654 NULL 1655 }; 1656 1657 static void zio_gang_tree_assemble_done(zio_t *zio); 1658 1659 static zio_gang_node_t * 1660 zio_gang_node_alloc(zio_gang_node_t **gnpp) 1661 { 1662 zio_gang_node_t *gn; 1663 1664 ASSERT(*gnpp == NULL); 1665 1666 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1667 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1668 *gnpp = gn; 1669 1670 return (gn); 1671 } 1672 1673 static void 1674 zio_gang_node_free(zio_gang_node_t **gnpp) 1675 { 1676 zio_gang_node_t *gn = *gnpp; 1677 1678 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1679 ASSERT(gn->gn_child[g] == NULL); 1680 1681 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1682 kmem_free(gn, sizeof (*gn)); 1683 *gnpp = NULL; 1684 } 1685 1686 static void 1687 zio_gang_tree_free(zio_gang_node_t **gnpp) 1688 { 1689 zio_gang_node_t *gn = *gnpp; 1690 1691 if (gn == NULL) 1692 return; 1693 1694 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1695 zio_gang_tree_free(&gn->gn_child[g]); 1696 1697 zio_gang_node_free(gnpp); 1698 } 1699 1700 static void 1701 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1702 { 1703 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1704 1705 ASSERT(gio->io_gang_leader == gio); 1706 ASSERT(BP_IS_GANG(bp)); 1707 1708 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1709 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1710 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1711 } 1712 1713 static void 1714 zio_gang_tree_assemble_done(zio_t *zio) 1715 { 1716 zio_t *gio = zio->io_gang_leader; 1717 zio_gang_node_t *gn = zio->io_private; 1718 blkptr_t *bp = zio->io_bp; 1719 1720 ASSERT(gio == zio_unique_parent(zio)); 1721 ASSERT(zio->io_child_count == 0); 1722 1723 if (zio->io_error) 1724 return; 1725 1726 if (BP_SHOULD_BYTESWAP(bp)) 1727 byteswap_uint64_array(zio->io_data, zio->io_size); 1728 1729 ASSERT(zio->io_data == gn->gn_gbh); 1730 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1731 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1732 1733 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1734 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1735 if (!BP_IS_GANG(gbp)) 1736 continue; 1737 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1738 } 1739 } 1740 1741 static void 1742 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1743 { 1744 zio_t *gio = pio->io_gang_leader; 1745 zio_t *zio; 1746 1747 ASSERT(BP_IS_GANG(bp) == !!gn); 1748 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1749 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1750 1751 /* 1752 * If you're a gang header, your data is in gn->gn_gbh. 1753 * If you're a gang member, your data is in 'data' and gn == NULL. 1754 */ 1755 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1756 1757 if (gn != NULL) { 1758 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1759 1760 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1761 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1762 if (BP_IS_HOLE(gbp)) 1763 continue; 1764 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1765 data = (char *)data + BP_GET_PSIZE(gbp); 1766 } 1767 } 1768 1769 if (gn == gio->io_gang_tree) 1770 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1771 1772 if (zio != pio) 1773 zio_nowait(zio); 1774 } 1775 1776 static int 1777 zio_gang_assemble(zio_t *zio) 1778 { 1779 blkptr_t *bp = zio->io_bp; 1780 1781 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1782 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1783 1784 zio->io_gang_leader = zio; 1785 1786 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1787 1788 return (ZIO_PIPELINE_CONTINUE); 1789 } 1790 1791 static int 1792 zio_gang_issue(zio_t *zio) 1793 { 1794 blkptr_t *bp = zio->io_bp; 1795 1796 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1797 return (ZIO_PIPELINE_STOP); 1798 1799 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1800 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1801 1802 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1803 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1804 else 1805 zio_gang_tree_free(&zio->io_gang_tree); 1806 1807 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1808 1809 return (ZIO_PIPELINE_CONTINUE); 1810 } 1811 1812 static void 1813 zio_write_gang_member_ready(zio_t *zio) 1814 { 1815 zio_t *pio = zio_unique_parent(zio); 1816 zio_t *gio = zio->io_gang_leader; 1817 dva_t *cdva = zio->io_bp->blk_dva; 1818 dva_t *pdva = pio->io_bp->blk_dva; 1819 uint64_t asize; 1820 1821 if (BP_IS_HOLE(zio->io_bp)) 1822 return; 1823 1824 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1825 1826 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1827 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1828 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1829 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1830 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1831 1832 mutex_enter(&pio->io_lock); 1833 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1834 ASSERT(DVA_GET_GANG(&pdva[d])); 1835 asize = DVA_GET_ASIZE(&pdva[d]); 1836 asize += DVA_GET_ASIZE(&cdva[d]); 1837 DVA_SET_ASIZE(&pdva[d], asize); 1838 } 1839 mutex_exit(&pio->io_lock); 1840 } 1841 1842 static int 1843 zio_write_gang_block(zio_t *pio) 1844 { 1845 spa_t *spa = pio->io_spa; 1846 blkptr_t *bp = pio->io_bp; 1847 zio_t *gio = pio->io_gang_leader; 1848 zio_t *zio; 1849 zio_gang_node_t *gn, **gnpp; 1850 zio_gbh_phys_t *gbh; 1851 uint64_t txg = pio->io_txg; 1852 uint64_t resid = pio->io_size; 1853 uint64_t lsize; 1854 int copies = gio->io_prop.zp_copies; 1855 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1856 zio_prop_t zp; 1857 int error; 1858 1859 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1860 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1861 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1862 if (error) { 1863 pio->io_error = error; 1864 return (ZIO_PIPELINE_CONTINUE); 1865 } 1866 1867 if (pio == gio) { 1868 gnpp = &gio->io_gang_tree; 1869 } else { 1870 gnpp = pio->io_private; 1871 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1872 } 1873 1874 gn = zio_gang_node_alloc(gnpp); 1875 gbh = gn->gn_gbh; 1876 bzero(gbh, SPA_GANGBLOCKSIZE); 1877 1878 /* 1879 * Create the gang header. 1880 */ 1881 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1882 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1883 1884 /* 1885 * Create and nowait the gang children. 1886 */ 1887 for (int g = 0; resid != 0; resid -= lsize, g++) { 1888 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1889 SPA_MINBLOCKSIZE); 1890 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1891 1892 zp.zp_checksum = gio->io_prop.zp_checksum; 1893 zp.zp_compress = ZIO_COMPRESS_OFF; 1894 zp.zp_type = DMU_OT_NONE; 1895 zp.zp_level = 0; 1896 zp.zp_copies = gio->io_prop.zp_copies; 1897 zp.zp_dedup = B_FALSE; 1898 zp.zp_dedup_verify = B_FALSE; 1899 zp.zp_zero_write = B_FALSE; 1900 zp.zp_nopwrite = B_FALSE; 1901 1902 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1903 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1904 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 1905 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1906 &pio->io_bookmark)); 1907 } 1908 1909 /* 1910 * Set pio's pipeline to just wait for zio to finish. 1911 */ 1912 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1913 1914 zio_nowait(zio); 1915 1916 return (ZIO_PIPELINE_CONTINUE); 1917 } 1918 1919 /* 1920 * The zio_nop_write stage in the pipeline determines if allocating 1921 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1922 * such as SHA256, we can compare the checksums of the new data and the old 1923 * to determine if allocating a new block is required. The nopwrite 1924 * feature can handle writes in either syncing or open context (i.e. zil 1925 * writes) and as a result is mutually exclusive with dedup. 1926 */ 1927 static int 1928 zio_nop_write(zio_t *zio) 1929 { 1930 blkptr_t *bp = zio->io_bp; 1931 blkptr_t *bp_orig = &zio->io_bp_orig; 1932 zio_prop_t *zp = &zio->io_prop; 1933 1934 ASSERT(BP_GET_LEVEL(bp) == 0); 1935 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1936 ASSERT(zp->zp_nopwrite); 1937 ASSERT(!zp->zp_dedup); 1938 ASSERT(zio->io_bp_override == NULL); 1939 ASSERT(IO_IS_ALLOCATING(zio)); 1940 1941 /* 1942 * Check to see if the original bp and the new bp have matching 1943 * characteristics (i.e. same checksum, compression algorithms, etc). 1944 * If they don't then just continue with the pipeline which will 1945 * allocate a new bp. 1946 */ 1947 if (BP_IS_HOLE(bp_orig) || 1948 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1949 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1950 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1951 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1952 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1953 return (ZIO_PIPELINE_CONTINUE); 1954 1955 /* 1956 * If the checksums match then reset the pipeline so that we 1957 * avoid allocating a new bp and issuing any I/O. 1958 */ 1959 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1960 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1961 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1962 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1963 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1964 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1965 sizeof (uint64_t)) == 0); 1966 1967 *bp = *bp_orig; 1968 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1969 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1970 } 1971 1972 return (ZIO_PIPELINE_CONTINUE); 1973 } 1974 1975 /* 1976 * ========================================================================== 1977 * Dedup 1978 * ========================================================================== 1979 */ 1980 static void 1981 zio_ddt_child_read_done(zio_t *zio) 1982 { 1983 blkptr_t *bp = zio->io_bp; 1984 ddt_entry_t *dde = zio->io_private; 1985 ddt_phys_t *ddp; 1986 zio_t *pio = zio_unique_parent(zio); 1987 1988 mutex_enter(&pio->io_lock); 1989 ddp = ddt_phys_select(dde, bp); 1990 if (zio->io_error == 0) 1991 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1992 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1993 dde->dde_repair_data = zio->io_data; 1994 else 1995 zio_buf_free(zio->io_data, zio->io_size); 1996 mutex_exit(&pio->io_lock); 1997 } 1998 1999 static int 2000 zio_ddt_read_start(zio_t *zio) 2001 { 2002 blkptr_t *bp = zio->io_bp; 2003 2004 ASSERT(BP_GET_DEDUP(bp)); 2005 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2006 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2007 2008 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2009 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2010 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2011 ddt_phys_t *ddp = dde->dde_phys; 2012 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2013 blkptr_t blk; 2014 2015 ASSERT(zio->io_vsd == NULL); 2016 zio->io_vsd = dde; 2017 2018 if (ddp_self == NULL) 2019 return (ZIO_PIPELINE_CONTINUE); 2020 2021 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2022 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2023 continue; 2024 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2025 &blk); 2026 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2027 zio_buf_alloc(zio->io_size), zio->io_size, 2028 zio_ddt_child_read_done, dde, zio->io_priority, 2029 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2030 &zio->io_bookmark)); 2031 } 2032 return (ZIO_PIPELINE_CONTINUE); 2033 } 2034 2035 zio_nowait(zio_read(zio, zio->io_spa, bp, 2036 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2037 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2038 2039 return (ZIO_PIPELINE_CONTINUE); 2040 } 2041 2042 static int 2043 zio_ddt_read_done(zio_t *zio) 2044 { 2045 blkptr_t *bp = zio->io_bp; 2046 2047 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2048 return (ZIO_PIPELINE_STOP); 2049 2050 ASSERT(BP_GET_DEDUP(bp)); 2051 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2052 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2053 2054 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2055 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2056 ddt_entry_t *dde = zio->io_vsd; 2057 if (ddt == NULL) { 2058 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2059 return (ZIO_PIPELINE_CONTINUE); 2060 } 2061 if (dde == NULL) { 2062 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2063 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2064 return (ZIO_PIPELINE_STOP); 2065 } 2066 if (dde->dde_repair_data != NULL) { 2067 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2068 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2069 } 2070 ddt_repair_done(ddt, dde); 2071 zio->io_vsd = NULL; 2072 } 2073 2074 ASSERT(zio->io_vsd == NULL); 2075 2076 return (ZIO_PIPELINE_CONTINUE); 2077 } 2078 2079 static boolean_t 2080 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2081 { 2082 spa_t *spa = zio->io_spa; 2083 2084 /* 2085 * Note: we compare the original data, not the transformed data, 2086 * because when zio->io_bp is an override bp, we will not have 2087 * pushed the I/O transforms. That's an important optimization 2088 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2089 */ 2090 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2091 zio_t *lio = dde->dde_lead_zio[p]; 2092 2093 if (lio != NULL) { 2094 return (lio->io_orig_size != zio->io_orig_size || 2095 bcmp(zio->io_orig_data, lio->io_orig_data, 2096 zio->io_orig_size) != 0); 2097 } 2098 } 2099 2100 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2101 ddt_phys_t *ddp = &dde->dde_phys[p]; 2102 2103 if (ddp->ddp_phys_birth != 0) { 2104 arc_buf_t *abuf = NULL; 2105 uint32_t aflags = ARC_WAIT; 2106 blkptr_t blk = *zio->io_bp; 2107 int error; 2108 2109 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2110 2111 ddt_exit(ddt); 2112 2113 error = arc_read(NULL, spa, &blk, 2114 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2115 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2116 &aflags, &zio->io_bookmark); 2117 2118 if (error == 0) { 2119 if (arc_buf_size(abuf) != zio->io_orig_size || 2120 bcmp(abuf->b_data, zio->io_orig_data, 2121 zio->io_orig_size) != 0) 2122 error = SET_ERROR(EEXIST); 2123 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2124 } 2125 2126 ddt_enter(ddt); 2127 return (error != 0); 2128 } 2129 } 2130 2131 return (B_FALSE); 2132 } 2133 2134 static void 2135 zio_ddt_child_write_ready(zio_t *zio) 2136 { 2137 int p = zio->io_prop.zp_copies; 2138 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2139 ddt_entry_t *dde = zio->io_private; 2140 ddt_phys_t *ddp = &dde->dde_phys[p]; 2141 zio_t *pio; 2142 2143 if (zio->io_error) 2144 return; 2145 2146 ddt_enter(ddt); 2147 2148 ASSERT(dde->dde_lead_zio[p] == zio); 2149 2150 ddt_phys_fill(ddp, zio->io_bp); 2151 2152 while ((pio = zio_walk_parents(zio)) != NULL) 2153 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2154 2155 ddt_exit(ddt); 2156 } 2157 2158 static void 2159 zio_ddt_child_write_done(zio_t *zio) 2160 { 2161 int p = zio->io_prop.zp_copies; 2162 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2163 ddt_entry_t *dde = zio->io_private; 2164 ddt_phys_t *ddp = &dde->dde_phys[p]; 2165 2166 ddt_enter(ddt); 2167 2168 ASSERT(ddp->ddp_refcnt == 0); 2169 ASSERT(dde->dde_lead_zio[p] == zio); 2170 dde->dde_lead_zio[p] = NULL; 2171 2172 if (zio->io_error == 0) { 2173 while (zio_walk_parents(zio) != NULL) 2174 ddt_phys_addref(ddp); 2175 } else { 2176 ddt_phys_clear(ddp); 2177 } 2178 2179 ddt_exit(ddt); 2180 } 2181 2182 static void 2183 zio_ddt_ditto_write_done(zio_t *zio) 2184 { 2185 int p = DDT_PHYS_DITTO; 2186 zio_prop_t *zp = &zio->io_prop; 2187 blkptr_t *bp = zio->io_bp; 2188 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2189 ddt_entry_t *dde = zio->io_private; 2190 ddt_phys_t *ddp = &dde->dde_phys[p]; 2191 ddt_key_t *ddk = &dde->dde_key; 2192 2193 ddt_enter(ddt); 2194 2195 ASSERT(ddp->ddp_refcnt == 0); 2196 ASSERT(dde->dde_lead_zio[p] == zio); 2197 dde->dde_lead_zio[p] = NULL; 2198 2199 if (zio->io_error == 0) { 2200 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2201 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2202 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2203 if (ddp->ddp_phys_birth != 0) 2204 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2205 ddt_phys_fill(ddp, bp); 2206 } 2207 2208 ddt_exit(ddt); 2209 } 2210 2211 static int 2212 zio_ddt_write(zio_t *zio) 2213 { 2214 spa_t *spa = zio->io_spa; 2215 blkptr_t *bp = zio->io_bp; 2216 uint64_t txg = zio->io_txg; 2217 zio_prop_t *zp = &zio->io_prop; 2218 int p = zp->zp_copies; 2219 int ditto_copies; 2220 zio_t *cio = NULL; 2221 zio_t *dio = NULL; 2222 ddt_t *ddt = ddt_select(spa, bp); 2223 ddt_entry_t *dde; 2224 ddt_phys_t *ddp; 2225 2226 ASSERT(BP_GET_DEDUP(bp)); 2227 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2228 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2229 2230 ddt_enter(ddt); 2231 dde = ddt_lookup(ddt, bp, B_TRUE); 2232 ddp = &dde->dde_phys[p]; 2233 2234 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2235 /* 2236 * If we're using a weak checksum, upgrade to a strong checksum 2237 * and try again. If we're already using a strong checksum, 2238 * we can't resolve it, so just convert to an ordinary write. 2239 * (And automatically e-mail a paper to Nature?) 2240 */ 2241 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2242 zp->zp_checksum = spa_dedup_checksum(spa); 2243 zio_pop_transforms(zio); 2244 zio->io_stage = ZIO_STAGE_OPEN; 2245 BP_ZERO(bp); 2246 } else { 2247 zp->zp_dedup = B_FALSE; 2248 } 2249 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2250 ddt_exit(ddt); 2251 return (ZIO_PIPELINE_CONTINUE); 2252 } 2253 2254 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2255 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2256 2257 if (ditto_copies > ddt_ditto_copies_present(dde) && 2258 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2259 zio_prop_t czp = *zp; 2260 2261 czp.zp_copies = ditto_copies; 2262 2263 /* 2264 * If we arrived here with an override bp, we won't have run 2265 * the transform stack, so we won't have the data we need to 2266 * generate a child i/o. So, toss the override bp and restart. 2267 * This is safe, because using the override bp is just an 2268 * optimization; and it's rare, so the cost doesn't matter. 2269 */ 2270 if (zio->io_bp_override) { 2271 zio_pop_transforms(zio); 2272 zio->io_stage = ZIO_STAGE_OPEN; 2273 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2274 zio->io_bp_override = NULL; 2275 BP_ZERO(bp); 2276 ddt_exit(ddt); 2277 return (ZIO_PIPELINE_CONTINUE); 2278 } 2279 2280 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2281 zio->io_orig_size, &czp, NULL, NULL, 2282 zio_ddt_ditto_write_done, dde, zio->io_priority, 2283 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2284 2285 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2286 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2287 } 2288 2289 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2290 if (ddp->ddp_phys_birth != 0) 2291 ddt_bp_fill(ddp, bp, txg); 2292 if (dde->dde_lead_zio[p] != NULL) 2293 zio_add_child(zio, dde->dde_lead_zio[p]); 2294 else 2295 ddt_phys_addref(ddp); 2296 } else if (zio->io_bp_override) { 2297 ASSERT(bp->blk_birth == txg); 2298 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2299 ddt_phys_fill(ddp, bp); 2300 ddt_phys_addref(ddp); 2301 } else { 2302 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2303 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2304 zio_ddt_child_write_done, dde, zio->io_priority, 2305 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2306 2307 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2308 dde->dde_lead_zio[p] = cio; 2309 } 2310 2311 ddt_exit(ddt); 2312 2313 if (cio) 2314 zio_nowait(cio); 2315 if (dio) 2316 zio_nowait(dio); 2317 2318 return (ZIO_PIPELINE_CONTINUE); 2319 } 2320 2321 ddt_entry_t *freedde; /* for debugging */ 2322 2323 static int 2324 zio_ddt_free(zio_t *zio) 2325 { 2326 spa_t *spa = zio->io_spa; 2327 blkptr_t *bp = zio->io_bp; 2328 ddt_t *ddt = ddt_select(spa, bp); 2329 ddt_entry_t *dde; 2330 ddt_phys_t *ddp; 2331 2332 ASSERT(BP_GET_DEDUP(bp)); 2333 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2334 2335 ddt_enter(ddt); 2336 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2337 ddp = ddt_phys_select(dde, bp); 2338 ddt_phys_decref(ddp); 2339 ddt_exit(ddt); 2340 2341 return (ZIO_PIPELINE_CONTINUE); 2342 } 2343 2344 /* 2345 * ========================================================================== 2346 * Allocate and free blocks 2347 * ========================================================================== 2348 */ 2349 static int 2350 zio_dva_allocate(zio_t *zio) 2351 { 2352 spa_t *spa = zio->io_spa; 2353 metaslab_class_t *mc = spa_normal_class(spa); 2354 blkptr_t *bp = zio->io_bp; 2355 int error; 2356 int flags = 0; 2357 2358 if (zio->io_gang_leader == NULL) { 2359 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2360 zio->io_gang_leader = zio; 2361 } 2362 2363 ASSERT(BP_IS_HOLE(bp)); 2364 ASSERT0(BP_GET_NDVAS(bp)); 2365 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2366 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2367 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2368 2369 /* 2370 * The dump device does not support gang blocks so allocation on 2371 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2372 * the "fast" gang feature. 2373 */ 2374 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2375 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2376 METASLAB_GANG_CHILD : 0; 2377 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2378 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2379 2380 if (error) { 2381 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2382 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2383 error); 2384 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2385 return (zio_write_gang_block(zio)); 2386 zio->io_error = error; 2387 } 2388 2389 return (ZIO_PIPELINE_CONTINUE); 2390 } 2391 2392 static int 2393 zio_dva_free(zio_t *zio) 2394 { 2395 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2396 2397 return (ZIO_PIPELINE_CONTINUE); 2398 } 2399 2400 static int 2401 zio_dva_claim(zio_t *zio) 2402 { 2403 int error; 2404 2405 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2406 if (error) 2407 zio->io_error = error; 2408 2409 return (ZIO_PIPELINE_CONTINUE); 2410 } 2411 2412 /* 2413 * Undo an allocation. This is used by zio_done() when an I/O fails 2414 * and we want to give back the block we just allocated. 2415 * This handles both normal blocks and gang blocks. 2416 */ 2417 static void 2418 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2419 { 2420 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2421 ASSERT(zio->io_bp_override == NULL); 2422 2423 if (!BP_IS_HOLE(bp)) 2424 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2425 2426 if (gn != NULL) { 2427 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2428 zio_dva_unallocate(zio, gn->gn_child[g], 2429 &gn->gn_gbh->zg_blkptr[g]); 2430 } 2431 } 2432 } 2433 2434 /* 2435 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2436 */ 2437 int 2438 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2439 uint64_t size, boolean_t use_slog) 2440 { 2441 int error = 1; 2442 2443 ASSERT(txg > spa_syncing_txg(spa)); 2444 2445 /* 2446 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2447 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2448 * when allocating them. 2449 */ 2450 if (use_slog) { 2451 error = metaslab_alloc(spa, spa_log_class(spa), size, 2452 new_bp, 1, txg, old_bp, 2453 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2454 } 2455 2456 if (error) { 2457 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2458 new_bp, 1, txg, old_bp, 2459 METASLAB_HINTBP_AVOID); 2460 } 2461 2462 if (error == 0) { 2463 BP_SET_LSIZE(new_bp, size); 2464 BP_SET_PSIZE(new_bp, size); 2465 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2466 BP_SET_CHECKSUM(new_bp, 2467 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2468 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2469 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2470 BP_SET_LEVEL(new_bp, 0); 2471 BP_SET_DEDUP(new_bp, 0); 2472 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2473 } 2474 2475 return (error); 2476 } 2477 2478 /* 2479 * Free an intent log block. 2480 */ 2481 void 2482 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2483 { 2484 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2485 ASSERT(!BP_IS_GANG(bp)); 2486 2487 zio_free(spa, txg, bp); 2488 } 2489 2490 /* 2491 * ========================================================================== 2492 * Read and write to physical devices 2493 * ========================================================================== 2494 */ 2495 static int 2496 zio_vdev_io_start(zio_t *zio) 2497 { 2498 vdev_t *vd = zio->io_vd; 2499 uint64_t align; 2500 spa_t *spa = zio->io_spa; 2501 2502 ASSERT(zio->io_error == 0); 2503 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2504 2505 if (vd == NULL) { 2506 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2507 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2508 2509 /* 2510 * The mirror_ops handle multiple DVAs in a single BP. 2511 */ 2512 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2513 } 2514 2515 /* 2516 * We keep track of time-sensitive I/Os so that the scan thread 2517 * can quickly react to certain workloads. In particular, we care 2518 * about non-scrubbing, top-level reads and writes with the following 2519 * characteristics: 2520 * - synchronous writes of user data to non-slog devices 2521 * - any reads of user data 2522 * When these conditions are met, adjust the timestamp of spa_last_io 2523 * which allows the scan thread to adjust its workload accordingly. 2524 */ 2525 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2526 vd == vd->vdev_top && !vd->vdev_islog && 2527 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2528 zio->io_txg != spa_syncing_txg(spa)) { 2529 uint64_t old = spa->spa_last_io; 2530 uint64_t new = ddi_get_lbolt64(); 2531 if (old != new) 2532 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2533 } 2534 2535 align = 1ULL << vd->vdev_top->vdev_ashift; 2536 2537 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 2538 P2PHASE(zio->io_size, align) != 0) { 2539 /* Transform logical writes to be a full physical block size. */ 2540 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2541 char *abuf = zio_buf_alloc(asize); 2542 ASSERT(vd == vd->vdev_top); 2543 if (zio->io_type == ZIO_TYPE_WRITE) { 2544 bcopy(zio->io_data, abuf, zio->io_size); 2545 bzero(abuf + zio->io_size, asize - zio->io_size); 2546 } 2547 zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2548 } 2549 2550 /* 2551 * If this is not a physical io, make sure that it is properly aligned 2552 * before proceeding. 2553 */ 2554 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 2555 ASSERT0(P2PHASE(zio->io_offset, align)); 2556 ASSERT0(P2PHASE(zio->io_size, align)); 2557 } else { 2558 /* 2559 * For physical writes, we allow 512b aligned writes and assume 2560 * the device will perform a read-modify-write as necessary. 2561 */ 2562 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); 2563 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); 2564 } 2565 2566 VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2567 2568 /* 2569 * If this is a repair I/O, and there's no self-healing involved -- 2570 * that is, we're just resilvering what we expect to resilver -- 2571 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2572 * This prevents spurious resilvering with nested replication. 2573 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2574 * A is out of date, we'll read from C+D, then use the data to 2575 * resilver A+B -- but we don't actually want to resilver B, just A. 2576 * The top-level mirror has no way to know this, so instead we just 2577 * discard unnecessary repairs as we work our way down the vdev tree. 2578 * The same logic applies to any form of nested replication: 2579 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2580 */ 2581 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2582 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2583 zio->io_txg != 0 && /* not a delegated i/o */ 2584 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2585 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2586 zio_vdev_io_bypass(zio); 2587 return (ZIO_PIPELINE_CONTINUE); 2588 } 2589 2590 if (vd->vdev_ops->vdev_op_leaf && 2591 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2592 2593 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) 2594 return (ZIO_PIPELINE_CONTINUE); 2595 2596 if ((zio = vdev_queue_io(zio)) == NULL) 2597 return (ZIO_PIPELINE_STOP); 2598 2599 if (!vdev_accessible(vd, zio)) { 2600 zio->io_error = SET_ERROR(ENXIO); 2601 zio_interrupt(zio); 2602 return (ZIO_PIPELINE_STOP); 2603 } 2604 } 2605 2606 return (vd->vdev_ops->vdev_op_io_start(zio)); 2607 } 2608 2609 static int 2610 zio_vdev_io_done(zio_t *zio) 2611 { 2612 vdev_t *vd = zio->io_vd; 2613 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2614 boolean_t unexpected_error = B_FALSE; 2615 2616 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2617 return (ZIO_PIPELINE_STOP); 2618 2619 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2620 2621 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2622 2623 vdev_queue_io_done(zio); 2624 2625 if (zio->io_type == ZIO_TYPE_WRITE) 2626 vdev_cache_write(zio); 2627 2628 if (zio_injection_enabled && zio->io_error == 0) 2629 zio->io_error = zio_handle_device_injection(vd, 2630 zio, EIO); 2631 2632 if (zio_injection_enabled && zio->io_error == 0) 2633 zio->io_error = zio_handle_label_injection(zio, EIO); 2634 2635 if (zio->io_error) { 2636 if (!vdev_accessible(vd, zio)) { 2637 zio->io_error = SET_ERROR(ENXIO); 2638 } else { 2639 unexpected_error = B_TRUE; 2640 } 2641 } 2642 } 2643 2644 ops->vdev_op_io_done(zio); 2645 2646 if (unexpected_error) 2647 VERIFY(vdev_probe(vd, zio) == NULL); 2648 2649 return (ZIO_PIPELINE_CONTINUE); 2650 } 2651 2652 /* 2653 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2654 * disk, and use that to finish the checksum ereport later. 2655 */ 2656 static void 2657 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2658 const void *good_buf) 2659 { 2660 /* no processing needed */ 2661 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2662 } 2663 2664 /*ARGSUSED*/ 2665 void 2666 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2667 { 2668 void *buf = zio_buf_alloc(zio->io_size); 2669 2670 bcopy(zio->io_data, buf, zio->io_size); 2671 2672 zcr->zcr_cbinfo = zio->io_size; 2673 zcr->zcr_cbdata = buf; 2674 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2675 zcr->zcr_free = zio_buf_free; 2676 } 2677 2678 static int 2679 zio_vdev_io_assess(zio_t *zio) 2680 { 2681 vdev_t *vd = zio->io_vd; 2682 2683 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2684 return (ZIO_PIPELINE_STOP); 2685 2686 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2687 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2688 2689 if (zio->io_vsd != NULL) { 2690 zio->io_vsd_ops->vsd_free(zio); 2691 zio->io_vsd = NULL; 2692 } 2693 2694 if (zio_injection_enabled && zio->io_error == 0) 2695 zio->io_error = zio_handle_fault_injection(zio, EIO); 2696 2697 /* 2698 * If the I/O failed, determine whether we should attempt to retry it. 2699 * 2700 * On retry, we cut in line in the issue queue, since we don't want 2701 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2702 */ 2703 if (zio->io_error && vd == NULL && 2704 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2705 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2706 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2707 zio->io_error = 0; 2708 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2709 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2710 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2711 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2712 zio_requeue_io_start_cut_in_line); 2713 return (ZIO_PIPELINE_STOP); 2714 } 2715 2716 /* 2717 * If we got an error on a leaf device, convert it to ENXIO 2718 * if the device is not accessible at all. 2719 */ 2720 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2721 !vdev_accessible(vd, zio)) 2722 zio->io_error = SET_ERROR(ENXIO); 2723 2724 /* 2725 * If we can't write to an interior vdev (mirror or RAID-Z), 2726 * set vdev_cant_write so that we stop trying to allocate from it. 2727 */ 2728 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2729 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2730 vd->vdev_cant_write = B_TRUE; 2731 } 2732 2733 if (zio->io_error) 2734 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2735 2736 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2737 zio->io_physdone != NULL) { 2738 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2739 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2740 zio->io_physdone(zio->io_logical); 2741 } 2742 2743 return (ZIO_PIPELINE_CONTINUE); 2744 } 2745 2746 void 2747 zio_vdev_io_reissue(zio_t *zio) 2748 { 2749 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2750 ASSERT(zio->io_error == 0); 2751 2752 zio->io_stage >>= 1; 2753 } 2754 2755 void 2756 zio_vdev_io_redone(zio_t *zio) 2757 { 2758 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2759 2760 zio->io_stage >>= 1; 2761 } 2762 2763 void 2764 zio_vdev_io_bypass(zio_t *zio) 2765 { 2766 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2767 ASSERT(zio->io_error == 0); 2768 2769 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2770 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2771 } 2772 2773 /* 2774 * ========================================================================== 2775 * Generate and verify checksums 2776 * ========================================================================== 2777 */ 2778 static int 2779 zio_checksum_generate(zio_t *zio) 2780 { 2781 blkptr_t *bp = zio->io_bp; 2782 enum zio_checksum checksum; 2783 2784 if (bp == NULL) { 2785 /* 2786 * This is zio_write_phys(). 2787 * We're either generating a label checksum, or none at all. 2788 */ 2789 checksum = zio->io_prop.zp_checksum; 2790 2791 if (checksum == ZIO_CHECKSUM_OFF) 2792 return (ZIO_PIPELINE_CONTINUE); 2793 2794 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2795 } else { 2796 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2797 ASSERT(!IO_IS_ALLOCATING(zio)); 2798 checksum = ZIO_CHECKSUM_GANG_HEADER; 2799 } else { 2800 checksum = BP_GET_CHECKSUM(bp); 2801 } 2802 } 2803 2804 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2805 2806 return (ZIO_PIPELINE_CONTINUE); 2807 } 2808 2809 static int 2810 zio_checksum_verify(zio_t *zio) 2811 { 2812 zio_bad_cksum_t info; 2813 blkptr_t *bp = zio->io_bp; 2814 int error; 2815 2816 ASSERT(zio->io_vd != NULL); 2817 2818 if (bp == NULL) { 2819 /* 2820 * This is zio_read_phys(). 2821 * We're either verifying a label checksum, or nothing at all. 2822 */ 2823 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2824 return (ZIO_PIPELINE_CONTINUE); 2825 2826 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2827 } 2828 2829 if ((error = zio_checksum_error(zio, &info)) != 0) { 2830 zio->io_error = error; 2831 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2832 zfs_ereport_start_checksum(zio->io_spa, 2833 zio->io_vd, zio, zio->io_offset, 2834 zio->io_size, NULL, &info); 2835 } 2836 } 2837 2838 return (ZIO_PIPELINE_CONTINUE); 2839 } 2840 2841 /* 2842 * Called by RAID-Z to ensure we don't compute the checksum twice. 2843 */ 2844 void 2845 zio_checksum_verified(zio_t *zio) 2846 { 2847 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2848 } 2849 2850 /* 2851 * ========================================================================== 2852 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2853 * An error of 0 indicates success. ENXIO indicates whole-device failure, 2854 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2855 * indicate errors that are specific to one I/O, and most likely permanent. 2856 * Any other error is presumed to be worse because we weren't expecting it. 2857 * ========================================================================== 2858 */ 2859 int 2860 zio_worst_error(int e1, int e2) 2861 { 2862 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2863 int r1, r2; 2864 2865 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2866 if (e1 == zio_error_rank[r1]) 2867 break; 2868 2869 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2870 if (e2 == zio_error_rank[r2]) 2871 break; 2872 2873 return (r1 > r2 ? e1 : e2); 2874 } 2875 2876 /* 2877 * ========================================================================== 2878 * I/O completion 2879 * ========================================================================== 2880 */ 2881 static int 2882 zio_ready(zio_t *zio) 2883 { 2884 blkptr_t *bp = zio->io_bp; 2885 zio_t *pio, *pio_next; 2886 2887 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2888 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2889 return (ZIO_PIPELINE_STOP); 2890 2891 if (zio->io_ready) { 2892 ASSERT(IO_IS_ALLOCATING(zio)); 2893 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2894 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2895 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2896 2897 zio->io_ready(zio); 2898 } 2899 2900 if (bp != NULL && bp != &zio->io_bp_copy) 2901 zio->io_bp_copy = *bp; 2902 2903 if (zio->io_error) 2904 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2905 2906 mutex_enter(&zio->io_lock); 2907 zio->io_state[ZIO_WAIT_READY] = 1; 2908 pio = zio_walk_parents(zio); 2909 mutex_exit(&zio->io_lock); 2910 2911 /* 2912 * As we notify zio's parents, new parents could be added. 2913 * New parents go to the head of zio's io_parent_list, however, 2914 * so we will (correctly) not notify them. The remainder of zio's 2915 * io_parent_list, from 'pio_next' onward, cannot change because 2916 * all parents must wait for us to be done before they can be done. 2917 */ 2918 for (; pio != NULL; pio = pio_next) { 2919 pio_next = zio_walk_parents(zio); 2920 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2921 } 2922 2923 if (zio->io_flags & ZIO_FLAG_NODATA) { 2924 if (BP_IS_GANG(bp)) { 2925 zio->io_flags &= ~ZIO_FLAG_NODATA; 2926 } else { 2927 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2928 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2929 } 2930 } 2931 2932 if (zio_injection_enabled && 2933 zio->io_spa->spa_syncing_txg == zio->io_txg) 2934 zio_handle_ignored_writes(zio); 2935 2936 return (ZIO_PIPELINE_CONTINUE); 2937 } 2938 2939 static int 2940 zio_done(zio_t *zio) 2941 { 2942 spa_t *spa = zio->io_spa; 2943 zio_t *lio = zio->io_logical; 2944 blkptr_t *bp = zio->io_bp; 2945 vdev_t *vd = zio->io_vd; 2946 uint64_t psize = zio->io_size; 2947 zio_t *pio, *pio_next; 2948 2949 /* 2950 * If our children haven't all completed, 2951 * wait for them and then repeat this pipeline stage. 2952 */ 2953 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2954 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2955 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2956 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2957 return (ZIO_PIPELINE_STOP); 2958 2959 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2960 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2961 ASSERT(zio->io_children[c][w] == 0); 2962 2963 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 2964 ASSERT(bp->blk_pad[0] == 0); 2965 ASSERT(bp->blk_pad[1] == 0); 2966 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2967 (bp == zio_unique_parent(zio)->io_bp)); 2968 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2969 zio->io_bp_override == NULL && 2970 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2971 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2972 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2973 ASSERT(BP_COUNT_GANG(bp) == 0 || 2974 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2975 } 2976 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 2977 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 2978 } 2979 2980 /* 2981 * If there were child vdev/gang/ddt errors, they apply to us now. 2982 */ 2983 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2984 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2985 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2986 2987 /* 2988 * If the I/O on the transformed data was successful, generate any 2989 * checksum reports now while we still have the transformed data. 2990 */ 2991 if (zio->io_error == 0) { 2992 while (zio->io_cksum_report != NULL) { 2993 zio_cksum_report_t *zcr = zio->io_cksum_report; 2994 uint64_t align = zcr->zcr_align; 2995 uint64_t asize = P2ROUNDUP(psize, align); 2996 char *abuf = zio->io_data; 2997 2998 if (asize != psize) { 2999 abuf = zio_buf_alloc(asize); 3000 bcopy(zio->io_data, abuf, psize); 3001 bzero(abuf + psize, asize - psize); 3002 } 3003 3004 zio->io_cksum_report = zcr->zcr_next; 3005 zcr->zcr_next = NULL; 3006 zcr->zcr_finish(zcr, abuf); 3007 zfs_ereport_free_checksum(zcr); 3008 3009 if (asize != psize) 3010 zio_buf_free(abuf, asize); 3011 } 3012 } 3013 3014 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3015 3016 vdev_stat_update(zio, psize); 3017 3018 if (zio->io_error) { 3019 /* 3020 * If this I/O is attached to a particular vdev, 3021 * generate an error message describing the I/O failure 3022 * at the block level. We ignore these errors if the 3023 * device is currently unavailable. 3024 */ 3025 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3026 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3027 3028 if ((zio->io_error == EIO || !(zio->io_flags & 3029 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3030 zio == lio) { 3031 /* 3032 * For logical I/O requests, tell the SPA to log the 3033 * error and generate a logical data ereport. 3034 */ 3035 spa_log_error(spa, zio); 3036 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3037 0, 0); 3038 } 3039 } 3040 3041 if (zio->io_error && zio == lio) { 3042 /* 3043 * Determine whether zio should be reexecuted. This will 3044 * propagate all the way to the root via zio_notify_parent(). 3045 */ 3046 ASSERT(vd == NULL && bp != NULL); 3047 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3048 3049 if (IO_IS_ALLOCATING(zio) && 3050 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3051 if (zio->io_error != ENOSPC) 3052 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3053 else 3054 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3055 } 3056 3057 if ((zio->io_type == ZIO_TYPE_READ || 3058 zio->io_type == ZIO_TYPE_FREE) && 3059 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3060 zio->io_error == ENXIO && 3061 spa_load_state(spa) == SPA_LOAD_NONE && 3062 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3063 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3064 3065 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3066 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3067 3068 /* 3069 * Here is a possibly good place to attempt to do 3070 * either combinatorial reconstruction or error correction 3071 * based on checksums. It also might be a good place 3072 * to send out preliminary ereports before we suspend 3073 * processing. 3074 */ 3075 } 3076 3077 /* 3078 * If there were logical child errors, they apply to us now. 3079 * We defer this until now to avoid conflating logical child 3080 * errors with errors that happened to the zio itself when 3081 * updating vdev stats and reporting FMA events above. 3082 */ 3083 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3084 3085 if ((zio->io_error || zio->io_reexecute) && 3086 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3087 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3088 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3089 3090 zio_gang_tree_free(&zio->io_gang_tree); 3091 3092 /* 3093 * Godfather I/Os should never suspend. 3094 */ 3095 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3096 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3097 zio->io_reexecute = 0; 3098 3099 if (zio->io_reexecute) { 3100 /* 3101 * This is a logical I/O that wants to reexecute. 3102 * 3103 * Reexecute is top-down. When an i/o fails, if it's not 3104 * the root, it simply notifies its parent and sticks around. 3105 * The parent, seeing that it still has children in zio_done(), 3106 * does the same. This percolates all the way up to the root. 3107 * The root i/o will reexecute or suspend the entire tree. 3108 * 3109 * This approach ensures that zio_reexecute() honors 3110 * all the original i/o dependency relationships, e.g. 3111 * parents not executing until children are ready. 3112 */ 3113 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3114 3115 zio->io_gang_leader = NULL; 3116 3117 mutex_enter(&zio->io_lock); 3118 zio->io_state[ZIO_WAIT_DONE] = 1; 3119 mutex_exit(&zio->io_lock); 3120 3121 /* 3122 * "The Godfather" I/O monitors its children but is 3123 * not a true parent to them. It will track them through 3124 * the pipeline but severs its ties whenever they get into 3125 * trouble (e.g. suspended). This allows "The Godfather" 3126 * I/O to return status without blocking. 3127 */ 3128 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3129 zio_link_t *zl = zio->io_walk_link; 3130 pio_next = zio_walk_parents(zio); 3131 3132 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3133 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3134 zio_remove_child(pio, zio, zl); 3135 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3136 } 3137 } 3138 3139 if ((pio = zio_unique_parent(zio)) != NULL) { 3140 /* 3141 * We're not a root i/o, so there's nothing to do 3142 * but notify our parent. Don't propagate errors 3143 * upward since we haven't permanently failed yet. 3144 */ 3145 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3146 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3147 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3148 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3149 /* 3150 * We'd fail again if we reexecuted now, so suspend 3151 * until conditions improve (e.g. device comes online). 3152 */ 3153 zio_suspend(spa, zio); 3154 } else { 3155 /* 3156 * Reexecution is potentially a huge amount of work. 3157 * Hand it off to the otherwise-unused claim taskq. 3158 */ 3159 ASSERT(zio->io_tqent.tqent_next == NULL); 3160 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3161 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3162 0, &zio->io_tqent); 3163 } 3164 return (ZIO_PIPELINE_STOP); 3165 } 3166 3167 ASSERT(zio->io_child_count == 0); 3168 ASSERT(zio->io_reexecute == 0); 3169 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3170 3171 /* 3172 * Report any checksum errors, since the I/O is complete. 3173 */ 3174 while (zio->io_cksum_report != NULL) { 3175 zio_cksum_report_t *zcr = zio->io_cksum_report; 3176 zio->io_cksum_report = zcr->zcr_next; 3177 zcr->zcr_next = NULL; 3178 zcr->zcr_finish(zcr, NULL); 3179 zfs_ereport_free_checksum(zcr); 3180 } 3181 3182 /* 3183 * It is the responsibility of the done callback to ensure that this 3184 * particular zio is no longer discoverable for adoption, and as 3185 * such, cannot acquire any new parents. 3186 */ 3187 if (zio->io_done) 3188 zio->io_done(zio); 3189 3190 mutex_enter(&zio->io_lock); 3191 zio->io_state[ZIO_WAIT_DONE] = 1; 3192 mutex_exit(&zio->io_lock); 3193 3194 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3195 zio_link_t *zl = zio->io_walk_link; 3196 pio_next = zio_walk_parents(zio); 3197 zio_remove_child(pio, zio, zl); 3198 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3199 } 3200 3201 if (zio->io_waiter != NULL) { 3202 mutex_enter(&zio->io_lock); 3203 zio->io_executor = NULL; 3204 cv_broadcast(&zio->io_cv); 3205 mutex_exit(&zio->io_lock); 3206 } else { 3207 zio_destroy(zio); 3208 } 3209 3210 return (ZIO_PIPELINE_STOP); 3211 } 3212 3213 /* 3214 * ========================================================================== 3215 * I/O pipeline definition 3216 * ========================================================================== 3217 */ 3218 static zio_pipe_stage_t *zio_pipeline[] = { 3219 NULL, 3220 zio_read_bp_init, 3221 zio_free_bp_init, 3222 zio_issue_async, 3223 zio_write_bp_init, 3224 zio_checksum_generate, 3225 zio_nop_write, 3226 zio_ddt_read_start, 3227 zio_ddt_read_done, 3228 zio_ddt_write, 3229 zio_ddt_free, 3230 zio_gang_assemble, 3231 zio_gang_issue, 3232 zio_dva_allocate, 3233 zio_dva_free, 3234 zio_dva_claim, 3235 zio_ready, 3236 zio_vdev_io_start, 3237 zio_vdev_io_done, 3238 zio_vdev_io_assess, 3239 zio_checksum_verify, 3240 zio_done 3241 }; 3242 3243 /* dnp is the dnode for zb1->zb_object */ 3244 boolean_t 3245 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, 3246 const zbookmark_phys_t *zb2) 3247 { 3248 uint64_t zb1nextL0, zb2thisobj; 3249 3250 ASSERT(zb1->zb_objset == zb2->zb_objset); 3251 ASSERT(zb2->zb_level == 0); 3252 3253 /* The objset_phys_t isn't before anything. */ 3254 if (dnp == NULL) 3255 return (B_FALSE); 3256 3257 zb1nextL0 = (zb1->zb_blkid + 1) << 3258 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3259 3260 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3261 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3262 3263 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3264 uint64_t nextobj = zb1nextL0 * 3265 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3266 return (nextobj <= zb2thisobj); 3267 } 3268 3269 if (zb1->zb_object < zb2thisobj) 3270 return (B_TRUE); 3271 if (zb1->zb_object > zb2thisobj) 3272 return (B_FALSE); 3273 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3274 return (B_FALSE); 3275 return (zb1nextL0 <= zb2->zb_blkid); 3276 }