1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/fm/fs/zfs.h> 29 #include <sys/spa.h> 30 #include <sys/txg.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio_impl.h> 34 #include <sys/zio_compress.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/dmu_objset.h> 37 #include <sys/arc.h> 38 #include <sys/ddt.h> 39 #include <sys/blkptr.h> 40 #include <sys/zfeature.h> 41 42 /* 43 * ========================================================================== 44 * I/O type descriptions 45 * ========================================================================== 46 */ 47 const char *zio_type_name[ZIO_TYPES] = { 48 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 49 "zio_ioctl" 50 }; 51 52 /* 53 * ========================================================================== 54 * I/O kmem caches 55 * ========================================================================== 56 */ 57 kmem_cache_t *zio_cache; 58 kmem_cache_t *zio_link_cache; 59 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 60 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 61 62 #ifdef _KERNEL 63 extern vmem_t *zio_alloc_arena; 64 #endif 65 66 #define ZIO_PIPELINE_CONTINUE 0x100 67 #define ZIO_PIPELINE_STOP 0x101 68 69 /* 70 * The following actions directly effect the spa's sync-to-convergence logic. 71 * The values below define the sync pass when we start performing the action. 72 * Care should be taken when changing these values as they directly impact 73 * spa_sync() performance. Tuning these values may introduce subtle performance 74 * pathologies and should only be done in the context of performance analysis. 75 * These tunables will eventually be removed and replaced with #defines once 76 * enough analysis has been done to determine optimal values. 77 * 78 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 79 * regular blocks are not deferred. 80 */ 81 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 82 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 83 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 84 85 /* 86 * An allocating zio is one that either currently has the DVA allocate 87 * stage set or will have it later in its lifetime. 88 */ 89 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 90 91 boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 92 93 #ifdef ZFS_DEBUG 94 int zio_buf_debug_limit = 16384; 95 #else 96 int zio_buf_debug_limit = 0; 97 #endif 98 99 void 100 zio_init(void) 101 { 102 size_t c; 103 vmem_t *data_alloc_arena = NULL; 104 105 #ifdef _KERNEL 106 data_alloc_arena = zio_alloc_arena; 107 #endif 108 zio_cache = kmem_cache_create("zio_cache", 109 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 110 zio_link_cache = kmem_cache_create("zio_link_cache", 111 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 112 113 /* 114 * For small buffers, we want a cache for each multiple of 115 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 116 * for each quarter-power of 2. 117 */ 118 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 119 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 120 size_t p2 = size; 121 size_t align = 0; 122 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 123 124 while (p2 & (p2 - 1)) 125 p2 &= p2 - 1; 126 127 #ifndef _KERNEL 128 /* 129 * If we are using watchpoints, put each buffer on its own page, 130 * to eliminate the performance overhead of trapping to the 131 * kernel when modifying a non-watched buffer that shares the 132 * page with a watched buffer. 133 */ 134 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 135 continue; 136 #endif 137 if (size <= 4 * SPA_MINBLOCKSIZE) { 138 align = SPA_MINBLOCKSIZE; 139 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 140 align = MIN(p2 >> 2, PAGESIZE); 141 } 142 143 if (align != 0) { 144 char name[36]; 145 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 146 zio_buf_cache[c] = kmem_cache_create(name, size, 147 align, NULL, NULL, NULL, NULL, NULL, cflags); 148 149 /* 150 * Since zio_data bufs do not appear in crash dumps, we 151 * pass KMC_NOTOUCH so that no allocator metadata is 152 * stored with the buffers. 153 */ 154 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 155 zio_data_buf_cache[c] = kmem_cache_create(name, size, 156 align, NULL, NULL, NULL, NULL, data_alloc_arena, 157 cflags | KMC_NOTOUCH); 158 } 159 } 160 161 while (--c != 0) { 162 ASSERT(zio_buf_cache[c] != NULL); 163 if (zio_buf_cache[c - 1] == NULL) 164 zio_buf_cache[c - 1] = zio_buf_cache[c]; 165 166 ASSERT(zio_data_buf_cache[c] != NULL); 167 if (zio_data_buf_cache[c - 1] == NULL) 168 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 169 } 170 171 zio_inject_init(); 172 } 173 174 void 175 zio_fini(void) 176 { 177 size_t c; 178 kmem_cache_t *last_cache = NULL; 179 kmem_cache_t *last_data_cache = NULL; 180 181 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 182 if (zio_buf_cache[c] != last_cache) { 183 last_cache = zio_buf_cache[c]; 184 kmem_cache_destroy(zio_buf_cache[c]); 185 } 186 zio_buf_cache[c] = NULL; 187 188 if (zio_data_buf_cache[c] != last_data_cache) { 189 last_data_cache = zio_data_buf_cache[c]; 190 kmem_cache_destroy(zio_data_buf_cache[c]); 191 } 192 zio_data_buf_cache[c] = NULL; 193 } 194 195 kmem_cache_destroy(zio_link_cache); 196 kmem_cache_destroy(zio_cache); 197 198 zio_inject_fini(); 199 } 200 201 /* 202 * ========================================================================== 203 * Allocate and free I/O buffers 204 * ========================================================================== 205 */ 206 207 /* 208 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 209 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 210 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 211 * excess / transient data in-core during a crashdump. 212 */ 213 void * 214 zio_buf_alloc(size_t size) 215 { 216 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 217 218 ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 219 220 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 221 } 222 223 /* 224 * Same as zio_buf_alloc, but won't sleep in case memory cannot be allocated 225 * and will instead return immediately with a failure. 226 */ 227 void * 228 zio_buf_alloc_canfail(size_t size) 229 { 230 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 231 232 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 233 234 return (kmem_cache_alloc(zio_buf_cache[c], KM_NOSLEEP | KM_NORMALPRI)); 235 } 236 237 /* 238 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 239 * crashdump if the kernel panics. This exists so that we will limit the amount 240 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 241 * of kernel heap dumped to disk when the kernel panics) 242 */ 243 void * 244 zio_data_buf_alloc(size_t size) 245 { 246 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 247 248 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 249 250 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 251 } 252 253 /* 254 * Same as zio_data_buf_alloc, but won't sleep in case memory cannot be 255 * allocated and will instead return immediately with a failure. 256 */ 257 void * 258 zio_data_buf_alloc_canfail(size_t size) 259 { 260 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 261 262 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 263 264 return (kmem_cache_alloc(zio_data_buf_cache[c], 265 KM_NOSLEEP | KM_NORMALPRI)); 266 } 267 268 void 269 zio_buf_free(void *buf, size_t size) 270 { 271 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 272 273 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 274 275 kmem_cache_free(zio_buf_cache[c], buf); 276 } 277 278 void 279 zio_data_buf_free(void *buf, size_t size) 280 { 281 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 282 283 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 284 285 kmem_cache_free(zio_data_buf_cache[c], buf); 286 } 287 288 /* 289 * ========================================================================== 290 * Push and pop I/O transform buffers 291 * ========================================================================== 292 */ 293 static void 294 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 295 zio_transform_func_t *transform) 296 { 297 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 298 299 zt->zt_orig_data = zio->io_data; 300 zt->zt_orig_size = zio->io_size; 301 zt->zt_bufsize = bufsize; 302 zt->zt_transform = transform; 303 304 zt->zt_next = zio->io_transform_stack; 305 zio->io_transform_stack = zt; 306 307 zio->io_data = data; 308 zio->io_size = size; 309 } 310 311 static void 312 zio_pop_transforms(zio_t *zio) 313 { 314 zio_transform_t *zt; 315 316 while ((zt = zio->io_transform_stack) != NULL) { 317 if (zt->zt_transform != NULL) 318 zt->zt_transform(zio, 319 zt->zt_orig_data, zt->zt_orig_size); 320 321 if (zt->zt_bufsize != 0) 322 zio_buf_free(zio->io_data, zt->zt_bufsize); 323 324 zio->io_data = zt->zt_orig_data; 325 zio->io_size = zt->zt_orig_size; 326 zio->io_transform_stack = zt->zt_next; 327 328 kmem_free(zt, sizeof (zio_transform_t)); 329 } 330 } 331 332 /* 333 * ========================================================================== 334 * I/O transform callbacks for subblocks and decompression 335 * ========================================================================== 336 */ 337 static void 338 zio_subblock(zio_t *zio, void *data, uint64_t size) 339 { 340 ASSERT(zio->io_size > size); 341 342 if (zio->io_type == ZIO_TYPE_READ) 343 bcopy(zio->io_data, data, size); 344 } 345 346 static void 347 zio_decompress(zio_t *zio, void *data, uint64_t size) 348 { 349 if (zio->io_error == 0 && 350 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 351 zio->io_data, data, zio->io_size, size) != 0) 352 zio->io_error = SET_ERROR(EIO); 353 } 354 355 /* 356 * ========================================================================== 357 * I/O parent/child relationships and pipeline interlocks 358 * ========================================================================== 359 */ 360 /* 361 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 362 * continue calling these functions until they return NULL. 363 * Otherwise, the next caller will pick up the list walk in 364 * some indeterminate state. (Otherwise every caller would 365 * have to pass in a cookie to keep the state represented by 366 * io_walk_link, which gets annoying.) 367 */ 368 zio_t * 369 zio_walk_parents(zio_t *cio) 370 { 371 zio_link_t *zl = cio->io_walk_link; 372 list_t *pl = &cio->io_parent_list; 373 374 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 375 cio->io_walk_link = zl; 376 377 if (zl == NULL) 378 return (NULL); 379 380 ASSERT(zl->zl_child == cio); 381 return (zl->zl_parent); 382 } 383 384 zio_t * 385 zio_walk_children(zio_t *pio) 386 { 387 zio_link_t *zl = pio->io_walk_link; 388 list_t *cl = &pio->io_child_list; 389 390 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 391 pio->io_walk_link = zl; 392 393 if (zl == NULL) 394 return (NULL); 395 396 ASSERT(zl->zl_parent == pio); 397 return (zl->zl_child); 398 } 399 400 zio_t * 401 zio_unique_parent(zio_t *cio) 402 { 403 zio_t *pio = zio_walk_parents(cio); 404 405 VERIFY(zio_walk_parents(cio) == NULL); 406 return (pio); 407 } 408 409 void 410 zio_add_child(zio_t *pio, zio_t *cio) 411 { 412 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 413 414 /* 415 * Logical I/Os can have logical, gang, or vdev children. 416 * Gang I/Os can have gang or vdev children. 417 * Vdev I/Os can only have vdev children. 418 * The following ASSERT captures all of these constraints. 419 */ 420 ASSERT(cio->io_child_type <= pio->io_child_type); 421 422 zl->zl_parent = pio; 423 zl->zl_child = cio; 424 425 mutex_enter(&cio->io_lock); 426 mutex_enter(&pio->io_lock); 427 428 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 429 430 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 431 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 432 433 list_insert_head(&pio->io_child_list, zl); 434 list_insert_head(&cio->io_parent_list, zl); 435 436 pio->io_child_count++; 437 cio->io_parent_count++; 438 439 mutex_exit(&pio->io_lock); 440 mutex_exit(&cio->io_lock); 441 } 442 443 static void 444 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 445 { 446 ASSERT(zl->zl_parent == pio); 447 ASSERT(zl->zl_child == cio); 448 449 mutex_enter(&cio->io_lock); 450 mutex_enter(&pio->io_lock); 451 452 list_remove(&pio->io_child_list, zl); 453 list_remove(&cio->io_parent_list, zl); 454 455 pio->io_child_count--; 456 cio->io_parent_count--; 457 458 mutex_exit(&pio->io_lock); 459 mutex_exit(&cio->io_lock); 460 461 kmem_cache_free(zio_link_cache, zl); 462 } 463 464 static boolean_t 465 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 466 { 467 uint64_t *countp = &zio->io_children[child][wait]; 468 boolean_t waiting = B_FALSE; 469 470 mutex_enter(&zio->io_lock); 471 ASSERT(zio->io_stall == NULL); 472 if (*countp != 0) { 473 zio->io_stage >>= 1; 474 zio->io_stall = countp; 475 waiting = B_TRUE; 476 } 477 mutex_exit(&zio->io_lock); 478 479 return (waiting); 480 } 481 482 static void 483 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 484 { 485 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 486 int *errorp = &pio->io_child_error[zio->io_child_type]; 487 488 mutex_enter(&pio->io_lock); 489 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 490 *errorp = zio_worst_error(*errorp, zio->io_error); 491 pio->io_reexecute |= zio->io_reexecute; 492 ASSERT3U(*countp, >, 0); 493 494 (*countp)--; 495 496 if (*countp == 0 && pio->io_stall == countp) { 497 pio->io_stall = NULL; 498 mutex_exit(&pio->io_lock); 499 zio_execute(pio); 500 } else { 501 mutex_exit(&pio->io_lock); 502 } 503 } 504 505 static void 506 zio_inherit_child_errors(zio_t *zio, enum zio_child c) 507 { 508 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 509 zio->io_error = zio->io_child_error[c]; 510 } 511 512 /* 513 * ========================================================================== 514 * Create the various types of I/O (read, write, free, etc) 515 * ========================================================================== 516 */ 517 static zio_t * 518 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 519 void *data, uint64_t size, zio_done_func_t *done, void *private, 520 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 521 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, 522 enum zio_stage stage, enum zio_stage pipeline) 523 { 524 zio_t *zio; 525 526 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 527 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 528 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 529 530 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 531 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 532 ASSERT(vd || stage == ZIO_STAGE_OPEN); 533 534 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 535 bzero(zio, sizeof (zio_t)); 536 537 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 538 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 539 540 list_create(&zio->io_parent_list, sizeof (zio_link_t), 541 offsetof(zio_link_t, zl_parent_node)); 542 list_create(&zio->io_child_list, sizeof (zio_link_t), 543 offsetof(zio_link_t, zl_child_node)); 544 545 if (vd != NULL) 546 zio->io_child_type = ZIO_CHILD_VDEV; 547 else if (flags & ZIO_FLAG_GANG_CHILD) 548 zio->io_child_type = ZIO_CHILD_GANG; 549 else if (flags & ZIO_FLAG_DDT_CHILD) 550 zio->io_child_type = ZIO_CHILD_DDT; 551 else 552 zio->io_child_type = ZIO_CHILD_LOGICAL; 553 554 if (bp != NULL) { 555 zio->io_bp = (blkptr_t *)bp; 556 zio->io_bp_copy = *bp; 557 zio->io_bp_orig = *bp; 558 if (type != ZIO_TYPE_WRITE || 559 zio->io_child_type == ZIO_CHILD_DDT) 560 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 561 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 562 zio->io_logical = zio; 563 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 564 pipeline |= ZIO_GANG_STAGES; 565 } 566 567 zio->io_spa = spa; 568 zio->io_txg = txg; 569 zio->io_done = done; 570 zio->io_private = private; 571 zio->io_type = type; 572 zio->io_priority = priority; 573 zio->io_vd = vd; 574 zio->io_offset = offset; 575 zio->io_orig_data = zio->io_data = data; 576 zio->io_orig_size = zio->io_size = size; 577 zio->io_orig_flags = zio->io_flags = flags; 578 zio->io_orig_stage = zio->io_stage = stage; 579 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 580 581 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 582 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 583 584 if (zb != NULL) 585 zio->io_bookmark = *zb; 586 587 if (pio != NULL) { 588 if (zio->io_logical == NULL) 589 zio->io_logical = pio->io_logical; 590 if (zio->io_child_type == ZIO_CHILD_GANG) 591 zio->io_gang_leader = pio->io_gang_leader; 592 zio_add_child(pio, zio); 593 } 594 595 return (zio); 596 } 597 598 static void 599 zio_destroy(zio_t *zio) 600 { 601 list_destroy(&zio->io_parent_list); 602 list_destroy(&zio->io_child_list); 603 mutex_destroy(&zio->io_lock); 604 cv_destroy(&zio->io_cv); 605 kmem_cache_free(zio_cache, zio); 606 } 607 608 zio_t * 609 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 610 void *private, enum zio_flag flags) 611 { 612 zio_t *zio; 613 614 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 615 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 616 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 617 618 return (zio); 619 } 620 621 zio_t * 622 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 623 { 624 return (zio_null(NULL, spa, NULL, done, private, flags)); 625 } 626 627 zio_t * 628 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 629 void *data, uint64_t size, zio_done_func_t *done, void *private, 630 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 631 { 632 zio_t *zio; 633 634 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 635 data, size, done, private, 636 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 637 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 638 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 639 640 return (zio); 641 } 642 643 zio_t * 644 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 645 void *data, uint64_t size, const zio_prop_t *zp, 646 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 647 void *private, 648 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 649 { 650 zio_t *zio; 651 652 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 653 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 654 zp->zp_compress >= ZIO_COMPRESS_OFF && 655 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 656 DMU_OT_IS_VALID(zp->zp_type) && 657 zp->zp_level < 32 && 658 zp->zp_copies > 0 && 659 zp->zp_copies <= spa_max_replication(spa)); 660 661 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 662 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 663 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 664 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 665 666 zio->io_ready = ready; 667 zio->io_physdone = physdone; 668 zio->io_prop = *zp; 669 670 /* 671 * Data can be NULL if we are going to call zio_write_override() to 672 * provide the already-allocated BP. But we may need the data to 673 * verify a dedup hit (if requested). In this case, don't try to 674 * dedup (just take the already-allocated BP verbatim). 675 */ 676 if (data == NULL && zio->io_prop.zp_dedup_verify) { 677 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 678 } 679 680 return (zio); 681 } 682 683 zio_t * 684 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 685 uint64_t size, zio_done_func_t *done, void *private, 686 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 687 { 688 zio_t *zio; 689 690 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 691 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 692 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 693 694 return (zio); 695 } 696 697 void 698 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 699 { 700 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 701 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 702 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 703 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 704 705 /* 706 * We must reset the io_prop to match the values that existed 707 * when the bp was first written by dmu_sync() keeping in mind 708 * that nopwrite and dedup are mutually exclusive. 709 */ 710 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 711 zio->io_prop.zp_nopwrite = nopwrite; 712 zio->io_prop.zp_copies = copies; 713 zio->io_bp_override = bp; 714 } 715 716 void 717 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 718 { 719 720 /* 721 * The check for EMBEDDED is a performance optimization. We 722 * process the free here (by ignoring it) rather than 723 * putting it on the list and then processing it in zio_free_sync(). 724 */ 725 if (BP_IS_EMBEDDED(bp)) 726 return; 727 metaslab_check_free(spa, bp); 728 729 /* 730 * Frees that are for the currently-syncing txg, are not going to be 731 * deferred, and which will not need to do a read (i.e. not GANG or 732 * DEDUP), can be processed immediately. Otherwise, put them on the 733 * in-memory list for later processing. 734 */ 735 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 736 txg != spa->spa_syncing_txg || 737 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 738 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 739 } else { 740 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); 741 } 742 } 743 744 zio_t * 745 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 746 enum zio_flag flags) 747 { 748 zio_t *zio; 749 enum zio_stage stage = ZIO_FREE_PIPELINE; 750 751 ASSERT(!BP_IS_HOLE(bp)); 752 ASSERT(spa_syncing_txg(spa) == txg); 753 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 754 755 if (BP_IS_EMBEDDED(bp)) 756 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 757 758 metaslab_check_free(spa, bp); 759 arc_freed(spa, bp); 760 761 /* 762 * GANG and DEDUP blocks can induce a read (for the gang block header, 763 * or the DDT), so issue them asynchronously so that this thread is 764 * not tied up. 765 */ 766 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 767 stage |= ZIO_STAGE_ISSUE_ASYNC; 768 769 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 770 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 771 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 772 773 return (zio); 774 } 775 776 zio_t * 777 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 778 zio_done_func_t *done, void *private, enum zio_flag flags) 779 { 780 zio_t *zio; 781 782 dprintf_bp(bp, "claiming in txg %llu", txg); 783 784 if (BP_IS_EMBEDDED(bp)) 785 return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 786 787 /* 788 * A claim is an allocation of a specific block. Claims are needed 789 * to support immediate writes in the intent log. The issue is that 790 * immediate writes contain committed data, but in a txg that was 791 * *not* committed. Upon opening the pool after an unclean shutdown, 792 * the intent log claims all blocks that contain immediate write data 793 * so that the SPA knows they're in use. 794 * 795 * All claims *must* be resolved in the first txg -- before the SPA 796 * starts allocating blocks -- so that nothing is allocated twice. 797 * If txg == 0 we just verify that the block is claimable. 798 */ 799 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 800 ASSERT(txg == spa_first_txg(spa) || txg == 0); 801 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 802 803 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 804 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 805 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 806 807 return (zio); 808 } 809 810 zio_t * 811 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 812 zio_done_func_t *done, void *private, enum zio_flag flags) 813 { 814 zio_t *zio; 815 int c; 816 817 if (vd->vdev_children == 0) { 818 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 819 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 820 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 821 822 zio->io_cmd = cmd; 823 } else { 824 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 825 826 for (c = 0; c < vd->vdev_children; c++) 827 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 828 done, private, flags)); 829 } 830 831 return (zio); 832 } 833 834 zio_t * 835 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 836 void *data, int checksum, zio_done_func_t *done, void *private, 837 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 838 { 839 zio_t *zio; 840 841 ASSERT(vd->vdev_children == 0); 842 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 843 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 844 ASSERT3U(offset + size, <=, vd->vdev_psize); 845 846 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 847 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 848 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 849 850 zio->io_prop.zp_checksum = checksum; 851 852 return (zio); 853 } 854 855 zio_t * 856 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 857 void *data, int checksum, zio_done_func_t *done, void *private, 858 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 859 { 860 zio_t *zio; 861 862 ASSERT(vd->vdev_children == 0); 863 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 864 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 865 ASSERT3U(offset + size, <=, vd->vdev_psize); 866 867 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 868 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, 869 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 870 871 zio->io_prop.zp_checksum = checksum; 872 873 if (zio_checksum_table[checksum].ci_eck) { 874 /* 875 * zec checksums are necessarily destructive -- they modify 876 * the end of the write buffer to hold the verifier/checksum. 877 * Therefore, we must make a local copy in case the data is 878 * being written to multiple places in parallel. 879 */ 880 void *wbuf = zio_buf_alloc(size); 881 bcopy(data, wbuf, size); 882 zio_push_transform(zio, wbuf, size, size, NULL); 883 } 884 885 return (zio); 886 } 887 888 /* 889 * Create a child I/O to do some work for us. 890 */ 891 zio_t * 892 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 893 void *data, uint64_t size, int type, zio_priority_t priority, 894 enum zio_flag flags, zio_done_func_t *done, void *private) 895 { 896 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 897 zio_t *zio; 898 899 ASSERT(vd->vdev_parent == 900 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 901 902 if (type == ZIO_TYPE_READ && bp != NULL) { 903 /* 904 * If we have the bp, then the child should perform the 905 * checksum and the parent need not. This pushes error 906 * detection as close to the leaves as possible and 907 * eliminates redundant checksums in the interior nodes. 908 */ 909 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 910 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 911 } 912 913 if (vd->vdev_children == 0) 914 offset += VDEV_LABEL_START_SIZE; 915 916 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 917 918 /* 919 * If we've decided to do a repair, the write is not speculative -- 920 * even if the original read was. 921 */ 922 if (flags & ZIO_FLAG_IO_REPAIR) 923 flags &= ~ZIO_FLAG_SPECULATIVE; 924 925 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 926 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 927 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 928 929 zio->io_physdone = pio->io_physdone; 930 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 931 zio->io_logical->io_phys_children++; 932 933 return (zio); 934 } 935 936 zio_t * 937 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 938 int type, zio_priority_t priority, enum zio_flag flags, 939 zio_done_func_t *done, void *private) 940 { 941 zio_t *zio; 942 943 ASSERT(vd->vdev_ops->vdev_op_leaf); 944 945 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 946 data, size, done, private, type, priority, 947 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 948 vd, offset, NULL, 949 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 950 951 return (zio); 952 } 953 954 void 955 zio_flush(zio_t *zio, vdev_t *vd) 956 { 957 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 958 NULL, NULL, 959 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 960 } 961 962 void 963 zio_shrink(zio_t *zio, uint64_t size) 964 { 965 ASSERT(zio->io_executor == NULL); 966 ASSERT(zio->io_orig_size == zio->io_size); 967 ASSERT(size <= zio->io_size); 968 969 /* 970 * We don't shrink for raidz because of problems with the 971 * reconstruction when reading back less than the block size. 972 * Note, BP_IS_RAIDZ() assumes no compression. 973 */ 974 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 975 if (!BP_IS_RAIDZ(zio->io_bp)) 976 zio->io_orig_size = zio->io_size = size; 977 } 978 979 /* 980 * ========================================================================== 981 * Prepare to read and write logical blocks 982 * ========================================================================== 983 */ 984 985 static int 986 zio_read_bp_init(zio_t *zio) 987 { 988 blkptr_t *bp = zio->io_bp; 989 990 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 991 zio->io_child_type == ZIO_CHILD_LOGICAL && 992 !(zio->io_flags & ZIO_FLAG_RAW)) { 993 uint64_t psize = 994 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 995 void *cbuf = zio_buf_alloc(psize); 996 997 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 998 } 999 1000 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1001 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1002 decode_embedded_bp_compressed(bp, zio->io_data); 1003 } else { 1004 ASSERT(!BP_IS_EMBEDDED(bp)); 1005 } 1006 1007 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1008 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1009 1010 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1011 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1012 1013 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1014 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1015 1016 return (ZIO_PIPELINE_CONTINUE); 1017 } 1018 1019 static int 1020 zio_write_bp_init(zio_t *zio) 1021 { 1022 spa_t *spa = zio->io_spa; 1023 zio_prop_t *zp = &zio->io_prop; 1024 enum zio_compress compress = zp->zp_compress; 1025 blkptr_t *bp = zio->io_bp; 1026 uint64_t lsize = zio->io_size; 1027 uint64_t psize = lsize; 1028 int pass = 1; 1029 1030 /* 1031 * If our children haven't all reached the ready stage, 1032 * wait for them and then repeat this pipeline stage. 1033 */ 1034 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 1035 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 1036 return (ZIO_PIPELINE_STOP); 1037 1038 if (!IO_IS_ALLOCATING(zio)) 1039 return (ZIO_PIPELINE_CONTINUE); 1040 1041 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1042 1043 if (zio->io_bp_override) { 1044 ASSERT(bp->blk_birth != zio->io_txg); 1045 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1046 1047 *bp = *zio->io_bp_override; 1048 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1049 1050 if (BP_IS_EMBEDDED(bp)) 1051 return (ZIO_PIPELINE_CONTINUE); 1052 1053 /* 1054 * If we've been overridden and nopwrite is set then 1055 * set the flag accordingly to indicate that a nopwrite 1056 * has already occurred. 1057 */ 1058 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1059 ASSERT(!zp->zp_dedup); 1060 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1061 return (ZIO_PIPELINE_CONTINUE); 1062 } 1063 1064 ASSERT(!zp->zp_nopwrite); 1065 1066 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1067 return (ZIO_PIPELINE_CONTINUE); 1068 1069 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1070 zp->zp_dedup_verify); 1071 1072 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1073 BP_SET_DEDUP(bp, 1); 1074 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1075 return (ZIO_PIPELINE_CONTINUE); 1076 } 1077 zio->io_bp_override = NULL; 1078 BP_ZERO(bp); 1079 } 1080 1081 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1082 /* 1083 * We're rewriting an existing block, which means we're 1084 * working on behalf of spa_sync(). For spa_sync() to 1085 * converge, it must eventually be the case that we don't 1086 * have to allocate new blocks. But compression changes 1087 * the blocksize, which forces a reallocate, and makes 1088 * convergence take longer. Therefore, after the first 1089 * few passes, stop compressing to ensure convergence. 1090 */ 1091 pass = spa_sync_pass(spa); 1092 1093 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1094 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1095 ASSERT(!BP_GET_DEDUP(bp)); 1096 1097 if (pass >= zfs_sync_pass_dont_compress) 1098 compress = ZIO_COMPRESS_OFF; 1099 1100 /* Make sure someone doesn't change their mind on overwrites */ 1101 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1102 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1103 } 1104 1105 if (compress != ZIO_COMPRESS_OFF) { 1106 void *cbuf = zio_buf_alloc(lsize); 1107 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1108 if (psize == 0 || psize == lsize) { 1109 compress = ZIO_COMPRESS_OFF; 1110 zio_buf_free(cbuf, lsize); 1111 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && 1112 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 1113 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 1114 encode_embedded_bp_compressed(bp, 1115 cbuf, compress, lsize, psize); 1116 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 1117 BP_SET_TYPE(bp, zio->io_prop.zp_type); 1118 BP_SET_LEVEL(bp, zio->io_prop.zp_level); 1119 zio_buf_free(cbuf, lsize); 1120 bp->blk_birth = zio->io_txg; 1121 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1122 ASSERT(spa_feature_is_active(spa, 1123 SPA_FEATURE_EMBEDDED_DATA)); 1124 return (ZIO_PIPELINE_CONTINUE); 1125 } else { 1126 /* 1127 * Round up compressed size to MINBLOCKSIZE and 1128 * zero the tail. 1129 */ 1130 size_t rounded = 1131 P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); 1132 if (rounded > psize) { 1133 bzero((char *)cbuf + psize, rounded - psize); 1134 psize = rounded; 1135 } 1136 if (psize == lsize) { 1137 compress = ZIO_COMPRESS_OFF; 1138 zio_buf_free(cbuf, lsize); 1139 } else { 1140 zio_push_transform(zio, cbuf, 1141 psize, lsize, NULL); 1142 } 1143 } 1144 } 1145 1146 /* 1147 * The final pass of spa_sync() must be all rewrites, but the first 1148 * few passes offer a trade-off: allocating blocks defers convergence, 1149 * but newly allocated blocks are sequential, so they can be written 1150 * to disk faster. Therefore, we allow the first few passes of 1151 * spa_sync() to allocate new blocks, but force rewrites after that. 1152 * There should only be a handful of blocks after pass 1 in any case. 1153 */ 1154 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1155 BP_GET_PSIZE(bp) == psize && 1156 pass >= zfs_sync_pass_rewrite) { 1157 ASSERT(psize != 0); 1158 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1159 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1160 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1161 } else { 1162 BP_ZERO(bp); 1163 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1164 } 1165 1166 if (psize == 0) { 1167 if (zio->io_bp_orig.blk_birth != 0 && 1168 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1169 BP_SET_LSIZE(bp, lsize); 1170 BP_SET_TYPE(bp, zp->zp_type); 1171 BP_SET_LEVEL(bp, zp->zp_level); 1172 BP_SET_BIRTH(bp, zio->io_txg, 0); 1173 } 1174 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1175 } else { 1176 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1177 BP_SET_LSIZE(bp, lsize); 1178 BP_SET_TYPE(bp, zp->zp_type); 1179 BP_SET_LEVEL(bp, zp->zp_level); 1180 BP_SET_PSIZE(bp, psize); 1181 BP_SET_COMPRESS(bp, compress); 1182 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1183 BP_SET_DEDUP(bp, zp->zp_dedup); 1184 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1185 if (zp->zp_dedup) { 1186 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1187 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1188 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1189 } 1190 if (zp->zp_nopwrite) { 1191 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1192 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1193 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1194 } 1195 } 1196 1197 return (ZIO_PIPELINE_CONTINUE); 1198 } 1199 1200 static int 1201 zio_free_bp_init(zio_t *zio) 1202 { 1203 blkptr_t *bp = zio->io_bp; 1204 1205 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1206 if (BP_GET_DEDUP(bp)) 1207 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1208 } 1209 1210 return (ZIO_PIPELINE_CONTINUE); 1211 } 1212 1213 /* 1214 * ========================================================================== 1215 * Execute the I/O pipeline 1216 * ========================================================================== 1217 */ 1218 1219 static void 1220 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1221 { 1222 spa_t *spa = zio->io_spa; 1223 zio_type_t t = zio->io_type; 1224 int flags = (cutinline ? TQ_FRONT : 0); 1225 1226 /* 1227 * If we're a config writer or a probe, the normal issue and 1228 * interrupt threads may all be blocked waiting for the config lock. 1229 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1230 */ 1231 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1232 t = ZIO_TYPE_NULL; 1233 1234 /* 1235 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1236 */ 1237 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1238 t = ZIO_TYPE_NULL; 1239 1240 /* 1241 * If this is a high priority I/O, then use the high priority taskq if 1242 * available. 1243 */ 1244 if (zio->io_priority == ZIO_PRIORITY_NOW && 1245 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1246 q++; 1247 1248 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1249 1250 /* 1251 * NB: We are assuming that the zio can only be dispatched 1252 * to a single taskq at a time. It would be a grievous error 1253 * to dispatch the zio to another taskq at the same time. 1254 */ 1255 ASSERT(zio->io_tqent.tqent_next == NULL); 1256 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1257 flags, &zio->io_tqent); 1258 } 1259 1260 static boolean_t 1261 zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1262 { 1263 kthread_t *executor = zio->io_executor; 1264 spa_t *spa = zio->io_spa; 1265 1266 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1267 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1268 uint_t i; 1269 for (i = 0; i < tqs->stqs_count; i++) { 1270 if (taskq_member(tqs->stqs_taskq[i], executor)) 1271 return (B_TRUE); 1272 } 1273 } 1274 1275 return (B_FALSE); 1276 } 1277 1278 static int 1279 zio_issue_async(zio_t *zio) 1280 { 1281 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1282 1283 return (ZIO_PIPELINE_STOP); 1284 } 1285 1286 void 1287 zio_interrupt(zio_t *zio) 1288 { 1289 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1290 } 1291 1292 /* 1293 * Execute the I/O pipeline until one of the following occurs: 1294 * 1295 * (1) the I/O completes 1296 * (2) the pipeline stalls waiting for dependent child I/Os 1297 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1298 * (4) the I/O is delegated by vdev-level caching or aggregation 1299 * (5) the I/O is deferred due to vdev-level queueing 1300 * (6) the I/O is handed off to another thread. 1301 * 1302 * In all cases, the pipeline stops whenever there's no CPU work; it never 1303 * burns a thread in cv_wait(). 1304 * 1305 * There's no locking on io_stage because there's no legitimate way 1306 * for multiple threads to be attempting to process the same I/O. 1307 */ 1308 static zio_pipe_stage_t *zio_pipeline[]; 1309 1310 void 1311 zio_execute(zio_t *zio) 1312 { 1313 zio->io_executor = curthread; 1314 1315 while (zio->io_stage < ZIO_STAGE_DONE) { 1316 enum zio_stage pipeline = zio->io_pipeline; 1317 enum zio_stage stage = zio->io_stage; 1318 int rv; 1319 1320 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1321 ASSERT(ISP2(stage)); 1322 ASSERT(zio->io_stall == NULL); 1323 1324 do { 1325 stage <<= 1; 1326 } while ((stage & pipeline) == 0); 1327 1328 ASSERT(stage <= ZIO_STAGE_DONE); 1329 1330 /* 1331 * If we are in interrupt context and this pipeline stage 1332 * will grab a config lock that is held across I/O, 1333 * or may wait for an I/O that needs an interrupt thread 1334 * to complete, issue async to avoid deadlock. 1335 * 1336 * For VDEV_IO_START, we cut in line so that the io will 1337 * be sent to disk promptly. 1338 */ 1339 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1340 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1341 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1342 zio_requeue_io_start_cut_in_line : B_FALSE; 1343 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1344 return; 1345 } 1346 1347 zio->io_stage = stage; 1348 rv = zio_pipeline[highbit64(stage) - 1](zio); 1349 1350 if (rv == ZIO_PIPELINE_STOP) 1351 return; 1352 1353 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1354 } 1355 } 1356 1357 /* 1358 * ========================================================================== 1359 * Initiate I/O, either sync or async 1360 * ========================================================================== 1361 */ 1362 int 1363 zio_wait(zio_t *zio) 1364 { 1365 int error; 1366 1367 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1368 ASSERT(zio->io_executor == NULL); 1369 1370 zio->io_waiter = curthread; 1371 1372 zio_execute(zio); 1373 1374 mutex_enter(&zio->io_lock); 1375 while (zio->io_executor != NULL) 1376 cv_wait(&zio->io_cv, &zio->io_lock); 1377 mutex_exit(&zio->io_lock); 1378 1379 error = zio->io_error; 1380 zio_destroy(zio); 1381 1382 return (error); 1383 } 1384 1385 void 1386 zio_nowait(zio_t *zio) 1387 { 1388 ASSERT(zio->io_executor == NULL); 1389 1390 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1391 zio_unique_parent(zio) == NULL) { 1392 /* 1393 * This is a logical async I/O with no parent to wait for it. 1394 * We add it to the spa_async_root_zio "Godfather" I/O which 1395 * will ensure they complete prior to unloading the pool. 1396 */ 1397 spa_t *spa = zio->io_spa; 1398 1399 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1400 } 1401 1402 zio_execute(zio); 1403 } 1404 1405 /* 1406 * ========================================================================== 1407 * Reexecute or suspend/resume failed I/O 1408 * ========================================================================== 1409 */ 1410 1411 static void 1412 zio_reexecute(zio_t *pio) 1413 { 1414 zio_t *cio, *cio_next; 1415 1416 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1417 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1418 ASSERT(pio->io_gang_leader == NULL); 1419 ASSERT(pio->io_gang_tree == NULL); 1420 1421 pio->io_flags = pio->io_orig_flags; 1422 pio->io_stage = pio->io_orig_stage; 1423 pio->io_pipeline = pio->io_orig_pipeline; 1424 pio->io_reexecute = 0; 1425 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1426 pio->io_error = 0; 1427 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1428 pio->io_state[w] = 0; 1429 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1430 pio->io_child_error[c] = 0; 1431 1432 if (IO_IS_ALLOCATING(pio)) 1433 BP_ZERO(pio->io_bp); 1434 1435 /* 1436 * As we reexecute pio's children, new children could be created. 1437 * New children go to the head of pio's io_child_list, however, 1438 * so we will (correctly) not reexecute them. The key is that 1439 * the remainder of pio's io_child_list, from 'cio_next' onward, 1440 * cannot be affected by any side effects of reexecuting 'cio'. 1441 */ 1442 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1443 cio_next = zio_walk_children(pio); 1444 mutex_enter(&pio->io_lock); 1445 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1446 pio->io_children[cio->io_child_type][w]++; 1447 mutex_exit(&pio->io_lock); 1448 zio_reexecute(cio); 1449 } 1450 1451 /* 1452 * Now that all children have been reexecuted, execute the parent. 1453 * We don't reexecute "The Godfather" I/O here as it's the 1454 * responsibility of the caller to wait on him. 1455 */ 1456 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1457 zio_execute(pio); 1458 } 1459 1460 void 1461 zio_suspend(spa_t *spa, zio_t *zio) 1462 { 1463 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1464 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1465 "failure and the failure mode property for this pool " 1466 "is set to panic.", spa_name(spa)); 1467 1468 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1469 1470 mutex_enter(&spa->spa_suspend_lock); 1471 1472 if (spa->spa_suspend_zio_root == NULL) 1473 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1474 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1475 ZIO_FLAG_GODFATHER); 1476 1477 spa->spa_suspended = B_TRUE; 1478 1479 if (zio != NULL) { 1480 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1481 ASSERT(zio != spa->spa_suspend_zio_root); 1482 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1483 ASSERT(zio_unique_parent(zio) == NULL); 1484 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1485 zio_add_child(spa->spa_suspend_zio_root, zio); 1486 } 1487 1488 mutex_exit(&spa->spa_suspend_lock); 1489 } 1490 1491 int 1492 zio_resume(spa_t *spa) 1493 { 1494 zio_t *pio; 1495 1496 /* 1497 * Reexecute all previously suspended i/o. 1498 */ 1499 mutex_enter(&spa->spa_suspend_lock); 1500 spa->spa_suspended = B_FALSE; 1501 cv_broadcast(&spa->spa_suspend_cv); 1502 pio = spa->spa_suspend_zio_root; 1503 spa->spa_suspend_zio_root = NULL; 1504 mutex_exit(&spa->spa_suspend_lock); 1505 1506 if (pio == NULL) 1507 return (0); 1508 1509 zio_reexecute(pio); 1510 return (zio_wait(pio)); 1511 } 1512 1513 void 1514 zio_resume_wait(spa_t *spa) 1515 { 1516 mutex_enter(&spa->spa_suspend_lock); 1517 while (spa_suspended(spa)) 1518 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1519 mutex_exit(&spa->spa_suspend_lock); 1520 } 1521 1522 /* 1523 * ========================================================================== 1524 * Gang blocks. 1525 * 1526 * A gang block is a collection of small blocks that looks to the DMU 1527 * like one large block. When zio_dva_allocate() cannot find a block 1528 * of the requested size, due to either severe fragmentation or the pool 1529 * being nearly full, it calls zio_write_gang_block() to construct the 1530 * block from smaller fragments. 1531 * 1532 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1533 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1534 * an indirect block: it's an array of block pointers. It consumes 1535 * only one sector and hence is allocatable regardless of fragmentation. 1536 * The gang header's bps point to its gang members, which hold the data. 1537 * 1538 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1539 * as the verifier to ensure uniqueness of the SHA256 checksum. 1540 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1541 * not the gang header. This ensures that data block signatures (needed for 1542 * deduplication) are independent of how the block is physically stored. 1543 * 1544 * Gang blocks can be nested: a gang member may itself be a gang block. 1545 * Thus every gang block is a tree in which root and all interior nodes are 1546 * gang headers, and the leaves are normal blocks that contain user data. 1547 * The root of the gang tree is called the gang leader. 1548 * 1549 * To perform any operation (read, rewrite, free, claim) on a gang block, 1550 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1551 * in the io_gang_tree field of the original logical i/o by recursively 1552 * reading the gang leader and all gang headers below it. This yields 1553 * an in-core tree containing the contents of every gang header and the 1554 * bps for every constituent of the gang block. 1555 * 1556 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1557 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1558 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1559 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1560 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1561 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1562 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1563 * of the gang header plus zio_checksum_compute() of the data to update the 1564 * gang header's blk_cksum as described above. 1565 * 1566 * The two-phase assemble/issue model solves the problem of partial failure -- 1567 * what if you'd freed part of a gang block but then couldn't read the 1568 * gang header for another part? Assembling the entire gang tree first 1569 * ensures that all the necessary gang header I/O has succeeded before 1570 * starting the actual work of free, claim, or write. Once the gang tree 1571 * is assembled, free and claim are in-memory operations that cannot fail. 1572 * 1573 * In the event that a gang write fails, zio_dva_unallocate() walks the 1574 * gang tree to immediately free (i.e. insert back into the space map) 1575 * everything we've allocated. This ensures that we don't get ENOSPC 1576 * errors during repeated suspend/resume cycles due to a flaky device. 1577 * 1578 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1579 * the gang tree, we won't modify the block, so we can safely defer the free 1580 * (knowing that the block is still intact). If we *can* assemble the gang 1581 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1582 * each constituent bp and we can allocate a new block on the next sync pass. 1583 * 1584 * In all cases, the gang tree allows complete recovery from partial failure. 1585 * ========================================================================== 1586 */ 1587 1588 static zio_t * 1589 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1590 { 1591 if (gn != NULL) 1592 return (pio); 1593 1594 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1595 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1596 &pio->io_bookmark)); 1597 } 1598 1599 zio_t * 1600 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1601 { 1602 zio_t *zio; 1603 1604 if (gn != NULL) { 1605 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1606 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1607 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1608 /* 1609 * As we rewrite each gang header, the pipeline will compute 1610 * a new gang block header checksum for it; but no one will 1611 * compute a new data checksum, so we do that here. The one 1612 * exception is the gang leader: the pipeline already computed 1613 * its data checksum because that stage precedes gang assembly. 1614 * (Presently, nothing actually uses interior data checksums; 1615 * this is just good hygiene.) 1616 */ 1617 if (gn != pio->io_gang_leader->io_gang_tree) { 1618 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1619 data, BP_GET_PSIZE(bp)); 1620 } 1621 /* 1622 * If we are here to damage data for testing purposes, 1623 * leave the GBH alone so that we can detect the damage. 1624 */ 1625 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1626 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1627 } else { 1628 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1629 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1630 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1631 } 1632 1633 return (zio); 1634 } 1635 1636 /* ARGSUSED */ 1637 zio_t * 1638 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1639 { 1640 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1641 ZIO_GANG_CHILD_FLAGS(pio))); 1642 } 1643 1644 /* ARGSUSED */ 1645 zio_t * 1646 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1647 { 1648 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1649 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1650 } 1651 1652 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1653 NULL, 1654 zio_read_gang, 1655 zio_rewrite_gang, 1656 zio_free_gang, 1657 zio_claim_gang, 1658 NULL 1659 }; 1660 1661 static void zio_gang_tree_assemble_done(zio_t *zio); 1662 1663 static zio_gang_node_t * 1664 zio_gang_node_alloc(zio_gang_node_t **gnpp) 1665 { 1666 zio_gang_node_t *gn; 1667 1668 ASSERT(*gnpp == NULL); 1669 1670 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1671 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1672 *gnpp = gn; 1673 1674 return (gn); 1675 } 1676 1677 static void 1678 zio_gang_node_free(zio_gang_node_t **gnpp) 1679 { 1680 zio_gang_node_t *gn = *gnpp; 1681 1682 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1683 ASSERT(gn->gn_child[g] == NULL); 1684 1685 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1686 kmem_free(gn, sizeof (*gn)); 1687 *gnpp = NULL; 1688 } 1689 1690 static void 1691 zio_gang_tree_free(zio_gang_node_t **gnpp) 1692 { 1693 zio_gang_node_t *gn = *gnpp; 1694 1695 if (gn == NULL) 1696 return; 1697 1698 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1699 zio_gang_tree_free(&gn->gn_child[g]); 1700 1701 zio_gang_node_free(gnpp); 1702 } 1703 1704 static void 1705 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1706 { 1707 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1708 1709 ASSERT(gio->io_gang_leader == gio); 1710 ASSERT(BP_IS_GANG(bp)); 1711 1712 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1713 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1714 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1715 } 1716 1717 static void 1718 zio_gang_tree_assemble_done(zio_t *zio) 1719 { 1720 zio_t *gio = zio->io_gang_leader; 1721 zio_gang_node_t *gn = zio->io_private; 1722 blkptr_t *bp = zio->io_bp; 1723 1724 ASSERT(gio == zio_unique_parent(zio)); 1725 ASSERT(zio->io_child_count == 0); 1726 1727 if (zio->io_error) 1728 return; 1729 1730 if (BP_SHOULD_BYTESWAP(bp)) 1731 byteswap_uint64_array(zio->io_data, zio->io_size); 1732 1733 ASSERT(zio->io_data == gn->gn_gbh); 1734 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1735 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1736 1737 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1738 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1739 if (!BP_IS_GANG(gbp)) 1740 continue; 1741 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1742 } 1743 } 1744 1745 static void 1746 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1747 { 1748 zio_t *gio = pio->io_gang_leader; 1749 zio_t *zio; 1750 1751 ASSERT(BP_IS_GANG(bp) == !!gn); 1752 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1753 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1754 1755 /* 1756 * If you're a gang header, your data is in gn->gn_gbh. 1757 * If you're a gang member, your data is in 'data' and gn == NULL. 1758 */ 1759 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1760 1761 if (gn != NULL) { 1762 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1763 1764 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1765 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1766 if (BP_IS_HOLE(gbp)) 1767 continue; 1768 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1769 data = (char *)data + BP_GET_PSIZE(gbp); 1770 } 1771 } 1772 1773 if (gn == gio->io_gang_tree) 1774 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1775 1776 if (zio != pio) 1777 zio_nowait(zio); 1778 } 1779 1780 static int 1781 zio_gang_assemble(zio_t *zio) 1782 { 1783 blkptr_t *bp = zio->io_bp; 1784 1785 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1786 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1787 1788 zio->io_gang_leader = zio; 1789 1790 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1791 1792 return (ZIO_PIPELINE_CONTINUE); 1793 } 1794 1795 static int 1796 zio_gang_issue(zio_t *zio) 1797 { 1798 blkptr_t *bp = zio->io_bp; 1799 1800 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1801 return (ZIO_PIPELINE_STOP); 1802 1803 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1804 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1805 1806 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1807 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1808 else 1809 zio_gang_tree_free(&zio->io_gang_tree); 1810 1811 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1812 1813 return (ZIO_PIPELINE_CONTINUE); 1814 } 1815 1816 static void 1817 zio_write_gang_member_ready(zio_t *zio) 1818 { 1819 zio_t *pio = zio_unique_parent(zio); 1820 zio_t *gio = zio->io_gang_leader; 1821 dva_t *cdva = zio->io_bp->blk_dva; 1822 dva_t *pdva = pio->io_bp->blk_dva; 1823 uint64_t asize; 1824 1825 if (BP_IS_HOLE(zio->io_bp)) 1826 return; 1827 1828 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1829 1830 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1831 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1832 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1833 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1834 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1835 1836 mutex_enter(&pio->io_lock); 1837 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1838 ASSERT(DVA_GET_GANG(&pdva[d])); 1839 asize = DVA_GET_ASIZE(&pdva[d]); 1840 asize += DVA_GET_ASIZE(&cdva[d]); 1841 DVA_SET_ASIZE(&pdva[d], asize); 1842 } 1843 mutex_exit(&pio->io_lock); 1844 } 1845 1846 static int 1847 zio_write_gang_block(zio_t *pio) 1848 { 1849 spa_t *spa = pio->io_spa; 1850 blkptr_t *bp = pio->io_bp; 1851 zio_t *gio = pio->io_gang_leader; 1852 zio_t *zio; 1853 zio_gang_node_t *gn, **gnpp; 1854 zio_gbh_phys_t *gbh; 1855 uint64_t txg = pio->io_txg; 1856 uint64_t resid = pio->io_size; 1857 uint64_t lsize; 1858 int copies = gio->io_prop.zp_copies; 1859 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1860 zio_prop_t zp; 1861 int error; 1862 1863 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1864 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1865 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1866 if (error) { 1867 pio->io_error = error; 1868 return (ZIO_PIPELINE_CONTINUE); 1869 } 1870 1871 if (pio == gio) { 1872 gnpp = &gio->io_gang_tree; 1873 } else { 1874 gnpp = pio->io_private; 1875 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1876 } 1877 1878 gn = zio_gang_node_alloc(gnpp); 1879 gbh = gn->gn_gbh; 1880 bzero(gbh, SPA_GANGBLOCKSIZE); 1881 1882 /* 1883 * Create the gang header. 1884 */ 1885 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1886 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1887 1888 /* 1889 * Create and nowait the gang children. 1890 */ 1891 for (int g = 0; resid != 0; resid -= lsize, g++) { 1892 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1893 SPA_MINBLOCKSIZE); 1894 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1895 1896 zp.zp_checksum = gio->io_prop.zp_checksum; 1897 zp.zp_compress = ZIO_COMPRESS_OFF; 1898 zp.zp_type = DMU_OT_NONE; 1899 zp.zp_level = 0; 1900 zp.zp_copies = gio->io_prop.zp_copies; 1901 zp.zp_dedup = B_FALSE; 1902 zp.zp_dedup_verify = B_FALSE; 1903 zp.zp_nopwrite = B_FALSE; 1904 1905 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1906 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1907 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 1908 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1909 &pio->io_bookmark)); 1910 } 1911 1912 /* 1913 * Set pio's pipeline to just wait for zio to finish. 1914 */ 1915 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1916 1917 zio_nowait(zio); 1918 1919 return (ZIO_PIPELINE_CONTINUE); 1920 } 1921 1922 /* 1923 * The zio_nop_write stage in the pipeline determines if allocating 1924 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1925 * such as SHA256, we can compare the checksums of the new data and the old 1926 * to determine if allocating a new block is required. The nopwrite 1927 * feature can handle writes in either syncing or open context (i.e. zil 1928 * writes) and as a result is mutually exclusive with dedup. 1929 */ 1930 static int 1931 zio_nop_write(zio_t *zio) 1932 { 1933 blkptr_t *bp = zio->io_bp; 1934 blkptr_t *bp_orig = &zio->io_bp_orig; 1935 zio_prop_t *zp = &zio->io_prop; 1936 1937 ASSERT(BP_GET_LEVEL(bp) == 0); 1938 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1939 ASSERT(zp->zp_nopwrite); 1940 ASSERT(!zp->zp_dedup); 1941 ASSERT(zio->io_bp_override == NULL); 1942 ASSERT(IO_IS_ALLOCATING(zio)); 1943 1944 /* 1945 * Check to see if the original bp and the new bp have matching 1946 * characteristics (i.e. same checksum, compression algorithms, etc). 1947 * If they don't then just continue with the pipeline which will 1948 * allocate a new bp. 1949 */ 1950 if (BP_IS_HOLE(bp_orig) || 1951 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1952 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1953 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1954 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1955 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1956 return (ZIO_PIPELINE_CONTINUE); 1957 1958 /* 1959 * If the checksums match then reset the pipeline so that we 1960 * avoid allocating a new bp and issuing any I/O. 1961 */ 1962 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1963 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1964 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1965 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1966 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1967 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1968 sizeof (uint64_t)) == 0); 1969 1970 *bp = *bp_orig; 1971 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1972 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1973 } 1974 1975 return (ZIO_PIPELINE_CONTINUE); 1976 } 1977 1978 /* 1979 * ========================================================================== 1980 * Dedup 1981 * ========================================================================== 1982 */ 1983 static void 1984 zio_ddt_child_read_done(zio_t *zio) 1985 { 1986 blkptr_t *bp = zio->io_bp; 1987 ddt_entry_t *dde = zio->io_private; 1988 ddt_phys_t *ddp; 1989 zio_t *pio = zio_unique_parent(zio); 1990 1991 mutex_enter(&pio->io_lock); 1992 ddp = ddt_phys_select(dde, bp); 1993 if (zio->io_error == 0) 1994 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1995 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1996 dde->dde_repair_data = zio->io_data; 1997 else 1998 zio_buf_free(zio->io_data, zio->io_size); 1999 mutex_exit(&pio->io_lock); 2000 } 2001 2002 static int 2003 zio_ddt_read_start(zio_t *zio) 2004 { 2005 blkptr_t *bp = zio->io_bp; 2006 2007 ASSERT(BP_GET_DEDUP(bp)); 2008 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2009 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2010 2011 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2012 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2013 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2014 ddt_phys_t *ddp = dde->dde_phys; 2015 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2016 blkptr_t blk; 2017 2018 ASSERT(zio->io_vsd == NULL); 2019 zio->io_vsd = dde; 2020 2021 if (ddp_self == NULL) 2022 return (ZIO_PIPELINE_CONTINUE); 2023 2024 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2025 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2026 continue; 2027 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2028 &blk); 2029 zio_nowait(zio_read(zio, zio->io_spa, &blk, 2030 zio_buf_alloc(zio->io_size), zio->io_size, 2031 zio_ddt_child_read_done, dde, zio->io_priority, 2032 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 2033 &zio->io_bookmark)); 2034 } 2035 return (ZIO_PIPELINE_CONTINUE); 2036 } 2037 2038 zio_nowait(zio_read(zio, zio->io_spa, bp, 2039 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 2040 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2041 2042 return (ZIO_PIPELINE_CONTINUE); 2043 } 2044 2045 static int 2046 zio_ddt_read_done(zio_t *zio) 2047 { 2048 blkptr_t *bp = zio->io_bp; 2049 2050 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 2051 return (ZIO_PIPELINE_STOP); 2052 2053 ASSERT(BP_GET_DEDUP(bp)); 2054 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2055 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2056 2057 if (zio->io_child_error[ZIO_CHILD_DDT]) { 2058 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2059 ddt_entry_t *dde = zio->io_vsd; 2060 if (ddt == NULL) { 2061 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2062 return (ZIO_PIPELINE_CONTINUE); 2063 } 2064 if (dde == NULL) { 2065 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 2066 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2067 return (ZIO_PIPELINE_STOP); 2068 } 2069 if (dde->dde_repair_data != NULL) { 2070 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 2071 zio->io_child_error[ZIO_CHILD_DDT] = 0; 2072 } 2073 ddt_repair_done(ddt, dde); 2074 zio->io_vsd = NULL; 2075 } 2076 2077 ASSERT(zio->io_vsd == NULL); 2078 2079 return (ZIO_PIPELINE_CONTINUE); 2080 } 2081 2082 static boolean_t 2083 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2084 { 2085 spa_t *spa = zio->io_spa; 2086 2087 /* 2088 * Note: we compare the original data, not the transformed data, 2089 * because when zio->io_bp is an override bp, we will not have 2090 * pushed the I/O transforms. That's an important optimization 2091 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2092 */ 2093 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2094 zio_t *lio = dde->dde_lead_zio[p]; 2095 2096 if (lio != NULL) { 2097 return (lio->io_orig_size != zio->io_orig_size || 2098 bcmp(zio->io_orig_data, lio->io_orig_data, 2099 zio->io_orig_size) != 0); 2100 } 2101 } 2102 2103 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2104 ddt_phys_t *ddp = &dde->dde_phys[p]; 2105 2106 if (ddp->ddp_phys_birth != 0) { 2107 arc_buf_t *abuf = NULL; 2108 uint32_t aflags = ARC_WAIT; 2109 blkptr_t blk = *zio->io_bp; 2110 int error; 2111 2112 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2113 2114 ddt_exit(ddt); 2115 2116 error = arc_read(NULL, spa, &blk, 2117 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2118 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2119 &aflags, &zio->io_bookmark); 2120 2121 if (error == 0) { 2122 if (arc_buf_size(abuf) != zio->io_orig_size || 2123 bcmp(abuf->b_data, zio->io_orig_data, 2124 zio->io_orig_size) != 0) 2125 error = SET_ERROR(EEXIST); 2126 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2127 } 2128 2129 ddt_enter(ddt); 2130 return (error != 0); 2131 } 2132 } 2133 2134 return (B_FALSE); 2135 } 2136 2137 static void 2138 zio_ddt_child_write_ready(zio_t *zio) 2139 { 2140 int p = zio->io_prop.zp_copies; 2141 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2142 ddt_entry_t *dde = zio->io_private; 2143 ddt_phys_t *ddp = &dde->dde_phys[p]; 2144 zio_t *pio; 2145 2146 if (zio->io_error) 2147 return; 2148 2149 ddt_enter(ddt); 2150 2151 ASSERT(dde->dde_lead_zio[p] == zio); 2152 2153 ddt_phys_fill(ddp, zio->io_bp); 2154 2155 while ((pio = zio_walk_parents(zio)) != NULL) 2156 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2157 2158 ddt_exit(ddt); 2159 } 2160 2161 static void 2162 zio_ddt_child_write_done(zio_t *zio) 2163 { 2164 int p = zio->io_prop.zp_copies; 2165 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2166 ddt_entry_t *dde = zio->io_private; 2167 ddt_phys_t *ddp = &dde->dde_phys[p]; 2168 2169 ddt_enter(ddt); 2170 2171 ASSERT(ddp->ddp_refcnt == 0); 2172 ASSERT(dde->dde_lead_zio[p] == zio); 2173 dde->dde_lead_zio[p] = NULL; 2174 2175 if (zio->io_error == 0) { 2176 while (zio_walk_parents(zio) != NULL) 2177 ddt_phys_addref(ddp); 2178 } else { 2179 ddt_phys_clear(ddp); 2180 } 2181 2182 ddt_exit(ddt); 2183 } 2184 2185 static void 2186 zio_ddt_ditto_write_done(zio_t *zio) 2187 { 2188 int p = DDT_PHYS_DITTO; 2189 zio_prop_t *zp = &zio->io_prop; 2190 blkptr_t *bp = zio->io_bp; 2191 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2192 ddt_entry_t *dde = zio->io_private; 2193 ddt_phys_t *ddp = &dde->dde_phys[p]; 2194 ddt_key_t *ddk = &dde->dde_key; 2195 2196 ddt_enter(ddt); 2197 2198 ASSERT(ddp->ddp_refcnt == 0); 2199 ASSERT(dde->dde_lead_zio[p] == zio); 2200 dde->dde_lead_zio[p] = NULL; 2201 2202 if (zio->io_error == 0) { 2203 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2204 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2205 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2206 if (ddp->ddp_phys_birth != 0) 2207 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2208 ddt_phys_fill(ddp, bp); 2209 } 2210 2211 ddt_exit(ddt); 2212 } 2213 2214 static int 2215 zio_ddt_write(zio_t *zio) 2216 { 2217 spa_t *spa = zio->io_spa; 2218 blkptr_t *bp = zio->io_bp; 2219 uint64_t txg = zio->io_txg; 2220 zio_prop_t *zp = &zio->io_prop; 2221 int p = zp->zp_copies; 2222 int ditto_copies; 2223 zio_t *cio = NULL; 2224 zio_t *dio = NULL; 2225 ddt_t *ddt = ddt_select(spa, bp); 2226 ddt_entry_t *dde; 2227 ddt_phys_t *ddp; 2228 2229 ASSERT(BP_GET_DEDUP(bp)); 2230 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2231 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2232 2233 ddt_enter(ddt); 2234 dde = ddt_lookup(ddt, bp, B_TRUE); 2235 ddp = &dde->dde_phys[p]; 2236 2237 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2238 /* 2239 * If we're using a weak checksum, upgrade to a strong checksum 2240 * and try again. If we're already using a strong checksum, 2241 * we can't resolve it, so just convert to an ordinary write. 2242 * (And automatically e-mail a paper to Nature?) 2243 */ 2244 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2245 zp->zp_checksum = spa_dedup_checksum(spa); 2246 zio_pop_transforms(zio); 2247 zio->io_stage = ZIO_STAGE_OPEN; 2248 BP_ZERO(bp); 2249 } else { 2250 zp->zp_dedup = B_FALSE; 2251 } 2252 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2253 ddt_exit(ddt); 2254 return (ZIO_PIPELINE_CONTINUE); 2255 } 2256 2257 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2258 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2259 2260 if (ditto_copies > ddt_ditto_copies_present(dde) && 2261 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2262 zio_prop_t czp = *zp; 2263 2264 czp.zp_copies = ditto_copies; 2265 2266 /* 2267 * If we arrived here with an override bp, we won't have run 2268 * the transform stack, so we won't have the data we need to 2269 * generate a child i/o. So, toss the override bp and restart. 2270 * This is safe, because using the override bp is just an 2271 * optimization; and it's rare, so the cost doesn't matter. 2272 */ 2273 if (zio->io_bp_override) { 2274 zio_pop_transforms(zio); 2275 zio->io_stage = ZIO_STAGE_OPEN; 2276 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2277 zio->io_bp_override = NULL; 2278 BP_ZERO(bp); 2279 ddt_exit(ddt); 2280 return (ZIO_PIPELINE_CONTINUE); 2281 } 2282 2283 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2284 zio->io_orig_size, &czp, NULL, NULL, 2285 zio_ddt_ditto_write_done, dde, zio->io_priority, 2286 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2287 2288 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2289 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2290 } 2291 2292 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2293 if (ddp->ddp_phys_birth != 0) 2294 ddt_bp_fill(ddp, bp, txg); 2295 if (dde->dde_lead_zio[p] != NULL) 2296 zio_add_child(zio, dde->dde_lead_zio[p]); 2297 else 2298 ddt_phys_addref(ddp); 2299 } else if (zio->io_bp_override) { 2300 ASSERT(bp->blk_birth == txg); 2301 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2302 ddt_phys_fill(ddp, bp); 2303 ddt_phys_addref(ddp); 2304 } else { 2305 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2306 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2307 zio_ddt_child_write_done, dde, zio->io_priority, 2308 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2309 2310 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2311 dde->dde_lead_zio[p] = cio; 2312 } 2313 2314 ddt_exit(ddt); 2315 2316 if (cio) 2317 zio_nowait(cio); 2318 if (dio) 2319 zio_nowait(dio); 2320 2321 return (ZIO_PIPELINE_CONTINUE); 2322 } 2323 2324 ddt_entry_t *freedde; /* for debugging */ 2325 2326 static int 2327 zio_ddt_free(zio_t *zio) 2328 { 2329 spa_t *spa = zio->io_spa; 2330 blkptr_t *bp = zio->io_bp; 2331 ddt_t *ddt = ddt_select(spa, bp); 2332 ddt_entry_t *dde; 2333 ddt_phys_t *ddp; 2334 2335 ASSERT(BP_GET_DEDUP(bp)); 2336 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2337 2338 ddt_enter(ddt); 2339 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2340 ddp = ddt_phys_select(dde, bp); 2341 ddt_phys_decref(ddp); 2342 ddt_exit(ddt); 2343 2344 return (ZIO_PIPELINE_CONTINUE); 2345 } 2346 2347 /* 2348 * ========================================================================== 2349 * Allocate and free blocks 2350 * ========================================================================== 2351 */ 2352 static int 2353 zio_dva_allocate(zio_t *zio) 2354 { 2355 spa_t *spa = zio->io_spa; 2356 metaslab_class_t *mc = spa_normal_class(spa); 2357 blkptr_t *bp = zio->io_bp; 2358 int error; 2359 int flags = 0; 2360 2361 if (zio->io_gang_leader == NULL) { 2362 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2363 zio->io_gang_leader = zio; 2364 } 2365 2366 ASSERT(BP_IS_HOLE(bp)); 2367 ASSERT0(BP_GET_NDVAS(bp)); 2368 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2369 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2370 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2371 2372 /* 2373 * The dump device does not support gang blocks so allocation on 2374 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2375 * the "fast" gang feature. 2376 */ 2377 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2378 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2379 METASLAB_GANG_CHILD : 0; 2380 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2381 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2382 2383 if (error) { 2384 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2385 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2386 error); 2387 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2388 return (zio_write_gang_block(zio)); 2389 zio->io_error = error; 2390 } 2391 2392 return (ZIO_PIPELINE_CONTINUE); 2393 } 2394 2395 static int 2396 zio_dva_free(zio_t *zio) 2397 { 2398 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2399 2400 return (ZIO_PIPELINE_CONTINUE); 2401 } 2402 2403 static int 2404 zio_dva_claim(zio_t *zio) 2405 { 2406 int error; 2407 2408 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2409 if (error) 2410 zio->io_error = error; 2411 2412 return (ZIO_PIPELINE_CONTINUE); 2413 } 2414 2415 /* 2416 * Undo an allocation. This is used by zio_done() when an I/O fails 2417 * and we want to give back the block we just allocated. 2418 * This handles both normal blocks and gang blocks. 2419 */ 2420 static void 2421 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2422 { 2423 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2424 ASSERT(zio->io_bp_override == NULL); 2425 2426 if (!BP_IS_HOLE(bp)) 2427 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2428 2429 if (gn != NULL) { 2430 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2431 zio_dva_unallocate(zio, gn->gn_child[g], 2432 &gn->gn_gbh->zg_blkptr[g]); 2433 } 2434 } 2435 } 2436 2437 /* 2438 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2439 */ 2440 int 2441 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2442 uint64_t size, boolean_t use_slog) 2443 { 2444 int error = 1; 2445 2446 ASSERT(txg > spa_syncing_txg(spa)); 2447 2448 /* 2449 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2450 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2451 * when allocating them. 2452 */ 2453 if (use_slog) { 2454 error = metaslab_alloc(spa, spa_log_class(spa), size, 2455 new_bp, 1, txg, old_bp, 2456 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2457 } 2458 2459 if (error) { 2460 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2461 new_bp, 1, txg, old_bp, 2462 METASLAB_HINTBP_AVOID); 2463 } 2464 2465 if (error == 0) { 2466 BP_SET_LSIZE(new_bp, size); 2467 BP_SET_PSIZE(new_bp, size); 2468 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2469 BP_SET_CHECKSUM(new_bp, 2470 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2471 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2472 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2473 BP_SET_LEVEL(new_bp, 0); 2474 BP_SET_DEDUP(new_bp, 0); 2475 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2476 } 2477 2478 return (error); 2479 } 2480 2481 /* 2482 * Free an intent log block. 2483 */ 2484 void 2485 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2486 { 2487 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2488 ASSERT(!BP_IS_GANG(bp)); 2489 2490 zio_free(spa, txg, bp); 2491 } 2492 2493 /* 2494 * ========================================================================== 2495 * Read and write to physical devices 2496 * ========================================================================== 2497 */ 2498 2499 2500 /* 2501 * Issue an I/O to the underlying vdev. Typically the issue pipeline 2502 * stops after this stage and will resume upon I/O completion. 2503 * However, there are instances where the vdev layer may need to 2504 * continue the pipeline when an I/O was not issued. Since the I/O 2505 * that was sent to the vdev layer might be different than the one 2506 * currently active in the pipeline (see vdev_queue_io()), we explicitly 2507 * force the underlying vdev layers to call either zio_execute() or 2508 * zio_interrupt() to ensure that the pipeline continues with the correct I/O. 2509 */ 2510 static int 2511 zio_vdev_io_start(zio_t *zio) 2512 { 2513 vdev_t *vd = zio->io_vd; 2514 uint64_t align; 2515 spa_t *spa = zio->io_spa; 2516 2517 ASSERT(zio->io_error == 0); 2518 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2519 2520 if (vd == NULL) { 2521 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2522 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2523 2524 /* 2525 * The mirror_ops handle multiple DVAs in a single BP. 2526 */ 2527 vdev_mirror_ops.vdev_op_io_start(zio); 2528 return (ZIO_PIPELINE_STOP); 2529 } 2530 2531 /* 2532 * We keep track of time-sensitive I/Os so that the scan thread 2533 * can quickly react to certain workloads. In particular, we care 2534 * about non-scrubbing, top-level reads and writes with the following 2535 * characteristics: 2536 * - synchronous writes of user data to non-slog devices 2537 * - any reads of user data 2538 * When these conditions are met, adjust the timestamp of spa_last_io 2539 * which allows the scan thread to adjust its workload accordingly. 2540 */ 2541 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2542 vd == vd->vdev_top && !vd->vdev_islog && 2543 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2544 zio->io_txg != spa_syncing_txg(spa)) { 2545 uint64_t old = spa->spa_last_io; 2546 uint64_t new = ddi_get_lbolt64(); 2547 if (old != new) 2548 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2549 } 2550 2551 align = 1ULL << vd->vdev_top->vdev_ashift; 2552 2553 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 2554 P2PHASE(zio->io_size, align) != 0) { 2555 /* Transform logical writes to be a full physical block size. */ 2556 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2557 char *abuf = zio_buf_alloc(asize); 2558 ASSERT(vd == vd->vdev_top); 2559 if (zio->io_type == ZIO_TYPE_WRITE) { 2560 bcopy(zio->io_data, abuf, zio->io_size); 2561 bzero(abuf + zio->io_size, asize - zio->io_size); 2562 } 2563 zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2564 } 2565 2566 /* 2567 * If this is not a physical io, make sure that it is properly aligned 2568 * before proceeding. 2569 */ 2570 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 2571 ASSERT0(P2PHASE(zio->io_offset, align)); 2572 ASSERT0(P2PHASE(zio->io_size, align)); 2573 } else { 2574 /* 2575 * For physical writes, we allow 512b aligned writes and assume 2576 * the device will perform a read-modify-write as necessary. 2577 */ 2578 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); 2579 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); 2580 } 2581 2582 VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2583 2584 /* 2585 * If this is a repair I/O, and there's no self-healing involved -- 2586 * that is, we're just resilvering what we expect to resilver -- 2587 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2588 * This prevents spurious resilvering with nested replication. 2589 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2590 * A is out of date, we'll read from C+D, then use the data to 2591 * resilver A+B -- but we don't actually want to resilver B, just A. 2592 * The top-level mirror has no way to know this, so instead we just 2593 * discard unnecessary repairs as we work our way down the vdev tree. 2594 * The same logic applies to any form of nested replication: 2595 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2596 */ 2597 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2598 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2599 zio->io_txg != 0 && /* not a delegated i/o */ 2600 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2601 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2602 zio_vdev_io_bypass(zio); 2603 return (ZIO_PIPELINE_CONTINUE); 2604 } 2605 2606 if (vd->vdev_ops->vdev_op_leaf && 2607 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2608 2609 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) 2610 return (ZIO_PIPELINE_CONTINUE); 2611 2612 if ((zio = vdev_queue_io(zio)) == NULL) 2613 return (ZIO_PIPELINE_STOP); 2614 2615 if (!vdev_accessible(vd, zio)) { 2616 zio->io_error = SET_ERROR(ENXIO); 2617 zio_interrupt(zio); 2618 return (ZIO_PIPELINE_STOP); 2619 } 2620 } 2621 2622 vd->vdev_ops->vdev_op_io_start(zio); 2623 return (ZIO_PIPELINE_STOP); 2624 } 2625 2626 static int 2627 zio_vdev_io_done(zio_t *zio) 2628 { 2629 vdev_t *vd = zio->io_vd; 2630 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2631 boolean_t unexpected_error = B_FALSE; 2632 2633 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2634 return (ZIO_PIPELINE_STOP); 2635 2636 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2637 2638 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2639 2640 vdev_queue_io_done(zio); 2641 2642 if (zio->io_type == ZIO_TYPE_WRITE) 2643 vdev_cache_write(zio); 2644 2645 if (zio_injection_enabled && zio->io_error == 0) 2646 zio->io_error = zio_handle_device_injection(vd, 2647 zio, EIO); 2648 2649 if (zio_injection_enabled && zio->io_error == 0) 2650 zio->io_error = zio_handle_label_injection(zio, EIO); 2651 2652 if (zio->io_error) { 2653 if (!vdev_accessible(vd, zio)) { 2654 zio->io_error = SET_ERROR(ENXIO); 2655 } else { 2656 unexpected_error = B_TRUE; 2657 } 2658 } 2659 } 2660 2661 ops->vdev_op_io_done(zio); 2662 2663 if (unexpected_error) 2664 VERIFY(vdev_probe(vd, zio) == NULL); 2665 2666 return (ZIO_PIPELINE_CONTINUE); 2667 } 2668 2669 /* 2670 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2671 * disk, and use that to finish the checksum ereport later. 2672 */ 2673 static void 2674 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2675 const void *good_buf) 2676 { 2677 /* no processing needed */ 2678 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2679 } 2680 2681 /*ARGSUSED*/ 2682 void 2683 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2684 { 2685 void *buf = zio_buf_alloc(zio->io_size); 2686 2687 bcopy(zio->io_data, buf, zio->io_size); 2688 2689 zcr->zcr_cbinfo = zio->io_size; 2690 zcr->zcr_cbdata = buf; 2691 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2692 zcr->zcr_free = zio_buf_free; 2693 } 2694 2695 static int 2696 zio_vdev_io_assess(zio_t *zio) 2697 { 2698 vdev_t *vd = zio->io_vd; 2699 2700 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2701 return (ZIO_PIPELINE_STOP); 2702 2703 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2704 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2705 2706 if (zio->io_vsd != NULL) { 2707 zio->io_vsd_ops->vsd_free(zio); 2708 zio->io_vsd = NULL; 2709 } 2710 2711 if (zio_injection_enabled && zio->io_error == 0) 2712 zio->io_error = zio_handle_fault_injection(zio, EIO); 2713 2714 /* 2715 * If the I/O failed, determine whether we should attempt to retry it. 2716 * 2717 * On retry, we cut in line in the issue queue, since we don't want 2718 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2719 */ 2720 if (zio->io_error && vd == NULL && 2721 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2722 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2723 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2724 zio->io_error = 0; 2725 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2726 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2727 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2728 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2729 zio_requeue_io_start_cut_in_line); 2730 return (ZIO_PIPELINE_STOP); 2731 } 2732 2733 /* 2734 * If we got an error on a leaf device, convert it to ENXIO 2735 * if the device is not accessible at all. 2736 */ 2737 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2738 !vdev_accessible(vd, zio)) 2739 zio->io_error = SET_ERROR(ENXIO); 2740 2741 /* 2742 * If we can't write to an interior vdev (mirror or RAID-Z), 2743 * set vdev_cant_write so that we stop trying to allocate from it. 2744 */ 2745 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2746 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2747 vd->vdev_cant_write = B_TRUE; 2748 } 2749 2750 if (zio->io_error) 2751 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2752 2753 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2754 zio->io_physdone != NULL) { 2755 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2756 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2757 zio->io_physdone(zio->io_logical); 2758 } 2759 2760 return (ZIO_PIPELINE_CONTINUE); 2761 } 2762 2763 void 2764 zio_vdev_io_reissue(zio_t *zio) 2765 { 2766 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2767 ASSERT(zio->io_error == 0); 2768 2769 zio->io_stage >>= 1; 2770 } 2771 2772 void 2773 zio_vdev_io_redone(zio_t *zio) 2774 { 2775 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2776 2777 zio->io_stage >>= 1; 2778 } 2779 2780 void 2781 zio_vdev_io_bypass(zio_t *zio) 2782 { 2783 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2784 ASSERT(zio->io_error == 0); 2785 2786 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2787 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2788 } 2789 2790 /* 2791 * ========================================================================== 2792 * Generate and verify checksums 2793 * ========================================================================== 2794 */ 2795 static int 2796 zio_checksum_generate(zio_t *zio) 2797 { 2798 blkptr_t *bp = zio->io_bp; 2799 enum zio_checksum checksum; 2800 2801 if (bp == NULL) { 2802 /* 2803 * This is zio_write_phys(). 2804 * We're either generating a label checksum, or none at all. 2805 */ 2806 checksum = zio->io_prop.zp_checksum; 2807 2808 if (checksum == ZIO_CHECKSUM_OFF) 2809 return (ZIO_PIPELINE_CONTINUE); 2810 2811 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2812 } else { 2813 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2814 ASSERT(!IO_IS_ALLOCATING(zio)); 2815 checksum = ZIO_CHECKSUM_GANG_HEADER; 2816 } else { 2817 checksum = BP_GET_CHECKSUM(bp); 2818 } 2819 } 2820 2821 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2822 2823 return (ZIO_PIPELINE_CONTINUE); 2824 } 2825 2826 static int 2827 zio_checksum_verify(zio_t *zio) 2828 { 2829 zio_bad_cksum_t info; 2830 blkptr_t *bp = zio->io_bp; 2831 int error; 2832 2833 ASSERT(zio->io_vd != NULL); 2834 2835 if (bp == NULL) { 2836 /* 2837 * This is zio_read_phys(). 2838 * We're either verifying a label checksum, or nothing at all. 2839 */ 2840 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2841 return (ZIO_PIPELINE_CONTINUE); 2842 2843 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2844 } 2845 2846 if ((error = zio_checksum_error(zio, &info)) != 0) { 2847 zio->io_error = error; 2848 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2849 zfs_ereport_start_checksum(zio->io_spa, 2850 zio->io_vd, zio, zio->io_offset, 2851 zio->io_size, NULL, &info); 2852 } 2853 } 2854 2855 return (ZIO_PIPELINE_CONTINUE); 2856 } 2857 2858 /* 2859 * Called by RAID-Z to ensure we don't compute the checksum twice. 2860 */ 2861 void 2862 zio_checksum_verified(zio_t *zio) 2863 { 2864 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2865 } 2866 2867 /* 2868 * ========================================================================== 2869 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2870 * An error of 0 indicates success. ENXIO indicates whole-device failure, 2871 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2872 * indicate errors that are specific to one I/O, and most likely permanent. 2873 * Any other error is presumed to be worse because we weren't expecting it. 2874 * ========================================================================== 2875 */ 2876 int 2877 zio_worst_error(int e1, int e2) 2878 { 2879 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2880 int r1, r2; 2881 2882 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2883 if (e1 == zio_error_rank[r1]) 2884 break; 2885 2886 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2887 if (e2 == zio_error_rank[r2]) 2888 break; 2889 2890 return (r1 > r2 ? e1 : e2); 2891 } 2892 2893 /* 2894 * ========================================================================== 2895 * I/O completion 2896 * ========================================================================== 2897 */ 2898 static int 2899 zio_ready(zio_t *zio) 2900 { 2901 blkptr_t *bp = zio->io_bp; 2902 zio_t *pio, *pio_next; 2903 2904 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2905 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2906 return (ZIO_PIPELINE_STOP); 2907 2908 if (zio->io_ready) { 2909 ASSERT(IO_IS_ALLOCATING(zio)); 2910 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2911 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2912 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2913 2914 zio->io_ready(zio); 2915 } 2916 2917 if (bp != NULL && bp != &zio->io_bp_copy) 2918 zio->io_bp_copy = *bp; 2919 2920 if (zio->io_error) 2921 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2922 2923 mutex_enter(&zio->io_lock); 2924 zio->io_state[ZIO_WAIT_READY] = 1; 2925 pio = zio_walk_parents(zio); 2926 mutex_exit(&zio->io_lock); 2927 2928 /* 2929 * As we notify zio's parents, new parents could be added. 2930 * New parents go to the head of zio's io_parent_list, however, 2931 * so we will (correctly) not notify them. The remainder of zio's 2932 * io_parent_list, from 'pio_next' onward, cannot change because 2933 * all parents must wait for us to be done before they can be done. 2934 */ 2935 for (; pio != NULL; pio = pio_next) { 2936 pio_next = zio_walk_parents(zio); 2937 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2938 } 2939 2940 if (zio->io_flags & ZIO_FLAG_NODATA) { 2941 if (BP_IS_GANG(bp)) { 2942 zio->io_flags &= ~ZIO_FLAG_NODATA; 2943 } else { 2944 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2945 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2946 } 2947 } 2948 2949 if (zio_injection_enabled && 2950 zio->io_spa->spa_syncing_txg == zio->io_txg) 2951 zio_handle_ignored_writes(zio); 2952 2953 return (ZIO_PIPELINE_CONTINUE); 2954 } 2955 2956 static int 2957 zio_done(zio_t *zio) 2958 { 2959 spa_t *spa = zio->io_spa; 2960 zio_t *lio = zio->io_logical; 2961 blkptr_t *bp = zio->io_bp; 2962 vdev_t *vd = zio->io_vd; 2963 uint64_t psize = zio->io_size; 2964 zio_t *pio, *pio_next; 2965 2966 /* 2967 * If our children haven't all completed, 2968 * wait for them and then repeat this pipeline stage. 2969 */ 2970 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2971 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2972 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2973 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2974 return (ZIO_PIPELINE_STOP); 2975 2976 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2977 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2978 ASSERT(zio->io_children[c][w] == 0); 2979 2980 if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 2981 ASSERT(bp->blk_pad[0] == 0); 2982 ASSERT(bp->blk_pad[1] == 0); 2983 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2984 (bp == zio_unique_parent(zio)->io_bp)); 2985 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2986 zio->io_bp_override == NULL && 2987 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2988 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2989 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2990 ASSERT(BP_COUNT_GANG(bp) == 0 || 2991 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2992 } 2993 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 2994 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 2995 } 2996 2997 /* 2998 * If there were child vdev/gang/ddt errors, they apply to us now. 2999 */ 3000 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 3001 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 3002 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 3003 3004 /* 3005 * If the I/O on the transformed data was successful, generate any 3006 * checksum reports now while we still have the transformed data. 3007 */ 3008 if (zio->io_error == 0) { 3009 while (zio->io_cksum_report != NULL) { 3010 zio_cksum_report_t *zcr = zio->io_cksum_report; 3011 uint64_t align = zcr->zcr_align; 3012 uint64_t asize = P2ROUNDUP(psize, align); 3013 char *abuf = zio->io_data; 3014 3015 if (asize != psize) { 3016 abuf = zio_buf_alloc(asize); 3017 bcopy(zio->io_data, abuf, psize); 3018 bzero(abuf + psize, asize - psize); 3019 } 3020 3021 zio->io_cksum_report = zcr->zcr_next; 3022 zcr->zcr_next = NULL; 3023 zcr->zcr_finish(zcr, abuf); 3024 zfs_ereport_free_checksum(zcr); 3025 3026 if (asize != psize) 3027 zio_buf_free(abuf, asize); 3028 } 3029 } 3030 3031 zio_pop_transforms(zio); /* note: may set zio->io_error */ 3032 3033 vdev_stat_update(zio, psize); 3034 3035 if (zio->io_error) { 3036 /* 3037 * If this I/O is attached to a particular vdev, 3038 * generate an error message describing the I/O failure 3039 * at the block level. We ignore these errors if the 3040 * device is currently unavailable. 3041 */ 3042 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 3043 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 3044 3045 if ((zio->io_error == EIO || !(zio->io_flags & 3046 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 3047 zio == lio) { 3048 /* 3049 * For logical I/O requests, tell the SPA to log the 3050 * error and generate a logical data ereport. 3051 */ 3052 spa_log_error(spa, zio); 3053 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 3054 0, 0); 3055 } 3056 } 3057 3058 if (zio->io_error && zio == lio) { 3059 /* 3060 * Determine whether zio should be reexecuted. This will 3061 * propagate all the way to the root via zio_notify_parent(). 3062 */ 3063 ASSERT(vd == NULL && bp != NULL); 3064 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3065 3066 if (IO_IS_ALLOCATING(zio) && 3067 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 3068 if (zio->io_error != ENOSPC) 3069 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 3070 else 3071 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3072 } 3073 3074 if ((zio->io_type == ZIO_TYPE_READ || 3075 zio->io_type == ZIO_TYPE_FREE) && 3076 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 3077 zio->io_error == ENXIO && 3078 spa_load_state(spa) == SPA_LOAD_NONE && 3079 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 3080 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3081 3082 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 3083 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 3084 3085 /* 3086 * Here is a possibly good place to attempt to do 3087 * either combinatorial reconstruction or error correction 3088 * based on checksums. It also might be a good place 3089 * to send out preliminary ereports before we suspend 3090 * processing. 3091 */ 3092 } 3093 3094 /* 3095 * If there were logical child errors, they apply to us now. 3096 * We defer this until now to avoid conflating logical child 3097 * errors with errors that happened to the zio itself when 3098 * updating vdev stats and reporting FMA events above. 3099 */ 3100 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 3101 3102 if ((zio->io_error || zio->io_reexecute) && 3103 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 3104 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 3105 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 3106 3107 zio_gang_tree_free(&zio->io_gang_tree); 3108 3109 /* 3110 * Godfather I/Os should never suspend. 3111 */ 3112 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 3113 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3114 zio->io_reexecute = 0; 3115 3116 if (zio->io_reexecute) { 3117 /* 3118 * This is a logical I/O that wants to reexecute. 3119 * 3120 * Reexecute is top-down. When an i/o fails, if it's not 3121 * the root, it simply notifies its parent and sticks around. 3122 * The parent, seeing that it still has children in zio_done(), 3123 * does the same. This percolates all the way up to the root. 3124 * The root i/o will reexecute or suspend the entire tree. 3125 * 3126 * This approach ensures that zio_reexecute() honors 3127 * all the original i/o dependency relationships, e.g. 3128 * parents not executing until children are ready. 3129 */ 3130 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3131 3132 zio->io_gang_leader = NULL; 3133 3134 mutex_enter(&zio->io_lock); 3135 zio->io_state[ZIO_WAIT_DONE] = 1; 3136 mutex_exit(&zio->io_lock); 3137 3138 /* 3139 * "The Godfather" I/O monitors its children but is 3140 * not a true parent to them. It will track them through 3141 * the pipeline but severs its ties whenever they get into 3142 * trouble (e.g. suspended). This allows "The Godfather" 3143 * I/O to return status without blocking. 3144 */ 3145 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3146 zio_link_t *zl = zio->io_walk_link; 3147 pio_next = zio_walk_parents(zio); 3148 3149 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3150 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3151 zio_remove_child(pio, zio, zl); 3152 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3153 } 3154 } 3155 3156 if ((pio = zio_unique_parent(zio)) != NULL) { 3157 /* 3158 * We're not a root i/o, so there's nothing to do 3159 * but notify our parent. Don't propagate errors 3160 * upward since we haven't permanently failed yet. 3161 */ 3162 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3163 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3164 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3165 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3166 /* 3167 * We'd fail again if we reexecuted now, so suspend 3168 * until conditions improve (e.g. device comes online). 3169 */ 3170 zio_suspend(spa, zio); 3171 } else { 3172 /* 3173 * Reexecution is potentially a huge amount of work. 3174 * Hand it off to the otherwise-unused claim taskq. 3175 */ 3176 ASSERT(zio->io_tqent.tqent_next == NULL); 3177 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3178 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3179 0, &zio->io_tqent); 3180 } 3181 return (ZIO_PIPELINE_STOP); 3182 } 3183 3184 ASSERT(zio->io_child_count == 0); 3185 ASSERT(zio->io_reexecute == 0); 3186 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3187 3188 /* 3189 * Report any checksum errors, since the I/O is complete. 3190 */ 3191 while (zio->io_cksum_report != NULL) { 3192 zio_cksum_report_t *zcr = zio->io_cksum_report; 3193 zio->io_cksum_report = zcr->zcr_next; 3194 zcr->zcr_next = NULL; 3195 zcr->zcr_finish(zcr, NULL); 3196 zfs_ereport_free_checksum(zcr); 3197 } 3198 3199 /* 3200 * It is the responsibility of the done callback to ensure that this 3201 * particular zio is no longer discoverable for adoption, and as 3202 * such, cannot acquire any new parents. 3203 */ 3204 if (zio->io_done) 3205 zio->io_done(zio); 3206 3207 mutex_enter(&zio->io_lock); 3208 zio->io_state[ZIO_WAIT_DONE] = 1; 3209 mutex_exit(&zio->io_lock); 3210 3211 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3212 zio_link_t *zl = zio->io_walk_link; 3213 pio_next = zio_walk_parents(zio); 3214 zio_remove_child(pio, zio, zl); 3215 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3216 } 3217 3218 if (zio->io_waiter != NULL) { 3219 mutex_enter(&zio->io_lock); 3220 zio->io_executor = NULL; 3221 cv_broadcast(&zio->io_cv); 3222 mutex_exit(&zio->io_lock); 3223 } else { 3224 zio_destroy(zio); 3225 } 3226 3227 return (ZIO_PIPELINE_STOP); 3228 } 3229 3230 /* 3231 * ========================================================================== 3232 * I/O pipeline definition 3233 * ========================================================================== 3234 */ 3235 static zio_pipe_stage_t *zio_pipeline[] = { 3236 NULL, 3237 zio_read_bp_init, 3238 zio_free_bp_init, 3239 zio_issue_async, 3240 zio_write_bp_init, 3241 zio_checksum_generate, 3242 zio_nop_write, 3243 zio_ddt_read_start, 3244 zio_ddt_read_done, 3245 zio_ddt_write, 3246 zio_ddt_free, 3247 zio_gang_assemble, 3248 zio_gang_issue, 3249 zio_dva_allocate, 3250 zio_dva_free, 3251 zio_dva_claim, 3252 zio_ready, 3253 zio_vdev_io_start, 3254 zio_vdev_io_done, 3255 zio_vdev_io_assess, 3256 zio_checksum_verify, 3257 zio_done 3258 }; 3259 3260 /* dnp is the dnode for zb1->zb_object */ 3261 boolean_t 3262 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, 3263 const zbookmark_phys_t *zb2) 3264 { 3265 uint64_t zb1nextL0, zb2thisobj; 3266 3267 ASSERT(zb1->zb_objset == zb2->zb_objset); 3268 ASSERT(zb2->zb_level == 0); 3269 3270 /* The objset_phys_t isn't before anything. */ 3271 if (dnp == NULL) 3272 return (B_FALSE); 3273 3274 zb1nextL0 = (zb1->zb_blkid + 1) << 3275 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3276 3277 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3278 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3279 3280 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3281 uint64_t nextobj = zb1nextL0 * 3282 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3283 return (nextobj <= zb2thisobj); 3284 } 3285 3286 if (zb1->zb_object < zb2thisobj) 3287 return (B_TRUE); 3288 if (zb1->zb_object > zb2thisobj) 3289 return (B_FALSE); 3290 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3291 return (B_FALSE); 3292 return (zb1nextL0 <= zb2->zb_blkid); 3293 }