1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/fm/fs/zfs.h> 29 #include <sys/spa.h> 30 #include <sys/txg.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio_impl.h> 34 #include <sys/zio_compress.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/dmu_objset.h> 37 #include <sys/arc.h> 38 #include <sys/ddt.h> 39 40 /* 41 * ========================================================================== 42 * I/O type descriptions 43 * ========================================================================== 44 */ 45 const char *zio_type_name[ZIO_TYPES] = { 46 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 47 "zio_ioctl" 48 }; 49 50 /* 51 * ========================================================================== 52 * I/O kmem caches 53 * ========================================================================== 54 */ 55 kmem_cache_t *zio_cache; 56 kmem_cache_t *zio_link_cache; 57 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 58 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 59 60 #ifdef _KERNEL 61 extern vmem_t *zio_alloc_arena; 62 #endif 63 extern int zfs_mg_alloc_failures; 64 65 /* 66 * The following actions directly effect the spa's sync-to-convergence logic. 67 * The values below define the sync pass when we start performing the action. 68 * Care should be taken when changing these values as they directly impact 69 * spa_sync() performance. Tuning these values may introduce subtle performance 70 * pathologies and should only be done in the context of performance analysis. 71 * These tunables will eventually be removed and replaced with #defines once 72 * enough analysis has been done to determine optimal values. 73 * 74 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 75 * regular blocks are not deferred. 76 */ 77 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 78 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 79 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 80 81 /* 82 * An allocating zio is one that either currently has the DVA allocate 83 * stage set or will have it later in its lifetime. 84 */ 85 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 86 87 boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 88 89 #ifdef ZFS_DEBUG 90 int zio_buf_debug_limit = 16384; 91 #else 92 int zio_buf_debug_limit = 0; 93 #endif 94 95 void 96 zio_init(void) 97 { 98 size_t c; 99 vmem_t *data_alloc_arena = NULL; 100 101 #ifdef _KERNEL 102 data_alloc_arena = zio_alloc_arena; 103 #endif 104 zio_cache = kmem_cache_create("zio_cache", 105 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 106 zio_link_cache = kmem_cache_create("zio_link_cache", 107 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 108 109 /* 110 * For small buffers, we want a cache for each multiple of 111 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 112 * for each quarter-power of 2. For large buffers, we want 113 * a cache for each multiple of PAGESIZE. 114 */ 115 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 116 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 117 size_t p2 = size; 118 size_t align = 0; 119 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 120 121 while (p2 & (p2 - 1)) 122 p2 &= p2 - 1; 123 124 #ifndef _KERNEL 125 /* 126 * If we are using watchpoints, put each buffer on its own page, 127 * to eliminate the performance overhead of trapping to the 128 * kernel when modifying a non-watched buffer that shares the 129 * page with a watched buffer. 130 */ 131 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 132 continue; 133 #endif 134 if (size <= 4 * SPA_MINBLOCKSIZE) { 135 align = SPA_MINBLOCKSIZE; 136 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 137 align = PAGESIZE; 138 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 139 align = p2 >> 2; 140 } 141 142 if (align != 0) { 143 char name[36]; 144 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 145 zio_buf_cache[c] = kmem_cache_create(name, size, 146 align, NULL, NULL, NULL, NULL, NULL, cflags); 147 148 /* 149 * Since zio_data bufs do not appear in crash dumps, we 150 * pass KMC_NOTOUCH so that no allocator metadata is 151 * stored with the buffers. 152 */ 153 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 154 zio_data_buf_cache[c] = kmem_cache_create(name, size, 155 align, NULL, NULL, NULL, NULL, data_alloc_arena, 156 cflags | KMC_NOTOUCH); 157 } 158 } 159 160 while (--c != 0) { 161 ASSERT(zio_buf_cache[c] != NULL); 162 if (zio_buf_cache[c - 1] == NULL) 163 zio_buf_cache[c - 1] = zio_buf_cache[c]; 164 165 ASSERT(zio_data_buf_cache[c] != NULL); 166 if (zio_data_buf_cache[c - 1] == NULL) 167 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 168 } 169 170 /* 171 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 172 * to fail 3 times per txg or 8 failures, whichever is greater. 173 */ 174 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 175 176 zio_inject_init(); 177 } 178 179 void 180 zio_fini(void) 181 { 182 size_t c; 183 kmem_cache_t *last_cache = NULL; 184 kmem_cache_t *last_data_cache = NULL; 185 186 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 187 if (zio_buf_cache[c] != last_cache) { 188 last_cache = zio_buf_cache[c]; 189 kmem_cache_destroy(zio_buf_cache[c]); 190 } 191 zio_buf_cache[c] = NULL; 192 193 if (zio_data_buf_cache[c] != last_data_cache) { 194 last_data_cache = zio_data_buf_cache[c]; 195 kmem_cache_destroy(zio_data_buf_cache[c]); 196 } 197 zio_data_buf_cache[c] = NULL; 198 } 199 200 kmem_cache_destroy(zio_link_cache); 201 kmem_cache_destroy(zio_cache); 202 203 zio_inject_fini(); 204 } 205 206 /* 207 * ========================================================================== 208 * Allocate and free I/O buffers 209 * ========================================================================== 210 */ 211 212 /* 213 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 214 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 215 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 216 * excess / transient data in-core during a crashdump. 217 */ 218 void * 219 zio_buf_alloc(size_t size) 220 { 221 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 222 223 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 224 225 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 226 } 227 228 /* 229 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 230 * crashdump if the kernel panics. This exists so that we will limit the amount 231 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 232 * of kernel heap dumped to disk when the kernel panics) 233 */ 234 void * 235 zio_data_buf_alloc(size_t size) 236 { 237 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 238 239 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 240 241 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 242 } 243 244 void 245 zio_buf_free(void *buf, size_t size) 246 { 247 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 248 249 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 250 251 kmem_cache_free(zio_buf_cache[c], buf); 252 } 253 254 void 255 zio_data_buf_free(void *buf, size_t size) 256 { 257 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258 259 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260 261 kmem_cache_free(zio_data_buf_cache[c], buf); 262 } 263 264 /* 265 * ========================================================================== 266 * Push and pop I/O transform buffers 267 * ========================================================================== 268 */ 269 static void 270 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 271 zio_transform_func_t *transform) 272 { 273 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 274 275 zt->zt_orig_data = zio->io_data; 276 zt->zt_orig_size = zio->io_size; 277 zt->zt_bufsize = bufsize; 278 zt->zt_transform = transform; 279 280 zt->zt_next = zio->io_transform_stack; 281 zio->io_transform_stack = zt; 282 283 zio->io_data = data; 284 zio->io_size = size; 285 } 286 287 static void 288 zio_pop_transforms(zio_t *zio) 289 { 290 zio_transform_t *zt; 291 292 while ((zt = zio->io_transform_stack) != NULL) { 293 if (zt->zt_transform != NULL) 294 zt->zt_transform(zio, 295 zt->zt_orig_data, zt->zt_orig_size); 296 297 if (zt->zt_bufsize != 0) 298 zio_buf_free(zio->io_data, zt->zt_bufsize); 299 300 zio->io_data = zt->zt_orig_data; 301 zio->io_size = zt->zt_orig_size; 302 zio->io_transform_stack = zt->zt_next; 303 304 kmem_free(zt, sizeof (zio_transform_t)); 305 } 306 } 307 308 /* 309 * ========================================================================== 310 * I/O transform callbacks for subblocks and decompression 311 * ========================================================================== 312 */ 313 static void 314 zio_subblock(zio_t *zio, void *data, uint64_t size) 315 { 316 ASSERT(zio->io_size > size); 317 318 if (zio->io_type == ZIO_TYPE_READ) 319 bcopy(zio->io_data, data, size); 320 } 321 322 static void 323 zio_decompress(zio_t *zio, void *data, uint64_t size) 324 { 325 if (zio->io_error == 0 && 326 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 327 zio->io_data, data, zio->io_size, size) != 0) 328 zio->io_error = SET_ERROR(EIO); 329 } 330 331 /* 332 * ========================================================================== 333 * I/O parent/child relationships and pipeline interlocks 334 * ========================================================================== 335 */ 336 /* 337 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 338 * continue calling these functions until they return NULL. 339 * Otherwise, the next caller will pick up the list walk in 340 * some indeterminate state. (Otherwise every caller would 341 * have to pass in a cookie to keep the state represented by 342 * io_walk_link, which gets annoying.) 343 */ 344 zio_t * 345 zio_walk_parents(zio_t *cio) 346 { 347 zio_link_t *zl = cio->io_walk_link; 348 list_t *pl = &cio->io_parent_list; 349 350 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 351 cio->io_walk_link = zl; 352 353 if (zl == NULL) 354 return (NULL); 355 356 ASSERT(zl->zl_child == cio); 357 return (zl->zl_parent); 358 } 359 360 zio_t * 361 zio_walk_children(zio_t *pio) 362 { 363 zio_link_t *zl = pio->io_walk_link; 364 list_t *cl = &pio->io_child_list; 365 366 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 367 pio->io_walk_link = zl; 368 369 if (zl == NULL) 370 return (NULL); 371 372 ASSERT(zl->zl_parent == pio); 373 return (zl->zl_child); 374 } 375 376 zio_t * 377 zio_unique_parent(zio_t *cio) 378 { 379 zio_t *pio = zio_walk_parents(cio); 380 381 VERIFY(zio_walk_parents(cio) == NULL); 382 return (pio); 383 } 384 385 void 386 zio_add_child(zio_t *pio, zio_t *cio) 387 { 388 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 389 390 /* 391 * Logical I/Os can have logical, gang, or vdev children. 392 * Gang I/Os can have gang or vdev children. 393 * Vdev I/Os can only have vdev children. 394 * The following ASSERT captures all of these constraints. 395 */ 396 ASSERT(cio->io_child_type <= pio->io_child_type); 397 398 zl->zl_parent = pio; 399 zl->zl_child = cio; 400 401 mutex_enter(&cio->io_lock); 402 mutex_enter(&pio->io_lock); 403 404 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 405 406 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 407 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 408 409 list_insert_head(&pio->io_child_list, zl); 410 list_insert_head(&cio->io_parent_list, zl); 411 412 pio->io_child_count++; 413 cio->io_parent_count++; 414 415 mutex_exit(&pio->io_lock); 416 mutex_exit(&cio->io_lock); 417 } 418 419 static void 420 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 421 { 422 ASSERT(zl->zl_parent == pio); 423 ASSERT(zl->zl_child == cio); 424 425 mutex_enter(&cio->io_lock); 426 mutex_enter(&pio->io_lock); 427 428 list_remove(&pio->io_child_list, zl); 429 list_remove(&cio->io_parent_list, zl); 430 431 pio->io_child_count--; 432 cio->io_parent_count--; 433 434 mutex_exit(&pio->io_lock); 435 mutex_exit(&cio->io_lock); 436 437 kmem_cache_free(zio_link_cache, zl); 438 } 439 440 static boolean_t 441 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 442 { 443 uint64_t *countp = &zio->io_children[child][wait]; 444 boolean_t waiting = B_FALSE; 445 446 mutex_enter(&zio->io_lock); 447 ASSERT(zio->io_stall == NULL); 448 if (*countp != 0) { 449 zio->io_stage >>= 1; 450 zio->io_stall = countp; 451 waiting = B_TRUE; 452 } 453 mutex_exit(&zio->io_lock); 454 455 return (waiting); 456 } 457 458 static void 459 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 460 { 461 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 462 int *errorp = &pio->io_child_error[zio->io_child_type]; 463 464 mutex_enter(&pio->io_lock); 465 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 466 *errorp = zio_worst_error(*errorp, zio->io_error); 467 pio->io_reexecute |= zio->io_reexecute; 468 ASSERT3U(*countp, >, 0); 469 470 (*countp)--; 471 472 if (*countp == 0 && pio->io_stall == countp) { 473 pio->io_stall = NULL; 474 mutex_exit(&pio->io_lock); 475 zio_execute(pio); 476 } else { 477 mutex_exit(&pio->io_lock); 478 } 479 } 480 481 static void 482 zio_inherit_child_errors(zio_t *zio, enum zio_child c) 483 { 484 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 485 zio->io_error = zio->io_child_error[c]; 486 } 487 488 /* 489 * ========================================================================== 490 * Create the various types of I/O (read, write, free, etc) 491 * ========================================================================== 492 */ 493 static zio_t * 494 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 495 void *data, uint64_t size, zio_done_func_t *done, void *private, 496 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 497 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 498 enum zio_stage stage, enum zio_stage pipeline) 499 { 500 zio_t *zio; 501 502 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 503 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 504 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 505 506 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 507 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 508 ASSERT(vd || stage == ZIO_STAGE_OPEN); 509 510 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 511 bzero(zio, sizeof (zio_t)); 512 513 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 514 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 515 516 list_create(&zio->io_parent_list, sizeof (zio_link_t), 517 offsetof(zio_link_t, zl_parent_node)); 518 list_create(&zio->io_child_list, sizeof (zio_link_t), 519 offsetof(zio_link_t, zl_child_node)); 520 521 if (vd != NULL) 522 zio->io_child_type = ZIO_CHILD_VDEV; 523 else if (flags & ZIO_FLAG_GANG_CHILD) 524 zio->io_child_type = ZIO_CHILD_GANG; 525 else if (flags & ZIO_FLAG_DDT_CHILD) 526 zio->io_child_type = ZIO_CHILD_DDT; 527 else 528 zio->io_child_type = ZIO_CHILD_LOGICAL; 529 530 if (bp != NULL) { 531 zio->io_bp = (blkptr_t *)bp; 532 zio->io_bp_copy = *bp; 533 zio->io_bp_orig = *bp; 534 if (type != ZIO_TYPE_WRITE || 535 zio->io_child_type == ZIO_CHILD_DDT) 536 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 537 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 538 zio->io_logical = zio; 539 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 540 pipeline |= ZIO_GANG_STAGES; 541 } 542 543 zio->io_spa = spa; 544 zio->io_txg = txg; 545 zio->io_done = done; 546 zio->io_private = private; 547 zio->io_type = type; 548 zio->io_priority = priority; 549 zio->io_vd = vd; 550 zio->io_offset = offset; 551 zio->io_orig_data = zio->io_data = data; 552 zio->io_orig_size = zio->io_size = size; 553 zio->io_orig_flags = zio->io_flags = flags; 554 zio->io_orig_stage = zio->io_stage = stage; 555 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 556 557 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 558 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 559 560 if (zb != NULL) 561 zio->io_bookmark = *zb; 562 563 if (pio != NULL) { 564 if (zio->io_logical == NULL) 565 zio->io_logical = pio->io_logical; 566 if (zio->io_child_type == ZIO_CHILD_GANG) 567 zio->io_gang_leader = pio->io_gang_leader; 568 zio_add_child(pio, zio); 569 } 570 571 return (zio); 572 } 573 574 static void 575 zio_destroy(zio_t *zio) 576 { 577 list_destroy(&zio->io_parent_list); 578 list_destroy(&zio->io_child_list); 579 mutex_destroy(&zio->io_lock); 580 cv_destroy(&zio->io_cv); 581 kmem_cache_free(zio_cache, zio); 582 } 583 584 zio_t * 585 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 586 void *private, enum zio_flag flags) 587 { 588 zio_t *zio; 589 590 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 591 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 592 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 593 594 return (zio); 595 } 596 597 zio_t * 598 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 599 { 600 return (zio_null(NULL, spa, NULL, done, private, flags)); 601 } 602 603 zio_t * 604 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 605 void *data, uint64_t size, zio_done_func_t *done, void *private, 606 zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) 607 { 608 zio_t *zio; 609 610 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 611 data, size, done, private, 612 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 613 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 614 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 615 616 return (zio); 617 } 618 619 zio_t * 620 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 621 void *data, uint64_t size, const zio_prop_t *zp, 622 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 623 void *private, 624 zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) 625 { 626 zio_t *zio; 627 628 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 629 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 630 zp->zp_compress >= ZIO_COMPRESS_OFF && 631 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 632 DMU_OT_IS_VALID(zp->zp_type) && 633 zp->zp_level < 32 && 634 zp->zp_copies > 0 && 635 zp->zp_copies <= spa_max_replication(spa)); 636 637 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 638 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 639 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 640 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 641 642 zio->io_ready = ready; 643 zio->io_physdone = physdone; 644 zio->io_prop = *zp; 645 646 return (zio); 647 } 648 649 zio_t * 650 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 651 uint64_t size, zio_done_func_t *done, void *private, 652 zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb) 653 { 654 zio_t *zio; 655 656 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 657 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 658 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 659 660 return (zio); 661 } 662 663 void 664 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 665 { 666 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 667 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 668 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 669 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 670 671 /* 672 * We must reset the io_prop to match the values that existed 673 * when the bp was first written by dmu_sync() keeping in mind 674 * that nopwrite and dedup are mutually exclusive. 675 */ 676 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 677 zio->io_prop.zp_nopwrite = nopwrite; 678 zio->io_prop.zp_copies = copies; 679 zio->io_bp_override = bp; 680 } 681 682 void 683 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 684 { 685 metaslab_check_free(spa, bp); 686 687 /* 688 * Frees that are for the currently-syncing txg, are not going to be 689 * deferred, and which will not need to do a read (i.e. not GANG or 690 * DEDUP), can be processed immediately. Otherwise, put them on the 691 * in-memory list for later processing. 692 */ 693 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 694 txg != spa->spa_syncing_txg || 695 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 696 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 697 } else { 698 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); 699 } 700 } 701 702 zio_t * 703 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 704 enum zio_flag flags) 705 { 706 zio_t *zio; 707 enum zio_stage stage = ZIO_FREE_PIPELINE; 708 709 dprintf_bp(bp, "freeing in txg %llu, pass %u", 710 (longlong_t)txg, spa->spa_sync_pass); 711 712 ASSERT(!BP_IS_HOLE(bp)); 713 ASSERT(spa_syncing_txg(spa) == txg); 714 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 715 716 metaslab_check_free(spa, bp); 717 arc_freed(spa, bp); 718 719 /* 720 * GANG and DEDUP blocks can induce a read (for the gang block header, 721 * or the DDT), so issue them asynchronously so that this thread is 722 * not tied up. 723 */ 724 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 725 stage |= ZIO_STAGE_ISSUE_ASYNC; 726 727 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 728 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 729 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 730 731 732 return (zio); 733 } 734 735 zio_t * 736 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 737 zio_done_func_t *done, void *private, enum zio_flag flags) 738 { 739 zio_t *zio; 740 741 /* 742 * A claim is an allocation of a specific block. Claims are needed 743 * to support immediate writes in the intent log. The issue is that 744 * immediate writes contain committed data, but in a txg that was 745 * *not* committed. Upon opening the pool after an unclean shutdown, 746 * the intent log claims all blocks that contain immediate write data 747 * so that the SPA knows they're in use. 748 * 749 * All claims *must* be resolved in the first txg -- before the SPA 750 * starts allocating blocks -- so that nothing is allocated twice. 751 * If txg == 0 we just verify that the block is claimable. 752 */ 753 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 754 ASSERT(txg == spa_first_txg(spa) || txg == 0); 755 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 756 757 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 758 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 759 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 760 761 return (zio); 762 } 763 764 zio_t * 765 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 766 zio_done_func_t *done, void *private, enum zio_flag flags) 767 { 768 zio_t *zio; 769 int c; 770 771 if (vd->vdev_children == 0) { 772 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 773 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 774 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 775 776 zio->io_cmd = cmd; 777 } else { 778 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 779 780 for (c = 0; c < vd->vdev_children; c++) 781 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 782 done, private, flags)); 783 } 784 785 return (zio); 786 } 787 788 zio_t * 789 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 790 void *data, int checksum, zio_done_func_t *done, void *private, 791 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 792 { 793 zio_t *zio; 794 795 ASSERT(vd->vdev_children == 0); 796 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 797 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 798 ASSERT3U(offset + size, <=, vd->vdev_psize); 799 800 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 801 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 802 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 803 804 zio->io_prop.zp_checksum = checksum; 805 806 return (zio); 807 } 808 809 zio_t * 810 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 811 void *data, int checksum, zio_done_func_t *done, void *private, 812 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 813 { 814 zio_t *zio; 815 816 ASSERT(vd->vdev_children == 0); 817 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 818 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 819 ASSERT3U(offset + size, <=, vd->vdev_psize); 820 821 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 822 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 823 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 824 825 zio->io_prop.zp_checksum = checksum; 826 827 if (zio_checksum_table[checksum].ci_eck) { 828 /* 829 * zec checksums are necessarily destructive -- they modify 830 * the end of the write buffer to hold the verifier/checksum. 831 * Therefore, we must make a local copy in case the data is 832 * being written to multiple places in parallel. 833 */ 834 void *wbuf = zio_buf_alloc(size); 835 bcopy(data, wbuf, size); 836 zio_push_transform(zio, wbuf, size, size, NULL); 837 } 838 839 return (zio); 840 } 841 842 /* 843 * Create a child I/O to do some work for us. 844 */ 845 zio_t * 846 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 847 void *data, uint64_t size, int type, zio_priority_t priority, 848 enum zio_flag flags, zio_done_func_t *done, void *private) 849 { 850 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 851 zio_t *zio; 852 853 ASSERT(vd->vdev_parent == 854 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 855 856 if (type == ZIO_TYPE_READ && bp != NULL) { 857 /* 858 * If we have the bp, then the child should perform the 859 * checksum and the parent need not. This pushes error 860 * detection as close to the leaves as possible and 861 * eliminates redundant checksums in the interior nodes. 862 */ 863 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 864 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 865 } 866 867 if (vd->vdev_children == 0) 868 offset += VDEV_LABEL_START_SIZE; 869 870 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 871 872 /* 873 * If we've decided to do a repair, the write is not speculative -- 874 * even if the original read was. 875 */ 876 if (flags & ZIO_FLAG_IO_REPAIR) 877 flags &= ~ZIO_FLAG_SPECULATIVE; 878 879 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 880 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 881 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 882 883 zio->io_physdone = pio->io_physdone; 884 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 885 zio->io_logical->io_phys_children++; 886 887 return (zio); 888 } 889 890 zio_t * 891 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 892 int type, zio_priority_t priority, enum zio_flag flags, 893 zio_done_func_t *done, void *private) 894 { 895 zio_t *zio; 896 897 ASSERT(vd->vdev_ops->vdev_op_leaf); 898 899 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 900 data, size, done, private, type, priority, 901 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 902 vd, offset, NULL, 903 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 904 905 return (zio); 906 } 907 908 void 909 zio_flush(zio_t *zio, vdev_t *vd) 910 { 911 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 912 NULL, NULL, 913 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 914 } 915 916 void 917 zio_shrink(zio_t *zio, uint64_t size) 918 { 919 ASSERT(zio->io_executor == NULL); 920 ASSERT(zio->io_orig_size == zio->io_size); 921 ASSERT(size <= zio->io_size); 922 923 /* 924 * We don't shrink for raidz because of problems with the 925 * reconstruction when reading back less than the block size. 926 * Note, BP_IS_RAIDZ() assumes no compression. 927 */ 928 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 929 if (!BP_IS_RAIDZ(zio->io_bp)) 930 zio->io_orig_size = zio->io_size = size; 931 } 932 933 /* 934 * ========================================================================== 935 * Prepare to read and write logical blocks 936 * ========================================================================== 937 */ 938 939 static int 940 zio_read_bp_init(zio_t *zio) 941 { 942 blkptr_t *bp = zio->io_bp; 943 944 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 945 zio->io_child_type == ZIO_CHILD_LOGICAL && 946 !(zio->io_flags & ZIO_FLAG_RAW)) { 947 uint64_t psize = BP_GET_PSIZE(bp); 948 void *cbuf = zio_buf_alloc(psize); 949 950 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 951 } 952 953 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 954 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 955 956 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 957 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 958 959 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 960 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 961 962 return (ZIO_PIPELINE_CONTINUE); 963 } 964 965 static int 966 zio_write_bp_init(zio_t *zio) 967 { 968 spa_t *spa = zio->io_spa; 969 zio_prop_t *zp = &zio->io_prop; 970 enum zio_compress compress = zp->zp_compress; 971 blkptr_t *bp = zio->io_bp; 972 uint64_t lsize = zio->io_size; 973 uint64_t psize = lsize; 974 int pass = 1; 975 976 /* 977 * If our children haven't all reached the ready stage, 978 * wait for them and then repeat this pipeline stage. 979 */ 980 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 981 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 982 return (ZIO_PIPELINE_STOP); 983 984 if (!IO_IS_ALLOCATING(zio)) 985 return (ZIO_PIPELINE_CONTINUE); 986 987 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 988 989 if (zio->io_bp_override) { 990 ASSERT(bp->blk_birth != zio->io_txg); 991 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 992 993 *bp = *zio->io_bp_override; 994 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 995 996 /* 997 * If we've been overridden and nopwrite is set then 998 * set the flag accordingly to indicate that a nopwrite 999 * has already occurred. 1000 */ 1001 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1002 ASSERT(!zp->zp_dedup); 1003 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1004 return (ZIO_PIPELINE_CONTINUE); 1005 } 1006 1007 ASSERT(!zp->zp_nopwrite); 1008 1009 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1010 return (ZIO_PIPELINE_CONTINUE); 1011 1012 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1013 zp->zp_dedup_verify); 1014 1015 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1016 BP_SET_DEDUP(bp, 1); 1017 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1018 return (ZIO_PIPELINE_CONTINUE); 1019 } 1020 zio->io_bp_override = NULL; 1021 BP_ZERO(bp); 1022 } 1023 1024 if (bp->blk_birth == zio->io_txg) { 1025 /* 1026 * We're rewriting an existing block, which means we're 1027 * working on behalf of spa_sync(). For spa_sync() to 1028 * converge, it must eventually be the case that we don't 1029 * have to allocate new blocks. But compression changes 1030 * the blocksize, which forces a reallocate, and makes 1031 * convergence take longer. Therefore, after the first 1032 * few passes, stop compressing to ensure convergence. 1033 */ 1034 pass = spa_sync_pass(spa); 1035 1036 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1037 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1038 ASSERT(!BP_GET_DEDUP(bp)); 1039 1040 if (pass >= zfs_sync_pass_dont_compress) 1041 compress = ZIO_COMPRESS_OFF; 1042 1043 /* Make sure someone doesn't change their mind on overwrites */ 1044 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1045 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1046 } 1047 1048 if (compress != ZIO_COMPRESS_OFF) { 1049 void *cbuf = zio_buf_alloc(lsize); 1050 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1051 if (psize == 0 || psize == lsize) { 1052 compress = ZIO_COMPRESS_OFF; 1053 zio_buf_free(cbuf, lsize); 1054 } else { 1055 ASSERT(psize < lsize); 1056 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1057 } 1058 } 1059 1060 /* 1061 * The final pass of spa_sync() must be all rewrites, but the first 1062 * few passes offer a trade-off: allocating blocks defers convergence, 1063 * but newly allocated blocks are sequential, so they can be written 1064 * to disk faster. Therefore, we allow the first few passes of 1065 * spa_sync() to allocate new blocks, but force rewrites after that. 1066 * There should only be a handful of blocks after pass 1 in any case. 1067 */ 1068 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 1069 pass >= zfs_sync_pass_rewrite) { 1070 ASSERT(psize != 0); 1071 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1072 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1073 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1074 } else { 1075 BP_ZERO(bp); 1076 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1077 } 1078 1079 if (psize == 0) { 1080 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1081 } else { 1082 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1083 BP_SET_LSIZE(bp, lsize); 1084 BP_SET_PSIZE(bp, psize); 1085 BP_SET_COMPRESS(bp, compress); 1086 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1087 BP_SET_TYPE(bp, zp->zp_type); 1088 BP_SET_LEVEL(bp, zp->zp_level); 1089 BP_SET_DEDUP(bp, zp->zp_dedup); 1090 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1091 if (zp->zp_dedup) { 1092 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1093 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1094 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1095 } 1096 if (zp->zp_nopwrite) { 1097 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1098 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1099 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1100 } 1101 } 1102 1103 return (ZIO_PIPELINE_CONTINUE); 1104 } 1105 1106 static int 1107 zio_free_bp_init(zio_t *zio) 1108 { 1109 blkptr_t *bp = zio->io_bp; 1110 1111 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1112 if (BP_GET_DEDUP(bp)) 1113 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1114 } 1115 1116 return (ZIO_PIPELINE_CONTINUE); 1117 } 1118 1119 /* 1120 * ========================================================================== 1121 * Execute the I/O pipeline 1122 * ========================================================================== 1123 */ 1124 1125 static void 1126 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1127 { 1128 spa_t *spa = zio->io_spa; 1129 zio_type_t t = zio->io_type; 1130 int flags = (cutinline ? TQ_FRONT : 0); 1131 1132 /* 1133 * If we're a config writer or a probe, the normal issue and 1134 * interrupt threads may all be blocked waiting for the config lock. 1135 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1136 */ 1137 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1138 t = ZIO_TYPE_NULL; 1139 1140 /* 1141 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1142 */ 1143 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1144 t = ZIO_TYPE_NULL; 1145 1146 /* 1147 * If this is a high priority I/O, then use the high priority taskq if 1148 * available. 1149 */ 1150 if (zio->io_priority == ZIO_PRIORITY_NOW && 1151 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1152 q++; 1153 1154 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1155 1156 /* 1157 * NB: We are assuming that the zio can only be dispatched 1158 * to a single taskq at a time. It would be a grievous error 1159 * to dispatch the zio to another taskq at the same time. 1160 */ 1161 ASSERT(zio->io_tqent.tqent_next == NULL); 1162 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1163 flags, &zio->io_tqent); 1164 } 1165 1166 static boolean_t 1167 zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1168 { 1169 kthread_t *executor = zio->io_executor; 1170 spa_t *spa = zio->io_spa; 1171 1172 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1173 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1174 uint_t i; 1175 for (i = 0; i < tqs->stqs_count; i++) { 1176 if (taskq_member(tqs->stqs_taskq[i], executor)) 1177 return (B_TRUE); 1178 } 1179 } 1180 1181 return (B_FALSE); 1182 } 1183 1184 static int 1185 zio_issue_async(zio_t *zio) 1186 { 1187 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1188 1189 return (ZIO_PIPELINE_STOP); 1190 } 1191 1192 void 1193 zio_interrupt(zio_t *zio) 1194 { 1195 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1196 } 1197 1198 /* 1199 * Execute the I/O pipeline until one of the following occurs: 1200 * 1201 * (1) the I/O completes 1202 * (2) the pipeline stalls waiting for dependent child I/Os 1203 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1204 * (4) the I/O is delegated by vdev-level caching or aggregation 1205 * (5) the I/O is deferred due to vdev-level queueing 1206 * (6) the I/O is handed off to another thread. 1207 * 1208 * In all cases, the pipeline stops whenever there's no CPU work; it never 1209 * burns a thread in cv_wait(). 1210 * 1211 * There's no locking on io_stage because there's no legitimate way 1212 * for multiple threads to be attempting to process the same I/O. 1213 */ 1214 static zio_pipe_stage_t *zio_pipeline[]; 1215 1216 void 1217 zio_execute(zio_t *zio) 1218 { 1219 zio->io_executor = curthread; 1220 1221 while (zio->io_stage < ZIO_STAGE_DONE) { 1222 enum zio_stage pipeline = zio->io_pipeline; 1223 enum zio_stage stage = zio->io_stage; 1224 int rv; 1225 1226 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1227 ASSERT(ISP2(stage)); 1228 ASSERT(zio->io_stall == NULL); 1229 1230 do { 1231 stage <<= 1; 1232 } while ((stage & pipeline) == 0); 1233 1234 ASSERT(stage <= ZIO_STAGE_DONE); 1235 1236 /* 1237 * If we are in interrupt context and this pipeline stage 1238 * will grab a config lock that is held across I/O, 1239 * or may wait for an I/O that needs an interrupt thread 1240 * to complete, issue async to avoid deadlock. 1241 * 1242 * For VDEV_IO_START, we cut in line so that the io will 1243 * be sent to disk promptly. 1244 */ 1245 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1246 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1247 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1248 zio_requeue_io_start_cut_in_line : B_FALSE; 1249 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1250 return; 1251 } 1252 1253 zio->io_stage = stage; 1254 rv = zio_pipeline[highbit(stage) - 1](zio); 1255 1256 if (rv == ZIO_PIPELINE_STOP) 1257 return; 1258 1259 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1260 } 1261 } 1262 1263 /* 1264 * ========================================================================== 1265 * Initiate I/O, either sync or async 1266 * ========================================================================== 1267 */ 1268 int 1269 zio_wait(zio_t *zio) 1270 { 1271 int error; 1272 1273 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1274 ASSERT(zio->io_executor == NULL); 1275 1276 zio->io_waiter = curthread; 1277 1278 zio_execute(zio); 1279 1280 mutex_enter(&zio->io_lock); 1281 while (zio->io_executor != NULL) 1282 cv_wait(&zio->io_cv, &zio->io_lock); 1283 mutex_exit(&zio->io_lock); 1284 1285 error = zio->io_error; 1286 zio_destroy(zio); 1287 1288 return (error); 1289 } 1290 1291 void 1292 zio_nowait(zio_t *zio) 1293 { 1294 ASSERT(zio->io_executor == NULL); 1295 1296 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1297 zio_unique_parent(zio) == NULL) { 1298 /* 1299 * This is a logical async I/O with no parent to wait for it. 1300 * We add it to the spa_async_root_zio "Godfather" I/O which 1301 * will ensure they complete prior to unloading the pool. 1302 */ 1303 spa_t *spa = zio->io_spa; 1304 1305 zio_add_child(spa->spa_async_zio_root, zio); 1306 } 1307 1308 zio_execute(zio); 1309 } 1310 1311 /* 1312 * ========================================================================== 1313 * Reexecute or suspend/resume failed I/O 1314 * ========================================================================== 1315 */ 1316 1317 static void 1318 zio_reexecute(zio_t *pio) 1319 { 1320 zio_t *cio, *cio_next; 1321 1322 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1323 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1324 ASSERT(pio->io_gang_leader == NULL); 1325 ASSERT(pio->io_gang_tree == NULL); 1326 1327 pio->io_flags = pio->io_orig_flags; 1328 pio->io_stage = pio->io_orig_stage; 1329 pio->io_pipeline = pio->io_orig_pipeline; 1330 pio->io_reexecute = 0; 1331 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1332 pio->io_error = 0; 1333 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1334 pio->io_state[w] = 0; 1335 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1336 pio->io_child_error[c] = 0; 1337 1338 if (IO_IS_ALLOCATING(pio)) 1339 BP_ZERO(pio->io_bp); 1340 1341 /* 1342 * As we reexecute pio's children, new children could be created. 1343 * New children go to the head of pio's io_child_list, however, 1344 * so we will (correctly) not reexecute them. The key is that 1345 * the remainder of pio's io_child_list, from 'cio_next' onward, 1346 * cannot be affected by any side effects of reexecuting 'cio'. 1347 */ 1348 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1349 cio_next = zio_walk_children(pio); 1350 mutex_enter(&pio->io_lock); 1351 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1352 pio->io_children[cio->io_child_type][w]++; 1353 mutex_exit(&pio->io_lock); 1354 zio_reexecute(cio); 1355 } 1356 1357 /* 1358 * Now that all children have been reexecuted, execute the parent. 1359 * We don't reexecute "The Godfather" I/O here as it's the 1360 * responsibility of the caller to wait on him. 1361 */ 1362 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1363 zio_execute(pio); 1364 } 1365 1366 void 1367 zio_suspend(spa_t *spa, zio_t *zio) 1368 { 1369 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1370 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1371 "failure and the failure mode property for this pool " 1372 "is set to panic.", spa_name(spa)); 1373 1374 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1375 1376 mutex_enter(&spa->spa_suspend_lock); 1377 1378 if (spa->spa_suspend_zio_root == NULL) 1379 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1380 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1381 ZIO_FLAG_GODFATHER); 1382 1383 spa->spa_suspended = B_TRUE; 1384 1385 if (zio != NULL) { 1386 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1387 ASSERT(zio != spa->spa_suspend_zio_root); 1388 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1389 ASSERT(zio_unique_parent(zio) == NULL); 1390 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1391 zio_add_child(spa->spa_suspend_zio_root, zio); 1392 } 1393 1394 mutex_exit(&spa->spa_suspend_lock); 1395 } 1396 1397 int 1398 zio_resume(spa_t *spa) 1399 { 1400 zio_t *pio; 1401 1402 /* 1403 * Reexecute all previously suspended i/o. 1404 */ 1405 mutex_enter(&spa->spa_suspend_lock); 1406 spa->spa_suspended = B_FALSE; 1407 cv_broadcast(&spa->spa_suspend_cv); 1408 pio = spa->spa_suspend_zio_root; 1409 spa->spa_suspend_zio_root = NULL; 1410 mutex_exit(&spa->spa_suspend_lock); 1411 1412 if (pio == NULL) 1413 return (0); 1414 1415 zio_reexecute(pio); 1416 return (zio_wait(pio)); 1417 } 1418 1419 void 1420 zio_resume_wait(spa_t *spa) 1421 { 1422 mutex_enter(&spa->spa_suspend_lock); 1423 while (spa_suspended(spa)) 1424 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1425 mutex_exit(&spa->spa_suspend_lock); 1426 } 1427 1428 /* 1429 * ========================================================================== 1430 * Gang blocks. 1431 * 1432 * A gang block is a collection of small blocks that looks to the DMU 1433 * like one large block. When zio_dva_allocate() cannot find a block 1434 * of the requested size, due to either severe fragmentation or the pool 1435 * being nearly full, it calls zio_write_gang_block() to construct the 1436 * block from smaller fragments. 1437 * 1438 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1439 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1440 * an indirect block: it's an array of block pointers. It consumes 1441 * only one sector and hence is allocatable regardless of fragmentation. 1442 * The gang header's bps point to its gang members, which hold the data. 1443 * 1444 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1445 * as the verifier to ensure uniqueness of the SHA256 checksum. 1446 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1447 * not the gang header. This ensures that data block signatures (needed for 1448 * deduplication) are independent of how the block is physically stored. 1449 * 1450 * Gang blocks can be nested: a gang member may itself be a gang block. 1451 * Thus every gang block is a tree in which root and all interior nodes are 1452 * gang headers, and the leaves are normal blocks that contain user data. 1453 * The root of the gang tree is called the gang leader. 1454 * 1455 * To perform any operation (read, rewrite, free, claim) on a gang block, 1456 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1457 * in the io_gang_tree field of the original logical i/o by recursively 1458 * reading the gang leader and all gang headers below it. This yields 1459 * an in-core tree containing the contents of every gang header and the 1460 * bps for every constituent of the gang block. 1461 * 1462 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1463 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1464 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1465 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1466 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1467 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1468 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1469 * of the gang header plus zio_checksum_compute() of the data to update the 1470 * gang header's blk_cksum as described above. 1471 * 1472 * The two-phase assemble/issue model solves the problem of partial failure -- 1473 * what if you'd freed part of a gang block but then couldn't read the 1474 * gang header for another part? Assembling the entire gang tree first 1475 * ensures that all the necessary gang header I/O has succeeded before 1476 * starting the actual work of free, claim, or write. Once the gang tree 1477 * is assembled, free and claim are in-memory operations that cannot fail. 1478 * 1479 * In the event that a gang write fails, zio_dva_unallocate() walks the 1480 * gang tree to immediately free (i.e. insert back into the space map) 1481 * everything we've allocated. This ensures that we don't get ENOSPC 1482 * errors during repeated suspend/resume cycles due to a flaky device. 1483 * 1484 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1485 * the gang tree, we won't modify the block, so we can safely defer the free 1486 * (knowing that the block is still intact). If we *can* assemble the gang 1487 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1488 * each constituent bp and we can allocate a new block on the next sync pass. 1489 * 1490 * In all cases, the gang tree allows complete recovery from partial failure. 1491 * ========================================================================== 1492 */ 1493 1494 static zio_t * 1495 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1496 { 1497 if (gn != NULL) 1498 return (pio); 1499 1500 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1501 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1502 &pio->io_bookmark)); 1503 } 1504 1505 zio_t * 1506 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1507 { 1508 zio_t *zio; 1509 1510 if (gn != NULL) { 1511 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1512 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1513 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1514 /* 1515 * As we rewrite each gang header, the pipeline will compute 1516 * a new gang block header checksum for it; but no one will 1517 * compute a new data checksum, so we do that here. The one 1518 * exception is the gang leader: the pipeline already computed 1519 * its data checksum because that stage precedes gang assembly. 1520 * (Presently, nothing actually uses interior data checksums; 1521 * this is just good hygiene.) 1522 */ 1523 if (gn != pio->io_gang_leader->io_gang_tree) { 1524 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1525 data, BP_GET_PSIZE(bp)); 1526 } 1527 /* 1528 * If we are here to damage data for testing purposes, 1529 * leave the GBH alone so that we can detect the damage. 1530 */ 1531 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1532 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1533 } else { 1534 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1535 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1536 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1537 } 1538 1539 return (zio); 1540 } 1541 1542 /* ARGSUSED */ 1543 zio_t * 1544 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1545 { 1546 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1547 ZIO_GANG_CHILD_FLAGS(pio))); 1548 } 1549 1550 /* ARGSUSED */ 1551 zio_t * 1552 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1553 { 1554 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1555 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1556 } 1557 1558 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1559 NULL, 1560 zio_read_gang, 1561 zio_rewrite_gang, 1562 zio_free_gang, 1563 zio_claim_gang, 1564 NULL 1565 }; 1566 1567 static void zio_gang_tree_assemble_done(zio_t *zio); 1568 1569 static zio_gang_node_t * 1570 zio_gang_node_alloc(zio_gang_node_t **gnpp) 1571 { 1572 zio_gang_node_t *gn; 1573 1574 ASSERT(*gnpp == NULL); 1575 1576 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1577 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1578 *gnpp = gn; 1579 1580 return (gn); 1581 } 1582 1583 static void 1584 zio_gang_node_free(zio_gang_node_t **gnpp) 1585 { 1586 zio_gang_node_t *gn = *gnpp; 1587 1588 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1589 ASSERT(gn->gn_child[g] == NULL); 1590 1591 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1592 kmem_free(gn, sizeof (*gn)); 1593 *gnpp = NULL; 1594 } 1595 1596 static void 1597 zio_gang_tree_free(zio_gang_node_t **gnpp) 1598 { 1599 zio_gang_node_t *gn = *gnpp; 1600 1601 if (gn == NULL) 1602 return; 1603 1604 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1605 zio_gang_tree_free(&gn->gn_child[g]); 1606 1607 zio_gang_node_free(gnpp); 1608 } 1609 1610 static void 1611 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1612 { 1613 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1614 1615 ASSERT(gio->io_gang_leader == gio); 1616 ASSERT(BP_IS_GANG(bp)); 1617 1618 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1619 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1620 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1621 } 1622 1623 static void 1624 zio_gang_tree_assemble_done(zio_t *zio) 1625 { 1626 zio_t *gio = zio->io_gang_leader; 1627 zio_gang_node_t *gn = zio->io_private; 1628 blkptr_t *bp = zio->io_bp; 1629 1630 ASSERT(gio == zio_unique_parent(zio)); 1631 ASSERT(zio->io_child_count == 0); 1632 1633 if (zio->io_error) 1634 return; 1635 1636 if (BP_SHOULD_BYTESWAP(bp)) 1637 byteswap_uint64_array(zio->io_data, zio->io_size); 1638 1639 ASSERT(zio->io_data == gn->gn_gbh); 1640 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1641 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1642 1643 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1644 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1645 if (!BP_IS_GANG(gbp)) 1646 continue; 1647 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1648 } 1649 } 1650 1651 static void 1652 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1653 { 1654 zio_t *gio = pio->io_gang_leader; 1655 zio_t *zio; 1656 1657 ASSERT(BP_IS_GANG(bp) == !!gn); 1658 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1659 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1660 1661 /* 1662 * If you're a gang header, your data is in gn->gn_gbh. 1663 * If you're a gang member, your data is in 'data' and gn == NULL. 1664 */ 1665 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1666 1667 if (gn != NULL) { 1668 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1669 1670 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1671 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1672 if (BP_IS_HOLE(gbp)) 1673 continue; 1674 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1675 data = (char *)data + BP_GET_PSIZE(gbp); 1676 } 1677 } 1678 1679 if (gn == gio->io_gang_tree) 1680 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1681 1682 if (zio != pio) 1683 zio_nowait(zio); 1684 } 1685 1686 static int 1687 zio_gang_assemble(zio_t *zio) 1688 { 1689 blkptr_t *bp = zio->io_bp; 1690 1691 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1692 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1693 1694 zio->io_gang_leader = zio; 1695 1696 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1697 1698 return (ZIO_PIPELINE_CONTINUE); 1699 } 1700 1701 static int 1702 zio_gang_issue(zio_t *zio) 1703 { 1704 blkptr_t *bp = zio->io_bp; 1705 1706 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1707 return (ZIO_PIPELINE_STOP); 1708 1709 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1710 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1711 1712 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1713 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1714 else 1715 zio_gang_tree_free(&zio->io_gang_tree); 1716 1717 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1718 1719 return (ZIO_PIPELINE_CONTINUE); 1720 } 1721 1722 static void 1723 zio_write_gang_member_ready(zio_t *zio) 1724 { 1725 zio_t *pio = zio_unique_parent(zio); 1726 zio_t *gio = zio->io_gang_leader; 1727 dva_t *cdva = zio->io_bp->blk_dva; 1728 dva_t *pdva = pio->io_bp->blk_dva; 1729 uint64_t asize; 1730 1731 if (BP_IS_HOLE(zio->io_bp)) 1732 return; 1733 1734 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1735 1736 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1737 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1738 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1739 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1740 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1741 1742 mutex_enter(&pio->io_lock); 1743 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1744 ASSERT(DVA_GET_GANG(&pdva[d])); 1745 asize = DVA_GET_ASIZE(&pdva[d]); 1746 asize += DVA_GET_ASIZE(&cdva[d]); 1747 DVA_SET_ASIZE(&pdva[d], asize); 1748 } 1749 mutex_exit(&pio->io_lock); 1750 } 1751 1752 static int 1753 zio_write_gang_block(zio_t *pio) 1754 { 1755 spa_t *spa = pio->io_spa; 1756 blkptr_t *bp = pio->io_bp; 1757 zio_t *gio = pio->io_gang_leader; 1758 zio_t *zio; 1759 zio_gang_node_t *gn, **gnpp; 1760 zio_gbh_phys_t *gbh; 1761 uint64_t txg = pio->io_txg; 1762 uint64_t resid = pio->io_size; 1763 uint64_t lsize; 1764 int copies = gio->io_prop.zp_copies; 1765 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1766 zio_prop_t zp; 1767 int error; 1768 1769 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1770 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1771 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1772 if (error) { 1773 pio->io_error = error; 1774 return (ZIO_PIPELINE_CONTINUE); 1775 } 1776 1777 if (pio == gio) { 1778 gnpp = &gio->io_gang_tree; 1779 } else { 1780 gnpp = pio->io_private; 1781 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1782 } 1783 1784 gn = zio_gang_node_alloc(gnpp); 1785 gbh = gn->gn_gbh; 1786 bzero(gbh, SPA_GANGBLOCKSIZE); 1787 1788 /* 1789 * Create the gang header. 1790 */ 1791 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1792 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1793 1794 /* 1795 * Create and nowait the gang children. 1796 */ 1797 for (int g = 0; resid != 0; resid -= lsize, g++) { 1798 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1799 SPA_MINBLOCKSIZE); 1800 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1801 1802 zp.zp_checksum = gio->io_prop.zp_checksum; 1803 zp.zp_compress = ZIO_COMPRESS_OFF; 1804 zp.zp_type = DMU_OT_NONE; 1805 zp.zp_level = 0; 1806 zp.zp_copies = gio->io_prop.zp_copies; 1807 zp.zp_dedup = B_FALSE; 1808 zp.zp_dedup_verify = B_FALSE; 1809 zp.zp_nopwrite = B_FALSE; 1810 1811 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1812 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1813 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 1814 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1815 &pio->io_bookmark)); 1816 } 1817 1818 /* 1819 * Set pio's pipeline to just wait for zio to finish. 1820 */ 1821 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1822 1823 zio_nowait(zio); 1824 1825 return (ZIO_PIPELINE_CONTINUE); 1826 } 1827 1828 /* 1829 * The zio_nop_write stage in the pipeline determines if allocating 1830 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1831 * such as SHA256, we can compare the checksums of the new data and the old 1832 * to determine if allocating a new block is required. The nopwrite 1833 * feature can handle writes in either syncing or open context (i.e. zil 1834 * writes) and as a result is mutually exclusive with dedup. 1835 */ 1836 static int 1837 zio_nop_write(zio_t *zio) 1838 { 1839 blkptr_t *bp = zio->io_bp; 1840 blkptr_t *bp_orig = &zio->io_bp_orig; 1841 zio_prop_t *zp = &zio->io_prop; 1842 1843 ASSERT(BP_GET_LEVEL(bp) == 0); 1844 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1845 ASSERT(zp->zp_nopwrite); 1846 ASSERT(!zp->zp_dedup); 1847 ASSERT(zio->io_bp_override == NULL); 1848 ASSERT(IO_IS_ALLOCATING(zio)); 1849 1850 /* 1851 * Check to see if the original bp and the new bp have matching 1852 * characteristics (i.e. same checksum, compression algorithms, etc). 1853 * If they don't then just continue with the pipeline which will 1854 * allocate a new bp. 1855 */ 1856 if (BP_IS_HOLE(bp_orig) || 1857 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1858 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1859 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1860 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1861 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1862 return (ZIO_PIPELINE_CONTINUE); 1863 1864 /* 1865 * If the checksums match then reset the pipeline so that we 1866 * avoid allocating a new bp and issuing any I/O. 1867 */ 1868 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1869 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1870 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1871 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1872 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1873 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1874 sizeof (uint64_t)) == 0); 1875 1876 *bp = *bp_orig; 1877 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1878 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1879 } 1880 1881 return (ZIO_PIPELINE_CONTINUE); 1882 } 1883 1884 /* 1885 * ========================================================================== 1886 * Dedup 1887 * ========================================================================== 1888 */ 1889 static void 1890 zio_ddt_child_read_done(zio_t *zio) 1891 { 1892 blkptr_t *bp = zio->io_bp; 1893 ddt_entry_t *dde = zio->io_private; 1894 ddt_phys_t *ddp; 1895 zio_t *pio = zio_unique_parent(zio); 1896 1897 mutex_enter(&pio->io_lock); 1898 ddp = ddt_phys_select(dde, bp); 1899 if (zio->io_error == 0) 1900 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1901 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1902 dde->dde_repair_data = zio->io_data; 1903 else 1904 zio_buf_free(zio->io_data, zio->io_size); 1905 mutex_exit(&pio->io_lock); 1906 } 1907 1908 static int 1909 zio_ddt_read_start(zio_t *zio) 1910 { 1911 blkptr_t *bp = zio->io_bp; 1912 1913 ASSERT(BP_GET_DEDUP(bp)); 1914 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1915 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1916 1917 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1918 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1919 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 1920 ddt_phys_t *ddp = dde->dde_phys; 1921 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 1922 blkptr_t blk; 1923 1924 ASSERT(zio->io_vsd == NULL); 1925 zio->io_vsd = dde; 1926 1927 if (ddp_self == NULL) 1928 return (ZIO_PIPELINE_CONTINUE); 1929 1930 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1931 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 1932 continue; 1933 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 1934 &blk); 1935 zio_nowait(zio_read(zio, zio->io_spa, &blk, 1936 zio_buf_alloc(zio->io_size), zio->io_size, 1937 zio_ddt_child_read_done, dde, zio->io_priority, 1938 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 1939 &zio->io_bookmark)); 1940 } 1941 return (ZIO_PIPELINE_CONTINUE); 1942 } 1943 1944 zio_nowait(zio_read(zio, zio->io_spa, bp, 1945 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 1946 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 1947 1948 return (ZIO_PIPELINE_CONTINUE); 1949 } 1950 1951 static int 1952 zio_ddt_read_done(zio_t *zio) 1953 { 1954 blkptr_t *bp = zio->io_bp; 1955 1956 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 1957 return (ZIO_PIPELINE_STOP); 1958 1959 ASSERT(BP_GET_DEDUP(bp)); 1960 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1961 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1962 1963 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1964 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1965 ddt_entry_t *dde = zio->io_vsd; 1966 if (ddt == NULL) { 1967 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 1968 return (ZIO_PIPELINE_CONTINUE); 1969 } 1970 if (dde == NULL) { 1971 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 1972 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1973 return (ZIO_PIPELINE_STOP); 1974 } 1975 if (dde->dde_repair_data != NULL) { 1976 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 1977 zio->io_child_error[ZIO_CHILD_DDT] = 0; 1978 } 1979 ddt_repair_done(ddt, dde); 1980 zio->io_vsd = NULL; 1981 } 1982 1983 ASSERT(zio->io_vsd == NULL); 1984 1985 return (ZIO_PIPELINE_CONTINUE); 1986 } 1987 1988 static boolean_t 1989 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 1990 { 1991 spa_t *spa = zio->io_spa; 1992 1993 /* 1994 * Note: we compare the original data, not the transformed data, 1995 * because when zio->io_bp is an override bp, we will not have 1996 * pushed the I/O transforms. That's an important optimization 1997 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 1998 */ 1999 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2000 zio_t *lio = dde->dde_lead_zio[p]; 2001 2002 if (lio != NULL) { 2003 return (lio->io_orig_size != zio->io_orig_size || 2004 bcmp(zio->io_orig_data, lio->io_orig_data, 2005 zio->io_orig_size) != 0); 2006 } 2007 } 2008 2009 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2010 ddt_phys_t *ddp = &dde->dde_phys[p]; 2011 2012 if (ddp->ddp_phys_birth != 0) { 2013 arc_buf_t *abuf = NULL; 2014 uint32_t aflags = ARC_WAIT; 2015 blkptr_t blk = *zio->io_bp; 2016 int error; 2017 2018 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2019 2020 ddt_exit(ddt); 2021 2022 error = arc_read(NULL, spa, &blk, 2023 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2024 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2025 &aflags, &zio->io_bookmark); 2026 2027 if (error == 0) { 2028 if (arc_buf_size(abuf) != zio->io_orig_size || 2029 bcmp(abuf->b_data, zio->io_orig_data, 2030 zio->io_orig_size) != 0) 2031 error = SET_ERROR(EEXIST); 2032 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2033 } 2034 2035 ddt_enter(ddt); 2036 return (error != 0); 2037 } 2038 } 2039 2040 return (B_FALSE); 2041 } 2042 2043 static void 2044 zio_ddt_child_write_ready(zio_t *zio) 2045 { 2046 int p = zio->io_prop.zp_copies; 2047 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2048 ddt_entry_t *dde = zio->io_private; 2049 ddt_phys_t *ddp = &dde->dde_phys[p]; 2050 zio_t *pio; 2051 2052 if (zio->io_error) 2053 return; 2054 2055 ddt_enter(ddt); 2056 2057 ASSERT(dde->dde_lead_zio[p] == zio); 2058 2059 ddt_phys_fill(ddp, zio->io_bp); 2060 2061 while ((pio = zio_walk_parents(zio)) != NULL) 2062 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2063 2064 ddt_exit(ddt); 2065 } 2066 2067 static void 2068 zio_ddt_child_write_done(zio_t *zio) 2069 { 2070 int p = zio->io_prop.zp_copies; 2071 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2072 ddt_entry_t *dde = zio->io_private; 2073 ddt_phys_t *ddp = &dde->dde_phys[p]; 2074 2075 ddt_enter(ddt); 2076 2077 ASSERT(ddp->ddp_refcnt == 0); 2078 ASSERT(dde->dde_lead_zio[p] == zio); 2079 dde->dde_lead_zio[p] = NULL; 2080 2081 if (zio->io_error == 0) { 2082 while (zio_walk_parents(zio) != NULL) 2083 ddt_phys_addref(ddp); 2084 } else { 2085 ddt_phys_clear(ddp); 2086 } 2087 2088 ddt_exit(ddt); 2089 } 2090 2091 static void 2092 zio_ddt_ditto_write_done(zio_t *zio) 2093 { 2094 int p = DDT_PHYS_DITTO; 2095 zio_prop_t *zp = &zio->io_prop; 2096 blkptr_t *bp = zio->io_bp; 2097 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2098 ddt_entry_t *dde = zio->io_private; 2099 ddt_phys_t *ddp = &dde->dde_phys[p]; 2100 ddt_key_t *ddk = &dde->dde_key; 2101 2102 ddt_enter(ddt); 2103 2104 ASSERT(ddp->ddp_refcnt == 0); 2105 ASSERT(dde->dde_lead_zio[p] == zio); 2106 dde->dde_lead_zio[p] = NULL; 2107 2108 if (zio->io_error == 0) { 2109 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2110 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2111 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2112 if (ddp->ddp_phys_birth != 0) 2113 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2114 ddt_phys_fill(ddp, bp); 2115 } 2116 2117 ddt_exit(ddt); 2118 } 2119 2120 static int 2121 zio_ddt_write(zio_t *zio) 2122 { 2123 spa_t *spa = zio->io_spa; 2124 blkptr_t *bp = zio->io_bp; 2125 uint64_t txg = zio->io_txg; 2126 zio_prop_t *zp = &zio->io_prop; 2127 int p = zp->zp_copies; 2128 int ditto_copies; 2129 zio_t *cio = NULL; 2130 zio_t *dio = NULL; 2131 ddt_t *ddt = ddt_select(spa, bp); 2132 ddt_entry_t *dde; 2133 ddt_phys_t *ddp; 2134 2135 ASSERT(BP_GET_DEDUP(bp)); 2136 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2137 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2138 2139 ddt_enter(ddt); 2140 dde = ddt_lookup(ddt, bp, B_TRUE); 2141 ddp = &dde->dde_phys[p]; 2142 2143 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2144 /* 2145 * If we're using a weak checksum, upgrade to a strong checksum 2146 * and try again. If we're already using a strong checksum, 2147 * we can't resolve it, so just convert to an ordinary write. 2148 * (And automatically e-mail a paper to Nature?) 2149 */ 2150 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2151 zp->zp_checksum = spa_dedup_checksum(spa); 2152 zio_pop_transforms(zio); 2153 zio->io_stage = ZIO_STAGE_OPEN; 2154 BP_ZERO(bp); 2155 } else { 2156 zp->zp_dedup = B_FALSE; 2157 } 2158 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2159 ddt_exit(ddt); 2160 return (ZIO_PIPELINE_CONTINUE); 2161 } 2162 2163 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2164 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2165 2166 if (ditto_copies > ddt_ditto_copies_present(dde) && 2167 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2168 zio_prop_t czp = *zp; 2169 2170 czp.zp_copies = ditto_copies; 2171 2172 /* 2173 * If we arrived here with an override bp, we won't have run 2174 * the transform stack, so we won't have the data we need to 2175 * generate a child i/o. So, toss the override bp and restart. 2176 * This is safe, because using the override bp is just an 2177 * optimization; and it's rare, so the cost doesn't matter. 2178 */ 2179 if (zio->io_bp_override) { 2180 zio_pop_transforms(zio); 2181 zio->io_stage = ZIO_STAGE_OPEN; 2182 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2183 zio->io_bp_override = NULL; 2184 BP_ZERO(bp); 2185 ddt_exit(ddt); 2186 return (ZIO_PIPELINE_CONTINUE); 2187 } 2188 2189 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2190 zio->io_orig_size, &czp, NULL, NULL, 2191 zio_ddt_ditto_write_done, dde, zio->io_priority, 2192 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2193 2194 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2195 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2196 } 2197 2198 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2199 if (ddp->ddp_phys_birth != 0) 2200 ddt_bp_fill(ddp, bp, txg); 2201 if (dde->dde_lead_zio[p] != NULL) 2202 zio_add_child(zio, dde->dde_lead_zio[p]); 2203 else 2204 ddt_phys_addref(ddp); 2205 } else if (zio->io_bp_override) { 2206 ASSERT(bp->blk_birth == txg); 2207 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2208 ddt_phys_fill(ddp, bp); 2209 ddt_phys_addref(ddp); 2210 } else { 2211 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2212 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2213 zio_ddt_child_write_done, dde, zio->io_priority, 2214 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2215 2216 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2217 dde->dde_lead_zio[p] = cio; 2218 } 2219 2220 ddt_exit(ddt); 2221 2222 if (cio) 2223 zio_nowait(cio); 2224 if (dio) 2225 zio_nowait(dio); 2226 2227 return (ZIO_PIPELINE_CONTINUE); 2228 } 2229 2230 ddt_entry_t *freedde; /* for debugging */ 2231 2232 static int 2233 zio_ddt_free(zio_t *zio) 2234 { 2235 spa_t *spa = zio->io_spa; 2236 blkptr_t *bp = zio->io_bp; 2237 ddt_t *ddt = ddt_select(spa, bp); 2238 ddt_entry_t *dde; 2239 ddt_phys_t *ddp; 2240 2241 ASSERT(BP_GET_DEDUP(bp)); 2242 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2243 2244 ddt_enter(ddt); 2245 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2246 ddp = ddt_phys_select(dde, bp); 2247 ddt_phys_decref(ddp); 2248 ddt_exit(ddt); 2249 2250 return (ZIO_PIPELINE_CONTINUE); 2251 } 2252 2253 /* 2254 * ========================================================================== 2255 * Allocate and free blocks 2256 * ========================================================================== 2257 */ 2258 static int 2259 zio_dva_allocate(zio_t *zio) 2260 { 2261 spa_t *spa = zio->io_spa; 2262 metaslab_class_t *mc = spa_normal_class(spa); 2263 blkptr_t *bp = zio->io_bp; 2264 int error; 2265 int flags = 0; 2266 2267 if (zio->io_gang_leader == NULL) { 2268 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2269 zio->io_gang_leader = zio; 2270 } 2271 2272 ASSERT(BP_IS_HOLE(bp)); 2273 ASSERT0(BP_GET_NDVAS(bp)); 2274 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2275 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2276 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2277 2278 /* 2279 * The dump device does not support gang blocks so allocation on 2280 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2281 * the "fast" gang feature. 2282 */ 2283 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2284 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2285 METASLAB_GANG_CHILD : 0; 2286 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2287 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2288 2289 if (error) { 2290 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2291 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2292 error); 2293 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2294 return (zio_write_gang_block(zio)); 2295 zio->io_error = error; 2296 } 2297 2298 return (ZIO_PIPELINE_CONTINUE); 2299 } 2300 2301 static int 2302 zio_dva_free(zio_t *zio) 2303 { 2304 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2305 2306 return (ZIO_PIPELINE_CONTINUE); 2307 } 2308 2309 static int 2310 zio_dva_claim(zio_t *zio) 2311 { 2312 int error; 2313 2314 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2315 if (error) 2316 zio->io_error = error; 2317 2318 return (ZIO_PIPELINE_CONTINUE); 2319 } 2320 2321 /* 2322 * Undo an allocation. This is used by zio_done() when an I/O fails 2323 * and we want to give back the block we just allocated. 2324 * This handles both normal blocks and gang blocks. 2325 */ 2326 static void 2327 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2328 { 2329 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2330 ASSERT(zio->io_bp_override == NULL); 2331 2332 if (!BP_IS_HOLE(bp)) 2333 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2334 2335 if (gn != NULL) { 2336 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2337 zio_dva_unallocate(zio, gn->gn_child[g], 2338 &gn->gn_gbh->zg_blkptr[g]); 2339 } 2340 } 2341 } 2342 2343 /* 2344 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2345 */ 2346 int 2347 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2348 uint64_t size, boolean_t use_slog) 2349 { 2350 int error = 1; 2351 2352 ASSERT(txg > spa_syncing_txg(spa)); 2353 2354 /* 2355 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2356 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2357 * when allocating them. 2358 */ 2359 if (use_slog) { 2360 error = metaslab_alloc(spa, spa_log_class(spa), size, 2361 new_bp, 1, txg, old_bp, 2362 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2363 } 2364 2365 if (error) { 2366 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2367 new_bp, 1, txg, old_bp, 2368 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2369 } 2370 2371 if (error == 0) { 2372 BP_SET_LSIZE(new_bp, size); 2373 BP_SET_PSIZE(new_bp, size); 2374 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2375 BP_SET_CHECKSUM(new_bp, 2376 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2377 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2378 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2379 BP_SET_LEVEL(new_bp, 0); 2380 BP_SET_DEDUP(new_bp, 0); 2381 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2382 } 2383 2384 return (error); 2385 } 2386 2387 /* 2388 * Free an intent log block. 2389 */ 2390 void 2391 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2392 { 2393 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2394 ASSERT(!BP_IS_GANG(bp)); 2395 2396 zio_free(spa, txg, bp); 2397 } 2398 2399 /* 2400 * ========================================================================== 2401 * Read and write to physical devices 2402 * ========================================================================== 2403 */ 2404 static int 2405 zio_vdev_io_start(zio_t *zio) 2406 { 2407 vdev_t *vd = zio->io_vd; 2408 uint64_t align; 2409 spa_t *spa = zio->io_spa; 2410 2411 ASSERT(zio->io_error == 0); 2412 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2413 2414 if (vd == NULL) { 2415 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2416 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2417 2418 /* 2419 * The mirror_ops handle multiple DVAs in a single BP. 2420 */ 2421 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2422 } 2423 2424 /* 2425 * We keep track of time-sensitive I/Os so that the scan thread 2426 * can quickly react to certain workloads. In particular, we care 2427 * about non-scrubbing, top-level reads and writes with the following 2428 * characteristics: 2429 * - synchronous writes of user data to non-slog devices 2430 * - any reads of user data 2431 * When these conditions are met, adjust the timestamp of spa_last_io 2432 * which allows the scan thread to adjust its workload accordingly. 2433 */ 2434 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2435 vd == vd->vdev_top && !vd->vdev_islog && 2436 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2437 zio->io_txg != spa_syncing_txg(spa)) { 2438 uint64_t old = spa->spa_last_io; 2439 uint64_t new = ddi_get_lbolt64(); 2440 if (old != new) 2441 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2442 } 2443 2444 align = 1ULL << vd->vdev_top->vdev_ashift; 2445 2446 if (P2PHASE(zio->io_size, align) != 0) { 2447 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2448 char *abuf = zio_buf_alloc(asize); 2449 ASSERT(vd == vd->vdev_top); 2450 if (zio->io_type == ZIO_TYPE_WRITE) { 2451 bcopy(zio->io_data, abuf, zio->io_size); 2452 bzero(abuf + zio->io_size, asize - zio->io_size); 2453 } 2454 zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2455 } 2456 2457 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2458 ASSERT(P2PHASE(zio->io_size, align) == 0); 2459 VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2460 2461 /* 2462 * If this is a repair I/O, and there's no self-healing involved -- 2463 * that is, we're just resilvering what we expect to resilver -- 2464 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2465 * This prevents spurious resilvering with nested replication. 2466 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2467 * A is out of date, we'll read from C+D, then use the data to 2468 * resilver A+B -- but we don't actually want to resilver B, just A. 2469 * The top-level mirror has no way to know this, so instead we just 2470 * discard unnecessary repairs as we work our way down the vdev tree. 2471 * The same logic applies to any form of nested replication: 2472 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2473 */ 2474 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2475 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2476 zio->io_txg != 0 && /* not a delegated i/o */ 2477 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2478 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2479 zio_vdev_io_bypass(zio); 2480 return (ZIO_PIPELINE_CONTINUE); 2481 } 2482 2483 if (vd->vdev_ops->vdev_op_leaf && 2484 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2485 2486 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2487 return (ZIO_PIPELINE_CONTINUE); 2488 2489 if ((zio = vdev_queue_io(zio)) == NULL) 2490 return (ZIO_PIPELINE_STOP); 2491 2492 if (!vdev_accessible(vd, zio)) { 2493 zio->io_error = SET_ERROR(ENXIO); 2494 zio_interrupt(zio); 2495 return (ZIO_PIPELINE_STOP); 2496 } 2497 } 2498 2499 return (vd->vdev_ops->vdev_op_io_start(zio)); 2500 } 2501 2502 static int 2503 zio_vdev_io_done(zio_t *zio) 2504 { 2505 vdev_t *vd = zio->io_vd; 2506 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2507 boolean_t unexpected_error = B_FALSE; 2508 2509 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2510 return (ZIO_PIPELINE_STOP); 2511 2512 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2513 2514 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2515 2516 vdev_queue_io_done(zio); 2517 2518 if (zio->io_type == ZIO_TYPE_WRITE) 2519 vdev_cache_write(zio); 2520 2521 if (zio_injection_enabled && zio->io_error == 0) 2522 zio->io_error = zio_handle_device_injection(vd, 2523 zio, EIO); 2524 2525 if (zio_injection_enabled && zio->io_error == 0) 2526 zio->io_error = zio_handle_label_injection(zio, EIO); 2527 2528 if (zio->io_error) { 2529 if (!vdev_accessible(vd, zio)) { 2530 zio->io_error = SET_ERROR(ENXIO); 2531 } else { 2532 unexpected_error = B_TRUE; 2533 } 2534 } 2535 } 2536 2537 ops->vdev_op_io_done(zio); 2538 2539 if (unexpected_error) 2540 VERIFY(vdev_probe(vd, zio) == NULL); 2541 2542 return (ZIO_PIPELINE_CONTINUE); 2543 } 2544 2545 /* 2546 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2547 * disk, and use that to finish the checksum ereport later. 2548 */ 2549 static void 2550 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2551 const void *good_buf) 2552 { 2553 /* no processing needed */ 2554 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2555 } 2556 2557 /*ARGSUSED*/ 2558 void 2559 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2560 { 2561 void *buf = zio_buf_alloc(zio->io_size); 2562 2563 bcopy(zio->io_data, buf, zio->io_size); 2564 2565 zcr->zcr_cbinfo = zio->io_size; 2566 zcr->zcr_cbdata = buf; 2567 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2568 zcr->zcr_free = zio_buf_free; 2569 } 2570 2571 static int 2572 zio_vdev_io_assess(zio_t *zio) 2573 { 2574 vdev_t *vd = zio->io_vd; 2575 2576 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2577 return (ZIO_PIPELINE_STOP); 2578 2579 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2580 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2581 2582 if (zio->io_vsd != NULL) { 2583 zio->io_vsd_ops->vsd_free(zio); 2584 zio->io_vsd = NULL; 2585 } 2586 2587 if (zio_injection_enabled && zio->io_error == 0) 2588 zio->io_error = zio_handle_fault_injection(zio, EIO); 2589 2590 /* 2591 * If the I/O failed, determine whether we should attempt to retry it. 2592 * 2593 * On retry, we cut in line in the issue queue, since we don't want 2594 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2595 */ 2596 if (zio->io_error && vd == NULL && 2597 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2598 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2599 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2600 zio->io_error = 0; 2601 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2602 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2603 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2604 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2605 zio_requeue_io_start_cut_in_line); 2606 return (ZIO_PIPELINE_STOP); 2607 } 2608 2609 /* 2610 * If we got an error on a leaf device, convert it to ENXIO 2611 * if the device is not accessible at all. 2612 */ 2613 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2614 !vdev_accessible(vd, zio)) 2615 zio->io_error = SET_ERROR(ENXIO); 2616 2617 /* 2618 * If we can't write to an interior vdev (mirror or RAID-Z), 2619 * set vdev_cant_write so that we stop trying to allocate from it. 2620 */ 2621 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2622 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2623 vd->vdev_cant_write = B_TRUE; 2624 } 2625 2626 if (zio->io_error) 2627 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2628 2629 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2630 zio->io_physdone != NULL) { 2631 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2632 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2633 zio->io_physdone(zio->io_logical); 2634 } 2635 2636 return (ZIO_PIPELINE_CONTINUE); 2637 } 2638 2639 void 2640 zio_vdev_io_reissue(zio_t *zio) 2641 { 2642 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2643 ASSERT(zio->io_error == 0); 2644 2645 zio->io_stage >>= 1; 2646 } 2647 2648 void 2649 zio_vdev_io_redone(zio_t *zio) 2650 { 2651 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2652 2653 zio->io_stage >>= 1; 2654 } 2655 2656 void 2657 zio_vdev_io_bypass(zio_t *zio) 2658 { 2659 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2660 ASSERT(zio->io_error == 0); 2661 2662 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2663 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2664 } 2665 2666 /* 2667 * ========================================================================== 2668 * Generate and verify checksums 2669 * ========================================================================== 2670 */ 2671 static int 2672 zio_checksum_generate(zio_t *zio) 2673 { 2674 blkptr_t *bp = zio->io_bp; 2675 enum zio_checksum checksum; 2676 2677 if (bp == NULL) { 2678 /* 2679 * This is zio_write_phys(). 2680 * We're either generating a label checksum, or none at all. 2681 */ 2682 checksum = zio->io_prop.zp_checksum; 2683 2684 if (checksum == ZIO_CHECKSUM_OFF) 2685 return (ZIO_PIPELINE_CONTINUE); 2686 2687 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2688 } else { 2689 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2690 ASSERT(!IO_IS_ALLOCATING(zio)); 2691 checksum = ZIO_CHECKSUM_GANG_HEADER; 2692 } else { 2693 checksum = BP_GET_CHECKSUM(bp); 2694 } 2695 } 2696 2697 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2698 2699 return (ZIO_PIPELINE_CONTINUE); 2700 } 2701 2702 static int 2703 zio_checksum_verify(zio_t *zio) 2704 { 2705 zio_bad_cksum_t info; 2706 blkptr_t *bp = zio->io_bp; 2707 int error; 2708 2709 ASSERT(zio->io_vd != NULL); 2710 2711 if (bp == NULL) { 2712 /* 2713 * This is zio_read_phys(). 2714 * We're either verifying a label checksum, or nothing at all. 2715 */ 2716 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2717 return (ZIO_PIPELINE_CONTINUE); 2718 2719 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2720 } 2721 2722 if ((error = zio_checksum_error(zio, &info)) != 0) { 2723 zio->io_error = error; 2724 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2725 zfs_ereport_start_checksum(zio->io_spa, 2726 zio->io_vd, zio, zio->io_offset, 2727 zio->io_size, NULL, &info); 2728 } 2729 } 2730 2731 return (ZIO_PIPELINE_CONTINUE); 2732 } 2733 2734 /* 2735 * Called by RAID-Z to ensure we don't compute the checksum twice. 2736 */ 2737 void 2738 zio_checksum_verified(zio_t *zio) 2739 { 2740 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2741 } 2742 2743 /* 2744 * ========================================================================== 2745 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2746 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2747 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2748 * indicate errors that are specific to one I/O, and most likely permanent. 2749 * Any other error is presumed to be worse because we weren't expecting it. 2750 * ========================================================================== 2751 */ 2752 int 2753 zio_worst_error(int e1, int e2) 2754 { 2755 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2756 int r1, r2; 2757 2758 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2759 if (e1 == zio_error_rank[r1]) 2760 break; 2761 2762 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2763 if (e2 == zio_error_rank[r2]) 2764 break; 2765 2766 return (r1 > r2 ? e1 : e2); 2767 } 2768 2769 /* 2770 * ========================================================================== 2771 * I/O completion 2772 * ========================================================================== 2773 */ 2774 static int 2775 zio_ready(zio_t *zio) 2776 { 2777 blkptr_t *bp = zio->io_bp; 2778 zio_t *pio, *pio_next; 2779 2780 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2781 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2782 return (ZIO_PIPELINE_STOP); 2783 2784 if (zio->io_ready) { 2785 ASSERT(IO_IS_ALLOCATING(zio)); 2786 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2787 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2788 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2789 2790 zio->io_ready(zio); 2791 } 2792 2793 if (bp != NULL && bp != &zio->io_bp_copy) 2794 zio->io_bp_copy = *bp; 2795 2796 if (zio->io_error) 2797 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2798 2799 mutex_enter(&zio->io_lock); 2800 zio->io_state[ZIO_WAIT_READY] = 1; 2801 pio = zio_walk_parents(zio); 2802 mutex_exit(&zio->io_lock); 2803 2804 /* 2805 * As we notify zio's parents, new parents could be added. 2806 * New parents go to the head of zio's io_parent_list, however, 2807 * so we will (correctly) not notify them. The remainder of zio's 2808 * io_parent_list, from 'pio_next' onward, cannot change because 2809 * all parents must wait for us to be done before they can be done. 2810 */ 2811 for (; pio != NULL; pio = pio_next) { 2812 pio_next = zio_walk_parents(zio); 2813 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2814 } 2815 2816 if (zio->io_flags & ZIO_FLAG_NODATA) { 2817 if (BP_IS_GANG(bp)) { 2818 zio->io_flags &= ~ZIO_FLAG_NODATA; 2819 } else { 2820 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2821 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2822 } 2823 } 2824 2825 if (zio_injection_enabled && 2826 zio->io_spa->spa_syncing_txg == zio->io_txg) 2827 zio_handle_ignored_writes(zio); 2828 2829 return (ZIO_PIPELINE_CONTINUE); 2830 } 2831 2832 static int 2833 zio_done(zio_t *zio) 2834 { 2835 spa_t *spa = zio->io_spa; 2836 zio_t *lio = zio->io_logical; 2837 blkptr_t *bp = zio->io_bp; 2838 vdev_t *vd = zio->io_vd; 2839 uint64_t psize = zio->io_size; 2840 zio_t *pio, *pio_next; 2841 2842 /* 2843 * If our children haven't all completed, 2844 * wait for them and then repeat this pipeline stage. 2845 */ 2846 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2847 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2848 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2849 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2850 return (ZIO_PIPELINE_STOP); 2851 2852 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2853 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2854 ASSERT(zio->io_children[c][w] == 0); 2855 2856 if (bp != NULL) { 2857 ASSERT(bp->blk_pad[0] == 0); 2858 ASSERT(bp->blk_pad[1] == 0); 2859 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2860 (bp == zio_unique_parent(zio)->io_bp)); 2861 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2862 zio->io_bp_override == NULL && 2863 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2864 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2865 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2866 ASSERT(BP_COUNT_GANG(bp) == 0 || 2867 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2868 } 2869 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 2870 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 2871 } 2872 2873 /* 2874 * If there were child vdev/gang/ddt errors, they apply to us now. 2875 */ 2876 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2877 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2878 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2879 2880 /* 2881 * If the I/O on the transformed data was successful, generate any 2882 * checksum reports now while we still have the transformed data. 2883 */ 2884 if (zio->io_error == 0) { 2885 while (zio->io_cksum_report != NULL) { 2886 zio_cksum_report_t *zcr = zio->io_cksum_report; 2887 uint64_t align = zcr->zcr_align; 2888 uint64_t asize = P2ROUNDUP(psize, align); 2889 char *abuf = zio->io_data; 2890 2891 if (asize != psize) { 2892 abuf = zio_buf_alloc(asize); 2893 bcopy(zio->io_data, abuf, psize); 2894 bzero(abuf + psize, asize - psize); 2895 } 2896 2897 zio->io_cksum_report = zcr->zcr_next; 2898 zcr->zcr_next = NULL; 2899 zcr->zcr_finish(zcr, abuf); 2900 zfs_ereport_free_checksum(zcr); 2901 2902 if (asize != psize) 2903 zio_buf_free(abuf, asize); 2904 } 2905 } 2906 2907 zio_pop_transforms(zio); /* note: may set zio->io_error */ 2908 2909 vdev_stat_update(zio, psize); 2910 2911 if (zio->io_error) { 2912 /* 2913 * If this I/O is attached to a particular vdev, 2914 * generate an error message describing the I/O failure 2915 * at the block level. We ignore these errors if the 2916 * device is currently unavailable. 2917 */ 2918 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 2919 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 2920 2921 if ((zio->io_error == EIO || !(zio->io_flags & 2922 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 2923 zio == lio) { 2924 /* 2925 * For logical I/O requests, tell the SPA to log the 2926 * error and generate a logical data ereport. 2927 */ 2928 spa_log_error(spa, zio); 2929 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 2930 0, 0); 2931 } 2932 } 2933 2934 if (zio->io_error && zio == lio) { 2935 /* 2936 * Determine whether zio should be reexecuted. This will 2937 * propagate all the way to the root via zio_notify_parent(). 2938 */ 2939 ASSERT(vd == NULL && bp != NULL); 2940 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2941 2942 if (IO_IS_ALLOCATING(zio) && 2943 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 2944 if (zio->io_error != ENOSPC) 2945 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 2946 else 2947 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2948 } 2949 2950 if ((zio->io_type == ZIO_TYPE_READ || 2951 zio->io_type == ZIO_TYPE_FREE) && 2952 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 2953 zio->io_error == ENXIO && 2954 spa_load_state(spa) == SPA_LOAD_NONE && 2955 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 2956 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2957 2958 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 2959 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2960 2961 /* 2962 * Here is a possibly good place to attempt to do 2963 * either combinatorial reconstruction or error correction 2964 * based on checksums. It also might be a good place 2965 * to send out preliminary ereports before we suspend 2966 * processing. 2967 */ 2968 } 2969 2970 /* 2971 * If there were logical child errors, they apply to us now. 2972 * We defer this until now to avoid conflating logical child 2973 * errors with errors that happened to the zio itself when 2974 * updating vdev stats and reporting FMA events above. 2975 */ 2976 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 2977 2978 if ((zio->io_error || zio->io_reexecute) && 2979 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 2980 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 2981 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2982 2983 zio_gang_tree_free(&zio->io_gang_tree); 2984 2985 /* 2986 * Godfather I/Os should never suspend. 2987 */ 2988 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 2989 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 2990 zio->io_reexecute = 0; 2991 2992 if (zio->io_reexecute) { 2993 /* 2994 * This is a logical I/O that wants to reexecute. 2995 * 2996 * Reexecute is top-down. When an i/o fails, if it's not 2997 * the root, it simply notifies its parent and sticks around. 2998 * The parent, seeing that it still has children in zio_done(), 2999 * does the same. This percolates all the way up to the root. 3000 * The root i/o will reexecute or suspend the entire tree. 3001 * 3002 * This approach ensures that zio_reexecute() honors 3003 * all the original i/o dependency relationships, e.g. 3004 * parents not executing until children are ready. 3005 */ 3006 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3007 3008 zio->io_gang_leader = NULL; 3009 3010 mutex_enter(&zio->io_lock); 3011 zio->io_state[ZIO_WAIT_DONE] = 1; 3012 mutex_exit(&zio->io_lock); 3013 3014 /* 3015 * "The Godfather" I/O monitors its children but is 3016 * not a true parent to them. It will track them through 3017 * the pipeline but severs its ties whenever they get into 3018 * trouble (e.g. suspended). This allows "The Godfather" 3019 * I/O to return status without blocking. 3020 */ 3021 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3022 zio_link_t *zl = zio->io_walk_link; 3023 pio_next = zio_walk_parents(zio); 3024 3025 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3026 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3027 zio_remove_child(pio, zio, zl); 3028 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3029 } 3030 } 3031 3032 if ((pio = zio_unique_parent(zio)) != NULL) { 3033 /* 3034 * We're not a root i/o, so there's nothing to do 3035 * but notify our parent. Don't propagate errors 3036 * upward since we haven't permanently failed yet. 3037 */ 3038 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3039 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3040 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3041 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3042 /* 3043 * We'd fail again if we reexecuted now, so suspend 3044 * until conditions improve (e.g. device comes online). 3045 */ 3046 zio_suspend(spa, zio); 3047 } else { 3048 /* 3049 * Reexecution is potentially a huge amount of work. 3050 * Hand it off to the otherwise-unused claim taskq. 3051 */ 3052 ASSERT(zio->io_tqent.tqent_next == NULL); 3053 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3054 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3055 0, &zio->io_tqent); 3056 } 3057 return (ZIO_PIPELINE_STOP); 3058 } 3059 3060 ASSERT(zio->io_child_count == 0); 3061 ASSERT(zio->io_reexecute == 0); 3062 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3063 3064 /* 3065 * Report any checksum errors, since the I/O is complete. 3066 */ 3067 while (zio->io_cksum_report != NULL) { 3068 zio_cksum_report_t *zcr = zio->io_cksum_report; 3069 zio->io_cksum_report = zcr->zcr_next; 3070 zcr->zcr_next = NULL; 3071 zcr->zcr_finish(zcr, NULL); 3072 zfs_ereport_free_checksum(zcr); 3073 } 3074 3075 /* 3076 * It is the responsibility of the done callback to ensure that this 3077 * particular zio is no longer discoverable for adoption, and as 3078 * such, cannot acquire any new parents. 3079 */ 3080 if (zio->io_done) 3081 zio->io_done(zio); 3082 3083 mutex_enter(&zio->io_lock); 3084 zio->io_state[ZIO_WAIT_DONE] = 1; 3085 mutex_exit(&zio->io_lock); 3086 3087 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3088 zio_link_t *zl = zio->io_walk_link; 3089 pio_next = zio_walk_parents(zio); 3090 zio_remove_child(pio, zio, zl); 3091 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3092 } 3093 3094 if (zio->io_waiter != NULL) { 3095 mutex_enter(&zio->io_lock); 3096 zio->io_executor = NULL; 3097 cv_broadcast(&zio->io_cv); 3098 mutex_exit(&zio->io_lock); 3099 } else { 3100 zio_destroy(zio); 3101 } 3102 3103 return (ZIO_PIPELINE_STOP); 3104 } 3105 3106 /* 3107 * ========================================================================== 3108 * I/O pipeline definition 3109 * ========================================================================== 3110 */ 3111 static zio_pipe_stage_t *zio_pipeline[] = { 3112 NULL, 3113 zio_read_bp_init, 3114 zio_free_bp_init, 3115 zio_issue_async, 3116 zio_write_bp_init, 3117 zio_checksum_generate, 3118 zio_nop_write, 3119 zio_ddt_read_start, 3120 zio_ddt_read_done, 3121 zio_ddt_write, 3122 zio_ddt_free, 3123 zio_gang_assemble, 3124 zio_gang_issue, 3125 zio_dva_allocate, 3126 zio_dva_free, 3127 zio_dva_claim, 3128 zio_ready, 3129 zio_vdev_io_start, 3130 zio_vdev_io_done, 3131 zio_vdev_io_assess, 3132 zio_checksum_verify, 3133 zio_done 3134 }; 3135 3136 /* dnp is the dnode for zb1->zb_object */ 3137 boolean_t 3138 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3139 const zbookmark_t *zb2) 3140 { 3141 uint64_t zb1nextL0, zb2thisobj; 3142 3143 ASSERT(zb1->zb_objset == zb2->zb_objset); 3144 ASSERT(zb2->zb_level == 0); 3145 3146 /* 3147 * A bookmark in the deadlist is considered to be after 3148 * everything else. 3149 */ 3150 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3151 return (B_TRUE); 3152 3153 /* The objset_phys_t isn't before anything. */ 3154 if (dnp == NULL) 3155 return (B_FALSE); 3156 3157 zb1nextL0 = (zb1->zb_blkid + 1) << 3158 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3159 3160 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3161 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3162 3163 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3164 uint64_t nextobj = zb1nextL0 * 3165 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3166 return (nextobj <= zb2thisobj); 3167 } 3168 3169 if (zb1->zb_object < zb2thisobj) 3170 return (B_TRUE); 3171 if (zb1->zb_object > zb2thisobj) 3172 return (B_FALSE); 3173 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3174 return (B_FALSE); 3175 return (zb1nextL0 <= zb2->zb_blkid); 3176 }