1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Joyent, Inc. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa_impl.h> 30 #include <sys/refcount.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/abd.h> 34 #include <sys/fs/zfs.h> 35 #include <sys/zio.h> 36 #include <sys/sunldi.h> 37 #include <sys/efi_partition.h> 38 #include <sys/fm/fs/zfs.h> 39 40 /* 41 * Virtual device vector for disks. 42 */ 43 44 extern ldi_ident_t zfs_li; 45 46 static void vdev_disk_close(vdev_t *); 47 48 typedef struct vdev_disk_ldi_cb { 49 list_node_t lcb_next; 50 ldi_callback_id_t lcb_id; 51 } vdev_disk_ldi_cb_t; 52 53 static void 54 vdev_disk_alloc(vdev_t *vd) 55 { 56 vdev_disk_t *dvd; 57 58 dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 59 /* 60 * Create the LDI event callback list. 61 */ 62 list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), 63 offsetof(vdev_disk_ldi_cb_t, lcb_next)); 64 } 65 66 static void 67 vdev_disk_free(vdev_t *vd) 68 { 69 vdev_disk_t *dvd = vd->vdev_tsd; 70 vdev_disk_ldi_cb_t *lcb; 71 72 if (dvd == NULL) 73 return; 74 75 /* 76 * We have already closed the LDI handle. Clean up the LDI event 77 * callbacks and free vd->vdev_tsd. 78 */ 79 while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { 80 list_remove(&dvd->vd_ldi_cbs, lcb); 81 (void) ldi_ev_remove_callbacks(lcb->lcb_id); 82 kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); 83 } 84 list_destroy(&dvd->vd_ldi_cbs); 85 kmem_free(dvd, sizeof (vdev_disk_t)); 86 vd->vdev_tsd = NULL; 87 } 88 89 /* ARGSUSED */ 90 static int 91 vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg, 92 void *ev_data) 93 { 94 vdev_t *vd = (vdev_t *)arg; 95 vdev_disk_t *dvd = vd->vdev_tsd; 96 97 /* 98 * Ignore events other than offline. 99 */ 100 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 101 return (LDI_EV_SUCCESS); 102 103 /* 104 * All LDI handles must be closed for the state change to succeed, so 105 * call on vdev_disk_close() to do this. 106 * 107 * We inform vdev_disk_close that it is being called from offline 108 * notify context so it will defer cleanup of LDI event callbacks and 109 * freeing of vd->vdev_tsd to the offline finalize or a reopen. 110 */ 111 dvd->vd_ldi_offline = B_TRUE; 112 vdev_disk_close(vd); 113 114 /* 115 * Now that the device is closed, request that the spa_async_thread 116 * mark the device as REMOVED and notify FMA of the removal. 117 */ 118 zfs_post_remove(vd->vdev_spa, vd); 119 vd->vdev_remove_wanted = B_TRUE; 120 spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); 121 122 return (LDI_EV_SUCCESS); 123 } 124 125 /* ARGSUSED */ 126 static void 127 vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, 128 int ldi_result, void *arg, void *ev_data) 129 { 130 vdev_t *vd = (vdev_t *)arg; 131 132 /* 133 * Ignore events other than offline. 134 */ 135 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 136 return; 137 138 /* 139 * We have already closed the LDI handle in notify. 140 * Clean up the LDI event callbacks and free vd->vdev_tsd. 141 */ 142 vdev_disk_free(vd); 143 144 /* 145 * Request that the vdev be reopened if the offline state change was 146 * unsuccessful. 147 */ 148 if (ldi_result != LDI_EV_SUCCESS) { 149 vd->vdev_probe_wanted = B_TRUE; 150 spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); 151 } 152 } 153 154 static ldi_ev_callback_t vdev_disk_off_callb = { 155 .cb_vers = LDI_EV_CB_VERS, 156 .cb_notify = vdev_disk_off_notify, 157 .cb_finalize = vdev_disk_off_finalize 158 }; 159 160 /* ARGSUSED */ 161 static void 162 vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, 163 int ldi_result, void *arg, void *ev_data) 164 { 165 vdev_t *vd = (vdev_t *)arg; 166 167 /* 168 * Ignore events other than degrade. 169 */ 170 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0) 171 return; 172 173 /* 174 * Degrade events always succeed. Mark the vdev as degraded. 175 * This status is purely informative for the user. 176 */ 177 (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0); 178 } 179 180 static ldi_ev_callback_t vdev_disk_dgrd_callb = { 181 .cb_vers = LDI_EV_CB_VERS, 182 .cb_notify = NULL, 183 .cb_finalize = vdev_disk_dgrd_finalize 184 }; 185 186 static void 187 vdev_disk_hold(vdev_t *vd) 188 { 189 ddi_devid_t devid; 190 char *minor; 191 192 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 193 194 /* 195 * We must have a pathname, and it must be absolute. 196 */ 197 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 198 return; 199 200 /* 201 * Only prefetch path and devid info if the device has 202 * never been opened. 203 */ 204 if (vd->vdev_tsd != NULL) 205 return; 206 207 if (vd->vdev_wholedisk == -1ULL) { 208 size_t len = strlen(vd->vdev_path) + 3; 209 char *buf = kmem_alloc(len, KM_SLEEP); 210 211 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 212 213 (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); 214 kmem_free(buf, len); 215 } 216 217 if (vd->vdev_name_vp == NULL) 218 (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); 219 220 if (vd->vdev_devid != NULL && 221 ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { 222 (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); 223 ddi_devid_str_free(minor); 224 ddi_devid_free(devid); 225 } 226 } 227 228 static void 229 vdev_disk_rele(vdev_t *vd) 230 { 231 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 232 233 if (vd->vdev_name_vp) { 234 VN_RELE_ASYNC(vd->vdev_name_vp, 235 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 236 vd->vdev_name_vp = NULL; 237 } 238 if (vd->vdev_devid_vp) { 239 VN_RELE_ASYNC(vd->vdev_devid_vp, 240 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 241 vd->vdev_devid_vp = NULL; 242 } 243 } 244 245 /* 246 * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when 247 * even a fallback to DKIOCGMEDIAINFO fails. 248 */ 249 #ifdef DEBUG 250 #define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) 251 #else 252 #define VDEV_DEBUG(...) /* Nothing... */ 253 #endif 254 255 static int 256 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 257 uint64_t *ashift) 258 { 259 spa_t *spa = vd->vdev_spa; 260 vdev_disk_t *dvd = vd->vdev_tsd; 261 ldi_ev_cookie_t ecookie; 262 vdev_disk_ldi_cb_t *lcb; 263 union { 264 struct dk_minfo_ext ude; 265 struct dk_minfo ud; 266 } dks; 267 struct dk_minfo_ext *dkmext = &dks.ude; 268 struct dk_minfo *dkm = &dks.ud; 269 int error; 270 dev_t dev; 271 int otyp; 272 boolean_t validate_devid = B_FALSE; 273 ddi_devid_t devid; 274 uint64_t capacity = 0, blksz = 0, pbsize; 275 276 /* 277 * We must have a pathname, and it must be absolute. 278 */ 279 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 280 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 281 return (SET_ERROR(EINVAL)); 282 } 283 284 /* 285 * Reopen the device if it's not currently open. Otherwise, 286 * just update the physical size of the device. 287 */ 288 if (dvd != NULL) { 289 if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { 290 /* 291 * If we are opening a device in its offline notify 292 * context, the LDI handle was just closed. Clean 293 * up the LDI event callbacks and free vd->vdev_tsd. 294 */ 295 vdev_disk_free(vd); 296 } else { 297 ASSERT(vd->vdev_reopening); 298 goto skip_open; 299 } 300 } 301 302 /* 303 * Create vd->vdev_tsd. 304 */ 305 vdev_disk_alloc(vd); 306 dvd = vd->vdev_tsd; 307 308 /* 309 * When opening a disk device, we want to preserve the user's original 310 * intent. We always want to open the device by the path the user gave 311 * us, even if it is one of multiple paths to the same device. But we 312 * also want to be able to survive disks being removed/recabled. 313 * Therefore the sequence of opening devices is: 314 * 315 * 1. Try opening the device by path. For legacy pools without the 316 * 'whole_disk' property, attempt to fix the path by appending 's0'. 317 * 318 * 2. If the devid of the device matches the stored value, return 319 * success. 320 * 321 * 3. Otherwise, the device may have moved. Try opening the device 322 * by the devid instead. 323 */ 324 if (vd->vdev_devid != NULL) { 325 if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, 326 &dvd->vd_minor) != 0) { 327 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 328 return (SET_ERROR(EINVAL)); 329 } 330 } 331 332 error = EINVAL; /* presume failure */ 333 334 if (vd->vdev_path != NULL) { 335 336 if (vd->vdev_wholedisk == -1ULL) { 337 size_t len = strlen(vd->vdev_path) + 3; 338 char *buf = kmem_alloc(len, KM_SLEEP); 339 340 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 341 342 error = ldi_open_by_name(buf, spa_mode(spa), kcred, 343 &dvd->vd_lh, zfs_li); 344 if (error == 0) { 345 spa_strfree(vd->vdev_path); 346 vd->vdev_path = buf; 347 vd->vdev_wholedisk = 1ULL; 348 } else { 349 kmem_free(buf, len); 350 } 351 } 352 353 /* 354 * If we have not yet opened the device, try to open it by the 355 * specified path. 356 */ 357 if (error != 0) { 358 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 359 kcred, &dvd->vd_lh, zfs_li); 360 } 361 362 /* 363 * Compare the devid to the stored value. 364 */ 365 if (error == 0 && vd->vdev_devid != NULL && 366 ldi_get_devid(dvd->vd_lh, &devid) == 0) { 367 if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { 368 error = SET_ERROR(EINVAL); 369 (void) ldi_close(dvd->vd_lh, spa_mode(spa), 370 kcred); 371 dvd->vd_lh = NULL; 372 } 373 ddi_devid_free(devid); 374 } 375 376 /* 377 * If we succeeded in opening the device, but 'vdev_wholedisk' 378 * is not yet set, then this must be a slice. 379 */ 380 if (error == 0 && vd->vdev_wholedisk == -1ULL) 381 vd->vdev_wholedisk = 0; 382 } 383 384 /* 385 * If we were unable to open by path, or the devid check fails, open by 386 * devid instead. 387 */ 388 if (error != 0 && vd->vdev_devid != NULL) { 389 error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, 390 spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); 391 } 392 393 /* 394 * If all else fails, then try opening by physical path (if available) 395 * or the logical path (if we failed due to the devid check). While not 396 * as reliable as the devid, this will give us something, and the higher 397 * level vdev validation will prevent us from opening the wrong device. 398 */ 399 if (error) { 400 if (vd->vdev_devid != NULL) 401 validate_devid = B_TRUE; 402 403 if (vd->vdev_physpath != NULL && 404 (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) 405 error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), 406 kcred, &dvd->vd_lh, zfs_li); 407 408 /* 409 * Note that we don't support the legacy auto-wholedisk support 410 * as above. This hasn't been used in a very long time and we 411 * don't need to propagate its oddities to this edge condition. 412 */ 413 if (error && vd->vdev_path != NULL) 414 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 415 kcred, &dvd->vd_lh, zfs_li); 416 } 417 418 if (error) { 419 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 420 return (error); 421 } 422 423 /* 424 * Now that the device has been successfully opened, update the devid 425 * if necessary. 426 */ 427 if (validate_devid && spa_writeable(spa) && 428 ldi_get_devid(dvd->vd_lh, &devid) == 0) { 429 if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { 430 char *vd_devid; 431 432 vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); 433 zfs_dbgmsg("vdev %s: update devid from %s, " 434 "to %s", vd->vdev_path, vd->vdev_devid, vd_devid); 435 spa_strfree(vd->vdev_devid); 436 vd->vdev_devid = spa_strdup(vd_devid); 437 ddi_devid_str_free(vd_devid); 438 } 439 ddi_devid_free(devid); 440 } 441 442 /* 443 * Once a device is opened, verify that the physical device path (if 444 * available) is up to date. 445 */ 446 if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && 447 ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { 448 char *physpath, *minorname; 449 450 physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); 451 minorname = NULL; 452 if (ddi_dev_pathname(dev, otyp, physpath) == 0 && 453 ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && 454 (vd->vdev_physpath == NULL || 455 strcmp(vd->vdev_physpath, physpath) != 0)) { 456 if (vd->vdev_physpath) 457 spa_strfree(vd->vdev_physpath); 458 (void) strlcat(physpath, ":", MAXPATHLEN); 459 (void) strlcat(physpath, minorname, MAXPATHLEN); 460 vd->vdev_physpath = spa_strdup(physpath); 461 } 462 if (minorname) 463 kmem_free(minorname, strlen(minorname) + 1); 464 kmem_free(physpath, MAXPATHLEN); 465 } 466 467 /* 468 * Register callbacks for the LDI offline event. 469 */ 470 if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == 471 LDI_EV_SUCCESS) { 472 lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 473 list_insert_tail(&dvd->vd_ldi_cbs, lcb); 474 (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 475 &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); 476 } 477 478 /* 479 * Register callbacks for the LDI degrade event. 480 */ 481 if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == 482 LDI_EV_SUCCESS) { 483 lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 484 list_insert_tail(&dvd->vd_ldi_cbs, lcb); 485 (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 486 &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); 487 } 488 skip_open: 489 /* 490 * Determine the actual size of the device. 491 */ 492 if (ldi_get_size(dvd->vd_lh, psize) != 0) { 493 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 494 return (SET_ERROR(EINVAL)); 495 } 496 497 *max_psize = *psize; 498 499 /* 500 * Determine the device's minimum transfer size. 501 * If the ioctl isn't supported, assume DEV_BSIZE. 502 */ 503 if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, 504 (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { 505 capacity = dkmext->dki_capacity - 1; 506 blksz = dkmext->dki_lbsize; 507 pbsize = dkmext->dki_pbsize; 508 } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, 509 (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { 510 VDEV_DEBUG( 511 "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", 512 vd->vdev_path); 513 capacity = dkm->dki_capacity - 1; 514 blksz = dkm->dki_lbsize; 515 pbsize = blksz; 516 } else { 517 VDEV_DEBUG("vdev_disk_open(\"%s\"): " 518 "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", 519 vd->vdev_path, error); 520 pbsize = DEV_BSIZE; 521 } 522 523 *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; 524 525 if (vd->vdev_wholedisk == 1) { 526 int wce = 1; 527 528 if (error == 0) { 529 /* 530 * If we have the capability to expand, we'd have 531 * found out via success from DKIOCGMEDIAINFO{,EXT}. 532 * Adjust max_psize upward accordingly since we know 533 * we own the whole disk now. 534 */ 535 *max_psize = capacity * blksz; 536 } 537 538 /* 539 * Since we own the whole disk, try to enable disk write 540 * caching. We ignore errors because it's OK if we can't do it. 541 */ 542 (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, 543 FKIOCTL, kcred, NULL); 544 } 545 546 /* 547 * Clear the nowritecache bit, so that on a vdev_reopen() we will 548 * try again. 549 */ 550 vd->vdev_nowritecache = B_FALSE; 551 552 return (0); 553 } 554 555 static void 556 vdev_disk_close(vdev_t *vd) 557 { 558 vdev_disk_t *dvd = vd->vdev_tsd; 559 560 if (vd->vdev_reopening || dvd == NULL) 561 return; 562 563 if (dvd->vd_minor != NULL) { 564 ddi_devid_str_free(dvd->vd_minor); 565 dvd->vd_minor = NULL; 566 } 567 568 if (dvd->vd_devid != NULL) { 569 ddi_devid_free(dvd->vd_devid); 570 dvd->vd_devid = NULL; 571 } 572 573 if (dvd->vd_lh != NULL) { 574 (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); 575 dvd->vd_lh = NULL; 576 } 577 578 vd->vdev_delayed_close = B_FALSE; 579 /* 580 * If we closed the LDI handle due to an offline notify from LDI, 581 * don't free vd->vdev_tsd or unregister the callbacks here; 582 * the offline finalize callback or a reopen will take care of it. 583 */ 584 if (dvd->vd_ldi_offline) 585 return; 586 587 vdev_disk_free(vd); 588 } 589 590 int 591 vdev_disk_physio(vdev_t *vd, caddr_t data, 592 size_t size, uint64_t offset, int flags, boolean_t isdump) 593 { 594 vdev_disk_t *dvd = vd->vdev_tsd; 595 596 /* 597 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 598 * Nothing to be done here but return failure. 599 */ 600 if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) 601 return (EIO); 602 603 ASSERT(vd->vdev_ops == &vdev_disk_ops); 604 605 /* 606 * If in the context of an active crash dump, use the ldi_dump(9F) 607 * call instead of ldi_strategy(9F) as usual. 608 */ 609 if (isdump) { 610 ASSERT3P(dvd, !=, NULL); 611 return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), 612 lbtodb(size))); 613 } 614 615 return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); 616 } 617 618 int 619 vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, 620 size_t size, uint64_t offset, int flags) 621 { 622 buf_t *bp; 623 int error = 0; 624 625 if (vd_lh == NULL) 626 return (SET_ERROR(EINVAL)); 627 628 ASSERT(flags & B_READ || flags & B_WRITE); 629 630 bp = getrbuf(KM_SLEEP); 631 bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; 632 bp->b_bcount = size; 633 bp->b_un.b_addr = (void *)data; 634 bp->b_lblkno = lbtodb(offset); 635 bp->b_bufsize = size; 636 637 error = ldi_strategy(vd_lh, bp); 638 ASSERT(error == 0); 639 if ((error = biowait(bp)) == 0 && bp->b_resid != 0) 640 error = SET_ERROR(EIO); 641 freerbuf(bp); 642 643 return (error); 644 } 645 646 static void 647 vdev_disk_io_intr(buf_t *bp) 648 { 649 vdev_buf_t *vb = (vdev_buf_t *)bp; 650 zio_t *zio = vb->vb_io; 651 652 /* 653 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. 654 * Rather than teach the rest of the stack about other error 655 * possibilities (EFAULT, etc), we normalize the error value here. 656 */ 657 zio->io_error = (geterror(bp) != 0 ? EIO : 0); 658 659 if (zio->io_error == 0 && bp->b_resid != 0) 660 zio->io_error = SET_ERROR(EIO); 661 662 if (zio->io_type == ZIO_TYPE_READ) { 663 abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); 664 } else { 665 abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); 666 } 667 668 kmem_free(vb, sizeof (vdev_buf_t)); 669 670 zio_delay_interrupt(zio); 671 } 672 673 static void 674 vdev_disk_ioctl_free(zio_t *zio) 675 { 676 kmem_free(zio->io_vsd, sizeof (struct dk_callback)); 677 } 678 679 static const zio_vsd_ops_t vdev_disk_vsd_ops = { 680 vdev_disk_ioctl_free, 681 zio_vsd_default_cksum_report 682 }; 683 684 static void 685 vdev_disk_ioctl_done(void *zio_arg, int error) 686 { 687 zio_t *zio = zio_arg; 688 689 zio->io_error = error; 690 691 zio_interrupt(zio); 692 } 693 694 static void 695 vdev_disk_io_start(zio_t *zio) 696 { 697 vdev_t *vd = zio->io_vd; 698 vdev_disk_t *dvd = vd->vdev_tsd; 699 vdev_buf_t *vb; 700 struct dk_callback *dkc; 701 buf_t *bp; 702 int error; 703 704 /* 705 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 706 * Nothing to be done here but return failure. 707 */ 708 if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { 709 zio->io_error = ENXIO; 710 zio_interrupt(zio); 711 return; 712 } 713 714 if (zio->io_type == ZIO_TYPE_IOCTL) { 715 /* XXPOLICY */ 716 if (!vdev_readable(vd)) { 717 zio->io_error = SET_ERROR(ENXIO); 718 zio_interrupt(zio); 719 return; 720 } 721 722 switch (zio->io_cmd) { 723 724 case DKIOCFLUSHWRITECACHE: 725 726 if (zfs_nocacheflush) 727 break; 728 729 if (vd->vdev_nowritecache) { 730 zio->io_error = SET_ERROR(ENOTSUP); 731 break; 732 } 733 734 zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); 735 zio->io_vsd_ops = &vdev_disk_vsd_ops; 736 737 dkc->dkc_callback = vdev_disk_ioctl_done; 738 dkc->dkc_flag = FLUSH_VOLATILE; 739 dkc->dkc_cookie = zio; 740 741 error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, 742 (uintptr_t)dkc, FKIOCTL, kcred, NULL); 743 744 if (error == 0) { 745 /* 746 * The ioctl will be done asychronously, 747 * and will call vdev_disk_ioctl_done() 748 * upon completion. 749 */ 750 return; 751 } 752 753 zio->io_error = error; 754 755 break; 756 757 default: 758 zio->io_error = SET_ERROR(ENOTSUP); 759 } 760 761 zio_execute(zio); 762 return; 763 } 764 765 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 766 zio->io_target_timestamp = zio_handle_io_delay(zio); 767 768 vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); 769 770 vb->vb_io = zio; 771 bp = &vb->vb_buf; 772 773 bioinit(bp); 774 bp->b_flags = B_BUSY | B_NOCACHE | 775 (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); 776 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 777 bp->b_flags |= B_FAILFAST; 778 bp->b_bcount = zio->io_size; 779 780 if (zio->io_type == ZIO_TYPE_READ) { 781 bp->b_un.b_addr = 782 abd_borrow_buf(zio->io_abd, zio->io_size); 783 } else { 784 bp->b_un.b_addr = 785 abd_borrow_buf_copy(zio->io_abd, zio->io_size); 786 } 787 788 bp->b_lblkno = lbtodb(zio->io_offset); 789 bp->b_bufsize = zio->io_size; 790 bp->b_iodone = (int (*)())vdev_disk_io_intr; 791 792 /* ldi_strategy() will return non-zero only on programming errors */ 793 VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); 794 } 795 796 static void 797 vdev_disk_io_done(zio_t *zio) 798 { 799 vdev_t *vd = zio->io_vd; 800 801 /* 802 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if 803 * the device has been removed. If this is the case, then we trigger an 804 * asynchronous removal of the device. Otherwise, probe the device and 805 * make sure it's still accessible. 806 */ 807 if (zio->io_error == EIO && !vd->vdev_remove_wanted) { 808 vdev_disk_t *dvd = vd->vdev_tsd; 809 int state = DKIO_NONE; 810 811 if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, 812 FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { 813 /* 814 * We post the resource as soon as possible, instead of 815 * when the async removal actually happens, because the 816 * DE is using this information to discard previous I/O 817 * errors. 818 */ 819 zfs_post_remove(zio->io_spa, vd); 820 vd->vdev_remove_wanted = B_TRUE; 821 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 822 } else if (!vd->vdev_delayed_close) { 823 vd->vdev_delayed_close = B_TRUE; 824 } 825 } 826 } 827 828 vdev_ops_t vdev_disk_ops = { 829 vdev_disk_open, 830 vdev_disk_close, 831 vdev_default_asize, 832 vdev_disk_io_start, 833 vdev_disk_io_done, 834 NULL, 835 vdev_disk_hold, 836 vdev_disk_rele, 837 VDEV_TYPE_DISK, /* name of this vdev type */ 838 B_TRUE /* leaf vdev */ 839 }; 840 841 /* 842 * Given the root disk device devid or pathname, read the label from 843 * the device, and construct a configuration nvlist. 844 */ 845 int 846 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) 847 { 848 ldi_handle_t vd_lh; 849 vdev_label_t *label; 850 uint64_t s, size; 851 int l; 852 ddi_devid_t tmpdevid; 853 int error = -1; 854 char *minor_name; 855 856 /* 857 * Read the device label and build the nvlist. 858 */ 859 if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, 860 &minor_name) == 0) { 861 error = ldi_open_by_devid(tmpdevid, minor_name, 862 FREAD, kcred, &vd_lh, zfs_li); 863 ddi_devid_free(tmpdevid); 864 ddi_devid_str_free(minor_name); 865 } 866 867 if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, 868 zfs_li))) 869 return (error); 870 871 if (ldi_get_size(vd_lh, &s)) { 872 (void) ldi_close(vd_lh, FREAD, kcred); 873 return (SET_ERROR(EIO)); 874 } 875 876 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); 877 label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); 878 879 *config = NULL; 880 for (l = 0; l < VDEV_LABELS; l++) { 881 uint64_t offset, state, txg = 0; 882 883 /* read vdev label */ 884 offset = vdev_label_offset(size, l, 0); 885 if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, 886 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) 887 continue; 888 889 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, 890 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { 891 *config = NULL; 892 continue; 893 } 894 895 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 896 &state) != 0 || state >= POOL_STATE_DESTROYED) { 897 nvlist_free(*config); 898 *config = NULL; 899 continue; 900 } 901 902 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 903 &txg) != 0 || txg == 0) { 904 nvlist_free(*config); 905 *config = NULL; 906 continue; 907 } 908 909 break; 910 } 911 912 kmem_free(label, sizeof (vdev_label_t)); 913 (void) ldi_close(vd_lh, FREAD, kcred); 914 if (*config == NULL) 915 error = SET_ERROR(EIDRM); 916 917 return (error); 918 }