1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/zfs_zone.h>
29 #include <sys/spa_impl.h>
30 #include <sys/refcount.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/sunldi.h>
36 #include <sys/efi_partition.h>
37 #include <sys/fm/fs/zfs.h>
38
39 /*
40 * Virtual device vector for disks.
41 */
42
43 extern ldi_ident_t zfs_li;
44
45 typedef struct vdev_disk_buf {
46 buf_t vdb_buf;
47 zio_t *vdb_io;
48 } vdev_disk_buf_t;
49
50 static void
51 vdev_disk_hold(vdev_t *vd)
52 {
53 ddi_devid_t devid;
54 char *minor;
55
56 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
57
58 /*
59 * We must have a pathname, and it must be absolute.
60 */
61 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
62 return;
63
64 /*
65 * Only prefetch path and devid info if the device has
66 * never been opened.
67 */
68 if (vd->vdev_tsd != NULL)
69 return;
70
71 if (vd->vdev_wholedisk == -1ULL) {
72 size_t len = strlen(vd->vdev_path) + 3;
73 char *buf = kmem_alloc(len, KM_SLEEP);
74
75 (void) snprintf(buf, len, "%ss0", vd->vdev_path);
76
77 (void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
78 kmem_free(buf, len);
79 }
80
81 if (vd->vdev_name_vp == NULL)
82 (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
83
84 if (vd->vdev_devid != NULL &&
85 ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
86 (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
87 ddi_devid_str_free(minor);
88 ddi_devid_free(devid);
89 }
90 }
91
92 static void
93 vdev_disk_rele(vdev_t *vd)
94 {
95 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
96
97 if (vd->vdev_name_vp) {
98 VN_RELE_ASYNC(vd->vdev_name_vp,
99 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
100 vd->vdev_name_vp = NULL;
101 }
102 if (vd->vdev_devid_vp) {
103 VN_RELE_ASYNC(vd->vdev_devid_vp,
104 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
105 vd->vdev_devid_vp = NULL;
106 }
107 }
108
109 static uint64_t
110 vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
111 {
112 ASSERT(vd->vdev_wholedisk);
113
114 vdev_disk_t *dvd = vd->vdev_tsd;
115 dk_efi_t dk_ioc;
116 efi_gpt_t *efi;
117 uint64_t avail_space = 0;
118 int efisize = EFI_LABEL_SIZE * 2;
119
120 dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP);
121 dk_ioc.dki_lba = 1;
122 dk_ioc.dki_length = efisize;
123 dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data;
124 efi = dk_ioc.dki_data;
125
126 if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc,
127 FKIOCTL, kcred, NULL) == 0) {
128 uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
129
130 zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu",
131 vd->vdev_path, capacity, efi_altern_lba);
132 if (capacity > efi_altern_lba)
133 avail_space = (capacity - efi_altern_lba) * blksz;
134 }
135 kmem_free(dk_ioc.dki_data, efisize);
136 return (avail_space);
137 }
138
139 static int
140 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
141 uint64_t *ashift)
142 {
143 spa_t *spa = vd->vdev_spa;
144 vdev_disk_t *dvd;
145 struct dk_minfo_ext dkmext;
146 int error;
147 dev_t dev;
148 int otyp;
149
150 /*
151 * We must have a pathname, and it must be absolute.
152 */
153 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
154 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
155 return (EINVAL);
156 }
157
158 /*
159 * Reopen the device if it's not currently open. Otherwise,
160 * just update the physical size of the device.
161 */
162 if (vd->vdev_tsd != NULL) {
163 ASSERT(vd->vdev_reopening);
164 dvd = vd->vdev_tsd;
165 goto skip_open;
166 }
167
168 dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
169
170 /*
171 * When opening a disk device, we want to preserve the user's original
172 * intent. We always want to open the device by the path the user gave
173 * us, even if it is one of multiple paths to the save device. But we
174 * also want to be able to survive disks being removed/recabled.
175 * Therefore the sequence of opening devices is:
176 *
177 * 1. Try opening the device by path. For legacy pools without the
178 * 'whole_disk' property, attempt to fix the path by appending 's0'.
179 *
180 * 2. If the devid of the device matches the stored value, return
181 * success.
182 *
183 * 3. Otherwise, the device may have moved. Try opening the device
184 * by the devid instead.
185 */
186 if (vd->vdev_devid != NULL) {
187 if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
188 &dvd->vd_minor) != 0) {
189 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
190 return (EINVAL);
191 }
192 }
193
194 error = EINVAL; /* presume failure */
195
196 if (vd->vdev_path != NULL) {
197 ddi_devid_t devid;
198
199 if (vd->vdev_wholedisk == -1ULL) {
200 size_t len = strlen(vd->vdev_path) + 3;
201 char *buf = kmem_alloc(len, KM_SLEEP);
202 ldi_handle_t lh;
203
204 (void) snprintf(buf, len, "%ss0", vd->vdev_path);
205
206 if (ldi_open_by_name(buf, spa_mode(spa), kcred,
207 &lh, zfs_li) == 0) {
208 spa_strfree(vd->vdev_path);
209 vd->vdev_path = buf;
210 vd->vdev_wholedisk = 1ULL;
211 (void) ldi_close(lh, spa_mode(spa), kcred);
212 } else {
213 kmem_free(buf, len);
214 }
215 }
216
217 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
218 &dvd->vd_lh, zfs_li);
219
220 /*
221 * Compare the devid to the stored value.
222 */
223 if (error == 0 && vd->vdev_devid != NULL &&
224 ldi_get_devid(dvd->vd_lh, &devid) == 0) {
225 if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
226 error = EINVAL;
227 (void) ldi_close(dvd->vd_lh, spa_mode(spa),
228 kcred);
229 dvd->vd_lh = NULL;
230 }
231 ddi_devid_free(devid);
232 }
233
234 /*
235 * If we succeeded in opening the device, but 'vdev_wholedisk'
236 * is not yet set, then this must be a slice.
237 */
238 if (error == 0 && vd->vdev_wholedisk == -1ULL)
239 vd->vdev_wholedisk = 0;
240 }
241
242 /*
243 * If we were unable to open by path, or the devid check fails, open by
244 * devid instead.
245 */
246 if (error != 0 && vd->vdev_devid != NULL)
247 error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
248 spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
249
250 /*
251 * If all else fails, then try opening by physical path (if available)
252 * or the logical path (if we failed due to the devid check). While not
253 * as reliable as the devid, this will give us something, and the higher
254 * level vdev validation will prevent us from opening the wrong device.
255 */
256 if (error) {
257 if (vd->vdev_physpath != NULL &&
258 (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
259 error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
260 kcred, &dvd->vd_lh, zfs_li);
261
262 /*
263 * Note that we don't support the legacy auto-wholedisk support
264 * as above. This hasn't been used in a very long time and we
265 * don't need to propagate its oddities to this edge condition.
266 */
267 if (error && vd->vdev_path != NULL)
268 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
269 kcred, &dvd->vd_lh, zfs_li);
270 }
271
272 if (error) {
273 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
274 return (error);
275 }
276
277 /*
278 * Once a device is opened, verify that the physical device path (if
279 * available) is up to date.
280 */
281 if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
282 ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
283 char *physpath, *minorname;
284
285 physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
286 minorname = NULL;
287 if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
288 ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
289 (vd->vdev_physpath == NULL ||
290 strcmp(vd->vdev_physpath, physpath) != 0)) {
291 if (vd->vdev_physpath)
292 spa_strfree(vd->vdev_physpath);
293 (void) strlcat(physpath, ":", MAXPATHLEN);
294 (void) strlcat(physpath, minorname, MAXPATHLEN);
295 vd->vdev_physpath = spa_strdup(physpath);
296 }
297 if (minorname)
298 kmem_free(minorname, strlen(minorname) + 1);
299 kmem_free(physpath, MAXPATHLEN);
300 }
301
302 skip_open:
303 /*
304 * Determine the actual size of the device.
305 */
306 if (ldi_get_size(dvd->vd_lh, psize) != 0) {
307 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
308 return (EINVAL);
309 }
310
311 /*
312 * Determine the device's minimum transfer size.
313 * If the ioctl isn't supported, assume DEV_BSIZE.
314 */
315 if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext,
316 FKIOCTL, kcred, NULL) != 0)
317 dkmext.dki_pbsize = DEV_BSIZE;
318
319 *ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1;
320
321 if (vd->vdev_wholedisk == 1) {
322 uint64_t capacity = dkmext.dki_capacity - 1;
323 uint64_t blksz = dkmext.dki_lbsize;
324 int wce = 1;
325
326 /*
327 * If we own the whole disk, try to enable disk write caching.
328 * We ignore errors because it's OK if we can't do it.
329 */
330 (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
331 FKIOCTL, kcred, NULL);
332
333 *max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz);
334 zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
335 "max_psize %llu", vd->vdev_path, *psize, *max_psize);
336 } else {
337 *max_psize = *psize;
338 }
339
340 /*
341 * Clear the nowritecache bit, so that on a vdev_reopen() we will
342 * try again.
343 */
344 vd->vdev_nowritecache = B_FALSE;
345
346 return (0);
347 }
348
349 static void
350 vdev_disk_close(vdev_t *vd)
351 {
352 vdev_disk_t *dvd = vd->vdev_tsd;
353
354 if (vd->vdev_reopening || dvd == NULL)
355 return;
356
357 if (dvd->vd_minor != NULL)
358 ddi_devid_str_free(dvd->vd_minor);
359
360 if (dvd->vd_devid != NULL)
361 ddi_devid_free(dvd->vd_devid);
362
363 if (dvd->vd_lh != NULL)
364 (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
365
366 vd->vdev_delayed_close = B_FALSE;
367 kmem_free(dvd, sizeof (vdev_disk_t));
368 vd->vdev_tsd = NULL;
369 }
370
371 int
372 vdev_disk_physio(vdev_t *vd, caddr_t data,
373 size_t size, uint64_t offset, int flags)
374 {
375 vdev_disk_t *dvd = vd->vdev_tsd;
376
377 /*
378 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
379 * Nothing to be done here but return failure.
380 */
381 if (dvd == NULL)
382 return (EIO);
383
384 ASSERT(vd->vdev_ops == &vdev_disk_ops);
385 return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
386 }
387
388 int
389 vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
390 size_t size, uint64_t offset, int flags)
391 {
392 buf_t *bp;
393 int error = 0;
394
395 if (vd_lh == NULL)
396 return (EINVAL);
397
398 ASSERT(flags & B_READ || flags & B_WRITE);
399
400 bp = getrbuf(KM_SLEEP);
401 bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
402 bp->b_bcount = size;
403 bp->b_un.b_addr = (void *)data;
404 bp->b_lblkno = lbtodb(offset);
405 bp->b_bufsize = size;
406
407 error = ldi_strategy(vd_lh, bp);
408 ASSERT(error == 0);
409 if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
410 error = EIO;
411 freerbuf(bp);
412
413 return (error);
414 }
415
416 static void
417 vdev_disk_io_intr(buf_t *bp)
418 {
419 vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
420 zio_t *zio = vdb->vdb_io;
421
422 /*
423 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
424 * Rather than teach the rest of the stack about other error
425 * possibilities (EFAULT, etc), we normalize the error value here.
426 */
427 zio->io_error = (geterror(bp) != 0 ? EIO : 0);
428
429 if (zio->io_error == 0 && bp->b_resid != 0)
430 zio->io_error = EIO;
431
432 kmem_free(vdb, sizeof (vdev_disk_buf_t));
433
434 zio_interrupt(zio);
435 }
436
437 static void
438 vdev_disk_ioctl_free(zio_t *zio)
439 {
440 kmem_free(zio->io_vsd, sizeof (struct dk_callback));
441 }
442
443 static const zio_vsd_ops_t vdev_disk_vsd_ops = {
444 vdev_disk_ioctl_free,
445 zio_vsd_default_cksum_report
446 };
447
448 static void
449 vdev_disk_ioctl_done(void *zio_arg, int error)
450 {
451 zio_t *zio = zio_arg;
452
453 zio->io_error = error;
454
455 zio_interrupt(zio);
456 }
457
458 static int
459 vdev_disk_io_start(zio_t *zio)
460 {
461 vdev_t *vd = zio->io_vd;
462 vdev_disk_t *dvd = vd->vdev_tsd;
463 vdev_disk_buf_t *vdb;
464 struct dk_callback *dkc;
465 buf_t *bp;
466 int error;
467
468 if (zio->io_type == ZIO_TYPE_IOCTL) {
469 /* XXPOLICY */
470 if (!vdev_readable(vd)) {
471 zio->io_error = ENXIO;
472 return (ZIO_PIPELINE_CONTINUE);
473 }
474
475 switch (zio->io_cmd) {
476
477 case DKIOCFLUSHWRITECACHE:
478
479 if (zfs_nocacheflush)
480 break;
481
482 if (vd->vdev_nowritecache) {
483 zio->io_error = ENOTSUP;
484 break;
485 }
486
487 zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
488 zio->io_vsd_ops = &vdev_disk_vsd_ops;
489
490 dkc->dkc_callback = vdev_disk_ioctl_done;
491 dkc->dkc_flag = FLUSH_VOLATILE;
492 dkc->dkc_cookie = zio;
493
494 error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
495 (uintptr_t)dkc, FKIOCTL, kcred, NULL);
496
497 if (error == 0) {
498 /*
499 * The ioctl will be done asychronously,
500 * and will call vdev_disk_ioctl_done()
501 * upon completion.
502 */
503 return (ZIO_PIPELINE_STOP);
504 }
505
506 if (error == ENOTSUP || error == ENOTTY) {
507 /*
508 * If we get ENOTSUP or ENOTTY, we know that
509 * no future attempts will ever succeed.
510 * In this case we set a persistent bit so
511 * that we don't bother with the ioctl in the
512 * future.
513 */
514 vd->vdev_nowritecache = B_TRUE;
515 }
516 zio->io_error = error;
517
518 break;
519
520 default:
521 zio->io_error = ENOTSUP;
522 }
523
524 return (ZIO_PIPELINE_CONTINUE);
525 }
526
527 vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
528
529 vdb->vdb_io = zio;
530 bp = &vdb->vdb_buf;
531
532 bioinit(bp);
533 bp->b_flags = B_BUSY | B_NOCACHE |
534 (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
535 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
536 bp->b_flags |= B_FAILFAST;
537 bp->b_bcount = zio->io_size;
538 bp->b_un.b_addr = zio->io_data;
539 bp->b_lblkno = lbtodb(zio->io_offset);
540 bp->b_bufsize = zio->io_size;
541 bp->b_iodone = (int (*)())vdev_disk_io_intr;
542
543 zfs_zone_zio_start(zio);
544
545 /* ldi_strategy() will return non-zero only on programming errors */
546 VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
547
548 return (ZIO_PIPELINE_STOP);
549 }
550
551 static void
552 vdev_disk_io_done(zio_t *zio)
553 {
554 vdev_t *vd = zio->io_vd;
555
556 zfs_zone_zio_done(zio);
557
558 /*
559 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
560 * the device has been removed. If this is the case, then we trigger an
561 * asynchronous removal of the device. Otherwise, probe the device and
562 * make sure it's still accessible.
563 */
564 if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
565 vdev_disk_t *dvd = vd->vdev_tsd;
566 int state = DKIO_NONE;
567
568 if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
569 FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
570 /*
571 * We post the resource as soon as possible, instead of
572 * when the async removal actually happens, because the
573 * DE is using this information to discard previous I/O
574 * errors.
575 */
576 zfs_post_remove(zio->io_spa, vd);
577 vd->vdev_remove_wanted = B_TRUE;
578 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
579 } else if (!vd->vdev_delayed_close) {
580 vd->vdev_delayed_close = B_TRUE;
581 }
582 }
583 }
584
585 vdev_ops_t vdev_disk_ops = {
586 vdev_disk_open,
587 vdev_disk_close,
588 vdev_default_asize,
589 vdev_disk_io_start,
590 vdev_disk_io_done,
591 NULL,
592 vdev_disk_hold,
593 vdev_disk_rele,
594 VDEV_TYPE_DISK, /* name of this vdev type */
595 B_TRUE /* leaf vdev */
596 };
597
598 /*
599 * Given the root disk device devid or pathname, read the label from
600 * the device, and construct a configuration nvlist.
601 */
602 int
603 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
604 {
605 ldi_handle_t vd_lh;
606 vdev_label_t *label;
607 uint64_t s, size;
608 int l;
609 ddi_devid_t tmpdevid;
610 int error = -1;
611 char *minor_name;
612
613 /*
614 * Read the device label and build the nvlist.
615 */
616 if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
617 &minor_name) == 0) {
618 error = ldi_open_by_devid(tmpdevid, minor_name,
619 FREAD, kcred, &vd_lh, zfs_li);
620 ddi_devid_free(tmpdevid);
621 ddi_devid_str_free(minor_name);
622 }
623
624 if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
625 zfs_li)))
626 return (error);
627
628 if (ldi_get_size(vd_lh, &s)) {
629 (void) ldi_close(vd_lh, FREAD, kcred);
630 return (EIO);
631 }
632
633 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
634 label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
635
636 *config = NULL;
637 for (l = 0; l < VDEV_LABELS; l++) {
638 uint64_t offset, state, txg = 0;
639
640 /* read vdev label */
641 offset = vdev_label_offset(size, l, 0);
642 if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
643 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
644 continue;
645
646 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
647 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
648 *config = NULL;
649 continue;
650 }
651
652 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
653 &state) != 0 || state >= POOL_STATE_DESTROYED) {
654 nvlist_free(*config);
655 *config = NULL;
656 continue;
657 }
658
659 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
660 &txg) != 0 || txg == 0) {
661 nvlist_free(*config);
662 *config = NULL;
663 continue;
664 }
665
666 break;
667 }
668
669 kmem_free(label, sizeof (vdev_label_t));
670 (void) ldi_close(vd_lh, FREAD, kcred);
671 if (*config == NULL)
672 error = EIDRM;
673
674 return (error);
675 }