1 /*
2 * GRUB -- GRand Unified Bootloader
3 * Copyright (C) 1999,2000,2001,2002,2003,2004 Free Software Foundation, Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20 /*
21 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
22 * Use is subject to license terms.
23 */
24
25 /*
26 * Copyright (c) 2012 by Delphix. All rights reserved.
27 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
28 */
29
30 /*
31 * The zfs plug-in routines for GRUB are:
32 *
33 * zfs_mount() - locates a valid uberblock of the root pool and reads
34 * in its MOS at the memory address MOS.
35 *
36 * zfs_open() - locates a plain file object by following the MOS
37 * and places its dnode at the memory address DNODE.
38 *
39 * zfs_read() - read in the data blocks pointed by the DNODE.
40 *
41 * ZFS_SCRATCH is used as a working area.
42 *
43 * (memory addr) MOS DNODE ZFS_SCRATCH
44 * | | |
45 * +-------V---------V----------V---------------+
46 * memory | | dnode | dnode | scratch |
47 * | | 512B | 512B | area |
48 * +--------------------------------------------+
49 */
50
51 #ifdef FSYS_ZFS
52
53 #include "shared.h"
54 #include "filesys.h"
55 #include "fsys_zfs.h"
56
57 /* cache for a file block of the currently zfs_open()-ed file */
58 static void *file_buf = NULL;
59 static uint64_t file_start = 0;
60 static uint64_t file_end = 0;
61
62 /* cache for a dnode block */
63 static dnode_phys_t *dnode_buf = NULL;
64 static dnode_phys_t *dnode_mdn = NULL;
65 static uint64_t dnode_start = 0;
66 static uint64_t dnode_end = 0;
67
68 static uint64_t pool_guid = 0;
69 static uberblock_t current_uberblock;
70 static char *stackbase;
71
72 decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] =
73 {
74 {"inherit", 0}, /* ZIO_COMPRESS_INHERIT */
75 {"on", lzjb_decompress}, /* ZIO_COMPRESS_ON */
76 {"off", 0}, /* ZIO_COMPRESS_OFF */
77 {"lzjb", lzjb_decompress}, /* ZIO_COMPRESS_LZJB */
78 {"empty", 0}, /* ZIO_COMPRESS_EMPTY */
79 {"gzip-1", 0}, /* ZIO_COMPRESS_GZIP_1 */
80 {"gzip-2", 0}, /* ZIO_COMPRESS_GZIP_2 */
81 {"gzip-3", 0}, /* ZIO_COMPRESS_GZIP_3 */
82 {"gzip-4", 0}, /* ZIO_COMPRESS_GZIP_4 */
83 {"gzip-5", 0}, /* ZIO_COMPRESS_GZIP_5 */
84 {"gzip-6", 0}, /* ZIO_COMPRESS_GZIP_6 */
85 {"gzip-7", 0}, /* ZIO_COMPRESS_GZIP_7 */
86 {"gzip-8", 0}, /* ZIO_COMPRESS_GZIP_8 */
87 {"gzip-9", 0}, /* ZIO_COMPRESS_GZIP_9 */
88 {"zle", 0}, /* ZIO_COMPRESS_ZLE */
89 {"lz4", lz4_decompress} /* ZIO_COMPRESS_LZ4 */
90 };
91
92 static int zio_read_data(blkptr_t *bp, void *buf, char *stack);
93
94 /*
95 * Our own version of bcmp().
96 */
97 static int
98 zfs_bcmp(const void *s1, const void *s2, size_t n)
99 {
100 const uchar_t *ps1 = s1;
101 const uchar_t *ps2 = s2;
102
103 if (s1 != s2 && n != 0) {
104 do {
105 if (*ps1++ != *ps2++)
106 return (1);
107 } while (--n != 0);
108 }
109
110 return (0);
111 }
112
113 /*
114 * Our own version of log2(). Same thing as highbit()-1.
115 */
116 static int
117 zfs_log2(uint64_t num)
118 {
119 int i = 0;
120
121 while (num > 1) {
122 i++;
123 num = num >> 1;
124 }
125
126 return (i);
127 }
128
129 /* Checksum Functions */
130 static void
131 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
132 {
133 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
134 }
135
136 /* Checksum Table and Values */
137 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
138 {{NULL, NULL}, 0, 0, "inherit"},
139 {{NULL, NULL}, 0, 0, "on"},
140 {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
141 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"},
142 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"},
143 {{NULL, NULL}, 0, 0, "zilog"},
144 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"},
145 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"},
146 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"},
147 {{NULL, NULL}, 0, 0, "zilog2"},
148 {{zio_checksum_SHA512, NULL}, 0, 0, "SHA512"}
149 };
150
151 /*
152 * zio_checksum_verify: Provides support for checksum verification.
153 *
154 * Fletcher2, Fletcher4, SHA-256 and SHA-512/256 are supported.
155 *
156 * Return:
157 * -1 = Failure
158 * 0 = Success
159 */
160 static int
161 zio_checksum_verify(blkptr_t *bp, char *data, int size)
162 {
163 zio_cksum_t zc = bp->blk_cksum;
164 uint32_t checksum = BP_GET_CHECKSUM(bp);
165 int byteswap = BP_SHOULD_BYTESWAP(bp);
166 zio_eck_t *zec = (zio_eck_t *)(data + size) - 1;
167 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
168 zio_cksum_t actual_cksum, expected_cksum;
169
170 /* byteswap is not supported */
171 if (byteswap)
172 return (-1);
173
174 if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
175 return (-1);
176
177 if (ci->ci_eck) {
178 expected_cksum = zec->zec_cksum;
179 zec->zec_cksum = zc;
180 ci->ci_func[0](data, size, &actual_cksum);
181 zec->zec_cksum = expected_cksum;
182 zc = expected_cksum;
183
184 } else {
185 ci->ci_func[byteswap](data, size, &actual_cksum);
186 }
187
188 if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
189 (actual_cksum.zc_word[1] - zc.zc_word[1]) |
190 (actual_cksum.zc_word[2] - zc.zc_word[2]) |
191 (actual_cksum.zc_word[3] - zc.zc_word[3]))
192 return (-1);
193
194 return (0);
195 }
196
197 /*
198 * vdev_label_start returns the physical disk offset (in bytes) of
199 * label "l".
200 */
201 static uint64_t
202 vdev_label_start(uint64_t psize, int l)
203 {
204 return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
205 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
206 }
207
208 /*
209 * vdev_uberblock_compare takes two uberblock structures and returns an integer
210 * indicating the more recent of the two.
211 * Return Value = 1 if ub2 is more recent
212 * Return Value = -1 if ub1 is more recent
213 * The most recent uberblock is determined using its transaction number and
214 * timestamp. The uberblock with the highest transaction number is
215 * considered "newer". If the transaction numbers of the two blocks match, the
216 * timestamps are compared to determine the "newer" of the two.
217 */
218 static int
219 vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
220 {
221 if (ub1->ub_txg < ub2->ub_txg)
222 return (-1);
223 if (ub1->ub_txg > ub2->ub_txg)
224 return (1);
225
226 if (ub1->ub_timestamp < ub2->ub_timestamp)
227 return (-1);
228 if (ub1->ub_timestamp > ub2->ub_timestamp)
229 return (1);
230
231 return (0);
232 }
233
234 /*
235 * Three pieces of information are needed to verify an uberblock: the magic
236 * number, the version number, and the checksum.
237 *
238 * Return:
239 * 0 - Success
240 * -1 - Failure
241 */
242 static int
243 uberblock_verify(uberblock_t *uber, uint64_t ub_size, uint64_t offset)
244 {
245 blkptr_t bp;
246
247 BP_ZERO(&bp);
248 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
249 BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
250 ZIO_SET_CHECKSUM(&bp.blk_cksum, offset, 0, 0, 0);
251
252 if (zio_checksum_verify(&bp, (char *)uber, ub_size) != 0)
253 return (-1);
254
255 if (uber->ub_magic == UBERBLOCK_MAGIC &&
256 SPA_VERSION_IS_SUPPORTED(uber->ub_version))
257 return (0);
258
259 return (-1);
260 }
261
262 /*
263 * Find the best uberblock.
264 * Return:
265 * Success - Pointer to the best uberblock.
266 * Failure - NULL
267 */
268 static uberblock_t *
269 find_bestub(char *ub_array, uint64_t ashift, uint64_t sector)
270 {
271 uberblock_t *ubbest = NULL;
272 uberblock_t *ubnext;
273 uint64_t offset, ub_size;
274 int i;
275
276 ub_size = VDEV_UBERBLOCK_SIZE(ashift);
277
278 for (i = 0; i < VDEV_UBERBLOCK_COUNT(ashift); i++) {
279 ubnext = (uberblock_t *)ub_array;
280 ub_array += ub_size;
281 offset = (sector << SPA_MINBLOCKSHIFT) +
282 VDEV_UBERBLOCK_OFFSET(ashift, i);
283
284 if (uberblock_verify(ubnext, ub_size, offset) != 0)
285 continue;
286
287 if (ubbest == NULL ||
288 vdev_uberblock_compare(ubnext, ubbest) > 0)
289 ubbest = ubnext;
290 }
291
292 return (ubbest);
293 }
294
295 /*
296 * Read a block of data based on the gang block address dva,
297 * and put its data in buf.
298 *
299 * Return:
300 * 0 - success
301 * 1 - failure
302 */
303 static int
304 zio_read_gang(blkptr_t *bp, dva_t *dva, void *buf, char *stack)
305 {
306 zio_gbh_phys_t *zio_gb;
307 uint64_t offset, sector;
308 blkptr_t tmpbp;
309 int i;
310
311 zio_gb = (zio_gbh_phys_t *)stack;
312 stack += SPA_GANGBLOCKSIZE;
313 offset = DVA_GET_OFFSET(dva);
314 sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
315
316 /* read in the gang block header */
317 if (devread(sector, 0, SPA_GANGBLOCKSIZE, (char *)zio_gb) == 0) {
318 grub_printf("failed to read in a gang block header\n");
319 return (1);
320 }
321
322 /* self checksuming the gang block header */
323 BP_ZERO(&tmpbp);
324 BP_SET_CHECKSUM(&tmpbp, ZIO_CHECKSUM_GANG_HEADER);
325 BP_SET_BYTEORDER(&tmpbp, ZFS_HOST_BYTEORDER);
326 ZIO_SET_CHECKSUM(&tmpbp.blk_cksum, DVA_GET_VDEV(dva),
327 DVA_GET_OFFSET(dva), bp->blk_birth, 0);
328 if (zio_checksum_verify(&tmpbp, (char *)zio_gb, SPA_GANGBLOCKSIZE)) {
329 grub_printf("failed to checksum a gang block header\n");
330 return (1);
331 }
332
333 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
334 if (zio_gb->zg_blkptr[i].blk_birth == 0)
335 continue;
336
337 if (zio_read_data(&zio_gb->zg_blkptr[i], buf, stack))
338 return (1);
339 buf += BP_GET_PSIZE(&zio_gb->zg_blkptr[i]);
340 }
341
342 return (0);
343 }
344
345 /*
346 * Read in a block of raw data to buf.
347 *
348 * Return:
349 * 0 - success
350 * 1 - failure
351 */
352 static int
353 zio_read_data(blkptr_t *bp, void *buf, char *stack)
354 {
355 int i, psize;
356
357 psize = BP_GET_PSIZE(bp);
358
359 /* pick a good dva from the block pointer */
360 for (i = 0; i < SPA_DVAS_PER_BP; i++) {
361 uint64_t offset, sector;
362
363 if (bp->blk_dva[i].dva_word[0] == 0 &&
364 bp->blk_dva[i].dva_word[1] == 0)
365 continue;
366
367 if (DVA_GET_GANG(&bp->blk_dva[i])) {
368 if (zio_read_gang(bp, &bp->blk_dva[i], buf, stack) == 0)
369 return (0);
370 } else {
371 /* read in a data block */
372 offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
373 sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
374 if (devread(sector, 0, psize, buf) != 0)
375 return (0);
376 }
377 }
378
379 return (1);
380 }
381
382 /*
383 * Read in a block of data, verify its checksum, decompress if needed,
384 * and put the uncompressed data in buf.
385 *
386 * Return:
387 * 0 - success
388 * errnum - failure
389 */
390 static int
391 zio_read(blkptr_t *bp, void *buf, char *stack)
392 {
393 int lsize, psize, comp;
394 char *retbuf;
395
396 comp = BP_GET_COMPRESS(bp);
397 lsize = BP_GET_LSIZE(bp);
398 psize = BP_GET_PSIZE(bp);
399
400 if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
401 (comp != ZIO_COMPRESS_OFF &&
402 decomp_table[comp].decomp_func == NULL)) {
403 grub_printf("compression algorithm not supported\n");
404 return (ERR_FSYS_CORRUPT);
405 }
406
407 if ((char *)buf < stack && ((char *)buf) + lsize > stack) {
408 grub_printf("not enough memory allocated\n");
409 return (ERR_WONT_FIT);
410 }
411
412 retbuf = buf;
413 if (comp != ZIO_COMPRESS_OFF) {
414 buf = stack;
415 stack += psize;
416 }
417
418 if (zio_read_data(bp, buf, stack) != 0) {
419 grub_printf("zio_read_data failed\n");
420 return (ERR_FSYS_CORRUPT);
421 }
422
423 if (zio_checksum_verify(bp, buf, psize) != 0) {
424 grub_printf("checksum verification failed\n");
425 return (ERR_FSYS_CORRUPT);
426 }
427
428 if (comp != ZIO_COMPRESS_OFF) {
429 if (decomp_table[comp].decomp_func(buf, retbuf, psize,
430 lsize) != 0) {
431 grub_printf("zio_read decompression failed\n");
432 return (ERR_FSYS_CORRUPT);
433 }
434 }
435
436 return (0);
437 }
438
439 /*
440 * Get the block from a block id.
441 * push the block onto the stack.
442 *
443 * Return:
444 * 0 - success
445 * errnum - failure
446 */
447 static int
448 dmu_read(dnode_phys_t *dn, uint64_t blkid, void *buf, char *stack)
449 {
450 int idx, level;
451 blkptr_t *bp_array = dn->dn_blkptr;
452 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
453 blkptr_t *bp, *tmpbuf;
454
455 bp = (blkptr_t *)stack;
456 stack += sizeof (blkptr_t);
457
458 tmpbuf = (blkptr_t *)stack;
459 stack += 1<<dn->dn_indblkshift;
460
461 for (level = dn->dn_nlevels - 1; level >= 0; level--) {
462 idx = (blkid >> (epbs * level)) & ((1<<epbs)-1);
463 *bp = bp_array[idx];
464 if (level == 0)
465 tmpbuf = buf;
466 if (BP_IS_HOLE(bp)) {
467 grub_memset(buf, 0,
468 dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
469 break;
470 } else if (errnum = zio_read(bp, tmpbuf, stack)) {
471 return (errnum);
472 }
473
474 bp_array = tmpbuf;
475 }
476
477 return (0);
478 }
479
480 /*
481 * mzap_lookup: Looks up property described by "name" and returns the value
482 * in "value".
483 *
484 * Return:
485 * 0 - success
486 * errnum - failure
487 */
488 static int
489 mzap_lookup(mzap_phys_t *zapobj, int objsize, const char *name,
490 uint64_t *value)
491 {
492 int i, chunks;
493 mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
494
495 chunks = objsize / MZAP_ENT_LEN - 1;
496 for (i = 0; i < chunks; i++) {
497 if (grub_strcmp(mzap_ent[i].mze_name, name) == 0) {
498 *value = mzap_ent[i].mze_value;
499 return (0);
500 }
501 }
502
503 return (ERR_FSYS_CORRUPT);
504 }
505
506 static uint64_t
507 zap_hash(uint64_t salt, const char *name)
508 {
509 static uint64_t table[256];
510 const uint8_t *cp;
511 uint8_t c;
512 uint64_t crc = salt;
513
514 if (table[128] == 0) {
515 uint64_t *ct;
516 int i, j;
517 for (i = 0; i < 256; i++) {
518 for (ct = table + i, *ct = i, j = 8; j > 0; j--)
519 *ct = (*ct >> 1) ^ (-(*ct & 1) &
520 ZFS_CRC64_POLY);
521 }
522 }
523
524 if (crc == 0 || table[128] != ZFS_CRC64_POLY) {
525 errnum = ERR_FSYS_CORRUPT;
526 return (0);
527 }
528
529 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
530 crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF];
531
532 /*
533 * Only use 28 bits, since we need 4 bits in the cookie for the
534 * collision differentiator. We MUST use the high bits, since
535 * those are the ones that we first pay attention to when
536 * choosing the bucket.
537 */
538 crc &= ~((1ULL << (64 - 28)) - 1);
539
540 return (crc);
541 }
542
543 /*
544 * Only to be used on 8-bit arrays.
545 * array_len is actual len in bytes (not encoded le_value_length).
546 * buf is null-terminated.
547 */
548 static int
549 zap_leaf_array_equal(zap_leaf_phys_t *l, int blksft, int chunk,
550 int array_len, const char *buf)
551 {
552 int bseen = 0;
553
554 while (bseen < array_len) {
555 struct zap_leaf_array *la =
556 &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
557 int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
558
559 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
560 return (0);
561
562 if (zfs_bcmp(la->la_array, buf + bseen, toread) != 0)
563 break;
564 chunk = la->la_next;
565 bseen += toread;
566 }
567 return (bseen == array_len);
568 }
569
570 /*
571 * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
572 * value for the property "name".
573 *
574 * Return:
575 * 0 - success
576 * errnum - failure
577 */
578 static int
579 zap_leaf_lookup(zap_leaf_phys_t *l, int blksft, uint64_t h,
580 const char *name, uint64_t *value)
581 {
582 uint16_t chunk;
583 struct zap_leaf_entry *le;
584
585 /* Verify if this is a valid leaf block */
586 if (l->l_hdr.lh_block_type != ZBT_LEAF)
587 return (ERR_FSYS_CORRUPT);
588 if (l->l_hdr.lh_magic != ZAP_LEAF_MAGIC)
589 return (ERR_FSYS_CORRUPT);
590
591 for (chunk = l->l_hash[LEAF_HASH(blksft, h)];
592 chunk != CHAIN_END; chunk = le->le_next) {
593
594 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
595 return (ERR_FSYS_CORRUPT);
596
597 le = ZAP_LEAF_ENTRY(l, blksft, chunk);
598
599 /* Verify the chunk entry */
600 if (le->le_type != ZAP_CHUNK_ENTRY)
601 return (ERR_FSYS_CORRUPT);
602
603 if (le->le_hash != h)
604 continue;
605
606 if (zap_leaf_array_equal(l, blksft, le->le_name_chunk,
607 le->le_name_length, name)) {
608
609 struct zap_leaf_array *la;
610 uint8_t *ip;
611
612 if (le->le_int_size != 8 || le->le_value_length != 1)
613 return (ERR_FSYS_CORRUPT);
614
615 /* get the uint64_t property value */
616 la = &ZAP_LEAF_CHUNK(l, blksft,
617 le->le_value_chunk).l_array;
618 ip = la->la_array;
619
620 *value = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
621 (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
622 (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
623 (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
624
625 return (0);
626 }
627 }
628
629 return (ERR_FSYS_CORRUPT);
630 }
631
632 /*
633 * Fat ZAP lookup
634 *
635 * Return:
636 * 0 - success
637 * errnum - failure
638 */
639 static int
640 fzap_lookup(dnode_phys_t *zap_dnode, zap_phys_t *zap,
641 const char *name, uint64_t *value, char *stack)
642 {
643 zap_leaf_phys_t *l;
644 uint64_t hash, idx, blkid;
645 int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT);
646
647 /* Verify if this is a fat zap header block */
648 if (zap->zap_magic != (uint64_t)ZAP_MAGIC ||
649 zap->zap_flags != 0)
650 return (ERR_FSYS_CORRUPT);
651
652 hash = zap_hash(zap->zap_salt, name);
653 if (errnum)
654 return (errnum);
655
656 /* get block id from index */
657 if (zap->zap_ptrtbl.zt_numblks != 0) {
658 /* external pointer tables not supported */
659 return (ERR_FSYS_CORRUPT);
660 }
661 idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift);
662 blkid = ((uint64_t *)zap)[idx + (1<<(blksft-3-1))];
663
664 /* Get the leaf block */
665 l = (zap_leaf_phys_t *)stack;
666 stack += 1<<blksft;
667 if ((1<<blksft) < sizeof (zap_leaf_phys_t))
668 return (ERR_FSYS_CORRUPT);
669 if (errnum = dmu_read(zap_dnode, blkid, l, stack))
670 return (errnum);
671
672 return (zap_leaf_lookup(l, blksft, hash, name, value));
673 }
674
675 /*
676 * Read in the data of a zap object and find the value for a matching
677 * property name.
678 *
679 * Return:
680 * 0 - success
681 * errnum - failure
682 */
683 static int
684 zap_lookup(dnode_phys_t *zap_dnode, const char *name, uint64_t *val,
685 char *stack)
686 {
687 uint64_t block_type;
688 int size;
689 void *zapbuf;
690
691 /* Read in the first block of the zap object data. */
692 zapbuf = stack;
693 size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
694 stack += size;
695
696 if ((errnum = dmu_read(zap_dnode, 0, zapbuf, stack)) != 0)
697 return (errnum);
698
699 block_type = *((uint64_t *)zapbuf);
700
701 if (block_type == ZBT_MICRO) {
702 return (mzap_lookup(zapbuf, size, name, val));
703 } else if (block_type == ZBT_HEADER) {
704 /* this is a fat zap */
705 return (fzap_lookup(zap_dnode, zapbuf, name,
706 val, stack));
707 }
708
709 return (ERR_FSYS_CORRUPT);
710 }
711
712 typedef struct zap_attribute {
713 int za_integer_length;
714 uint64_t za_num_integers;
715 uint64_t za_first_integer;
716 char *za_name;
717 } zap_attribute_t;
718
719 typedef int (zap_cb_t)(zap_attribute_t *za, void *arg, char *stack);
720
721 static int
722 zap_iterate(dnode_phys_t *zap_dnode, zap_cb_t *cb, void *arg, char *stack)
723 {
724 uint32_t size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
725 zap_attribute_t za;
726 int i;
727 mzap_phys_t *mzp = (mzap_phys_t *)stack;
728 stack += size;
729
730 if ((errnum = dmu_read(zap_dnode, 0, mzp, stack)) != 0)
731 return (errnum);
732
733 /*
734 * Iteration over fatzap objects has not yet been implemented.
735 * If we encounter a pool in which there are more features for
736 * read than can fit inside a microzap (i.e., more than 2048
737 * features for read), we can add support for fatzap iteration.
738 * For now, fail.
739 */
740 if (mzp->mz_block_type != ZBT_MICRO) {
741 grub_printf("feature information stored in fatzap, pool "
742 "version not supported\n");
743 return (1);
744 }
745
746 za.za_integer_length = 8;
747 za.za_num_integers = 1;
748 for (i = 0; i < size / MZAP_ENT_LEN - 1; i++) {
749 mzap_ent_phys_t *mzep = &mzp->mz_chunk[i];
750 int err;
751
752 za.za_first_integer = mzep->mze_value;
753 za.za_name = mzep->mze_name;
754 err = cb(&za, arg, stack);
755 if (err != 0)
756 return (err);
757 }
758
759 return (0);
760 }
761
762 /*
763 * Get the dnode of an object number from the metadnode of an object set.
764 *
765 * Input
766 * mdn - metadnode to get the object dnode
767 * objnum - object number for the object dnode
768 * buf - data buffer that holds the returning dnode
769 * stack - scratch area
770 *
771 * Return:
772 * 0 - success
773 * errnum - failure
774 */
775 static int
776 dnode_get(dnode_phys_t *mdn, uint64_t objnum, uint8_t type, dnode_phys_t *buf,
777 char *stack)
778 {
779 uint64_t blkid, blksz; /* the block id this object dnode is in */
780 int epbs; /* shift of number of dnodes in a block */
781 int idx; /* index within a block */
782 dnode_phys_t *dnbuf;
783
784 blksz = mdn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
785 epbs = zfs_log2(blksz) - DNODE_SHIFT;
786 blkid = objnum >> epbs;
787 idx = objnum & ((1<<epbs)-1);
788
789 if (dnode_buf != NULL && dnode_mdn == mdn &&
790 objnum >= dnode_start && objnum < dnode_end) {
791 grub_memmove(buf, &dnode_buf[idx], DNODE_SIZE);
792 VERIFY_DN_TYPE(buf, type);
793 return (0);
794 }
795
796 if (dnode_buf && blksz == 1<<DNODE_BLOCK_SHIFT) {
797 dnbuf = dnode_buf;
798 dnode_mdn = mdn;
799 dnode_start = blkid << epbs;
800 dnode_end = (blkid + 1) << epbs;
801 } else {
802 dnbuf = (dnode_phys_t *)stack;
803 stack += blksz;
804 }
805
806 if (errnum = dmu_read(mdn, blkid, (char *)dnbuf, stack))
807 return (errnum);
808
809 grub_memmove(buf, &dnbuf[idx], DNODE_SIZE);
810 VERIFY_DN_TYPE(buf, type);
811
812 return (0);
813 }
814
815 /*
816 * Check if this is a special file that resides at the top
817 * dataset of the pool. Currently this is the GRUB menu,
818 * boot signature and boot signature backup.
819 * str starts with '/'.
820 */
821 static int
822 is_top_dataset_file(char *str)
823 {
824 char *tptr;
825
826 if ((tptr = grub_strstr(str, "menu.lst")) &&
827 (tptr[8] == '\0' || tptr[8] == ' ') &&
828 *(tptr-1) == '/')
829 return (1);
830
831 if (grub_strncmp(str, BOOTSIGN_DIR"/",
832 grub_strlen(BOOTSIGN_DIR) + 1) == 0)
833 return (1);
834
835 if (grub_strcmp(str, BOOTSIGN_BACKUP) == 0)
836 return (1);
837
838 return (0);
839 }
840
841 static int
842 check_feature(zap_attribute_t *za, void *arg, char *stack)
843 {
844 const char **names = arg;
845 int i;
846
847 if (za->za_first_integer == 0)
848 return (0);
849
850 for (i = 0; names[i] != NULL; i++) {
851 if (grub_strcmp(za->za_name, names[i]) == 0) {
852 return (0);
853 }
854 }
855 grub_printf("missing feature for read '%s'\n", za->za_name);
856 return (ERR_NEWER_VERSION);
857 }
858
859 /*
860 * Get the file dnode for a given file name where mdn is the meta dnode
861 * for this ZFS object set. When found, place the file dnode in dn.
862 * The 'path' argument will be mangled.
863 *
864 * Return:
865 * 0 - success
866 * errnum - failure
867 */
868 static int
869 dnode_get_path(dnode_phys_t *mdn, char *path, dnode_phys_t *dn,
870 char *stack)
871 {
872 uint64_t objnum, version;
873 char *cname, ch;
874
875 if (errnum = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
876 dn, stack))
877 return (errnum);
878
879 if (errnum = zap_lookup(dn, ZPL_VERSION_STR, &version, stack))
880 return (errnum);
881 if (version > ZPL_VERSION)
882 return (-1);
883
884 if (errnum = zap_lookup(dn, ZFS_ROOT_OBJ, &objnum, stack))
885 return (errnum);
886
887 if (errnum = dnode_get(mdn, objnum, DMU_OT_DIRECTORY_CONTENTS,
888 dn, stack))
889 return (errnum);
890
891 /* skip leading slashes */
892 while (*path == '/')
893 path++;
894
895 while (*path && !grub_isspace(*path)) {
896
897 /* get the next component name */
898 cname = path;
899 while (*path && !grub_isspace(*path) && *path != '/')
900 path++;
901 ch = *path;
902 *path = 0; /* ensure null termination */
903
904 if (errnum = zap_lookup(dn, cname, &objnum, stack))
905 return (errnum);
906
907 objnum = ZFS_DIRENT_OBJ(objnum);
908 if (errnum = dnode_get(mdn, objnum, 0, dn, stack))
909 return (errnum);
910
911 *path = ch;
912 while (*path == '/')
913 path++;
914 }
915
916 /* We found the dnode for this file. Verify if it is a plain file. */
917 VERIFY_DN_TYPE(dn, DMU_OT_PLAIN_FILE_CONTENTS);
918
919 return (0);
920 }
921
922 /*
923 * Get the default 'bootfs' property value from the rootpool.
924 *
925 * Return:
926 * 0 - success
927 * errnum -failure
928 */
929 static int
930 get_default_bootfsobj(dnode_phys_t *mosmdn, uint64_t *obj, char *stack)
931 {
932 uint64_t objnum = 0;
933 dnode_phys_t *dn = (dnode_phys_t *)stack;
934 stack += DNODE_SIZE;
935
936 if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
937 DMU_OT_OBJECT_DIRECTORY, dn, stack))
938 return (errnum);
939
940 /*
941 * find the object number for 'pool_props', and get the dnode
942 * of the 'pool_props'.
943 */
944 if (zap_lookup(dn, DMU_POOL_PROPS, &objnum, stack))
945 return (ERR_FILESYSTEM_NOT_FOUND);
946
947 if (errnum = dnode_get(mosmdn, objnum, DMU_OT_POOL_PROPS, dn, stack))
948 return (errnum);
949
950 if (zap_lookup(dn, ZPOOL_PROP_BOOTFS, &objnum, stack))
951 return (ERR_FILESYSTEM_NOT_FOUND);
952
953 if (!objnum)
954 return (ERR_FILESYSTEM_NOT_FOUND);
955
956 *obj = objnum;
957 return (0);
958 }
959
960 /*
961 * List of pool features that the grub implementation of ZFS supports for
962 * read. Note that features that are only required for write do not need
963 * to be listed here since grub opens pools in read-only mode.
964 *
965 * When this list is updated the version number in usr/src/grub/capability
966 * must be incremented to ensure the new grub gets installed.
967 */
968 static const char *spa_feature_names[] = {
969 "org.illumos:lz4_compress",
970 "org.illumos:sha512",
971 NULL
972 };
973
974 /*
975 * Checks whether the MOS features that are active are supported by this
976 * (GRUB's) implementation of ZFS.
977 *
978 * Return:
979 * 0: Success.
980 * errnum: Failure.
981 */
982 static int
983 check_mos_features(dnode_phys_t *mosmdn, char *stack)
984 {
985 uint64_t objnum;
986 dnode_phys_t *dn;
987 uint8_t error = 0;
988
989 dn = (dnode_phys_t *)stack;
990 stack += DNODE_SIZE;
991
992 if ((errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
993 DMU_OT_OBJECT_DIRECTORY, dn, stack)) != 0)
994 return (errnum);
995
996 /*
997 * Find the object number for 'features_for_read' and retrieve its
998 * corresponding dnode. Note that we don't check features_for_write
999 * because GRUB is not opening the pool for write.
1000 */
1001 if ((errnum = zap_lookup(dn, DMU_POOL_FEATURES_FOR_READ, &objnum,
1002 stack)) != 0)
1003 return (errnum);
1004
1005 if ((errnum = dnode_get(mosmdn, objnum, DMU_OTN_ZAP_METADATA,
1006 dn, stack)) != 0)
1007 return (errnum);
1008
1009 return (zap_iterate(dn, check_feature, spa_feature_names, stack));
1010 }
1011
1012 /*
1013 * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname),
1014 * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
1015 * of pool/rootfs.
1016 *
1017 * If no fsname and no obj are given, return the DSL_DIR metadnode.
1018 * If fsname is given, return its metadnode and its matching object number.
1019 * If only obj is given, return the metadnode for this object number.
1020 *
1021 * Return:
1022 * 0 - success
1023 * errnum - failure
1024 */
1025 static int
1026 get_objset_mdn(dnode_phys_t *mosmdn, char *fsname, uint64_t *obj,
1027 dnode_phys_t *mdn, char *stack)
1028 {
1029 uint64_t objnum, headobj;
1030 char *cname, ch;
1031 blkptr_t *bp;
1032 objset_phys_t *osp;
1033 int issnapshot = 0;
1034 char *snapname;
1035
1036 if (fsname == NULL && obj) {
1037 headobj = *obj;
1038 goto skip;
1039 }
1040
1041 if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1042 DMU_OT_OBJECT_DIRECTORY, mdn, stack))
1043 return (errnum);
1044
1045 if (errnum = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum,
1046 stack))
1047 return (errnum);
1048
1049 if (errnum = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, mdn, stack))
1050 return (errnum);
1051
1052 if (fsname == NULL) {
1053 headobj =
1054 ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1055 goto skip;
1056 }
1057
1058 /* take out the pool name */
1059 while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1060 fsname++;
1061
1062 while (*fsname && !grub_isspace(*fsname)) {
1063 uint64_t childobj;
1064
1065 while (*fsname == '/')
1066 fsname++;
1067
1068 cname = fsname;
1069 while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1070 fsname++;
1071 ch = *fsname;
1072 *fsname = 0;
1073
1074 snapname = cname;
1075 while (*snapname && !grub_isspace(*snapname) && *snapname !=
1076 '@')
1077 snapname++;
1078 if (*snapname == '@') {
1079 issnapshot = 1;
1080 *snapname = 0;
1081 }
1082 childobj =
1083 ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_child_dir_zapobj;
1084 if (errnum = dnode_get(mosmdn, childobj,
1085 DMU_OT_DSL_DIR_CHILD_MAP, mdn, stack))
1086 return (errnum);
1087
1088 if (zap_lookup(mdn, cname, &objnum, stack))
1089 return (ERR_FILESYSTEM_NOT_FOUND);
1090
1091 if (errnum = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR,
1092 mdn, stack))
1093 return (errnum);
1094
1095 *fsname = ch;
1096 if (issnapshot)
1097 *snapname = '@';
1098 }
1099 headobj = ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1100 if (obj)
1101 *obj = headobj;
1102
1103 skip:
1104 if (errnum = dnode_get(mosmdn, headobj, DMU_OT_DSL_DATASET, mdn, stack))
1105 return (errnum);
1106 if (issnapshot) {
1107 uint64_t snapobj;
1108
1109 snapobj = ((dsl_dataset_phys_t *)DN_BONUS(mdn))->
1110 ds_snapnames_zapobj;
1111
1112 if (errnum = dnode_get(mosmdn, snapobj,
1113 DMU_OT_DSL_DS_SNAP_MAP, mdn, stack))
1114 return (errnum);
1115 if (zap_lookup(mdn, snapname + 1, &headobj, stack))
1116 return (ERR_FILESYSTEM_NOT_FOUND);
1117 if (errnum = dnode_get(mosmdn, headobj,
1118 DMU_OT_DSL_DATASET, mdn, stack))
1119 return (errnum);
1120 if (obj)
1121 *obj = headobj;
1122 }
1123
1124 bp = &((dsl_dataset_phys_t *)DN_BONUS(mdn))->ds_bp;
1125 osp = (objset_phys_t *)stack;
1126 stack += sizeof (objset_phys_t);
1127 if (errnum = zio_read(bp, osp, stack))
1128 return (errnum);
1129
1130 grub_memmove((char *)mdn, (char *)&osp->os_meta_dnode, DNODE_SIZE);
1131
1132 return (0);
1133 }
1134
1135 /*
1136 * For a given XDR packed nvlist, verify the first 4 bytes and move on.
1137 *
1138 * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
1139 *
1140 * encoding method/host endian (4 bytes)
1141 * nvl_version (4 bytes)
1142 * nvl_nvflag (4 bytes)
1143 * encoded nvpairs:
1144 * encoded size of the nvpair (4 bytes)
1145 * decoded size of the nvpair (4 bytes)
1146 * name string size (4 bytes)
1147 * name string data (sizeof(NV_ALIGN4(string))
1148 * data type (4 bytes)
1149 * # of elements in the nvpair (4 bytes)
1150 * data
1151 * 2 zero's for the last nvpair
1152 * (end of the entire list) (8 bytes)
1153 *
1154 * Return:
1155 * 0 - success
1156 * 1 - failure
1157 */
1158 static int
1159 nvlist_unpack(char *nvlist, char **out)
1160 {
1161 /* Verify if the 1st and 2nd byte in the nvlist are valid. */
1162 if (nvlist[0] != NV_ENCODE_XDR || nvlist[1] != HOST_ENDIAN)
1163 return (1);
1164
1165 *out = nvlist + 4;
1166 return (0);
1167 }
1168
1169 static char *
1170 nvlist_array(char *nvlist, int index)
1171 {
1172 int i, encode_size;
1173
1174 for (i = 0; i < index; i++) {
1175 /* skip the header, nvl_version, and nvl_nvflag */
1176 nvlist = nvlist + 4 * 2;
1177
1178 while (encode_size = BSWAP_32(*(uint32_t *)nvlist))
1179 nvlist += encode_size; /* goto the next nvpair */
1180
1181 nvlist = nvlist + 4 * 2; /* skip the ending 2 zeros - 8 bytes */
1182 }
1183
1184 return (nvlist);
1185 }
1186
1187 /*
1188 * The nvlist_next_nvpair() function returns a handle to the next nvpair in the
1189 * list following nvpair. If nvpair is NULL, the first pair is returned. If
1190 * nvpair is the last pair in the nvlist, NULL is returned.
1191 */
1192 static char *
1193 nvlist_next_nvpair(char *nvl, char *nvpair)
1194 {
1195 char *cur, *prev;
1196 int encode_size;
1197
1198 if (nvl == NULL)
1199 return (NULL);
1200
1201 if (nvpair == NULL) {
1202 /* skip over nvl_version and nvl_nvflag */
1203 nvpair = nvl + 4 * 2;
1204 } else {
1205 /* skip to the next nvpair */
1206 encode_size = BSWAP_32(*(uint32_t *)nvpair);
1207 nvpair += encode_size;
1208 }
1209
1210 /* 8 bytes of 0 marks the end of the list */
1211 if (*(uint64_t *)nvpair == 0)
1212 return (NULL);
1213
1214 return (nvpair);
1215 }
1216
1217 /*
1218 * This function returns 0 on success and 1 on failure. On success, a string
1219 * containing the name of nvpair is saved in buf.
1220 */
1221 static int
1222 nvpair_name(char *nvp, char *buf, int buflen)
1223 {
1224 int len;
1225
1226 /* skip over encode/decode size */
1227 nvp += 4 * 2;
1228
1229 len = BSWAP_32(*(uint32_t *)nvp);
1230 if (buflen < len + 1)
1231 return (1);
1232
1233 grub_memmove(buf, nvp + 4, len);
1234 buf[len] = '\0';
1235
1236 return (0);
1237 }
1238
1239 /*
1240 * This function retrieves the value of the nvpair in the form of enumerated
1241 * type data_type_t. This is used to determine the appropriate type to pass to
1242 * nvpair_value().
1243 */
1244 static int
1245 nvpair_type(char *nvp)
1246 {
1247 int name_len, type;
1248
1249 /* skip over encode/decode size */
1250 nvp += 4 * 2;
1251
1252 /* skip over name_len */
1253 name_len = BSWAP_32(*(uint32_t *)nvp);
1254 nvp += 4;
1255
1256 /* skip over name */
1257 nvp = nvp + ((name_len + 3) & ~3); /* align */
1258
1259 type = BSWAP_32(*(uint32_t *)nvp);
1260
1261 return (type);
1262 }
1263
1264 static int
1265 nvpair_value(char *nvp, void *val, int valtype, int *nelmp)
1266 {
1267 int name_len, type, slen;
1268 char *strval = val;
1269 uint64_t *intval = val;
1270
1271 /* skip over encode/decode size */
1272 nvp += 4 * 2;
1273
1274 /* skip over name_len */
1275 name_len = BSWAP_32(*(uint32_t *)nvp);
1276 nvp += 4;
1277
1278 /* skip over name */
1279 nvp = nvp + ((name_len + 3) & ~3); /* align */
1280
1281 /* skip over type */
1282 type = BSWAP_32(*(uint32_t *)nvp);
1283 nvp += 4;
1284
1285 if (type == valtype) {
1286 int nelm;
1287
1288 nelm = BSWAP_32(*(uint32_t *)nvp);
1289 if (valtype != DATA_TYPE_BOOLEAN && nelm < 1)
1290 return (1);
1291 nvp += 4;
1292
1293 switch (valtype) {
1294 case DATA_TYPE_BOOLEAN:
1295 return (0);
1296
1297 case DATA_TYPE_STRING:
1298 slen = BSWAP_32(*(uint32_t *)nvp);
1299 nvp += 4;
1300 grub_memmove(strval, nvp, slen);
1301 strval[slen] = '\0';
1302 return (0);
1303
1304 case DATA_TYPE_UINT64:
1305 *intval = BSWAP_64(*(uint64_t *)nvp);
1306 return (0);
1307
1308 case DATA_TYPE_NVLIST:
1309 *(void **)val = (void *)nvp;
1310 return (0);
1311
1312 case DATA_TYPE_NVLIST_ARRAY:
1313 *(void **)val = (void *)nvp;
1314 if (nelmp)
1315 *nelmp = nelm;
1316 return (0);
1317 }
1318 }
1319
1320 return (1);
1321 }
1322
1323 static int
1324 nvlist_lookup_value(char *nvlist, char *name, void *val, int valtype,
1325 int *nelmp)
1326 {
1327 char *nvpair;
1328
1329 for (nvpair = nvlist_next_nvpair(nvlist, NULL);
1330 nvpair != NULL;
1331 nvpair = nvlist_next_nvpair(nvlist, nvpair)) {
1332 int name_len = BSWAP_32(*(uint32_t *)(nvpair + 4 * 2));
1333 char *nvp_name = nvpair + 4 * 3;
1334
1335 if ((grub_strncmp(nvp_name, name, name_len) == 0) &&
1336 nvpair_type(nvpair) == valtype) {
1337 return (nvpair_value(nvpair, val, valtype, nelmp));
1338 }
1339 }
1340 return (1);
1341 }
1342
1343 /*
1344 * Check if this vdev is online and is in a good state.
1345 */
1346 static int
1347 vdev_validate(char *nv)
1348 {
1349 uint64_t ival;
1350
1351 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_OFFLINE, &ival,
1352 DATA_TYPE_UINT64, NULL) == 0 ||
1353 nvlist_lookup_value(nv, ZPOOL_CONFIG_FAULTED, &ival,
1354 DATA_TYPE_UINT64, NULL) == 0 ||
1355 nvlist_lookup_value(nv, ZPOOL_CONFIG_REMOVED, &ival,
1356 DATA_TYPE_UINT64, NULL) == 0)
1357 return (ERR_DEV_VALUES);
1358
1359 return (0);
1360 }
1361
1362 /*
1363 * Get a valid vdev pathname/devid from the boot device.
1364 * The caller should already allocate MAXPATHLEN memory for bootpath and devid.
1365 */
1366 static int
1367 vdev_get_bootpath(char *nv, uint64_t inguid, char *devid, char *bootpath,
1368 int is_spare)
1369 {
1370 char type[16];
1371
1372 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_TYPE, &type, DATA_TYPE_STRING,
1373 NULL))
1374 return (ERR_FSYS_CORRUPT);
1375
1376 if (grub_strcmp(type, VDEV_TYPE_DISK) == 0) {
1377 uint64_t guid;
1378
1379 if (vdev_validate(nv) != 0)
1380 return (ERR_NO_BOOTPATH);
1381
1382 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_GUID,
1383 &guid, DATA_TYPE_UINT64, NULL) != 0)
1384 return (ERR_NO_BOOTPATH);
1385
1386 if (guid != inguid)
1387 return (ERR_NO_BOOTPATH);
1388
1389 /* for a spare vdev, pick the disk labeled with "is_spare" */
1390 if (is_spare) {
1391 uint64_t spare = 0;
1392 (void) nvlist_lookup_value(nv, ZPOOL_CONFIG_IS_SPARE,
1393 &spare, DATA_TYPE_UINT64, NULL);
1394 if (!spare)
1395 return (ERR_NO_BOOTPATH);
1396 }
1397
1398 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_PHYS_PATH,
1399 bootpath, DATA_TYPE_STRING, NULL) != 0)
1400 bootpath[0] = '\0';
1401
1402 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_DEVID,
1403 devid, DATA_TYPE_STRING, NULL) != 0)
1404 devid[0] = '\0';
1405
1406 if (grub_strlen(bootpath) >= MAXPATHLEN ||
1407 grub_strlen(devid) >= MAXPATHLEN)
1408 return (ERR_WONT_FIT);
1409
1410 return (0);
1411
1412 } else if (grub_strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
1413 grub_strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
1414 (is_spare = (grub_strcmp(type, VDEV_TYPE_SPARE) == 0))) {
1415 int nelm, i;
1416 char *child;
1417
1418 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_CHILDREN, &child,
1419 DATA_TYPE_NVLIST_ARRAY, &nelm))
1420 return (ERR_FSYS_CORRUPT);
1421
1422 for (i = 0; i < nelm; i++) {
1423 char *child_i;
1424
1425 child_i = nvlist_array(child, i);
1426 if (vdev_get_bootpath(child_i, inguid, devid,
1427 bootpath, is_spare) == 0)
1428 return (0);
1429 }
1430 }
1431
1432 return (ERR_NO_BOOTPATH);
1433 }
1434
1435 /*
1436 * Check the disk label information and retrieve needed vdev name-value pairs.
1437 *
1438 * Return:
1439 * 0 - success
1440 * ERR_* - failure
1441 */
1442 static int
1443 check_pool_label(uint64_t sector, char *stack, char *outdevid,
1444 char *outpath, uint64_t *outguid, uint64_t *outashift, uint64_t *outversion)
1445 {
1446 vdev_phys_t *vdev;
1447 uint64_t pool_state, txg = 0;
1448 char *nvlist, *nv, *features;
1449 uint64_t diskguid;
1450
1451 sector += (VDEV_SKIP_SIZE >> SPA_MINBLOCKSHIFT);
1452
1453 /* Read in the vdev name-value pair list (112K). */
1454 if (devread(sector, 0, VDEV_PHYS_SIZE, stack) == 0)
1455 return (ERR_READ);
1456
1457 vdev = (vdev_phys_t *)stack;
1458 stack += sizeof (vdev_phys_t);
1459
1460 if (nvlist_unpack(vdev->vp_nvlist, &nvlist))
1461 return (ERR_FSYS_CORRUPT);
1462
1463 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_STATE, &pool_state,
1464 DATA_TYPE_UINT64, NULL))
1465 return (ERR_FSYS_CORRUPT);
1466
1467 if (pool_state == POOL_STATE_DESTROYED)
1468 return (ERR_FILESYSTEM_NOT_FOUND);
1469
1470 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_NAME,
1471 current_rootpool, DATA_TYPE_STRING, NULL))
1472 return (ERR_FSYS_CORRUPT);
1473
1474 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_TXG, &txg,
1475 DATA_TYPE_UINT64, NULL))
1476 return (ERR_FSYS_CORRUPT);
1477
1478 /* not an active device */
1479 if (txg == 0)
1480 return (ERR_NO_BOOTPATH);
1481
1482 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VERSION, outversion,
1483 DATA_TYPE_UINT64, NULL))
1484 return (ERR_FSYS_CORRUPT);
1485 if (!SPA_VERSION_IS_SUPPORTED(*outversion))
1486 return (ERR_NEWER_VERSION);
1487 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VDEV_TREE, &nv,
1488 DATA_TYPE_NVLIST, NULL))
1489 return (ERR_FSYS_CORRUPT);
1490 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_GUID, &diskguid,
1491 DATA_TYPE_UINT64, NULL))
1492 return (ERR_FSYS_CORRUPT);
1493 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_ASHIFT, outashift,
1494 DATA_TYPE_UINT64, NULL) != 0)
1495 return (ERR_FSYS_CORRUPT);
1496 if (vdev_get_bootpath(nv, diskguid, outdevid, outpath, 0))
1497 return (ERR_NO_BOOTPATH);
1498 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_GUID, outguid,
1499 DATA_TYPE_UINT64, NULL))
1500 return (ERR_FSYS_CORRUPT);
1501
1502 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1503 &features, DATA_TYPE_NVLIST, NULL) == 0) {
1504 char *nvp;
1505 char *name = stack;
1506 stack += MAXNAMELEN;
1507
1508 for (nvp = nvlist_next_nvpair(features, NULL);
1509 nvp != NULL;
1510 nvp = nvlist_next_nvpair(features, nvp)) {
1511 zap_attribute_t za;
1512
1513 if (nvpair_name(nvp, name, MAXNAMELEN) != 0)
1514 return (ERR_FSYS_CORRUPT);
1515
1516 za.za_integer_length = 8;
1517 za.za_num_integers = 1;
1518 za.za_first_integer = 1;
1519 za.za_name = name;
1520 if (check_feature(&za, spa_feature_names, stack) != 0)
1521 return (ERR_NEWER_VERSION);
1522 }
1523 }
1524
1525 return (0);
1526 }
1527
1528 /*
1529 * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
1530 * to the memory address MOS.
1531 *
1532 * Return:
1533 * 1 - success
1534 * 0 - failure
1535 */
1536 int
1537 zfs_mount(void)
1538 {
1539 char *stack, *ub_array;
1540 int label = 0;
1541 uberblock_t *ubbest;
1542 objset_phys_t *osp;
1543 char tmp_bootpath[MAXNAMELEN];
1544 char tmp_devid[MAXNAMELEN];
1545 uint64_t tmp_guid, ashift, version;
1546 uint64_t adjpl = (uint64_t)part_length << SPA_MINBLOCKSHIFT;
1547 int err = errnum; /* preserve previous errnum state */
1548
1549 /* if it's our first time here, zero the best uberblock out */
1550 if (best_drive == 0 && best_part == 0 && find_best_root) {
1551 grub_memset(¤t_uberblock, 0, sizeof (uberblock_t));
1552 pool_guid = 0;
1553 }
1554
1555 stackbase = ZFS_SCRATCH;
1556 stack = stackbase;
1557 ub_array = stack;
1558 stack += VDEV_UBERBLOCK_RING;
1559
1560 osp = (objset_phys_t *)stack;
1561 stack += sizeof (objset_phys_t);
1562 adjpl = P2ALIGN(adjpl, (uint64_t)sizeof (vdev_label_t));
1563
1564 for (label = 0; label < VDEV_LABELS; label++) {
1565
1566 /*
1567 * some eltorito stacks don't give us a size and
1568 * we end up setting the size to MAXUINT, further
1569 * some of these devices stop working once a single
1570 * read past the end has been issued. Checking
1571 * for a maximum part_length and skipping the backup
1572 * labels at the end of the slice/partition/device
1573 * avoids breaking down on such devices.
1574 */
1575 if (part_length == MAXUINT && label == 2)
1576 break;
1577
1578 uint64_t sector = vdev_label_start(adjpl,
1579 label) >> SPA_MINBLOCKSHIFT;
1580
1581 /* Read in the uberblock ring (128K). */
1582 if (devread(sector +
1583 ((VDEV_SKIP_SIZE + VDEV_PHYS_SIZE) >> SPA_MINBLOCKSHIFT),
1584 0, VDEV_UBERBLOCK_RING, ub_array) == 0)
1585 continue;
1586
1587 if (check_pool_label(sector, stack, tmp_devid,
1588 tmp_bootpath, &tmp_guid, &ashift, &version))
1589 continue;
1590
1591 if (pool_guid == 0)
1592 pool_guid = tmp_guid;
1593
1594 if ((ubbest = find_bestub(ub_array, ashift, sector)) == NULL ||
1595 zio_read(&ubbest->ub_rootbp, osp, stack) != 0)
1596 continue;
1597
1598 VERIFY_OS_TYPE(osp, DMU_OST_META);
1599
1600 if (version >= SPA_VERSION_FEATURES &&
1601 check_mos_features(&osp->os_meta_dnode, stack) != 0)
1602 continue;
1603
1604 if (find_best_root && ((pool_guid != tmp_guid) ||
1605 vdev_uberblock_compare(ubbest, &(current_uberblock)) <= 0))
1606 continue;
1607
1608 /* Got the MOS. Save it at the memory addr MOS. */
1609 grub_memmove(MOS, &osp->os_meta_dnode, DNODE_SIZE);
1610 grub_memmove(¤t_uberblock, ubbest, sizeof (uberblock_t));
1611 grub_memmove(current_bootpath, tmp_bootpath, MAXNAMELEN);
1612 grub_memmove(current_devid, tmp_devid, grub_strlen(tmp_devid));
1613 is_zfs_mount = 1;
1614 return (1);
1615 }
1616
1617 /*
1618 * While some fs impls. (tftp) rely on setting and keeping
1619 * global errnums set, others won't reset it and will break
1620 * when issuing rawreads. The goal here is to simply not
1621 * have zfs mount attempts impact the previous state.
1622 */
1623 errnum = err;
1624 return (0);
1625 }
1626
1627 /*
1628 * zfs_open() locates a file in the rootpool by following the
1629 * MOS and places the dnode of the file in the memory address DNODE.
1630 *
1631 * Return:
1632 * 1 - success
1633 * 0 - failure
1634 */
1635 int
1636 zfs_open(char *filename)
1637 {
1638 char *stack;
1639 dnode_phys_t *mdn;
1640
1641 file_buf = NULL;
1642 stackbase = ZFS_SCRATCH;
1643 stack = stackbase;
1644
1645 mdn = (dnode_phys_t *)stack;
1646 stack += sizeof (dnode_phys_t);
1647
1648 dnode_mdn = NULL;
1649 dnode_buf = (dnode_phys_t *)stack;
1650 stack += 1<<DNODE_BLOCK_SHIFT;
1651
1652 /*
1653 * menu.lst is placed at the root pool filesystem level,
1654 * do not goto 'current_bootfs'.
1655 */
1656 if (is_top_dataset_file(filename)) {
1657 if (errnum = get_objset_mdn(MOS, NULL, NULL, mdn, stack))
1658 return (0);
1659
1660 current_bootfs_obj = 0;
1661 } else {
1662 if (current_bootfs[0] == '\0') {
1663 /* Get the default root filesystem object number */
1664 if (errnum = get_default_bootfsobj(MOS,
1665 ¤t_bootfs_obj, stack))
1666 return (0);
1667
1668 if (errnum = get_objset_mdn(MOS, NULL,
1669 ¤t_bootfs_obj, mdn, stack))
1670 return (0);
1671 } else {
1672 if (errnum = get_objset_mdn(MOS, current_bootfs,
1673 ¤t_bootfs_obj, mdn, stack)) {
1674 grub_memset(current_bootfs, 0, MAXNAMELEN);
1675 return (0);
1676 }
1677 }
1678 }
1679
1680 if (dnode_get_path(mdn, filename, DNODE, stack)) {
1681 errnum = ERR_FILE_NOT_FOUND;
1682 return (0);
1683 }
1684
1685 /* get the file size and set the file position to 0 */
1686
1687 /*
1688 * For DMU_OT_SA we will need to locate the SIZE attribute
1689 * attribute, which could be either in the bonus buffer
1690 * or the "spill" block.
1691 */
1692 if (DNODE->dn_bonustype == DMU_OT_SA) {
1693 sa_hdr_phys_t *sahdrp;
1694 int hdrsize;
1695
1696 if (DNODE->dn_bonuslen != 0) {
1697 sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE);
1698 } else {
1699 if (DNODE->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
1700 blkptr_t *bp = &DNODE->dn_spill;
1701 void *buf;
1702
1703 buf = (void *)stack;
1704 stack += BP_GET_LSIZE(bp);
1705
1706 /* reset errnum to rawread() failure */
1707 errnum = 0;
1708 if (zio_read(bp, buf, stack) != 0) {
1709 return (0);
1710 }
1711 sahdrp = buf;
1712 } else {
1713 errnum = ERR_FSYS_CORRUPT;
1714 return (0);
1715 }
1716 }
1717 hdrsize = SA_HDR_SIZE(sahdrp);
1718 filemax = *(uint64_t *)((char *)sahdrp + hdrsize +
1719 SA_SIZE_OFFSET);
1720 } else {
1721 filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size;
1722 }
1723 filepos = 0;
1724
1725 dnode_buf = NULL;
1726 return (1);
1727 }
1728
1729 /*
1730 * zfs_read reads in the data blocks pointed by the DNODE.
1731 *
1732 * Return:
1733 * len - the length successfully read in to the buffer
1734 * 0 - failure
1735 */
1736 int
1737 zfs_read(char *buf, int len)
1738 {
1739 char *stack;
1740 int blksz, length, movesize;
1741
1742 if (file_buf == NULL) {
1743 file_buf = stackbase;
1744 stackbase += SPA_MAXBLOCKSIZE;
1745 file_start = file_end = 0;
1746 }
1747 stack = stackbase;
1748
1749 /*
1750 * If offset is in memory, move it into the buffer provided and return.
1751 */
1752 if (filepos >= file_start && filepos+len <= file_end) {
1753 grub_memmove(buf, file_buf + filepos - file_start, len);
1754 filepos += len;
1755 return (len);
1756 }
1757
1758 blksz = DNODE->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1759
1760 /*
1761 * Entire Dnode is too big to fit into the space available. We
1762 * will need to read it in chunks. This could be optimized to
1763 * read in as large a chunk as there is space available, but for
1764 * now, this only reads in one data block at a time.
1765 */
1766 length = len;
1767 while (length) {
1768 /*
1769 * Find requested blkid and the offset within that block.
1770 */
1771 uint64_t blkid = filepos / blksz;
1772
1773 if (errnum = dmu_read(DNODE, blkid, file_buf, stack))
1774 return (0);
1775
1776 file_start = blkid * blksz;
1777 file_end = file_start + blksz;
1778
1779 movesize = MIN(length, file_end - filepos);
1780
1781 grub_memmove(buf, file_buf + filepos - file_start,
1782 movesize);
1783 buf += movesize;
1784 length -= movesize;
1785 filepos += movesize;
1786 }
1787
1788 return (len);
1789 }
1790
1791 /*
1792 * No-Op
1793 */
1794 int
1795 zfs_embed(int *start_sector, int needed_sectors)
1796 {
1797 return (1);
1798 }
1799
1800 #endif /* FSYS_ZFS */